Total coverage: 112430 (7%)of 1829435
1700 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_JUMP_LABEL_H #define _LINUX_JUMP_LABEL_H /* * Jump label support * * Copyright (C) 2009-2012 Jason Baron <jbaron@redhat.com> * Copyright (C) 2011-2012 Red Hat, Inc., Peter Zijlstra * * DEPRECATED API: * * The use of 'struct static_key' directly, is now DEPRECATED. In addition * static_key_{true,false}() is also DEPRECATED. IE DO NOT use the following: * * struct static_key false = STATIC_KEY_INIT_FALSE; * struct static_key true = STATIC_KEY_INIT_TRUE; * static_key_true() * static_key_false() * * The updated API replacements are: * * DEFINE_STATIC_KEY_TRUE(key); * DEFINE_STATIC_KEY_FALSE(key); * DEFINE_STATIC_KEY_ARRAY_TRUE(keys, count); * DEFINE_STATIC_KEY_ARRAY_FALSE(keys, count); * static_branch_likely() * static_branch_unlikely() * * Jump labels provide an interface to generate dynamic branches using * self-modifying code. Assuming toolchain and architecture support, if we * define a "key" that is initially false via "DEFINE_STATIC_KEY_FALSE(key)", * an "if (static_branch_unlikely(&key))" statement is an unconditional branch * (which defaults to false - and the true block is placed out of line). * Similarly, we can define an initially true key via * "DEFINE_STATIC_KEY_TRUE(key)", and use it in the same * "if (static_branch_unlikely(&key))", in which case we will generate an * unconditional branch to the out-of-line true branch. Keys that are * initially true or false can be using in both static_branch_unlikely() * and static_branch_likely() statements. * * At runtime we can change the branch target by setting the key * to true via a call to static_branch_enable(), or false using * static_branch_disable(). If the direction of the branch is switched by * these calls then we run-time modify the branch target via a * no-op -> jump or jump -> no-op conversion. For example, for an * initially false key that is used in an "if (static_branch_unlikely(&key))" * statement, setting the key to true requires us to patch in a jump * to the out-of-line of true branch. * * In addition to static_branch_{enable,disable}, we can also reference count * the key or branch direction via static_branch_{inc,dec}. Thus, * static_branch_inc() can be thought of as a 'make more true' and * static_branch_dec() as a 'make more false'. * * Since this relies on modifying code, the branch modifying functions * must be considered absolute slow paths (machine wide synchronization etc.). * OTOH, since the affected branches are unconditional, their runtime overhead * will be absolutely minimal, esp. in the default (off) case where the total * effect is a single NOP of appropriate size. The on case will patch in a jump * to the out-of-line block. * * When the control is directly exposed to userspace, it is prudent to delay the * decrement to avoid high frequency code modifications which can (and do) * cause significant performance degradation. Struct static_key_deferred and * static_key_slow_dec_deferred() provide for this. * * Lacking toolchain and or architecture support, static keys fall back to a * simple conditional branch. * * Additional babbling in: Documentation/staging/static-keys.rst */ #ifndef __ASSEMBLY__ #include <linux/types.h> #include <linux/compiler.h> #include <linux/cleanup.h> extern bool static_key_initialized; #define STATIC_KEY_CHECK_USE(key) WARN(!static_key_initialized, \ "%s(): static key '%pS' used before call to jump_label_init()", \ __func__, (key)) struct static_key { atomic_t enabled; #ifdef CONFIG_JUMP_LABEL /* * Note: * To make anonymous unions work with old compilers, the static * initialization of them requires brackets. This creates a dependency * on the order of the struct with the initializers. If any fields * are added, STATIC_KEY_INIT_TRUE and STATIC_KEY_INIT_FALSE may need * to be modified. * * bit 0 => 1 if key is initially true * 0 if initially false * bit 1 => 1 if points to struct static_key_mod * 0 if points to struct jump_entry */ union { unsigned long type; struct jump_entry *entries; struct static_key_mod *next; }; #endif /* CONFIG_JUMP_LABEL */ }; #endif /* __ASSEMBLY__ */ #ifdef CONFIG_JUMP_LABEL #include <asm/jump_label.h> #ifndef __ASSEMBLY__ #ifdef CONFIG_HAVE_ARCH_JUMP_LABEL_RELATIVE struct jump_entry { s32 code; s32 target; long key; // key may be far away from the core kernel under KASLR }; static inline unsigned long jump_entry_code(const struct jump_entry *entry) { return (unsigned long)&entry->code + entry->code; } static inline unsigned long jump_entry_target(const struct jump_entry *entry) { return (unsigned long)&entry->target + entry->target; } static inline struct static_key *jump_entry_key(const struct jump_entry *entry) { long offset = entry->key & ~3L; return (struct static_key *)((unsigned long)&entry->key + offset); } #else static inline unsigned long jump_entry_code(const struct jump_entry *entry) { return entry->code; } static inline unsigned long jump_entry_target(const struct jump_entry *entry) { return entry->target; } static inline struct static_key *jump_entry_key(const struct jump_entry *entry) { return (struct static_key *)((unsigned long)entry->key & ~3UL); } #endif static inline bool jump_entry_is_branch(const struct jump_entry *entry) { return (unsigned long)entry->key & 1UL; } static inline bool jump_entry_is_init(const struct jump_entry *entry) { return (unsigned long)entry->key & 2UL; } static inline void jump_entry_set_init(struct jump_entry *entry, bool set) { if (set) entry->key |= 2; else entry->key &= ~2; } static inline int jump_entry_size(struct jump_entry *entry) { #ifdef JUMP_LABEL_NOP_SIZE return JUMP_LABEL_NOP_SIZE; #else return arch_jump_entry_size(entry); #endif } #endif #endif #ifndef __ASSEMBLY__ enum jump_label_type { JUMP_LABEL_NOP = 0, JUMP_LABEL_JMP, }; struct module; #ifdef CONFIG_JUMP_LABEL #define JUMP_TYPE_FALSE 0UL #define JUMP_TYPE_TRUE 1UL #define JUMP_TYPE_LINKED 2UL #define JUMP_TYPE_MASK 3UL static __always_inline bool static_key_false(struct static_key *key) { return arch_static_branch(key, false); } static __always_inline bool static_key_true(struct static_key *key) { return !arch_static_branch(key, true); } extern struct jump_entry __start___jump_table[]; extern struct jump_entry __stop___jump_table[]; extern void jump_label_init(void); extern void jump_label_init_ro(void); extern void jump_label_lock(void); extern void jump_label_unlock(void); extern void arch_jump_label_transform(struct jump_entry *entry, enum jump_label_type type); extern bool arch_jump_label_transform_queue(struct jump_entry *entry, enum jump_label_type type); extern void arch_jump_label_transform_apply(void); extern int jump_label_text_reserved(void *start, void *end); extern bool static_key_slow_inc(struct static_key *key); extern bool static_key_fast_inc_not_disabled(struct static_key *key); extern void static_key_slow_dec(struct static_key *key); extern bool static_key_slow_inc_cpuslocked(struct static_key *key); extern void static_key_slow_dec_cpuslocked(struct static_key *key); extern int static_key_count(struct static_key *key); extern void static_key_enable(struct static_key *key); extern void static_key_disable(struct static_key *key); extern void static_key_enable_cpuslocked(struct static_key *key); extern void static_key_disable_cpuslocked(struct static_key *key); extern enum jump_label_type jump_label_init_type(struct jump_entry *entry); /* * We should be using ATOMIC_INIT() for initializing .enabled, but * the inclusion of atomic.h is problematic for inclusion of jump_label.h * in 'low-level' headers. Thus, we are initializing .enabled with a * raw value, but have added a BUILD_BUG_ON() to catch any issues in * jump_label_init() see: kernel/jump_label.c. */ #define STATIC_KEY_INIT_TRUE \ { .enabled = { 1 }, \ { .type = JUMP_TYPE_TRUE } } #define STATIC_KEY_INIT_FALSE \ { .enabled = { 0 }, \ { .type = JUMP_TYPE_FALSE } } #else /* !CONFIG_JUMP_LABEL */ #include <linux/atomic.h> #include <linux/bug.h> static __always_inline int static_key_count(struct static_key *key) { return raw_atomic_read(&key->enabled); } static __always_inline void jump_label_init(void) { static_key_initialized = true; } static __always_inline void jump_label_init_ro(void) { } static __always_inline bool static_key_false(struct static_key *key) { if (unlikely_notrace(static_key_count(key) > 0)) return true; return false; } static __always_inline bool static_key_true(struct static_key *key) { if (likely_notrace(static_key_count(key) > 0)) return true; return false; } static inline bool static_key_fast_inc_not_disabled(struct static_key *key) { int v; STATIC_KEY_CHECK_USE(key); /* * Prevent key->enabled getting negative to follow the same semantics * as for CONFIG_JUMP_LABEL=y, see kernel/jump_label.c comment. */ v = atomic_read(&key->enabled); do { if (v < 0 || (v + 1) < 0) return false; } while (!likely(atomic_try_cmpxchg(&key->enabled, &v, v + 1))); return true; } #define static_key_slow_inc(key) static_key_fast_inc_not_disabled(key) static inline void static_key_slow_dec(struct static_key *key) { STATIC_KEY_CHECK_USE(key); atomic_dec(&key->enabled); } #define static_key_slow_inc_cpuslocked(key) static_key_slow_inc(key) #define static_key_slow_dec_cpuslocked(key) static_key_slow_dec(key) static inline int jump_label_text_reserved(void *start, void *end) { return 0; } static inline void jump_label_lock(void) {} static inline void jump_label_unlock(void) {} static inline void static_key_enable(struct static_key *key) { STATIC_KEY_CHECK_USE(key); if (atomic_read(&key->enabled) != 0) { WARN_ON_ONCE(atomic_read(&key->enabled) != 1); return; } atomic_set(&key->enabled, 1); } static inline void static_key_disable(struct static_key *key) { STATIC_KEY_CHECK_USE(key); if (atomic_read(&key->enabled) != 1) { WARN_ON_ONCE(atomic_read(&key->enabled) != 0); return; } atomic_set(&key->enabled, 0); } #define static_key_enable_cpuslocked(k) static_key_enable((k)) #define static_key_disable_cpuslocked(k) static_key_disable((k)) #define STATIC_KEY_INIT_TRUE { .enabled = ATOMIC_INIT(1) } #define STATIC_KEY_INIT_FALSE { .enabled = ATOMIC_INIT(0) } #endif /* CONFIG_JUMP_LABEL */ DEFINE_LOCK_GUARD_0(jump_label_lock, jump_label_lock(), jump_label_unlock()) #define STATIC_KEY_INIT STATIC_KEY_INIT_FALSE #define jump_label_enabled static_key_enabled /* -------------------------------------------------------------------------- */ /* * Two type wrappers around static_key, such that we can use compile time * type differentiation to emit the right code. * * All the below code is macros in order to play type games. */ struct static_key_true { struct static_key key; }; struct static_key_false { struct static_key key; }; #define STATIC_KEY_TRUE_INIT (struct static_key_true) { .key = STATIC_KEY_INIT_TRUE, } #define STATIC_KEY_FALSE_INIT (struct static_key_false){ .key = STATIC_KEY_INIT_FALSE, } #define DEFINE_STATIC_KEY_TRUE(name) \ struct static_key_true name = STATIC_KEY_TRUE_INIT #define DEFINE_STATIC_KEY_TRUE_RO(name) \ struct static_key_true name __ro_after_init = STATIC_KEY_TRUE_INIT #define DECLARE_STATIC_KEY_TRUE(name) \ extern struct static_key_true name #define DEFINE_STATIC_KEY_FALSE(name) \ struct static_key_false name = STATIC_KEY_FALSE_INIT #define DEFINE_STATIC_KEY_FALSE_RO(name) \ struct static_key_false name __ro_after_init = STATIC_KEY_FALSE_INIT #define DECLARE_STATIC_KEY_FALSE(name) \ extern struct static_key_false name #define DEFINE_STATIC_KEY_ARRAY_TRUE(name, count) \ struct static_key_true name[count] = { \ [0 ... (count) - 1] = STATIC_KEY_TRUE_INIT, \ } #define DEFINE_STATIC_KEY_ARRAY_FALSE(name, count) \ struct static_key_false name[count] = { \ [0 ... (count) - 1] = STATIC_KEY_FALSE_INIT, \ } #define _DEFINE_STATIC_KEY_1(name) DEFINE_STATIC_KEY_TRUE(name) #define _DEFINE_STATIC_KEY_0(name) DEFINE_STATIC_KEY_FALSE(name) #define DEFINE_STATIC_KEY_MAYBE(cfg, name) \ __PASTE(_DEFINE_STATIC_KEY_, IS_ENABLED(cfg))(name) #define _DEFINE_STATIC_KEY_RO_1(name) DEFINE_STATIC_KEY_TRUE_RO(name) #define _DEFINE_STATIC_KEY_RO_0(name) DEFINE_STATIC_KEY_FALSE_RO(name) #define DEFINE_STATIC_KEY_MAYBE_RO(cfg, name) \ __PASTE(_DEFINE_STATIC_KEY_RO_, IS_ENABLED(cfg))(name) #define _DECLARE_STATIC_KEY_1(name) DECLARE_STATIC_KEY_TRUE(name) #define _DECLARE_STATIC_KEY_0(name) DECLARE_STATIC_KEY_FALSE(name) #define DECLARE_STATIC_KEY_MAYBE(cfg, name) \ __PASTE(_DECLARE_STATIC_KEY_, IS_ENABLED(cfg))(name) extern bool ____wrong_branch_error(void); #define static_key_enabled(x) \ ({ \ if (!__builtin_types_compatible_p(typeof(*x), struct static_key) && \ !__builtin_types_compatible_p(typeof(*x), struct static_key_true) &&\ !__builtin_types_compatible_p(typeof(*x), struct static_key_false)) \ ____wrong_branch_error(); \ static_key_count((struct static_key *)x) > 0; \ }) #ifdef CONFIG_JUMP_LABEL /* * Combine the right initial value (type) with the right branch order * to generate the desired result. * * * type\branch| likely (1) | unlikely (0) * -----------+-----------------------+------------------ * | | * true (1) | ... | ... * | NOP | JMP L * | <br-stmts> | 1: ... * | L: ... | * | | * | | L: <br-stmts> * | | jmp 1b * | | * -----------+-----------------------+------------------ * | | * false (0) | ... | ... * | JMP L | NOP * | <br-stmts> | 1: ... * | L: ... | * | | * | | L: <br-stmts> * | | jmp 1b * | | * -----------+-----------------------+------------------ * * The initial value is encoded in the LSB of static_key::entries, * type: 0 = false, 1 = true. * * The branch type is encoded in the LSB of jump_entry::key, * branch: 0 = unlikely, 1 = likely. * * This gives the following logic table: * * enabled type branch instuction * -----------------------------+----------- * 0 0 0 | NOP * 0 0 1 | JMP * 0 1 0 | NOP * 0 1 1 | JMP * * 1 0 0 | JMP * 1 0 1 | NOP * 1 1 0 | JMP * 1 1 1 | NOP * * Which gives the following functions: * * dynamic: instruction = enabled ^ branch * static: instruction = type ^ branch * * See jump_label_type() / jump_label_init_type(). */ #define static_branch_likely(x) \ ({ \ bool branch; \ if (__builtin_types_compatible_p(typeof(*x), struct static_key_true)) \ branch = !arch_static_branch(&(x)->key, true); \ else if (__builtin_types_compatible_p(typeof(*x), struct static_key_false)) \ branch = !arch_static_branch_jump(&(x)->key, true); \ else \ branch = ____wrong_branch_error(); \ likely_notrace(branch); \ }) #define static_branch_unlikely(x) \ ({ \ bool branch; \ if (__builtin_types_compatible_p(typeof(*x), struct static_key_true)) \ branch = arch_static_branch_jump(&(x)->key, false); \ else if (__builtin_types_compatible_p(typeof(*x), struct static_key_false)) \ branch = arch_static_branch(&(x)->key, false); \ else \ branch = ____wrong_branch_error(); \ unlikely_notrace(branch); \ }) #else /* !CONFIG_JUMP_LABEL */ #define static_branch_likely(x) likely_notrace(static_key_enabled(&(x)->key)) #define static_branch_unlikely(x) unlikely_notrace(static_key_enabled(&(x)->key)) #endif /* CONFIG_JUMP_LABEL */ #define static_branch_maybe(config, x) \ (IS_ENABLED(config) ? static_branch_likely(x) \ : static_branch_unlikely(x)) /* * Advanced usage; refcount, branch is enabled when: count != 0 */ #define static_branch_inc(x) static_key_slow_inc(&(x)->key) #define static_branch_dec(x) static_key_slow_dec(&(x)->key) #define static_branch_inc_cpuslocked(x) static_key_slow_inc_cpuslocked(&(x)->key) #define static_branch_dec_cpuslocked(x) static_key_slow_dec_cpuslocked(&(x)->key) /* * Normal usage; boolean enable/disable. */ #define static_branch_enable(x) static_key_enable(&(x)->key) #define static_branch_disable(x) static_key_disable(&(x)->key) #define static_branch_enable_cpuslocked(x) static_key_enable_cpuslocked(&(x)->key) #define static_branch_disable_cpuslocked(x) static_key_disable_cpuslocked(&(x)->key) #endif /* __ASSEMBLY__ */ #endif /* _LINUX_JUMP_LABEL_H */
47 1 47 44 1 44 47 47 44 44 1 1 1 1 44 2 2 2 2 1 1 123 109 5 9 9 5 5 9 6 3 6 1 6 6 6 6 11 3 3 6 11 8 7 1 7 7 7 1 2 5 1 6 4 5 3 11 11 4 11 323 18 317 319 319 319 7 8 8 8 14 6 1 7 7 190 8 183 183 162 9 9 9 62 62 168 163 1 14 2 1 13 13 9 9 1 9 1 9 9 2 9 151 150 11 2 5 9 5 5 2 5 5 41 41 143 5 25 27 18 9 24 7 23 23 1 5 1 6 6 6 6 6 1 6 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 41 41 4 20 41 41 13 2 2 43 43 7 8 24 27 22 13 42 7 3 2 1 13 41 43 43 43 43 43 6 6 6 6 6 1 1 1 1 1 1 1 143 2 2 25 30 41 41 43 43 8 43 8 43 43 2 43 43 49 3 49 1 47 46 47 47 44 1 3 44 44 44 44 44 1 1 1 2 63 62 62 2 2 2 2 2 2 2 5 57 1 42 43 43 43 62 181 181 118 182 157 9 15 182 181 1 41 115 145 173 12 150 150 150 135 97 144 141 110 76 76 72 4 3 73 72 4 136 8 19 125 124 19 150 150 150 144 76 150 149 71 71 71 55 24 24 24 24 22 20 3 3 62 62 173 62 5 5 62 2 5 12 12 12 12 12 12 2 10 12 182 183 4 7 1 2 1 12 5 10 12 12 9 4 29 29 29 61 58 3 62 62 61 61 62 62 62 62 1 1 1 1 1 2 2 1 10 10 3 10 10 10 1 1 1 1 1 1 1 7 7 7 5 1 3 3 3 2 1 1 20 1 2 17 148 62 18 49 10 64 41 5 7 46 3 47 1 47 46 46 39 11 61 9 132 125 4 18 104 125 6 7 144 2 144 144 143 144 138 7 143 144 143 2 9 2 2 2 144 144 68 87 87 68 25 124 124 125 125 123 123 2 124 59 123 2 62 62 62 68 62 63 9 9 9 9 9 2 7 7 7 2 7 7 7 7 7 7 7 1 323 324 27 19 7 26 26 5 2 17 17 173 173 172 173 172 173 173 173 173 173 173 173 173 172 173 172 62 62 61 62 61 62 51 51 60 62 51 242 240 47 236 46 46 46 170 2 2 48 48 48 48 48 48 48 48 48 47 47 47 63 16 48 1 48 48 48 46 2 48 46 1 46 2 46 2 48 48 62 1 63 182 180 179 179 178 23 155 178 177 177 177 176 176 176 176 175 175 174 173 1 173 173 173 171 171 170 171 23 170 171 170 1 170 2 1 22 170 2 164 1 164 164 164 164 163 164 164 162 163 163 161 162 162 164 164 163 163 163 163 163 162 162 3 162 162 162 29 29 296 298 5 169 159 159 159 159 159 159 159 159 159 159 182 184 28 154 182 183 182 176 4 7 2 3 159 19 23 9 33 7 35 3 1 38 42 1 42 42 42 42 42 42 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 // SPDX-License-Identifier: GPL-2.0 /* * fs/f2fs/segment.c * * Copyright (c) 2012 Samsung Electronics Co., Ltd. * http://www.samsung.com/ */ #include <linux/fs.h> #include <linux/f2fs_fs.h> #include <linux/bio.h> #include <linux/blkdev.h> #include <linux/sched/mm.h> #include <linux/prefetch.h> #include <linux/kthread.h> #include <linux/swap.h> #include <linux/timer.h> #include <linux/freezer.h> #include <linux/sched/signal.h> #include <linux/random.h> #include "f2fs.h" #include "segment.h" #include "node.h" #include "gc.h" #include "iostat.h" #include <trace/events/f2fs.h> #define __reverse_ffz(x) __reverse_ffs(~(x)) static struct kmem_cache *discard_entry_slab; static struct kmem_cache *discard_cmd_slab; static struct kmem_cache *sit_entry_set_slab; static struct kmem_cache *revoke_entry_slab; static unsigned long __reverse_ulong(unsigned char *str) { unsigned long tmp = 0; int shift = 24, idx = 0; #if BITS_PER_LONG == 64 shift = 56; #endif while (shift >= 0) { tmp |= (unsigned long)str[idx++] << shift; shift -= BITS_PER_BYTE; } return tmp; } /* * __reverse_ffs is copied from include/asm-generic/bitops/__ffs.h since * MSB and LSB are reversed in a byte by f2fs_set_bit. */ static inline unsigned long __reverse_ffs(unsigned long word) { int num = 0; #if BITS_PER_LONG == 64 if ((word & 0xffffffff00000000UL) == 0) num += 32; else word >>= 32; #endif if ((word & 0xffff0000) == 0) num += 16; else word >>= 16; if ((word & 0xff00) == 0) num += 8; else word >>= 8; if ((word & 0xf0) == 0) num += 4; else word >>= 4; if ((word & 0xc) == 0) num += 2; else word >>= 2; if ((word & 0x2) == 0) num += 1; return num; } /* * __find_rev_next(_zero)_bit is copied from lib/find_next_bit.c because * f2fs_set_bit makes MSB and LSB reversed in a byte. * @size must be integral times of unsigned long. * Example: * MSB <--> LSB * f2fs_set_bit(0, bitmap) => 1000 0000 * f2fs_set_bit(7, bitmap) => 0000 0001 */ static unsigned long __find_rev_next_bit(const unsigned long *addr, unsigned long size, unsigned long offset) { const unsigned long *p = addr + BIT_WORD(offset); unsigned long result = size; unsigned long tmp; if (offset >= size) return size; size -= (offset & ~(BITS_PER_LONG - 1)); offset %= BITS_PER_LONG; while (1) { if (*p == 0) goto pass; tmp = __reverse_ulong((unsigned char *)p); tmp &= ~0UL >> offset; if (size < BITS_PER_LONG) tmp &= (~0UL << (BITS_PER_LONG - size)); if (tmp) goto found; pass: if (size <= BITS_PER_LONG) break; size -= BITS_PER_LONG; offset = 0; p++; } return result; found: return result - size + __reverse_ffs(tmp); } static unsigned long __find_rev_next_zero_bit(const unsigned long *addr, unsigned long size, unsigned long offset) { const unsigned long *p = addr + BIT_WORD(offset); unsigned long result = size; unsigned long tmp; if (offset >= size) return size; size -= (offset & ~(BITS_PER_LONG - 1)); offset %= BITS_PER_LONG; while (1) { if (*p == ~0UL) goto pass; tmp = __reverse_ulong((unsigned char *)p); if (offset) tmp |= ~0UL << (BITS_PER_LONG - offset); if (size < BITS_PER_LONG) tmp |= ~0UL >> size; if (tmp != ~0UL) goto found; pass: if (size <= BITS_PER_LONG) break; size -= BITS_PER_LONG; offset = 0; p++; } return result; found: return result - size + __reverse_ffz(tmp); } bool f2fs_need_SSR(struct f2fs_sb_info *sbi) { int node_secs = get_blocktype_secs(sbi, F2FS_DIRTY_NODES); int dent_secs = get_blocktype_secs(sbi, F2FS_DIRTY_DENTS); int imeta_secs = get_blocktype_secs(sbi, F2FS_DIRTY_IMETA); if (f2fs_lfs_mode(sbi)) return false; if (sbi->gc_mode == GC_URGENT_HIGH) return true; if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) return true; return free_sections(sbi) <= (node_secs + 2 * dent_secs + imeta_secs + SM_I(sbi)->min_ssr_sections + reserved_sections(sbi)); } void f2fs_abort_atomic_write(struct inode *inode, bool clean) { struct f2fs_inode_info *fi = F2FS_I(inode); if (!f2fs_is_atomic_file(inode)) return; if (clean) truncate_inode_pages_final(inode->i_mapping); release_atomic_write_cnt(inode); clear_inode_flag(inode, FI_ATOMIC_COMMITTED); clear_inode_flag(inode, FI_ATOMIC_REPLACE); clear_inode_flag(inode, FI_ATOMIC_FILE); if (is_inode_flag_set(inode, FI_ATOMIC_DIRTIED)) { clear_inode_flag(inode, FI_ATOMIC_DIRTIED); /* * The vfs inode keeps clean during commit, but the f2fs inode * doesn't. So clear the dirty state after commit and let * f2fs_mark_inode_dirty_sync ensure a consistent dirty state. */ f2fs_inode_synced(inode); f2fs_mark_inode_dirty_sync(inode, true); } stat_dec_atomic_inode(inode); F2FS_I(inode)->atomic_write_task = NULL; if (clean) { f2fs_i_size_write(inode, fi->original_i_size); fi->original_i_size = 0; } /* avoid stale dirty inode during eviction */ sync_inode_metadata(inode, 0); } static int __replace_atomic_write_block(struct inode *inode, pgoff_t index, block_t new_addr, block_t *old_addr, bool recover) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct dnode_of_data dn; struct node_info ni; int err; retry: set_new_dnode(&dn, inode, NULL, NULL, 0); err = f2fs_get_dnode_of_data(&dn, index, ALLOC_NODE); if (err) { if (err == -ENOMEM) { f2fs_io_schedule_timeout(DEFAULT_IO_TIMEOUT); goto retry; } return err; } err = f2fs_get_node_info(sbi, dn.nid, &ni, false); if (err) { f2fs_put_dnode(&dn); return err; } if (recover) { /* dn.data_blkaddr is always valid */ if (!__is_valid_data_blkaddr(new_addr)) { if (new_addr == NULL_ADDR) dec_valid_block_count(sbi, inode, 1); f2fs_invalidate_blocks(sbi, dn.data_blkaddr, 1); f2fs_update_data_blkaddr(&dn, new_addr); } else { f2fs_replace_block(sbi, &dn, dn.data_blkaddr, new_addr, ni.version, true, true); } } else { blkcnt_t count = 1; err = inc_valid_block_count(sbi, inode, &count, true); if (err) { f2fs_put_dnode(&dn); return err; } *old_addr = dn.data_blkaddr; f2fs_truncate_data_blocks_range(&dn, 1); dec_valid_block_count(sbi, F2FS_I(inode)->cow_inode, count); f2fs_replace_block(sbi, &dn, dn.data_blkaddr, new_addr, ni.version, true, false); } f2fs_put_dnode(&dn); trace_f2fs_replace_atomic_write_block(inode, F2FS_I(inode)->cow_inode, index, old_addr ? *old_addr : 0, new_addr, recover); return 0; } static void __complete_revoke_list(struct inode *inode, struct list_head *head, bool revoke) { struct revoke_entry *cur, *tmp; pgoff_t start_index = 0; bool truncate = is_inode_flag_set(inode, FI_ATOMIC_REPLACE); list_for_each_entry_safe(cur, tmp, head, list) { if (revoke) { __replace_atomic_write_block(inode, cur->index, cur->old_addr, NULL, true); } else if (truncate) { f2fs_truncate_hole(inode, start_index, cur->index); start_index = cur->index + 1; } list_del(&cur->list); kmem_cache_free(revoke_entry_slab, cur); } if (!revoke && truncate) f2fs_do_truncate_blocks(inode, start_index * PAGE_SIZE, false); } static int __f2fs_commit_atomic_write(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct f2fs_inode_info *fi = F2FS_I(inode); struct inode *cow_inode = fi->cow_inode; struct revoke_entry *new; struct list_head revoke_list; block_t blkaddr; struct dnode_of_data dn; pgoff_t len = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); pgoff_t off = 0, blen, index; int ret = 0, i; INIT_LIST_HEAD(&revoke_list); while (len) { blen = min_t(pgoff_t, ADDRS_PER_BLOCK(cow_inode), len); set_new_dnode(&dn, cow_inode, NULL, NULL, 0); ret = f2fs_get_dnode_of_data(&dn, off, LOOKUP_NODE_RA); if (ret && ret != -ENOENT) { goto out; } else if (ret == -ENOENT) { ret = 0; if (dn.max_level == 0) goto out; goto next; } blen = min((pgoff_t)ADDRS_PER_PAGE(dn.node_page, cow_inode), len); index = off; for (i = 0; i < blen; i++, dn.ofs_in_node++, index++) { blkaddr = f2fs_data_blkaddr(&dn); if (!__is_valid_data_blkaddr(blkaddr)) { continue; } else if (!f2fs_is_valid_blkaddr(sbi, blkaddr, DATA_GENERIC_ENHANCE)) { f2fs_put_dnode(&dn); ret = -EFSCORRUPTED; goto out; } new = f2fs_kmem_cache_alloc(revoke_entry_slab, GFP_NOFS, true, NULL); ret = __replace_atomic_write_block(inode, index, blkaddr, &new->old_addr, false); if (ret) { f2fs_put_dnode(&dn); kmem_cache_free(revoke_entry_slab, new); goto out; } f2fs_update_data_blkaddr(&dn, NULL_ADDR); new->index = index; list_add_tail(&new->list, &revoke_list); } f2fs_put_dnode(&dn); next: off += blen; len -= blen; } out: if (ret) { sbi->revoked_atomic_block += fi->atomic_write_cnt; } else { sbi->committed_atomic_block += fi->atomic_write_cnt; set_inode_flag(inode, FI_ATOMIC_COMMITTED); if (is_inode_flag_set(inode, FI_ATOMIC_DIRTIED)) { clear_inode_flag(inode, FI_ATOMIC_DIRTIED); f2fs_mark_inode_dirty_sync(inode, true); } } __complete_revoke_list(inode, &revoke_list, ret ? true : false); return ret; } int f2fs_commit_atomic_write(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct f2fs_inode_info *fi = F2FS_I(inode); int err; err = filemap_write_and_wait_range(inode->i_mapping, 0, LLONG_MAX); if (err) return err; f2fs_down_write(&fi->i_gc_rwsem[WRITE]); f2fs_lock_op(sbi); err = __f2fs_commit_atomic_write(inode); f2fs_unlock_op(sbi); f2fs_up_write(&fi->i_gc_rwsem[WRITE]); return err; } /* * This function balances dirty node and dentry pages. * In addition, it controls garbage collection. */ void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need) { if (f2fs_cp_error(sbi)) return; if (time_to_inject(sbi, FAULT_CHECKPOINT)) f2fs_stop_checkpoint(sbi, false, STOP_CP_REASON_FAULT_INJECT); /* balance_fs_bg is able to be pending */ if (need && excess_cached_nats(sbi)) f2fs_balance_fs_bg(sbi, false); if (!f2fs_is_checkpoint_ready(sbi)) return; /* * We should do GC or end up with checkpoint, if there are so many dirty * dir/node pages without enough free segments. */ if (has_enough_free_secs(sbi, 0, 0)) return; if (test_opt(sbi, GC_MERGE) && sbi->gc_thread && sbi->gc_thread->f2fs_gc_task) { DEFINE_WAIT(wait); prepare_to_wait(&sbi->gc_thread->fggc_wq, &wait, TASK_UNINTERRUPTIBLE); wake_up(&sbi->gc_thread->gc_wait_queue_head); io_schedule(); finish_wait(&sbi->gc_thread->fggc_wq, &wait); } else { struct f2fs_gc_control gc_control = { .victim_segno = NULL_SEGNO, .init_gc_type = BG_GC, .no_bg_gc = true, .should_migrate_blocks = false, .err_gc_skipped = false, .nr_free_secs = 1 }; f2fs_down_write(&sbi->gc_lock); stat_inc_gc_call_count(sbi, FOREGROUND); f2fs_gc(sbi, &gc_control); } } static inline bool excess_dirty_threshold(struct f2fs_sb_info *sbi) { int factor = f2fs_rwsem_is_locked(&sbi->cp_rwsem) ? 3 : 2; unsigned int dents = get_pages(sbi, F2FS_DIRTY_DENTS); unsigned int qdata = get_pages(sbi, F2FS_DIRTY_QDATA); unsigned int nodes = get_pages(sbi, F2FS_DIRTY_NODES); unsigned int meta = get_pages(sbi, F2FS_DIRTY_META); unsigned int imeta = get_pages(sbi, F2FS_DIRTY_IMETA); unsigned int threshold = SEGS_TO_BLKS(sbi, (factor * DEFAULT_DIRTY_THRESHOLD)); unsigned int global_threshold = threshold * 3 / 2; if (dents >= threshold || qdata >= threshold || nodes >= threshold || meta >= threshold || imeta >= threshold) return true; return dents + qdata + nodes + meta + imeta > global_threshold; } void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi, bool from_bg) { if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) return; /* try to shrink extent cache when there is no enough memory */ if (!f2fs_available_free_memory(sbi, READ_EXTENT_CACHE)) f2fs_shrink_read_extent_tree(sbi, READ_EXTENT_CACHE_SHRINK_NUMBER); /* try to shrink age extent cache when there is no enough memory */ if (!f2fs_available_free_memory(sbi, AGE_EXTENT_CACHE)) f2fs_shrink_age_extent_tree(sbi, AGE_EXTENT_CACHE_SHRINK_NUMBER); /* check the # of cached NAT entries */ if (!f2fs_available_free_memory(sbi, NAT_ENTRIES)) f2fs_try_to_free_nats(sbi, NAT_ENTRY_PER_BLOCK); if (!f2fs_available_free_memory(sbi, FREE_NIDS)) f2fs_try_to_free_nids(sbi, MAX_FREE_NIDS); else f2fs_build_free_nids(sbi, false, false); if (excess_dirty_nats(sbi) || excess_dirty_threshold(sbi) || excess_prefree_segs(sbi) || !f2fs_space_for_roll_forward(sbi)) goto do_sync; /* there is background inflight IO or foreground operation recently */ if (is_inflight_io(sbi, REQ_TIME) || (!f2fs_time_over(sbi, REQ_TIME) && f2fs_rwsem_is_locked(&sbi->cp_rwsem))) return; /* exceed periodical checkpoint timeout threshold */ if (f2fs_time_over(sbi, CP_TIME)) goto do_sync; /* checkpoint is the only way to shrink partial cached entries */ if (f2fs_available_free_memory(sbi, NAT_ENTRIES) && f2fs_available_free_memory(sbi, INO_ENTRIES)) return; do_sync: if (test_opt(sbi, DATA_FLUSH) && from_bg) { struct blk_plug plug; mutex_lock(&sbi->flush_lock); blk_start_plug(&plug); f2fs_sync_dirty_inodes(sbi, FILE_INODE, false); blk_finish_plug(&plug); mutex_unlock(&sbi->flush_lock); } stat_inc_cp_call_count(sbi, BACKGROUND); f2fs_sync_fs(sbi->sb, 1); } static int __submit_flush_wait(struct f2fs_sb_info *sbi, struct block_device *bdev) { int ret = blkdev_issue_flush(bdev); trace_f2fs_issue_flush(bdev, test_opt(sbi, NOBARRIER), test_opt(sbi, FLUSH_MERGE), ret); if (!ret) f2fs_update_iostat(sbi, NULL, FS_FLUSH_IO, 0); return ret; } static int submit_flush_wait(struct f2fs_sb_info *sbi, nid_t ino) { int ret = 0; int i; if (!f2fs_is_multi_device(sbi)) return __submit_flush_wait(sbi, sbi->sb->s_bdev); for (i = 0; i < sbi->s_ndevs; i++) { if (!f2fs_is_dirty_device(sbi, ino, i, FLUSH_INO)) continue; ret = __submit_flush_wait(sbi, FDEV(i).bdev); if (ret) break; } return ret; } static int issue_flush_thread(void *data) { struct f2fs_sb_info *sbi = data; struct flush_cmd_control *fcc = SM_I(sbi)->fcc_info; wait_queue_head_t *q = &fcc->flush_wait_queue; repeat: if (kthread_should_stop()) return 0; if (!llist_empty(&fcc->issue_list)) { struct flush_cmd *cmd, *next; int ret; fcc->dispatch_list = llist_del_all(&fcc->issue_list); fcc->dispatch_list = llist_reverse_order(fcc->dispatch_list); cmd = llist_entry(fcc->dispatch_list, struct flush_cmd, llnode); ret = submit_flush_wait(sbi, cmd->ino); atomic_inc(&fcc->issued_flush); llist_for_each_entry_safe(cmd, next, fcc->dispatch_list, llnode) { cmd->ret = ret; complete(&cmd->wait); } fcc->dispatch_list = NULL; } wait_event_interruptible(*q, kthread_should_stop() || !llist_empty(&fcc->issue_list)); goto repeat; } int f2fs_issue_flush(struct f2fs_sb_info *sbi, nid_t ino) { struct flush_cmd_control *fcc = SM_I(sbi)->fcc_info; struct flush_cmd cmd; int ret; if (test_opt(sbi, NOBARRIER)) return 0; if (!test_opt(sbi, FLUSH_MERGE)) { atomic_inc(&fcc->queued_flush); ret = submit_flush_wait(sbi, ino); atomic_dec(&fcc->queued_flush); atomic_inc(&fcc->issued_flush); return ret; } if (atomic_inc_return(&fcc->queued_flush) == 1 || f2fs_is_multi_device(sbi)) { ret = submit_flush_wait(sbi, ino); atomic_dec(&fcc->queued_flush); atomic_inc(&fcc->issued_flush); return ret; } cmd.ino = ino; init_completion(&cmd.wait); llist_add(&cmd.llnode, &fcc->issue_list); /* * update issue_list before we wake up issue_flush thread, this * smp_mb() pairs with another barrier in ___wait_event(), see * more details in comments of waitqueue_active(). */ smp_mb(); if (waitqueue_active(&fcc->flush_wait_queue)) wake_up(&fcc->flush_wait_queue); if (fcc->f2fs_issue_flush) { wait_for_completion(&cmd.wait); atomic_dec(&fcc->queued_flush); } else { struct llist_node *list; list = llist_del_all(&fcc->issue_list); if (!list) { wait_for_completion(&cmd.wait); atomic_dec(&fcc->queued_flush); } else { struct flush_cmd *tmp, *next; ret = submit_flush_wait(sbi, ino); llist_for_each_entry_safe(tmp, next, list, llnode) { if (tmp == &cmd) { cmd.ret = ret; atomic_dec(&fcc->queued_flush); continue; } tmp->ret = ret; complete(&tmp->wait); } } } return cmd.ret; } int f2fs_create_flush_cmd_control(struct f2fs_sb_info *sbi) { dev_t dev = sbi->sb->s_bdev->bd_dev; struct flush_cmd_control *fcc; if (SM_I(sbi)->fcc_info) { fcc = SM_I(sbi)->fcc_info; if (fcc->f2fs_issue_flush) return 0; goto init_thread; } fcc = f2fs_kzalloc(sbi, sizeof(struct flush_cmd_control), GFP_KERNEL); if (!fcc) return -ENOMEM; atomic_set(&fcc->issued_flush, 0); atomic_set(&fcc->queued_flush, 0); init_waitqueue_head(&fcc->flush_wait_queue); init_llist_head(&fcc->issue_list); SM_I(sbi)->fcc_info = fcc; if (!test_opt(sbi, FLUSH_MERGE)) return 0; init_thread: fcc->f2fs_issue_flush = kthread_run(issue_flush_thread, sbi, "f2fs_flush-%u:%u", MAJOR(dev), MINOR(dev)); if (IS_ERR(fcc->f2fs_issue_flush)) { int err = PTR_ERR(fcc->f2fs_issue_flush); fcc->f2fs_issue_flush = NULL; return err; } return 0; } void f2fs_destroy_flush_cmd_control(struct f2fs_sb_info *sbi, bool free) { struct flush_cmd_control *fcc = SM_I(sbi)->fcc_info; if (fcc && fcc->f2fs_issue_flush) { struct task_struct *flush_thread = fcc->f2fs_issue_flush; fcc->f2fs_issue_flush = NULL; kthread_stop(flush_thread); } if (free) { kfree(fcc); SM_I(sbi)->fcc_info = NULL; } } int f2fs_flush_device_cache(struct f2fs_sb_info *sbi) { int ret = 0, i; if (!f2fs_is_multi_device(sbi)) return 0; if (test_opt(sbi, NOBARRIER)) return 0; for (i = 1; i < sbi->s_ndevs; i++) { int count = DEFAULT_RETRY_IO_COUNT; if (!f2fs_test_bit(i, (char *)&sbi->dirty_device)) continue; do { ret = __submit_flush_wait(sbi, FDEV(i).bdev); if (ret) f2fs_io_schedule_timeout(DEFAULT_IO_TIMEOUT); } while (ret && --count); if (ret) { f2fs_stop_checkpoint(sbi, false, STOP_CP_REASON_FLUSH_FAIL); break; } spin_lock(&sbi->dev_lock); f2fs_clear_bit(i, (char *)&sbi->dirty_device); spin_unlock(&sbi->dev_lock); } return ret; } static void __locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno, enum dirty_type dirty_type) { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); /* need not be added */ if (IS_CURSEG(sbi, segno)) return; if (!test_and_set_bit(segno, dirty_i->dirty_segmap[dirty_type])) dirty_i->nr_dirty[dirty_type]++; if (dirty_type == DIRTY) { struct seg_entry *sentry = get_seg_entry(sbi, segno); enum dirty_type t = sentry->type; if (unlikely(t >= DIRTY)) { f2fs_bug_on(sbi, 1); return; } if (!test_and_set_bit(segno, dirty_i->dirty_segmap[t])) dirty_i->nr_dirty[t]++; if (__is_large_section(sbi)) { unsigned int secno = GET_SEC_FROM_SEG(sbi, segno); block_t valid_blocks = get_valid_blocks(sbi, segno, true); f2fs_bug_on(sbi, (!is_sbi_flag_set(sbi, SBI_CP_DISABLED) && !valid_blocks) || valid_blocks == CAP_BLKS_PER_SEC(sbi)); if (!IS_CURSEC(sbi, secno)) set_bit(secno, dirty_i->dirty_secmap); } } } static void __remove_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno, enum dirty_type dirty_type) { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); block_t valid_blocks; if (test_and_clear_bit(segno, dirty_i->dirty_segmap[dirty_type])) dirty_i->nr_dirty[dirty_type]--; if (dirty_type == DIRTY) { struct seg_entry *sentry = get_seg_entry(sbi, segno); enum dirty_type t = sentry->type; if (test_and_clear_bit(segno, dirty_i->dirty_segmap[t])) dirty_i->nr_dirty[t]--; valid_blocks = get_valid_blocks(sbi, segno, true); if (valid_blocks == 0) { clear_bit(GET_SEC_FROM_SEG(sbi, segno), dirty_i->victim_secmap); #ifdef CONFIG_F2FS_CHECK_FS clear_bit(segno, SIT_I(sbi)->invalid_segmap); #endif } if (__is_large_section(sbi)) { unsigned int secno = GET_SEC_FROM_SEG(sbi, segno); if (!valid_blocks || valid_blocks == CAP_BLKS_PER_SEC(sbi)) { clear_bit(secno, dirty_i->dirty_secmap); return; } if (!IS_CURSEC(sbi, secno)) set_bit(secno, dirty_i->dirty_secmap); } } } /* * Should not occur error such as -ENOMEM. * Adding dirty entry into seglist is not critical operation. * If a given segment is one of current working segments, it won't be added. */ static void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno) { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); unsigned short valid_blocks, ckpt_valid_blocks; unsigned int usable_blocks; if (segno == NULL_SEGNO || IS_CURSEG(sbi, segno)) return; usable_blocks = f2fs_usable_blks_in_seg(sbi, segno); mutex_lock(&dirty_i->seglist_lock); valid_blocks = get_valid_blocks(sbi, segno, false); ckpt_valid_blocks = get_ckpt_valid_blocks(sbi, segno, false); if (valid_blocks == 0 && (!is_sbi_flag_set(sbi, SBI_CP_DISABLED) || ckpt_valid_blocks == usable_blocks)) { __locate_dirty_segment(sbi, segno, PRE); __remove_dirty_segment(sbi, segno, DIRTY); } else if (valid_blocks < usable_blocks) { __locate_dirty_segment(sbi, segno, DIRTY); } else { /* Recovery routine with SSR needs this */ __remove_dirty_segment(sbi, segno, DIRTY); } mutex_unlock(&dirty_i->seglist_lock); } /* This moves currently empty dirty blocks to prefree. Must hold seglist_lock */ void f2fs_dirty_to_prefree(struct f2fs_sb_info *sbi) { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); unsigned int segno; mutex_lock(&dirty_i->seglist_lock); for_each_set_bit(segno, dirty_i->dirty_segmap[DIRTY], MAIN_SEGS(sbi)) { if (get_valid_blocks(sbi, segno, false)) continue; if (IS_CURSEG(sbi, segno)) continue; __locate_dirty_segment(sbi, segno, PRE); __remove_dirty_segment(sbi, segno, DIRTY); } mutex_unlock(&dirty_i->seglist_lock); } block_t f2fs_get_unusable_blocks(struct f2fs_sb_info *sbi) { int ovp_hole_segs = (overprovision_segments(sbi) - reserved_segments(sbi)); block_t ovp_holes = SEGS_TO_BLKS(sbi, ovp_hole_segs); struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); block_t holes[2] = {0, 0}; /* DATA and NODE */ block_t unusable; struct seg_entry *se; unsigned int segno; mutex_lock(&dirty_i->seglist_lock); for_each_set_bit(segno, dirty_i->dirty_segmap[DIRTY], MAIN_SEGS(sbi)) { se = get_seg_entry(sbi, segno); if (IS_NODESEG(se->type)) holes[NODE] += f2fs_usable_blks_in_seg(sbi, segno) - se->valid_blocks; else holes[DATA] += f2fs_usable_blks_in_seg(sbi, segno) - se->valid_blocks; } mutex_unlock(&dirty_i->seglist_lock); unusable = max(holes[DATA], holes[NODE]); if (unusable > ovp_holes) return unusable - ovp_holes; return 0; } int f2fs_disable_cp_again(struct f2fs_sb_info *sbi, block_t unusable) { int ovp_hole_segs = (overprovision_segments(sbi) - reserved_segments(sbi)); if (F2FS_OPTION(sbi).unusable_cap_perc == 100) return 0; if (unusable > F2FS_OPTION(sbi).unusable_cap) return -EAGAIN; if (is_sbi_flag_set(sbi, SBI_CP_DISABLED_QUICK) && dirty_segments(sbi) > ovp_hole_segs) return -EAGAIN; if (has_not_enough_free_secs(sbi, 0, 0)) return -EAGAIN; return 0; } /* This is only used by SBI_CP_DISABLED */ static unsigned int get_free_segment(struct f2fs_sb_info *sbi) { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); unsigned int segno = 0; mutex_lock(&dirty_i->seglist_lock); for_each_set_bit(segno, dirty_i->dirty_segmap[DIRTY], MAIN_SEGS(sbi)) { if (get_valid_blocks(sbi, segno, false)) continue; if (get_ckpt_valid_blocks(sbi, segno, false)) continue; mutex_unlock(&dirty_i->seglist_lock); return segno; } mutex_unlock(&dirty_i->seglist_lock); return NULL_SEGNO; } static struct discard_cmd *__create_discard_cmd(struct f2fs_sb_info *sbi, struct block_device *bdev, block_t lstart, block_t start, block_t len) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; struct list_head *pend_list; struct discard_cmd *dc; f2fs_bug_on(sbi, !len); pend_list = &dcc->pend_list[plist_idx(len)]; dc = f2fs_kmem_cache_alloc(discard_cmd_slab, GFP_NOFS, true, NULL); INIT_LIST_HEAD(&dc->list); dc->bdev = bdev; dc->di.lstart = lstart; dc->di.start = start; dc->di.len = len; dc->ref = 0; dc->state = D_PREP; dc->queued = 0; dc->error = 0; init_completion(&dc->wait); list_add_tail(&dc->list, pend_list); spin_lock_init(&dc->lock); dc->bio_ref = 0; atomic_inc(&dcc->discard_cmd_cnt); dcc->undiscard_blks += len; return dc; } static bool f2fs_check_discard_tree(struct f2fs_sb_info *sbi) { #ifdef CONFIG_F2FS_CHECK_FS struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; struct rb_node *cur = rb_first_cached(&dcc->root), *next; struct discard_cmd *cur_dc, *next_dc; while (cur) { next = rb_next(cur); if (!next) return true; cur_dc = rb_entry(cur, struct discard_cmd, rb_node); next_dc = rb_entry(next, struct discard_cmd, rb_node); if (cur_dc->di.lstart + cur_dc->di.len > next_dc->di.lstart) { f2fs_info(sbi, "broken discard_rbtree, " "cur(%u, %u) next(%u, %u)", cur_dc->di.lstart, cur_dc->di.len, next_dc->di.lstart, next_dc->di.len); return false; } cur = next; } #endif return true; } static struct discard_cmd *__lookup_discard_cmd(struct f2fs_sb_info *sbi, block_t blkaddr) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; struct rb_node *node = dcc->root.rb_root.rb_node; struct discard_cmd *dc; while (node) { dc = rb_entry(node, struct discard_cmd, rb_node); if (blkaddr < dc->di.lstart) node = node->rb_left; else if (blkaddr >= dc->di.lstart + dc->di.len) node = node->rb_right; else return dc; } return NULL; } static struct discard_cmd *__lookup_discard_cmd_ret(struct rb_root_cached *root, block_t blkaddr, struct discard_cmd **prev_entry, struct discard_cmd **next_entry, struct rb_node ***insert_p, struct rb_node **insert_parent) { struct rb_node **pnode = &root->rb_root.rb_node; struct rb_node *parent = NULL, *tmp_node; struct discard_cmd *dc; *insert_p = NULL; *insert_parent = NULL; *prev_entry = NULL; *next_entry = NULL; if (RB_EMPTY_ROOT(&root->rb_root)) return NULL; while (*pnode) { parent = *pnode; dc = rb_entry(*pnode, struct discard_cmd, rb_node); if (blkaddr < dc->di.lstart) pnode = &(*pnode)->rb_left; else if (blkaddr >= dc->di.lstart + dc->di.len) pnode = &(*pnode)->rb_right; else goto lookup_neighbors; } *insert_p = pnode; *insert_parent = parent; dc = rb_entry(parent, struct discard_cmd, rb_node); tmp_node = parent; if (parent && blkaddr > dc->di.lstart) tmp_node = rb_next(parent); *next_entry = rb_entry_safe(tmp_node, struct discard_cmd, rb_node); tmp_node = parent; if (parent && blkaddr < dc->di.lstart) tmp_node = rb_prev(parent); *prev_entry = rb_entry_safe(tmp_node, struct discard_cmd, rb_node); return NULL; lookup_neighbors: /* lookup prev node for merging backward later */ tmp_node = rb_prev(&dc->rb_node); *prev_entry = rb_entry_safe(tmp_node, struct discard_cmd, rb_node); /* lookup next node for merging frontward later */ tmp_node = rb_next(&dc->rb_node); *next_entry = rb_entry_safe(tmp_node, struct discard_cmd, rb_node); return dc; } static void __detach_discard_cmd(struct discard_cmd_control *dcc, struct discard_cmd *dc) { if (dc->state == D_DONE) atomic_sub(dc->queued, &dcc->queued_discard); list_del(&dc->list); rb_erase_cached(&dc->rb_node, &dcc->root); dcc->undiscard_blks -= dc->di.len; kmem_cache_free(discard_cmd_slab, dc); atomic_dec(&dcc->discard_cmd_cnt); } static void __remove_discard_cmd(struct f2fs_sb_info *sbi, struct discard_cmd *dc) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; unsigned long flags; trace_f2fs_remove_discard(dc->bdev, dc->di.start, dc->di.len); spin_lock_irqsave(&dc->lock, flags); if (dc->bio_ref) { spin_unlock_irqrestore(&dc->lock, flags); return; } spin_unlock_irqrestore(&dc->lock, flags); f2fs_bug_on(sbi, dc->ref); if (dc->error == -EOPNOTSUPP) dc->error = 0; if (dc->error) f2fs_info_ratelimited(sbi, "Issue discard(%u, %u, %u) failed, ret: %d", dc->di.lstart, dc->di.start, dc->di.len, dc->error); __detach_discard_cmd(dcc, dc); } static void f2fs_submit_discard_endio(struct bio *bio) { struct discard_cmd *dc = (struct discard_cmd *)bio->bi_private; unsigned long flags; spin_lock_irqsave(&dc->lock, flags); if (!dc->error) dc->error = blk_status_to_errno(bio->bi_status); dc->bio_ref--; if (!dc->bio_ref && dc->state == D_SUBMIT) { dc->state = D_DONE; complete_all(&dc->wait); } spin_unlock_irqrestore(&dc->lock, flags); bio_put(bio); } static void __check_sit_bitmap(struct f2fs_sb_info *sbi, block_t start, block_t end) { #ifdef CONFIG_F2FS_CHECK_FS struct seg_entry *sentry; unsigned int segno; block_t blk = start; unsigned long offset, size, *map; while (blk < end) { segno = GET_SEGNO(sbi, blk); sentry = get_seg_entry(sbi, segno); offset = GET_BLKOFF_FROM_SEG0(sbi, blk); if (end < START_BLOCK(sbi, segno + 1)) size = GET_BLKOFF_FROM_SEG0(sbi, end); else size = BLKS_PER_SEG(sbi); map = (unsigned long *)(sentry->cur_valid_map); offset = __find_rev_next_bit(map, size, offset); f2fs_bug_on(sbi, offset != size); blk = START_BLOCK(sbi, segno + 1); } #endif } static void __init_discard_policy(struct f2fs_sb_info *sbi, struct discard_policy *dpolicy, int discard_type, unsigned int granularity) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; /* common policy */ dpolicy->type = discard_type; dpolicy->sync = true; dpolicy->ordered = false; dpolicy->granularity = granularity; dpolicy->max_requests = dcc->max_discard_request; dpolicy->io_aware_gran = dcc->discard_io_aware_gran; dpolicy->timeout = false; if (discard_type == DPOLICY_BG) { dpolicy->min_interval = dcc->min_discard_issue_time; dpolicy->mid_interval = dcc->mid_discard_issue_time; dpolicy->max_interval = dcc->max_discard_issue_time; if (dcc->discard_io_aware == DPOLICY_IO_AWARE_ENABLE) dpolicy->io_aware = true; else if (dcc->discard_io_aware == DPOLICY_IO_AWARE_DISABLE) dpolicy->io_aware = false; dpolicy->sync = false; dpolicy->ordered = true; if (utilization(sbi) > dcc->discard_urgent_util) { dpolicy->granularity = MIN_DISCARD_GRANULARITY; if (atomic_read(&dcc->discard_cmd_cnt)) dpolicy->max_interval = dcc->min_discard_issue_time; } } else if (discard_type == DPOLICY_FORCE) { dpolicy->min_interval = dcc->min_discard_issue_time; dpolicy->mid_interval = dcc->mid_discard_issue_time; dpolicy->max_interval = dcc->max_discard_issue_time; dpolicy->io_aware = false; } else if (discard_type == DPOLICY_FSTRIM) { dpolicy->io_aware = false; } else if (discard_type == DPOLICY_UMOUNT) { dpolicy->io_aware = false; /* we need to issue all to keep CP_TRIMMED_FLAG */ dpolicy->granularity = MIN_DISCARD_GRANULARITY; dpolicy->timeout = true; } } static void __update_discard_tree_range(struct f2fs_sb_info *sbi, struct block_device *bdev, block_t lstart, block_t start, block_t len); #ifdef CONFIG_BLK_DEV_ZONED static void __submit_zone_reset_cmd(struct f2fs_sb_info *sbi, struct discard_cmd *dc, blk_opf_t flag, struct list_head *wait_list, unsigned int *issued) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; struct block_device *bdev = dc->bdev; struct bio *bio = bio_alloc(bdev, 0, REQ_OP_ZONE_RESET | flag, GFP_NOFS); unsigned long flags; trace_f2fs_issue_reset_zone(bdev, dc->di.start); spin_lock_irqsave(&dc->lock, flags); dc->state = D_SUBMIT; dc->bio_ref++; spin_unlock_irqrestore(&dc->lock, flags); if (issued) (*issued)++; atomic_inc(&dcc->queued_discard); dc->queued++; list_move_tail(&dc->list, wait_list); /* sanity check on discard range */ __check_sit_bitmap(sbi, dc->di.lstart, dc->di.lstart + dc->di.len); bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(dc->di.start); bio->bi_private = dc; bio->bi_end_io = f2fs_submit_discard_endio; submit_bio(bio); atomic_inc(&dcc->issued_discard); f2fs_update_iostat(sbi, NULL, FS_ZONE_RESET_IO, dc->di.len * F2FS_BLKSIZE); } #endif /* this function is copied from blkdev_issue_discard from block/blk-lib.c */ static int __submit_discard_cmd(struct f2fs_sb_info *sbi, struct discard_policy *dpolicy, struct discard_cmd *dc, int *issued) { struct block_device *bdev = dc->bdev; unsigned int max_discard_blocks = SECTOR_TO_BLOCK(bdev_max_discard_sectors(bdev)); struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; struct list_head *wait_list = (dpolicy->type == DPOLICY_FSTRIM) ? &(dcc->fstrim_list) : &(dcc->wait_list); blk_opf_t flag = dpolicy->sync ? REQ_SYNC : 0; block_t lstart, start, len, total_len; int err = 0; if (dc->state != D_PREP) return 0; if (is_sbi_flag_set(sbi, SBI_NEED_FSCK)) return 0; #ifdef CONFIG_BLK_DEV_ZONED if (f2fs_sb_has_blkzoned(sbi) && bdev_is_zoned(bdev)) { int devi = f2fs_bdev_index(sbi, bdev); if (devi < 0) return -EINVAL; if (f2fs_blkz_is_seq(sbi, devi, dc->di.start)) { __submit_zone_reset_cmd(sbi, dc, flag, wait_list, issued); return 0; } } #endif /* * stop issuing discard for any of below cases: * 1. device is conventional zone, but it doesn't support discard. * 2. device is regulare device, after snapshot it doesn't support * discard. */ if (!bdev_max_discard_sectors(bdev)) return -EOPNOTSUPP; trace_f2fs_issue_discard(bdev, dc->di.start, dc->di.len); lstart = dc->di.lstart; start = dc->di.start; len = dc->di.len; total_len = len; dc->di.len = 0; while (total_len && *issued < dpolicy->max_requests && !err) { struct bio *bio = NULL; unsigned long flags; bool last = true; if (len > max_discard_blocks) { len = max_discard_blocks; last = false; } (*issued)++; if (*issued == dpolicy->max_requests) last = true; dc->di.len += len; if (time_to_inject(sbi, FAULT_DISCARD)) { err = -EIO; } else { err = __blkdev_issue_discard(bdev, SECTOR_FROM_BLOCK(start), SECTOR_FROM_BLOCK(len), GFP_NOFS, &bio); } if (err) { spin_lock_irqsave(&dc->lock, flags); if (dc->state == D_PARTIAL) dc->state = D_SUBMIT; spin_unlock_irqrestore(&dc->lock, flags); break; } f2fs_bug_on(sbi, !bio); /* * should keep before submission to avoid D_DONE * right away */ spin_lock_irqsave(&dc->lock, flags); if (last) dc->state = D_SUBMIT; else dc->state = D_PARTIAL; dc->bio_ref++; spin_unlock_irqrestore(&dc->lock, flags); atomic_inc(&dcc->queued_discard); dc->queued++; list_move_tail(&dc->list, wait_list); /* sanity check on discard range */ __check_sit_bitmap(sbi, lstart, lstart + len); bio->bi_private = dc; bio->bi_end_io = f2fs_submit_discard_endio; bio->bi_opf |= flag; submit_bio(bio); atomic_inc(&dcc->issued_discard); f2fs_update_iostat(sbi, NULL, FS_DISCARD_IO, len * F2FS_BLKSIZE); lstart += len; start += len; total_len -= len; len = total_len; } if (!err && len) { dcc->undiscard_blks -= len; __update_discard_tree_range(sbi, bdev, lstart, start, len); } return err; } static void __insert_discard_cmd(struct f2fs_sb_info *sbi, struct block_device *bdev, block_t lstart, block_t start, block_t len) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; struct rb_node **p = &dcc->root.rb_root.rb_node; struct rb_node *parent = NULL; struct discard_cmd *dc; bool leftmost = true; /* look up rb tree to find parent node */ while (*p) { parent = *p; dc = rb_entry(parent, struct discard_cmd, rb_node); if (lstart < dc->di.lstart) { p = &(*p)->rb_left; } else if (lstart >= dc->di.lstart + dc->di.len) { p = &(*p)->rb_right; leftmost = false; } else { /* Let's skip to add, if exists */ return; } } dc = __create_discard_cmd(sbi, bdev, lstart, start, len); rb_link_node(&dc->rb_node, parent, p); rb_insert_color_cached(&dc->rb_node, &dcc->root, leftmost); } static void __relocate_discard_cmd(struct discard_cmd_control *dcc, struct discard_cmd *dc) { list_move_tail(&dc->list, &dcc->pend_list[plist_idx(dc->di.len)]); } static void __punch_discard_cmd(struct f2fs_sb_info *sbi, struct discard_cmd *dc, block_t blkaddr) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; struct discard_info di = dc->di; bool modified = false; if (dc->state == D_DONE || dc->di.len == 1) { __remove_discard_cmd(sbi, dc); return; } dcc->undiscard_blks -= di.len; if (blkaddr > di.lstart) { dc->di.len = blkaddr - dc->di.lstart; dcc->undiscard_blks += dc->di.len; __relocate_discard_cmd(dcc, dc); modified = true; } if (blkaddr < di.lstart + di.len - 1) { if (modified) { __insert_discard_cmd(sbi, dc->bdev, blkaddr + 1, di.start + blkaddr + 1 - di.lstart, di.lstart + di.len - 1 - blkaddr); } else { dc->di.lstart++; dc->di.len--; dc->di.start++; dcc->undiscard_blks += dc->di.len; __relocate_discard_cmd(dcc, dc); } } } static void __update_discard_tree_range(struct f2fs_sb_info *sbi, struct block_device *bdev, block_t lstart, block_t start, block_t len) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; struct discard_cmd *prev_dc = NULL, *next_dc = NULL; struct discard_cmd *dc; struct discard_info di = {0}; struct rb_node **insert_p = NULL, *insert_parent = NULL; unsigned int max_discard_blocks = SECTOR_TO_BLOCK(bdev_max_discard_sectors(bdev)); block_t end = lstart + len; dc = __lookup_discard_cmd_ret(&dcc->root, lstart, &prev_dc, &next_dc, &insert_p, &insert_parent); if (dc) prev_dc = dc; if (!prev_dc) { di.lstart = lstart; di.len = next_dc ? next_dc->di.lstart - lstart : len; di.len = min(di.len, len); di.start = start; } while (1) { struct rb_node *node; bool merged = false; struct discard_cmd *tdc = NULL; if (prev_dc) { di.lstart = prev_dc->di.lstart + prev_dc->di.len; if (di.lstart < lstart) di.lstart = lstart; if (di.lstart >= end) break; if (!next_dc || next_dc->di.lstart > end) di.len = end - di.lstart; else di.len = next_dc->di.lstart - di.lstart; di.start = start + di.lstart - lstart; } if (!di.len) goto next; if (prev_dc && prev_dc->state == D_PREP && prev_dc->bdev == bdev && __is_discard_back_mergeable(&di, &prev_dc->di, max_discard_blocks)) { prev_dc->di.len += di.len; dcc->undiscard_blks += di.len; __relocate_discard_cmd(dcc, prev_dc); di = prev_dc->di; tdc = prev_dc; merged = true; } if (next_dc && next_dc->state == D_PREP && next_dc->bdev == bdev && __is_discard_front_mergeable(&di, &next_dc->di, max_discard_blocks)) { next_dc->di.lstart = di.lstart; next_dc->di.len += di.len; next_dc->di.start = di.start; dcc->undiscard_blks += di.len; __relocate_discard_cmd(dcc, next_dc); if (tdc) __remove_discard_cmd(sbi, tdc); merged = true; } if (!merged) __insert_discard_cmd(sbi, bdev, di.lstart, di.start, di.len); next: prev_dc = next_dc; if (!prev_dc) break; node = rb_next(&prev_dc->rb_node); next_dc = rb_entry_safe(node, struct discard_cmd, rb_node); } } #ifdef CONFIG_BLK_DEV_ZONED static void __queue_zone_reset_cmd(struct f2fs_sb_info *sbi, struct block_device *bdev, block_t blkstart, block_t lblkstart, block_t blklen) { trace_f2fs_queue_reset_zone(bdev, blkstart); mutex_lock(&SM_I(sbi)->dcc_info->cmd_lock); __insert_discard_cmd(sbi, bdev, lblkstart, blkstart, blklen); mutex_unlock(&SM_I(sbi)->dcc_info->cmd_lock); } #endif static void __queue_discard_cmd(struct f2fs_sb_info *sbi, struct block_device *bdev, block_t blkstart, block_t blklen) { block_t lblkstart = blkstart; if (!f2fs_bdev_support_discard(bdev)) return; trace_f2fs_queue_discard(bdev, blkstart, blklen); if (f2fs_is_multi_device(sbi)) { int devi = f2fs_target_device_index(sbi, blkstart); blkstart -= FDEV(devi).start_blk; } mutex_lock(&SM_I(sbi)->dcc_info->cmd_lock); __update_discard_tree_range(sbi, bdev, lblkstart, blkstart, blklen); mutex_unlock(&SM_I(sbi)->dcc_info->cmd_lock); } static void __issue_discard_cmd_orderly(struct f2fs_sb_info *sbi, struct discard_policy *dpolicy, int *issued) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; struct discard_cmd *prev_dc = NULL, *next_dc = NULL; struct rb_node **insert_p = NULL, *insert_parent = NULL; struct discard_cmd *dc; struct blk_plug plug; bool io_interrupted = false; mutex_lock(&dcc->cmd_lock); dc = __lookup_discard_cmd_ret(&dcc->root, dcc->next_pos, &prev_dc, &next_dc, &insert_p, &insert_parent); if (!dc) dc = next_dc; blk_start_plug(&plug); while (dc) { struct rb_node *node; int err = 0; if (dc->state != D_PREP) goto next; if (dpolicy->io_aware && !is_idle(sbi, DISCARD_TIME)) { io_interrupted = true; break; } dcc->next_pos = dc->di.lstart + dc->di.len; err = __submit_discard_cmd(sbi, dpolicy, dc, issued); if (*issued >= dpolicy->max_requests) break; next: node = rb_next(&dc->rb_node); if (err) __remove_discard_cmd(sbi, dc); dc = rb_entry_safe(node, struct discard_cmd, rb_node); } blk_finish_plug(&plug); if (!dc) dcc->next_pos = 0; mutex_unlock(&dcc->cmd_lock); if (!(*issued) && io_interrupted) *issued = -1; } static unsigned int __wait_all_discard_cmd(struct f2fs_sb_info *sbi, struct discard_policy *dpolicy); static int __issue_discard_cmd(struct f2fs_sb_info *sbi, struct discard_policy *dpolicy) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; struct list_head *pend_list; struct discard_cmd *dc, *tmp; struct blk_plug plug; int i, issued; bool io_interrupted = false; if (dpolicy->timeout) f2fs_update_time(sbi, UMOUNT_DISCARD_TIMEOUT); retry: issued = 0; for (i = MAX_PLIST_NUM - 1; i >= 0; i--) { if (dpolicy->timeout && f2fs_time_over(sbi, UMOUNT_DISCARD_TIMEOUT)) break; if (i + 1 < dpolicy->granularity) break; if (i + 1 < dcc->max_ordered_discard && dpolicy->ordered) { __issue_discard_cmd_orderly(sbi, dpolicy, &issued); return issued; } pend_list = &dcc->pend_list[i]; mutex_lock(&dcc->cmd_lock); if (list_empty(pend_list)) goto next; if (unlikely(dcc->rbtree_check)) f2fs_bug_on(sbi, !f2fs_check_discard_tree(sbi)); blk_start_plug(&plug); list_for_each_entry_safe(dc, tmp, pend_list, list) { f2fs_bug_on(sbi, dc->state != D_PREP); if (dpolicy->timeout && f2fs_time_over(sbi, UMOUNT_DISCARD_TIMEOUT)) break; if (dpolicy->io_aware && i < dpolicy->io_aware_gran && !is_idle(sbi, DISCARD_TIME)) { io_interrupted = true; break; } __submit_discard_cmd(sbi, dpolicy, dc, &issued); if (issued >= dpolicy->max_requests) break; } blk_finish_plug(&plug); next: mutex_unlock(&dcc->cmd_lock); if (issued >= dpolicy->max_requests || io_interrupted) break; } if (dpolicy->type == DPOLICY_UMOUNT && issued) { __wait_all_discard_cmd(sbi, dpolicy); goto retry; } if (!issued && io_interrupted) issued = -1; return issued; } static bool __drop_discard_cmd(struct f2fs_sb_info *sbi) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; struct list_head *pend_list; struct discard_cmd *dc, *tmp; int i; bool dropped = false; mutex_lock(&dcc->cmd_lock); for (i = MAX_PLIST_NUM - 1; i >= 0; i--) { pend_list = &dcc->pend_list[i]; list_for_each_entry_safe(dc, tmp, pend_list, list) { f2fs_bug_on(sbi, dc->state != D_PREP); __remove_discard_cmd(sbi, dc); dropped = true; } } mutex_unlock(&dcc->cmd_lock); return dropped; } void f2fs_drop_discard_cmd(struct f2fs_sb_info *sbi) { __drop_discard_cmd(sbi); } static unsigned int __wait_one_discard_bio(struct f2fs_sb_info *sbi, struct discard_cmd *dc) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; unsigned int len = 0; wait_for_completion_io(&dc->wait); mutex_lock(&dcc->cmd_lock); f2fs_bug_on(sbi, dc->state != D_DONE); dc->ref--; if (!dc->ref) { if (!dc->error) len = dc->di.len; __remove_discard_cmd(sbi, dc); } mutex_unlock(&dcc->cmd_lock); return len; } static unsigned int __wait_discard_cmd_range(struct f2fs_sb_info *sbi, struct discard_policy *dpolicy, block_t start, block_t end) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; struct list_head *wait_list = (dpolicy->type == DPOLICY_FSTRIM) ? &(dcc->fstrim_list) : &(dcc->wait_list); struct discard_cmd *dc = NULL, *iter, *tmp; unsigned int trimmed = 0; next: dc = NULL; mutex_lock(&dcc->cmd_lock); list_for_each_entry_safe(iter, tmp, wait_list, list) { if (iter->di.lstart + iter->di.len <= start || end <= iter->di.lstart) continue; if (iter->di.len < dpolicy->granularity) continue; if (iter->state == D_DONE && !iter->ref) { wait_for_completion_io(&iter->wait); if (!iter->error) trimmed += iter->di.len; __remove_discard_cmd(sbi, iter); } else { iter->ref++; dc = iter; break; } } mutex_unlock(&dcc->cmd_lock); if (dc) { trimmed += __wait_one_discard_bio(sbi, dc); goto next; } return trimmed; } static unsigned int __wait_all_discard_cmd(struct f2fs_sb_info *sbi, struct discard_policy *dpolicy) { struct discard_policy dp; unsigned int discard_blks; if (dpolicy) return __wait_discard_cmd_range(sbi, dpolicy, 0, UINT_MAX); /* wait all */ __init_discard_policy(sbi, &dp, DPOLICY_FSTRIM, MIN_DISCARD_GRANULARITY); discard_blks = __wait_discard_cmd_range(sbi, &dp, 0, UINT_MAX); __init_discard_policy(sbi, &dp, DPOLICY_UMOUNT, MIN_DISCARD_GRANULARITY); discard_blks += __wait_discard_cmd_range(sbi, &dp, 0, UINT_MAX); return discard_blks; } /* This should be covered by global mutex, &sit_i->sentry_lock */ static void f2fs_wait_discard_bio(struct f2fs_sb_info *sbi, block_t blkaddr) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; struct discard_cmd *dc; bool need_wait = false; mutex_lock(&dcc->cmd_lock); dc = __lookup_discard_cmd(sbi, blkaddr); #ifdef CONFIG_BLK_DEV_ZONED if (dc && f2fs_sb_has_blkzoned(sbi) && bdev_is_zoned(dc->bdev)) { int devi = f2fs_bdev_index(sbi, dc->bdev); if (devi < 0) { mutex_unlock(&dcc->cmd_lock); return; } if (f2fs_blkz_is_seq(sbi, devi, dc->di.start)) { /* force submit zone reset */ if (dc->state == D_PREP) __submit_zone_reset_cmd(sbi, dc, REQ_SYNC, &dcc->wait_list, NULL); dc->ref++; mutex_unlock(&dcc->cmd_lock); /* wait zone reset */ __wait_one_discard_bio(sbi, dc); return; } } #endif if (dc) { if (dc->state == D_PREP) { __punch_discard_cmd(sbi, dc, blkaddr); } else { dc->ref++; need_wait = true; } } mutex_unlock(&dcc->cmd_lock); if (need_wait) __wait_one_discard_bio(sbi, dc); } void f2fs_stop_discard_thread(struct f2fs_sb_info *sbi) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; if (dcc && dcc->f2fs_issue_discard) { struct task_struct *discard_thread = dcc->f2fs_issue_discard; dcc->f2fs_issue_discard = NULL; kthread_stop(discard_thread); } } /** * f2fs_issue_discard_timeout() - Issue all discard cmd within UMOUNT_DISCARD_TIMEOUT * @sbi: the f2fs_sb_info data for discard cmd to issue * * When UMOUNT_DISCARD_TIMEOUT is exceeded, all remaining discard commands will be dropped * * Return true if issued all discard cmd or no discard cmd need issue, otherwise return false. */ bool f2fs_issue_discard_timeout(struct f2fs_sb_info *sbi) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; struct discard_policy dpolicy; bool dropped; if (!atomic_read(&dcc->discard_cmd_cnt)) return true; __init_discard_policy(sbi, &dpolicy, DPOLICY_UMOUNT, dcc->discard_granularity); __issue_discard_cmd(sbi, &dpolicy); dropped = __drop_discard_cmd(sbi); /* just to make sure there is no pending discard commands */ __wait_all_discard_cmd(sbi, NULL); f2fs_bug_on(sbi, atomic_read(&dcc->discard_cmd_cnt)); return !dropped; } static int issue_discard_thread(void *data) { struct f2fs_sb_info *sbi = data; struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; wait_queue_head_t *q = &dcc->discard_wait_queue; struct discard_policy dpolicy; unsigned int wait_ms = dcc->min_discard_issue_time; int issued; set_freezable(); do { wait_event_freezable_timeout(*q, kthread_should_stop() || dcc->discard_wake, msecs_to_jiffies(wait_ms)); if (sbi->gc_mode == GC_URGENT_HIGH || !f2fs_available_free_memory(sbi, DISCARD_CACHE)) __init_discard_policy(sbi, &dpolicy, DPOLICY_FORCE, MIN_DISCARD_GRANULARITY); else __init_discard_policy(sbi, &dpolicy, DPOLICY_BG, dcc->discard_granularity); if (dcc->discard_wake) dcc->discard_wake = false; /* clean up pending candidates before going to sleep */ if (atomic_read(&dcc->queued_discard)) __wait_all_discard_cmd(sbi, NULL); if (f2fs_readonly(sbi->sb)) continue; if (kthread_should_stop()) return 0; if (is_sbi_flag_set(sbi, SBI_NEED_FSCK) || !atomic_read(&dcc->discard_cmd_cnt)) { wait_ms = dpolicy.max_interval; continue; } sb_start_intwrite(sbi->sb); issued = __issue_discard_cmd(sbi, &dpolicy); if (issued > 0) { __wait_all_discard_cmd(sbi, &dpolicy); wait_ms = dpolicy.min_interval; } else if (issued == -1) { wait_ms = f2fs_time_to_wait(sbi, DISCARD_TIME); if (!wait_ms) wait_ms = dpolicy.mid_interval; } else { wait_ms = dpolicy.max_interval; } if (!atomic_read(&dcc->discard_cmd_cnt)) wait_ms = dpolicy.max_interval; sb_end_intwrite(sbi->sb); } while (!kthread_should_stop()); return 0; } #ifdef CONFIG_BLK_DEV_ZONED static int __f2fs_issue_discard_zone(struct f2fs_sb_info *sbi, struct block_device *bdev, block_t blkstart, block_t blklen) { sector_t sector, nr_sects; block_t lblkstart = blkstart; int devi = 0; u64 remainder = 0; if (f2fs_is_multi_device(sbi)) { devi = f2fs_target_device_index(sbi, blkstart); if (blkstart < FDEV(devi).start_blk || blkstart > FDEV(devi).end_blk) { f2fs_err(sbi, "Invalid block %x", blkstart); return -EIO; } blkstart -= FDEV(devi).start_blk; } /* For sequential zones, reset the zone write pointer */ if (f2fs_blkz_is_seq(sbi, devi, blkstart)) { sector = SECTOR_FROM_BLOCK(blkstart); nr_sects = SECTOR_FROM_BLOCK(blklen); div64_u64_rem(sector, bdev_zone_sectors(bdev), &remainder); if (remainder || nr_sects != bdev_zone_sectors(bdev)) { f2fs_err(sbi, "(%d) %s: Unaligned zone reset attempted (block %x + %x)", devi, sbi->s_ndevs ? FDEV(devi).path : "", blkstart, blklen); return -EIO; } if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) { unsigned int nofs_flags; int ret; trace_f2fs_issue_reset_zone(bdev, blkstart); nofs_flags = memalloc_nofs_save(); ret = blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET, sector, nr_sects); memalloc_nofs_restore(nofs_flags); return ret; } __queue_zone_reset_cmd(sbi, bdev, blkstart, lblkstart, blklen); return 0; } /* For conventional zones, use regular discard if supported */ __queue_discard_cmd(sbi, bdev, lblkstart, blklen); return 0; } #endif static int __issue_discard_async(struct f2fs_sb_info *sbi, struct block_device *bdev, block_t blkstart, block_t blklen) { #ifdef CONFIG_BLK_DEV_ZONED if (f2fs_sb_has_blkzoned(sbi) && bdev_is_zoned(bdev)) return __f2fs_issue_discard_zone(sbi, bdev, blkstart, blklen); #endif __queue_discard_cmd(sbi, bdev, blkstart, blklen); return 0; } static int f2fs_issue_discard(struct f2fs_sb_info *sbi, block_t blkstart, block_t blklen) { sector_t start = blkstart, len = 0; struct block_device *bdev; struct seg_entry *se; unsigned int offset; block_t i; int err = 0; bdev = f2fs_target_device(sbi, blkstart, NULL); for (i = blkstart; i < blkstart + blklen; i++, len++) { if (i != start) { struct block_device *bdev2 = f2fs_target_device(sbi, i, NULL); if (bdev2 != bdev) { err = __issue_discard_async(sbi, bdev, start, len); if (err) return err; bdev = bdev2; start = i; len = 0; } } se = get_seg_entry(sbi, GET_SEGNO(sbi, i)); offset = GET_BLKOFF_FROM_SEG0(sbi, i); if (f2fs_block_unit_discard(sbi) && !f2fs_test_and_set_bit(offset, se->discard_map)) sbi->discard_blks--; } if (len) err = __issue_discard_async(sbi, bdev, start, len); return err; } static bool add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc, bool check_only) { int entries = SIT_VBLOCK_MAP_SIZE / sizeof(unsigned long); struct seg_entry *se = get_seg_entry(sbi, cpc->trim_start); unsigned long *cur_map = (unsigned long *)se->cur_valid_map; unsigned long *ckpt_map = (unsigned long *)se->ckpt_valid_map; unsigned long *discard_map = (unsigned long *)se->discard_map; unsigned long *dmap = SIT_I(sbi)->tmp_map; unsigned int start = 0, end = -1; bool force = (cpc->reason & CP_DISCARD); struct discard_entry *de = NULL; struct list_head *head = &SM_I(sbi)->dcc_info->entry_list; int i; if (se->valid_blocks == BLKS_PER_SEG(sbi) || !f2fs_hw_support_discard(sbi) || !f2fs_block_unit_discard(sbi)) return false; if (!force) { if (!f2fs_realtime_discard_enable(sbi) || !se->valid_blocks || SM_I(sbi)->dcc_info->nr_discards >= SM_I(sbi)->dcc_info->max_discards) return false; } /* SIT_VBLOCK_MAP_SIZE should be multiple of sizeof(unsigned long) */ for (i = 0; i < entries; i++) dmap[i] = force ? ~ckpt_map[i] & ~discard_map[i] : (cur_map[i] ^ ckpt_map[i]) & ckpt_map[i]; while (force || SM_I(sbi)->dcc_info->nr_discards <= SM_I(sbi)->dcc_info->max_discards) { start = __find_rev_next_bit(dmap, BLKS_PER_SEG(sbi), end + 1); if (start >= BLKS_PER_SEG(sbi)) break; end = __find_rev_next_zero_bit(dmap, BLKS_PER_SEG(sbi), start + 1); if (force && start && end != BLKS_PER_SEG(sbi) && (end - start) < cpc->trim_minlen) continue; if (check_only) return true; if (!de) { de = f2fs_kmem_cache_alloc(discard_entry_slab, GFP_F2FS_ZERO, true, NULL); de->start_blkaddr = START_BLOCK(sbi, cpc->trim_start); list_add_tail(&de->list, head); } for (i = start; i < end; i++) __set_bit_le(i, (void *)de->discard_map); SM_I(sbi)->dcc_info->nr_discards += end - start; } return false; } static void release_discard_addr(struct discard_entry *entry) { list_del(&entry->list); kmem_cache_free(discard_entry_slab, entry); } void f2fs_release_discard_addrs(struct f2fs_sb_info *sbi) { struct list_head *head = &(SM_I(sbi)->dcc_info->entry_list); struct discard_entry *entry, *this; /* drop caches */ list_for_each_entry_safe(entry, this, head, list) release_discard_addr(entry); } /* * Should call f2fs_clear_prefree_segments after checkpoint is done. */ static void set_prefree_as_free_segments(struct f2fs_sb_info *sbi) { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); unsigned int segno; mutex_lock(&dirty_i->seglist_lock); for_each_set_bit(segno, dirty_i->dirty_segmap[PRE], MAIN_SEGS(sbi)) __set_test_and_free(sbi, segno, false); mutex_unlock(&dirty_i->seglist_lock); } void f2fs_clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; struct list_head *head = &dcc->entry_list; struct discard_entry *entry, *this; struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); unsigned long *prefree_map = dirty_i->dirty_segmap[PRE]; unsigned int start = 0, end = -1; unsigned int secno, start_segno; bool force = (cpc->reason & CP_DISCARD); bool section_alignment = F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_SECTION; if (f2fs_lfs_mode(sbi) && __is_large_section(sbi)) section_alignment = true; mutex_lock(&dirty_i->seglist_lock); while (1) { int i; if (section_alignment && end != -1) end--; start = find_next_bit(prefree_map, MAIN_SEGS(sbi), end + 1); if (start >= MAIN_SEGS(sbi)) break; end = find_next_zero_bit(prefree_map, MAIN_SEGS(sbi), start + 1); if (section_alignment) { start = rounddown(start, SEGS_PER_SEC(sbi)); end = roundup(end, SEGS_PER_SEC(sbi)); } for (i = start; i < end; i++) { if (test_and_clear_bit(i, prefree_map)) dirty_i->nr_dirty[PRE]--; } if (!f2fs_realtime_discard_enable(sbi)) continue; if (force && start >= cpc->trim_start && (end - 1) <= cpc->trim_end) continue; /* Should cover 2MB zoned device for zone-based reset */ if (!f2fs_sb_has_blkzoned(sbi) && (!f2fs_lfs_mode(sbi) || !__is_large_section(sbi))) { f2fs_issue_discard(sbi, START_BLOCK(sbi, start), SEGS_TO_BLKS(sbi, end - start)); continue; } next: secno = GET_SEC_FROM_SEG(sbi, start); start_segno = GET_SEG_FROM_SEC(sbi, secno); if (!IS_CURSEC(sbi, secno) && !get_valid_blocks(sbi, start, true)) f2fs_issue_discard(sbi, START_BLOCK(sbi, start_segno), BLKS_PER_SEC(sbi)); start = start_segno + SEGS_PER_SEC(sbi); if (start < end) goto next; else end = start - 1; } mutex_unlock(&dirty_i->seglist_lock); if (!f2fs_block_unit_discard(sbi)) goto wakeup; /* send small discards */ list_for_each_entry_safe(entry, this, head, list) { unsigned int cur_pos = 0, next_pos, len, total_len = 0; bool is_valid = test_bit_le(0, entry->discard_map); find_next: if (is_valid) { next_pos = find_next_zero_bit_le(entry->discard_map, BLKS_PER_SEG(sbi), cur_pos); len = next_pos - cur_pos; if (f2fs_sb_has_blkzoned(sbi) || (force && len < cpc->trim_minlen)) goto skip; f2fs_issue_discard(sbi, entry->start_blkaddr + cur_pos, len); total_len += len; } else { next_pos = find_next_bit_le(entry->discard_map, BLKS_PER_SEG(sbi), cur_pos); } skip: cur_pos = next_pos; is_valid = !is_valid; if (cur_pos < BLKS_PER_SEG(sbi)) goto find_next; release_discard_addr(entry); dcc->nr_discards -= total_len; } wakeup: wake_up_discard_thread(sbi, false); } int f2fs_start_discard_thread(struct f2fs_sb_info *sbi) { dev_t dev = sbi->sb->s_bdev->bd_dev; struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; int err = 0; if (f2fs_sb_has_readonly(sbi)) { f2fs_info(sbi, "Skip to start discard thread for readonly image"); return 0; } if (!f2fs_realtime_discard_enable(sbi)) return 0; dcc->f2fs_issue_discard = kthread_run(issue_discard_thread, sbi, "f2fs_discard-%u:%u", MAJOR(dev), MINOR(dev)); if (IS_ERR(dcc->f2fs_issue_discard)) { err = PTR_ERR(dcc->f2fs_issue_discard); dcc->f2fs_issue_discard = NULL; } return err; } static int create_discard_cmd_control(struct f2fs_sb_info *sbi) { struct discard_cmd_control *dcc; int err = 0, i; if (SM_I(sbi)->dcc_info) { dcc = SM_I(sbi)->dcc_info; goto init_thread; } dcc = f2fs_kzalloc(sbi, sizeof(struct discard_cmd_control), GFP_KERNEL); if (!dcc) return -ENOMEM; dcc->discard_io_aware_gran = MAX_PLIST_NUM; dcc->discard_granularity = DEFAULT_DISCARD_GRANULARITY; dcc->max_ordered_discard = DEFAULT_MAX_ORDERED_DISCARD_GRANULARITY; dcc->discard_io_aware = DPOLICY_IO_AWARE_ENABLE; if (F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_SEGMENT) dcc->discard_granularity = BLKS_PER_SEG(sbi); else if (F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_SECTION) dcc->discard_granularity = BLKS_PER_SEC(sbi); INIT_LIST_HEAD(&dcc->entry_list); for (i = 0; i < MAX_PLIST_NUM; i++) INIT_LIST_HEAD(&dcc->pend_list[i]); INIT_LIST_HEAD(&dcc->wait_list); INIT_LIST_HEAD(&dcc->fstrim_list); mutex_init(&dcc->cmd_lock); atomic_set(&dcc->issued_discard, 0); atomic_set(&dcc->queued_discard, 0); atomic_set(&dcc->discard_cmd_cnt, 0); dcc->nr_discards = 0; dcc->max_discards = SEGS_TO_BLKS(sbi, MAIN_SEGS(sbi)); dcc->max_discard_request = DEF_MAX_DISCARD_REQUEST; dcc->min_discard_issue_time = DEF_MIN_DISCARD_ISSUE_TIME; dcc->mid_discard_issue_time = DEF_MID_DISCARD_ISSUE_TIME; dcc->max_discard_issue_time = DEF_MAX_DISCARD_ISSUE_TIME; dcc->discard_urgent_util = DEF_DISCARD_URGENT_UTIL; dcc->undiscard_blks = 0; dcc->next_pos = 0; dcc->root = RB_ROOT_CACHED; dcc->rbtree_check = false; init_waitqueue_head(&dcc->discard_wait_queue); SM_I(sbi)->dcc_info = dcc; init_thread: err = f2fs_start_discard_thread(sbi); if (err) { kfree(dcc); SM_I(sbi)->dcc_info = NULL; } return err; } static void destroy_discard_cmd_control(struct f2fs_sb_info *sbi) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; if (!dcc) return; f2fs_stop_discard_thread(sbi); /* * Recovery can cache discard commands, so in error path of * fill_super(), it needs to give a chance to handle them. */ f2fs_issue_discard_timeout(sbi); kfree(dcc); SM_I(sbi)->dcc_info = NULL; } static bool __mark_sit_entry_dirty(struct f2fs_sb_info *sbi, unsigned int segno) { struct sit_info *sit_i = SIT_I(sbi); if (!__test_and_set_bit(segno, sit_i->dirty_sentries_bitmap)) { sit_i->dirty_sentries++; return false; } return true; } static void __set_sit_entry_type(struct f2fs_sb_info *sbi, int type, unsigned int segno, int modified) { struct seg_entry *se = get_seg_entry(sbi, segno); se->type = type; if (modified) __mark_sit_entry_dirty(sbi, segno); } static inline unsigned long long get_segment_mtime(struct f2fs_sb_info *sbi, block_t blkaddr) { unsigned int segno = GET_SEGNO(sbi, blkaddr); if (segno == NULL_SEGNO) return 0; return get_seg_entry(sbi, segno)->mtime; } static void update_segment_mtime(struct f2fs_sb_info *sbi, block_t blkaddr, unsigned long long old_mtime) { struct seg_entry *se; unsigned int segno = GET_SEGNO(sbi, blkaddr); unsigned long long ctime = get_mtime(sbi, false); unsigned long long mtime = old_mtime ? old_mtime : ctime; if (segno == NULL_SEGNO) return; se = get_seg_entry(sbi, segno); if (!se->mtime) se->mtime = mtime; else se->mtime = div_u64(se->mtime * se->valid_blocks + mtime, se->valid_blocks + 1); if (ctime > SIT_I(sbi)->max_mtime) SIT_I(sbi)->max_mtime = ctime; } /* * NOTE: when updating multiple blocks at the same time, please ensure * that the consecutive input blocks belong to the same segment. */ static int update_sit_entry_for_release(struct f2fs_sb_info *sbi, struct seg_entry *se, block_t blkaddr, unsigned int offset, int del) { bool exist; #ifdef CONFIG_F2FS_CHECK_FS bool mir_exist; #endif int i; int del_count = -del; f2fs_bug_on(sbi, GET_SEGNO(sbi, blkaddr) != GET_SEGNO(sbi, blkaddr + del_count - 1)); for (i = 0; i < del_count; i++) { exist = f2fs_test_and_clear_bit(offset + i, se->cur_valid_map); #ifdef CONFIG_F2FS_CHECK_FS mir_exist = f2fs_test_and_clear_bit(offset + i, se->cur_valid_map_mir); if (unlikely(exist != mir_exist)) { f2fs_err(sbi, "Inconsistent error when clearing bitmap, blk:%u, old bit:%d", blkaddr + i, exist); f2fs_bug_on(sbi, 1); } #endif if (unlikely(!exist)) { f2fs_err(sbi, "Bitmap was wrongly cleared, blk:%u", blkaddr + i); f2fs_bug_on(sbi, 1); se->valid_blocks++; del += 1; } else if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) { /* * If checkpoints are off, we must not reuse data that * was used in the previous checkpoint. If it was used * before, we must track that to know how much space we * really have. */ if (f2fs_test_bit(offset + i, se->ckpt_valid_map)) { spin_lock(&sbi->stat_lock); sbi->unusable_block_count++; spin_unlock(&sbi->stat_lock); } } if (f2fs_block_unit_discard(sbi) && f2fs_test_and_clear_bit(offset + i, se->discard_map)) sbi->discard_blks++; if (!f2fs_test_bit(offset + i, se->ckpt_valid_map)) se->ckpt_valid_blocks -= 1; } return del; } static int update_sit_entry_for_alloc(struct f2fs_sb_info *sbi, struct seg_entry *se, block_t blkaddr, unsigned int offset, int del) { bool exist; #ifdef CONFIG_F2FS_CHECK_FS bool mir_exist; #endif exist = f2fs_test_and_set_bit(offset, se->cur_valid_map); #ifdef CONFIG_F2FS_CHECK_FS mir_exist = f2fs_test_and_set_bit(offset, se->cur_valid_map_mir); if (unlikely(exist != mir_exist)) { f2fs_err(sbi, "Inconsistent error when setting bitmap, blk:%u, old bit:%d", blkaddr, exist); f2fs_bug_on(sbi, 1); } #endif if (unlikely(exist)) { f2fs_err(sbi, "Bitmap was wrongly set, blk:%u", blkaddr); f2fs_bug_on(sbi, 1); se->valid_blocks--; del = 0; } if (f2fs_block_unit_discard(sbi) && !f2fs_test_and_set_bit(offset, se->discard_map)) sbi->discard_blks--; /* * SSR should never reuse block which is checkpointed * or newly invalidated. */ if (!is_sbi_flag_set(sbi, SBI_CP_DISABLED)) { if (!f2fs_test_and_set_bit(offset, se->ckpt_valid_map)) se->ckpt_valid_blocks++; } if (!f2fs_test_bit(offset, se->ckpt_valid_map)) se->ckpt_valid_blocks += del; return del; } /* * If releasing blocks, this function supports updating multiple consecutive blocks * at one time, but please note that these consecutive blocks need to belong to the * same segment. */ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del) { struct seg_entry *se; unsigned int segno, offset; long int new_vblocks; segno = GET_SEGNO(sbi, blkaddr); if (segno == NULL_SEGNO) return; se = get_seg_entry(sbi, segno); new_vblocks = se->valid_blocks + del; offset = GET_BLKOFF_FROM_SEG0(sbi, blkaddr); f2fs_bug_on(sbi, (new_vblocks < 0 || (new_vblocks > f2fs_usable_blks_in_seg(sbi, segno)))); se->valid_blocks = new_vblocks; /* Update valid block bitmap */ if (del > 0) { del = update_sit_entry_for_alloc(sbi, se, blkaddr, offset, del); } else { del = update_sit_entry_for_release(sbi, se, blkaddr, offset, del); } __mark_sit_entry_dirty(sbi, segno); /* update total number of valid blocks to be written in ckpt area */ SIT_I(sbi)->written_valid_blocks += del; if (__is_large_section(sbi)) get_sec_entry(sbi, segno)->valid_blocks += del; } void f2fs_invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr, unsigned int len) { unsigned int segno = GET_SEGNO(sbi, addr); struct sit_info *sit_i = SIT_I(sbi); block_t addr_start = addr, addr_end = addr + len - 1; unsigned int seg_num = GET_SEGNO(sbi, addr_end) - segno + 1; unsigned int i = 1, max_blocks = sbi->blocks_per_seg, cnt; f2fs_bug_on(sbi, addr == NULL_ADDR); if (addr == NEW_ADDR || addr == COMPRESS_ADDR) return; f2fs_invalidate_internal_cache(sbi, addr, len); /* add it into sit main buffer */ down_write(&sit_i->sentry_lock); if (seg_num == 1) cnt = len; else cnt = max_blocks - GET_BLKOFF_FROM_SEG0(sbi, addr); do { update_segment_mtime(sbi, addr_start, 0); update_sit_entry(sbi, addr_start, -cnt); /* add it into dirty seglist */ locate_dirty_segment(sbi, segno); /* update @addr_start and @cnt and @segno */ addr_start = START_BLOCK(sbi, ++segno); if (++i == seg_num) cnt = GET_BLKOFF_FROM_SEG0(sbi, addr_end) + 1; else cnt = max_blocks; } while (i <= seg_num); up_write(&sit_i->sentry_lock); } bool f2fs_is_checkpointed_data(struct f2fs_sb_info *sbi, block_t blkaddr) { struct sit_info *sit_i = SIT_I(sbi); unsigned int segno, offset; struct seg_entry *se; bool is_cp = false; if (!__is_valid_data_blkaddr(blkaddr)) return true; down_read(&sit_i->sentry_lock); segno = GET_SEGNO(sbi, blkaddr); se = get_seg_entry(sbi, segno); offset = GET_BLKOFF_FROM_SEG0(sbi, blkaddr); if (f2fs_test_bit(offset, se->ckpt_valid_map)) is_cp = true; up_read(&sit_i->sentry_lock); return is_cp; } static unsigned short f2fs_curseg_valid_blocks(struct f2fs_sb_info *sbi, int type) { struct curseg_info *curseg = CURSEG_I(sbi, type); if (sbi->ckpt->alloc_type[type] == SSR) return BLKS_PER_SEG(sbi); return curseg->next_blkoff; } /* * Calculate the number of current summary pages for writing */ int f2fs_npages_for_summary_flush(struct f2fs_sb_info *sbi, bool for_ra) { int valid_sum_count = 0; int i, sum_in_page; for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) { if (sbi->ckpt->alloc_type[i] != SSR && for_ra) valid_sum_count += le16_to_cpu(F2FS_CKPT(sbi)->cur_data_blkoff[i]); else valid_sum_count += f2fs_curseg_valid_blocks(sbi, i); } sum_in_page = (PAGE_SIZE - 2 * SUM_JOURNAL_SIZE - SUM_FOOTER_SIZE) / SUMMARY_SIZE; if (valid_sum_count <= sum_in_page) return 1; else if ((valid_sum_count - sum_in_page) <= (PAGE_SIZE - SUM_FOOTER_SIZE) / SUMMARY_SIZE) return 2; return 3; } /* * Caller should put this summary page */ struct page *f2fs_get_sum_page(struct f2fs_sb_info *sbi, unsigned int segno) { if (unlikely(f2fs_cp_error(sbi))) return ERR_PTR(-EIO); return f2fs_get_meta_page_retry(sbi, GET_SUM_BLOCK(sbi, segno)); } void f2fs_update_meta_page(struct f2fs_sb_info *sbi, void *src, block_t blk_addr) { struct page *page = f2fs_grab_meta_page(sbi, blk_addr); memcpy(page_address(page), src, PAGE_SIZE); set_page_dirty(page); f2fs_put_page(page, 1); } static void write_sum_page(struct f2fs_sb_info *sbi, struct f2fs_summary_block *sum_blk, block_t blk_addr) { f2fs_update_meta_page(sbi, (void *)sum_blk, blk_addr); } static void write_current_sum_page(struct f2fs_sb_info *sbi, int type, block_t blk_addr) { struct curseg_info *curseg = CURSEG_I(sbi, type); struct page *page = f2fs_grab_meta_page(sbi, blk_addr); struct f2fs_summary_block *src = curseg->sum_blk; struct f2fs_summary_block *dst; dst = (struct f2fs_summary_block *)page_address(page); memset(dst, 0, PAGE_SIZE); mutex_lock(&curseg->curseg_mutex); down_read(&curseg->journal_rwsem); memcpy(&dst->journal, curseg->journal, SUM_JOURNAL_SIZE); up_read(&curseg->journal_rwsem); memcpy(dst->entries, src->entries, SUM_ENTRY_SIZE); memcpy(&dst->footer, &src->footer, SUM_FOOTER_SIZE); mutex_unlock(&curseg->curseg_mutex); set_page_dirty(page); f2fs_put_page(page, 1); } static int is_next_segment_free(struct f2fs_sb_info *sbi, struct curseg_info *curseg) { unsigned int segno = curseg->segno + 1; struct free_segmap_info *free_i = FREE_I(sbi); if (segno < MAIN_SEGS(sbi) && segno % SEGS_PER_SEC(sbi)) return !test_bit(segno, free_i->free_segmap); return 0; } /* * Find a new segment from the free segments bitmap to right order * This function should be returned with success, otherwise BUG */ static int get_new_segment(struct f2fs_sb_info *sbi, unsigned int *newseg, bool new_sec, bool pinning) { struct free_segmap_info *free_i = FREE_I(sbi); unsigned int segno, secno, zoneno; unsigned int total_zones = MAIN_SECS(sbi) / sbi->secs_per_zone; unsigned int hint = GET_SEC_FROM_SEG(sbi, *newseg); unsigned int old_zoneno = GET_ZONE_FROM_SEG(sbi, *newseg); bool init = true; int i; int ret = 0; spin_lock(&free_i->segmap_lock); if (time_to_inject(sbi, FAULT_NO_SEGMENT)) { ret = -ENOSPC; goto out_unlock; } if (!new_sec && ((*newseg + 1) % SEGS_PER_SEC(sbi))) { segno = find_next_zero_bit(free_i->free_segmap, GET_SEG_FROM_SEC(sbi, hint + 1), *newseg + 1); if (segno < GET_SEG_FROM_SEC(sbi, hint + 1)) goto got_it; } #ifdef CONFIG_BLK_DEV_ZONED /* * If we format f2fs on zoned storage, let's try to get pinned sections * from beginning of the storage, which should be a conventional one. */ if (f2fs_sb_has_blkzoned(sbi)) { /* Prioritize writing to conventional zones */ if (sbi->blkzone_alloc_policy == BLKZONE_ALLOC_PRIOR_CONV || pinning) segno = 0; else segno = max(sbi->first_zoned_segno, *newseg); hint = GET_SEC_FROM_SEG(sbi, segno); } #endif find_other_zone: secno = find_next_zero_bit(free_i->free_secmap, MAIN_SECS(sbi), hint); #ifdef CONFIG_BLK_DEV_ZONED if (secno >= MAIN_SECS(sbi) && f2fs_sb_has_blkzoned(sbi)) { /* Write only to sequential zones */ if (sbi->blkzone_alloc_policy == BLKZONE_ALLOC_ONLY_SEQ) { hint = GET_SEC_FROM_SEG(sbi, sbi->first_zoned_segno); secno = find_next_zero_bit(free_i->free_secmap, MAIN_SECS(sbi), hint); } else secno = find_first_zero_bit(free_i->free_secmap, MAIN_SECS(sbi)); if (secno >= MAIN_SECS(sbi)) { ret = -ENOSPC; f2fs_bug_on(sbi, 1); goto out_unlock; } } #endif if (secno >= MAIN_SECS(sbi)) { secno = find_first_zero_bit(free_i->free_secmap, MAIN_SECS(sbi)); if (secno >= MAIN_SECS(sbi)) { ret = -ENOSPC; f2fs_bug_on(sbi, 1); goto out_unlock; } } segno = GET_SEG_FROM_SEC(sbi, secno); zoneno = GET_ZONE_FROM_SEC(sbi, secno); /* give up on finding another zone */ if (!init) goto got_it; if (sbi->secs_per_zone == 1) goto got_it; if (zoneno == old_zoneno) goto got_it; for (i = 0; i < NR_CURSEG_TYPE; i++) if (CURSEG_I(sbi, i)->zone == zoneno) break; if (i < NR_CURSEG_TYPE) { /* zone is in user, try another */ if (zoneno + 1 >= total_zones) hint = 0; else hint = (zoneno + 1) * sbi->secs_per_zone; init = false; goto find_other_zone; } got_it: /* set it as dirty segment in free segmap */ f2fs_bug_on(sbi, test_bit(segno, free_i->free_segmap)); /* no free section in conventional zone */ if (new_sec && pinning && !f2fs_valid_pinned_area(sbi, START_BLOCK(sbi, segno))) { ret = -EAGAIN; goto out_unlock; } __set_inuse(sbi, segno); *newseg = segno; out_unlock: spin_unlock(&free_i->segmap_lock); if (ret == -ENOSPC) f2fs_stop_checkpoint(sbi, false, STOP_CP_REASON_NO_SEGMENT); return ret; } static void reset_curseg(struct f2fs_sb_info *sbi, int type, int modified) { struct curseg_info *curseg = CURSEG_I(sbi, type); struct summary_footer *sum_footer; unsigned short seg_type = curseg->seg_type; /* only happen when get_new_segment() fails */ if (curseg->next_segno == NULL_SEGNO) return; curseg->inited = true; curseg->segno = curseg->next_segno; curseg->zone = GET_ZONE_FROM_SEG(sbi, curseg->segno); curseg->next_blkoff = 0; curseg->next_segno = NULL_SEGNO; sum_footer = &(curseg->sum_blk->footer); memset(sum_footer, 0, sizeof(struct summary_footer)); sanity_check_seg_type(sbi, seg_type); if (IS_DATASEG(seg_type)) SET_SUM_TYPE(sum_footer, SUM_TYPE_DATA); if (IS_NODESEG(seg_type)) SET_SUM_TYPE(sum_footer, SUM_TYPE_NODE); __set_sit_entry_type(sbi, seg_type, curseg->segno, modified); } static unsigned int __get_next_segno(struct f2fs_sb_info *sbi, int type) { struct curseg_info *curseg = CURSEG_I(sbi, type); unsigned short seg_type = curseg->seg_type; sanity_check_seg_type(sbi, seg_type); if (__is_large_section(sbi)) { if (f2fs_need_rand_seg(sbi)) { unsigned int hint = GET_SEC_FROM_SEG(sbi, curseg->segno); if (GET_SEC_FROM_SEG(sbi, curseg->segno + 1) != hint) return curseg->segno; return get_random_u32_inclusive(curseg->segno + 1, GET_SEG_FROM_SEC(sbi, hint + 1) - 1); } return curseg->segno; } else if (f2fs_need_rand_seg(sbi)) { return get_random_u32_below(MAIN_SECS(sbi) * SEGS_PER_SEC(sbi)); } /* inmem log may not locate on any segment after mount */ if (!curseg->inited) return 0; if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) return 0; if (seg_type == CURSEG_HOT_DATA || IS_NODESEG(seg_type)) return 0; if (SIT_I(sbi)->last_victim[ALLOC_NEXT]) return SIT_I(sbi)->last_victim[ALLOC_NEXT]; /* find segments from 0 to reuse freed segments */ if (F2FS_OPTION(sbi).alloc_mode == ALLOC_MODE_REUSE) return 0; return curseg->segno; } /* * Allocate a current working segment. * This function always allocates a free segment in LFS manner. */ static int new_curseg(struct f2fs_sb_info *sbi, int type, bool new_sec) { struct curseg_info *curseg = CURSEG_I(sbi, type); unsigned int segno = curseg->segno; bool pinning = type == CURSEG_COLD_DATA_PINNED; int ret; if (curseg->inited) write_sum_page(sbi, curseg->sum_blk, GET_SUM_BLOCK(sbi, segno)); segno = __get_next_segno(sbi, type); ret = get_new_segment(sbi, &segno, new_sec, pinning); if (ret) { if (ret == -ENOSPC) curseg->segno = NULL_SEGNO; return ret; } curseg->next_segno = segno; reset_curseg(sbi, type, 1); curseg->alloc_type = LFS; if (F2FS_OPTION(sbi).fs_mode == FS_MODE_FRAGMENT_BLK) curseg->fragment_remained_chunk = get_random_u32_inclusive(1, sbi->max_fragment_chunk); return 0; } static int __next_free_blkoff(struct f2fs_sb_info *sbi, int segno, block_t start) { struct seg_entry *se = get_seg_entry(sbi, segno); int entries = SIT_VBLOCK_MAP_SIZE / sizeof(unsigned long); unsigned long *target_map = SIT_I(sbi)->tmp_map; unsigned long *ckpt_map = (unsigned long *)se->ckpt_valid_map; unsigned long *cur_map = (unsigned long *)se->cur_valid_map; int i; for (i = 0; i < entries; i++) target_map[i] = ckpt_map[i] | cur_map[i]; return __find_rev_next_zero_bit(target_map, BLKS_PER_SEG(sbi), start); } static int f2fs_find_next_ssr_block(struct f2fs_sb_info *sbi, struct curseg_info *seg) { return __next_free_blkoff(sbi, seg->segno, seg->next_blkoff + 1); } bool f2fs_segment_has_free_slot(struct f2fs_sb_info *sbi, int segno) { return __next_free_blkoff(sbi, segno, 0) < BLKS_PER_SEG(sbi); } /* * This function always allocates a used segment(from dirty seglist) by SSR * manner, so it should recover the existing segment information of valid blocks */ static int change_curseg(struct f2fs_sb_info *sbi, int type) { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); struct curseg_info *curseg = CURSEG_I(sbi, type); unsigned int new_segno = curseg->next_segno; struct f2fs_summary_block *sum_node; struct page *sum_page; if (curseg->inited) write_sum_page(sbi, curseg->sum_blk, GET_SUM_BLOCK(sbi, curseg->segno)); __set_test_and_inuse(sbi, new_segno); mutex_lock(&dirty_i->seglist_lock); __remove_dirty_segment(sbi, new_segno, PRE); __remove_dirty_segment(sbi, new_segno, DIRTY); mutex_unlock(&dirty_i->seglist_lock); reset_curseg(sbi, type, 1); curseg->alloc_type = SSR; curseg->next_blkoff = __next_free_blkoff(sbi, curseg->segno, 0); sum_page = f2fs_get_sum_page(sbi, new_segno); if (IS_ERR(sum_page)) { /* GC won't be able to use stale summary pages by cp_error */ memset(curseg->sum_blk, 0, SUM_ENTRY_SIZE); return PTR_ERR(sum_page); } sum_node = (struct f2fs_summary_block *)page_address(sum_page); memcpy(curseg->sum_blk, sum_node, SUM_ENTRY_SIZE); f2fs_put_page(sum_page, 1); return 0; } static int get_ssr_segment(struct f2fs_sb_info *sbi, int type, int alloc_mode, unsigned long long age); static int get_atssr_segment(struct f2fs_sb_info *sbi, int type, int target_type, int alloc_mode, unsigned long long age) { struct curseg_info *curseg = CURSEG_I(sbi, type); int ret = 0; curseg->seg_type = target_type; if (get_ssr_segment(sbi, type, alloc_mode, age)) { struct seg_entry *se = get_seg_entry(sbi, curseg->next_segno); curseg->seg_type = se->type; ret = change_curseg(sbi, type); } else { /* allocate cold segment by default */ curseg->seg_type = CURSEG_COLD_DATA; ret = new_curseg(sbi, type, true); } stat_inc_seg_type(sbi, curseg); return ret; } static int __f2fs_init_atgc_curseg(struct f2fs_sb_info *sbi, bool force) { struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_ALL_DATA_ATGC); int ret = 0; if (!sbi->am.atgc_enabled && !force) return 0; f2fs_down_read(&SM_I(sbi)->curseg_lock); mutex_lock(&curseg->curseg_mutex); down_write(&SIT_I(sbi)->sentry_lock); ret = get_atssr_segment(sbi, CURSEG_ALL_DATA_ATGC, CURSEG_COLD_DATA, SSR, 0); up_write(&SIT_I(sbi)->sentry_lock); mutex_unlock(&curseg->curseg_mutex); f2fs_up_read(&SM_I(sbi)->curseg_lock); return ret; } int f2fs_init_inmem_curseg(struct f2fs_sb_info *sbi) { return __f2fs_init_atgc_curseg(sbi, false); } int f2fs_reinit_atgc_curseg(struct f2fs_sb_info *sbi) { int ret; if (!test_opt(sbi, ATGC)) return 0; if (sbi->am.atgc_enabled) return 0; if (le64_to_cpu(F2FS_CKPT(sbi)->elapsed_time) < sbi->am.age_threshold) return 0; ret = __f2fs_init_atgc_curseg(sbi, true); if (!ret) { sbi->am.atgc_enabled = true; f2fs_info(sbi, "reenabled age threshold GC"); } return ret; } static void __f2fs_save_inmem_curseg(struct f2fs_sb_info *sbi, int type) { struct curseg_info *curseg = CURSEG_I(sbi, type); mutex_lock(&curseg->curseg_mutex); if (!curseg->inited) goto out; if (get_valid_blocks(sbi, curseg->segno, false)) { write_sum_page(sbi, curseg->sum_blk, GET_SUM_BLOCK(sbi, curseg->segno)); } else { mutex_lock(&DIRTY_I(sbi)->seglist_lock); __set_test_and_free(sbi, curseg->segno, true); mutex_unlock(&DIRTY_I(sbi)->seglist_lock); } out: mutex_unlock(&curseg->curseg_mutex); } void f2fs_save_inmem_curseg(struct f2fs_sb_info *sbi) { __f2fs_save_inmem_curseg(sbi, CURSEG_COLD_DATA_PINNED); if (sbi->am.atgc_enabled) __f2fs_save_inmem_curseg(sbi, CURSEG_ALL_DATA_ATGC); } static void __f2fs_restore_inmem_curseg(struct f2fs_sb_info *sbi, int type) { struct curseg_info *curseg = CURSEG_I(sbi, type); mutex_lock(&curseg->curseg_mutex); if (!curseg->inited) goto out; if (get_valid_blocks(sbi, curseg->segno, false)) goto out; mutex_lock(&DIRTY_I(sbi)->seglist_lock); __set_test_and_inuse(sbi, curseg->segno); mutex_unlock(&DIRTY_I(sbi)->seglist_lock); out: mutex_unlock(&curseg->curseg_mutex); } void f2fs_restore_inmem_curseg(struct f2fs_sb_info *sbi) { __f2fs_restore_inmem_curseg(sbi, CURSEG_COLD_DATA_PINNED); if (sbi->am.atgc_enabled) __f2fs_restore_inmem_curseg(sbi, CURSEG_ALL_DATA_ATGC); } static int get_ssr_segment(struct f2fs_sb_info *sbi, int type, int alloc_mode, unsigned long long age) { struct curseg_info *curseg = CURSEG_I(sbi, type); unsigned segno = NULL_SEGNO; unsigned short seg_type = curseg->seg_type; int i, cnt; bool reversed = false; sanity_check_seg_type(sbi, seg_type); /* f2fs_need_SSR() already forces to do this */ if (!f2fs_get_victim(sbi, &segno, BG_GC, seg_type, alloc_mode, age, false)) { curseg->next_segno = segno; return 1; } /* For node segments, let's do SSR more intensively */ if (IS_NODESEG(seg_type)) { if (seg_type >= CURSEG_WARM_NODE) { reversed = true; i = CURSEG_COLD_NODE; } else { i = CURSEG_HOT_NODE; } cnt = NR_CURSEG_NODE_TYPE; } else { if (seg_type >= CURSEG_WARM_DATA) { reversed = true; i = CURSEG_COLD_DATA; } else { i = CURSEG_HOT_DATA; } cnt = NR_CURSEG_DATA_TYPE; } for (; cnt-- > 0; reversed ? i-- : i++) { if (i == seg_type) continue; if (!f2fs_get_victim(sbi, &segno, BG_GC, i, alloc_mode, age, false)) { curseg->next_segno = segno; return 1; } } /* find valid_blocks=0 in dirty list */ if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) { segno = get_free_segment(sbi); if (segno != NULL_SEGNO) { curseg->next_segno = segno; return 1; } } return 0; } static bool need_new_seg(struct f2fs_sb_info *sbi, int type) { struct curseg_info *curseg = CURSEG_I(sbi, type); if (!is_set_ckpt_flags(sbi, CP_CRC_RECOVERY_FLAG) && curseg->seg_type == CURSEG_WARM_NODE) return true; if (curseg->alloc_type == LFS && is_next_segment_free(sbi, curseg) && likely(!is_sbi_flag_set(sbi, SBI_CP_DISABLED))) return true; if (!f2fs_need_SSR(sbi) || !get_ssr_segment(sbi, type, SSR, 0)) return true; return false; } int f2fs_allocate_segment_for_resize(struct f2fs_sb_info *sbi, int type, unsigned int start, unsigned int end) { struct curseg_info *curseg = CURSEG_I(sbi, type); unsigned int segno; int ret = 0; f2fs_down_read(&SM_I(sbi)->curseg_lock); mutex_lock(&curseg->curseg_mutex); down_write(&SIT_I(sbi)->sentry_lock); segno = CURSEG_I(sbi, type)->segno; if (segno < start || segno > end) goto unlock; if (f2fs_need_SSR(sbi) && get_ssr_segment(sbi, type, SSR, 0)) ret = change_curseg(sbi, type); else ret = new_curseg(sbi, type, true); stat_inc_seg_type(sbi, curseg); locate_dirty_segment(sbi, segno); unlock: up_write(&SIT_I(sbi)->sentry_lock); if (segno != curseg->segno) f2fs_notice(sbi, "For resize: curseg of type %d: %u ==> %u", type, segno, curseg->segno); mutex_unlock(&curseg->curseg_mutex); f2fs_up_read(&SM_I(sbi)->curseg_lock); return ret; } static int __allocate_new_segment(struct f2fs_sb_info *sbi, int type, bool new_sec, bool force) { struct curseg_info *curseg = CURSEG_I(sbi, type); unsigned int old_segno; int err = 0; if (type == CURSEG_COLD_DATA_PINNED && !curseg->inited) goto allocate; if (!force && curseg->inited && !curseg->next_blkoff && !get_valid_blocks(sbi, curseg->segno, new_sec) && !get_ckpt_valid_blocks(sbi, curseg->segno, new_sec)) return 0; allocate: old_segno = curseg->segno; err = new_curseg(sbi, type, true); if (err) return err; stat_inc_seg_type(sbi, curseg); locate_dirty_segment(sbi, old_segno); return 0; } int f2fs_allocate_new_section(struct f2fs_sb_info *sbi, int type, bool force) { int ret; f2fs_down_read(&SM_I(sbi)->curseg_lock); down_write(&SIT_I(sbi)->sentry_lock); ret = __allocate_new_segment(sbi, type, true, force); up_write(&SIT_I(sbi)->sentry_lock); f2fs_up_read(&SM_I(sbi)->curseg_lock); return ret; } int f2fs_allocate_pinning_section(struct f2fs_sb_info *sbi) { int err; bool gc_required = true; retry: f2fs_lock_op(sbi); err = f2fs_allocate_new_section(sbi, CURSEG_COLD_DATA_PINNED, false); f2fs_unlock_op(sbi); if (f2fs_sb_has_blkzoned(sbi) && err == -EAGAIN && gc_required) { f2fs_down_write(&sbi->gc_lock); err = f2fs_gc_range(sbi, 0, GET_SEGNO(sbi, FDEV(0).end_blk), true, ZONED_PIN_SEC_REQUIRED_COUNT); f2fs_up_write(&sbi->gc_lock); gc_required = false; if (!err) goto retry; } return err; } int f2fs_allocate_new_segments(struct f2fs_sb_info *sbi) { int i; int err = 0; f2fs_down_read(&SM_I(sbi)->curseg_lock); down_write(&SIT_I(sbi)->sentry_lock); for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) err += __allocate_new_segment(sbi, i, false, false); up_write(&SIT_I(sbi)->sentry_lock); f2fs_up_read(&SM_I(sbi)->curseg_lock); return err; } bool f2fs_exist_trim_candidates(struct f2fs_sb_info *sbi, struct cp_control *cpc) { __u64 trim_start = cpc->trim_start; bool has_candidate = false; down_write(&SIT_I(sbi)->sentry_lock); for (; cpc->trim_start <= cpc->trim_end; cpc->trim_start++) { if (add_discard_addrs(sbi, cpc, true)) { has_candidate = true; break; } } up_write(&SIT_I(sbi)->sentry_lock); cpc->trim_start = trim_start; return has_candidate; } static unsigned int __issue_discard_cmd_range(struct f2fs_sb_info *sbi, struct discard_policy *dpolicy, unsigned int start, unsigned int end) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; struct discard_cmd *prev_dc = NULL, *next_dc = NULL; struct rb_node **insert_p = NULL, *insert_parent = NULL; struct discard_cmd *dc; struct blk_plug plug; int issued; unsigned int trimmed = 0; next: issued = 0; mutex_lock(&dcc->cmd_lock); if (unlikely(dcc->rbtree_check)) f2fs_bug_on(sbi, !f2fs_check_discard_tree(sbi)); dc = __lookup_discard_cmd_ret(&dcc->root, start, &prev_dc, &next_dc, &insert_p, &insert_parent); if (!dc) dc = next_dc; blk_start_plug(&plug); while (dc && dc->di.lstart <= end) { struct rb_node *node; int err = 0; if (dc->di.len < dpolicy->granularity) goto skip; if (dc->state != D_PREP) { list_move_tail(&dc->list, &dcc->fstrim_list); goto skip; } err = __submit_discard_cmd(sbi, dpolicy, dc, &issued); if (issued >= dpolicy->max_requests) { start = dc->di.lstart + dc->di.len; if (err) __remove_discard_cmd(sbi, dc); blk_finish_plug(&plug); mutex_unlock(&dcc->cmd_lock); trimmed += __wait_all_discard_cmd(sbi, NULL); f2fs_io_schedule_timeout(DEFAULT_IO_TIMEOUT); goto next; } skip: node = rb_next(&dc->rb_node); if (err) __remove_discard_cmd(sbi, dc); dc = rb_entry_safe(node, struct discard_cmd, rb_node); if (fatal_signal_pending(current)) break; } blk_finish_plug(&plug); mutex_unlock(&dcc->cmd_lock); return trimmed; } int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range) { __u64 start = F2FS_BYTES_TO_BLK(range->start); __u64 end = start + F2FS_BYTES_TO_BLK(range->len) - 1; unsigned int start_segno, end_segno; block_t start_block, end_block; struct cp_control cpc; struct discard_policy dpolicy; unsigned long long trimmed = 0; int err = 0; bool need_align = f2fs_lfs_mode(sbi) && __is_large_section(sbi); if (start >= MAX_BLKADDR(sbi) || range->len < sbi->blocksize) return -EINVAL; if (end < MAIN_BLKADDR(sbi)) goto out; if (is_sbi_flag_set(sbi, SBI_NEED_FSCK)) { f2fs_warn(sbi, "Found FS corruption, run fsck to fix."); return -EFSCORRUPTED; } /* start/end segment number in main_area */ start_segno = (start <= MAIN_BLKADDR(sbi)) ? 0 : GET_SEGNO(sbi, start); end_segno = (end >= MAX_BLKADDR(sbi)) ? MAIN_SEGS(sbi) - 1 : GET_SEGNO(sbi, end); if (need_align) { start_segno = rounddown(start_segno, SEGS_PER_SEC(sbi)); end_segno = roundup(end_segno + 1, SEGS_PER_SEC(sbi)) - 1; } cpc.reason = CP_DISCARD; cpc.trim_minlen = max_t(__u64, 1, F2FS_BYTES_TO_BLK(range->minlen)); cpc.trim_start = start_segno; cpc.trim_end = end_segno; if (sbi->discard_blks == 0) goto out; f2fs_down_write(&sbi->gc_lock); stat_inc_cp_call_count(sbi, TOTAL_CALL); err = f2fs_write_checkpoint(sbi, &cpc); f2fs_up_write(&sbi->gc_lock); if (err) goto out; /* * We filed discard candidates, but actually we don't need to wait for * all of them, since they'll be issued in idle time along with runtime * discard option. User configuration looks like using runtime discard * or periodic fstrim instead of it. */ if (f2fs_realtime_discard_enable(sbi)) goto out; start_block = START_BLOCK(sbi, start_segno); end_block = START_BLOCK(sbi, end_segno + 1); __init_discard_policy(sbi, &dpolicy, DPOLICY_FSTRIM, cpc.trim_minlen); trimmed = __issue_discard_cmd_range(sbi, &dpolicy, start_block, end_block); trimmed += __wait_discard_cmd_range(sbi, &dpolicy, start_block, end_block); out: if (!err) range->len = F2FS_BLK_TO_BYTES(trimmed); return err; } int f2fs_rw_hint_to_seg_type(struct f2fs_sb_info *sbi, enum rw_hint hint) { if (F2FS_OPTION(sbi).active_logs == 2) return CURSEG_HOT_DATA; else if (F2FS_OPTION(sbi).active_logs == 4) return CURSEG_COLD_DATA; /* active_log == 6 */ switch (hint) { case WRITE_LIFE_SHORT: return CURSEG_HOT_DATA; case WRITE_LIFE_EXTREME: return CURSEG_COLD_DATA; default: return CURSEG_WARM_DATA; } } /* * This returns write hints for each segment type. This hints will be * passed down to block layer as below by default. * * User F2FS Block * ---- ---- ----- * META WRITE_LIFE_NONE|REQ_META * HOT_NODE WRITE_LIFE_NONE * WARM_NODE WRITE_LIFE_MEDIUM * COLD_NODE WRITE_LIFE_LONG * ioctl(COLD) COLD_DATA WRITE_LIFE_EXTREME * extension list " " * * -- buffered io * COLD_DATA WRITE_LIFE_EXTREME * HOT_DATA WRITE_LIFE_SHORT * WARM_DATA WRITE_LIFE_NOT_SET * * -- direct io * WRITE_LIFE_EXTREME COLD_DATA WRITE_LIFE_EXTREME * WRITE_LIFE_SHORT HOT_DATA WRITE_LIFE_SHORT * WRITE_LIFE_NOT_SET WARM_DATA WRITE_LIFE_NOT_SET * WRITE_LIFE_NONE " WRITE_LIFE_NONE * WRITE_LIFE_MEDIUM " WRITE_LIFE_MEDIUM * WRITE_LIFE_LONG " WRITE_LIFE_LONG */ enum rw_hint f2fs_io_type_to_rw_hint(struct f2fs_sb_info *sbi, enum page_type type, enum temp_type temp) { switch (type) { case DATA: switch (temp) { case WARM: return WRITE_LIFE_NOT_SET; case HOT: return WRITE_LIFE_SHORT; case COLD: return WRITE_LIFE_EXTREME; default: return WRITE_LIFE_NONE; } case NODE: switch (temp) { case WARM: return WRITE_LIFE_MEDIUM; case HOT: return WRITE_LIFE_NONE; case COLD: return WRITE_LIFE_LONG; default: return WRITE_LIFE_NONE; } case META: return WRITE_LIFE_NONE; default: return WRITE_LIFE_NONE; } } static int __get_segment_type_2(struct f2fs_io_info *fio) { if (fio->type == DATA) return CURSEG_HOT_DATA; else return CURSEG_HOT_NODE; } static int __get_segment_type_4(struct f2fs_io_info *fio) { if (fio->type == DATA) { struct inode *inode = fio->page->mapping->host; if (S_ISDIR(inode->i_mode)) return CURSEG_HOT_DATA; else return CURSEG_COLD_DATA; } else { if (IS_DNODE(fio->page) && is_cold_node(fio->page)) return CURSEG_WARM_NODE; else return CURSEG_COLD_NODE; } } static int __get_age_segment_type(struct inode *inode, pgoff_t pgofs) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct extent_info ei = {}; if (f2fs_lookup_age_extent_cache(inode, pgofs, &ei)) { if (!ei.age) return NO_CHECK_TYPE; if (ei.age <= sbi->hot_data_age_threshold) return CURSEG_HOT_DATA; if (ei.age <= sbi->warm_data_age_threshold) return CURSEG_WARM_DATA; return CURSEG_COLD_DATA; } return NO_CHECK_TYPE; } static int __get_segment_type_6(struct f2fs_io_info *fio) { if (fio->type == DATA) { struct inode *inode = fio->page->mapping->host; int type; if (is_inode_flag_set(inode, FI_ALIGNED_WRITE)) return CURSEG_COLD_DATA_PINNED; if (page_private_gcing(fio->page)) { if (fio->sbi->am.atgc_enabled && (fio->io_type == FS_DATA_IO) && (fio->sbi->gc_mode != GC_URGENT_HIGH) && __is_valid_data_blkaddr(fio->old_blkaddr) && !is_inode_flag_set(inode, FI_OPU_WRITE)) return CURSEG_ALL_DATA_ATGC; else return CURSEG_COLD_DATA; } if (file_is_cold(inode) || f2fs_need_compress_data(inode)) return CURSEG_COLD_DATA; type = __get_age_segment_type(inode, page_folio(fio->page)->index); if (type != NO_CHECK_TYPE) return type; if (file_is_hot(inode) || is_inode_flag_set(inode, FI_HOT_DATA) || f2fs_is_cow_file(inode)) return CURSEG_HOT_DATA; return f2fs_rw_hint_to_seg_type(F2FS_I_SB(inode), inode->i_write_hint); } else { if (IS_DNODE(fio->page)) return is_cold_node(fio->page) ? CURSEG_WARM_NODE : CURSEG_HOT_NODE; return CURSEG_COLD_NODE; } } enum temp_type f2fs_get_segment_temp(struct f2fs_sb_info *sbi, enum log_type type) { struct curseg_info *curseg = CURSEG_I(sbi, type); enum temp_type temp = COLD; switch (curseg->seg_type) { case CURSEG_HOT_NODE: case CURSEG_HOT_DATA: temp = HOT; break; case CURSEG_WARM_NODE: case CURSEG_WARM_DATA: temp = WARM; break; case CURSEG_COLD_NODE: case CURSEG_COLD_DATA: temp = COLD; break; default: f2fs_bug_on(sbi, 1); } return temp; } static int __get_segment_type(struct f2fs_io_info *fio) { enum log_type type = CURSEG_HOT_DATA; switch (F2FS_OPTION(fio->sbi).active_logs) { case 2: type = __get_segment_type_2(fio); break; case 4: type = __get_segment_type_4(fio); break; case 6: type = __get_segment_type_6(fio); break; default: f2fs_bug_on(fio->sbi, true); } fio->temp = f2fs_get_segment_temp(fio->sbi, type); return type; } static void f2fs_randomize_chunk(struct f2fs_sb_info *sbi, struct curseg_info *seg) { /* To allocate block chunks in different sizes, use random number */ if (--seg->fragment_remained_chunk > 0) return; seg->fragment_remained_chunk = get_random_u32_inclusive(1, sbi->max_fragment_chunk); seg->next_blkoff += get_random_u32_inclusive(1, sbi->max_fragment_hole); } static void reset_curseg_fields(struct curseg_info *curseg) { curseg->inited = false; curseg->segno = NULL_SEGNO; curseg->next_segno = 0; } int f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, block_t old_blkaddr, block_t *new_blkaddr, struct f2fs_summary *sum, int type, struct f2fs_io_info *fio) { struct sit_info *sit_i = SIT_I(sbi); struct curseg_info *curseg = CURSEG_I(sbi, type); unsigned long long old_mtime; bool from_gc = (type == CURSEG_ALL_DATA_ATGC); struct seg_entry *se = NULL; bool segment_full = false; int ret = 0; f2fs_down_read(&SM_I(sbi)->curseg_lock); mutex_lock(&curseg->curseg_mutex); down_write(&sit_i->sentry_lock); if (curseg->segno == NULL_SEGNO) { ret = -ENOSPC; goto out_err; } if (from_gc) { f2fs_bug_on(sbi, GET_SEGNO(sbi, old_blkaddr) == NULL_SEGNO); se = get_seg_entry(sbi, GET_SEGNO(sbi, old_blkaddr)); sanity_check_seg_type(sbi, se->type); f2fs_bug_on(sbi, IS_NODESEG(se->type)); } *new_blkaddr = NEXT_FREE_BLKADDR(sbi, curseg); f2fs_bug_on(sbi, curseg->next_blkoff >= BLKS_PER_SEG(sbi)); f2fs_wait_discard_bio(sbi, *new_blkaddr); curseg->sum_blk->entries[curseg->next_blkoff] = *sum; if (curseg->alloc_type == SSR) { curseg->next_blkoff = f2fs_find_next_ssr_block(sbi, curseg); } else { curseg->next_blkoff++; if (F2FS_OPTION(sbi).fs_mode == FS_MODE_FRAGMENT_BLK) f2fs_randomize_chunk(sbi, curseg); } if (curseg->next_blkoff >= f2fs_usable_blks_in_seg(sbi, curseg->segno)) segment_full = true; stat_inc_block_count(sbi, curseg); if (from_gc) { old_mtime = get_segment_mtime(sbi, old_blkaddr); } else { update_segment_mtime(sbi, old_blkaddr, 0); old_mtime = 0; } update_segment_mtime(sbi, *new_blkaddr, old_mtime); /* * SIT information should be updated before segment allocation, * since SSR needs latest valid block information. */ update_sit_entry(sbi, *new_blkaddr, 1); update_sit_entry(sbi, old_blkaddr, -1); /* * If the current segment is full, flush it out and replace it with a * new segment. */ if (segment_full) { if (type == CURSEG_COLD_DATA_PINNED && !((curseg->segno + 1) % sbi->segs_per_sec)) { write_sum_page(sbi, curseg->sum_blk, GET_SUM_BLOCK(sbi, curseg->segno)); reset_curseg_fields(curseg); goto skip_new_segment; } if (from_gc) { ret = get_atssr_segment(sbi, type, se->type, AT_SSR, se->mtime); } else { if (need_new_seg(sbi, type)) ret = new_curseg(sbi, type, false); else ret = change_curseg(sbi, type); stat_inc_seg_type(sbi, curseg); } if (ret) goto out_err; } skip_new_segment: /* * segment dirty status should be updated after segment allocation, * so we just need to update status only one time after previous * segment being closed. */ locate_dirty_segment(sbi, GET_SEGNO(sbi, old_blkaddr)); locate_dirty_segment(sbi, GET_SEGNO(sbi, *new_blkaddr)); if (IS_DATASEG(curseg->seg_type)) atomic64_inc(&sbi->allocated_data_blocks); up_write(&sit_i->sentry_lock); if (page && IS_NODESEG(curseg->seg_type)) { fill_node_footer_blkaddr(page, NEXT_FREE_BLKADDR(sbi, curseg)); f2fs_inode_chksum_set(sbi, page); } if (fio) { struct f2fs_bio_info *io; INIT_LIST_HEAD(&fio->list); fio->in_list = 1; io = sbi->write_io[fio->type] + fio->temp; spin_lock(&io->io_lock); list_add_tail(&fio->list, &io->io_list); spin_unlock(&io->io_lock); } mutex_unlock(&curseg->curseg_mutex); f2fs_up_read(&SM_I(sbi)->curseg_lock); return 0; out_err: *new_blkaddr = NULL_ADDR; up_write(&sit_i->sentry_lock); mutex_unlock(&curseg->curseg_mutex); f2fs_up_read(&SM_I(sbi)->curseg_lock); return ret; } void f2fs_update_device_state(struct f2fs_sb_info *sbi, nid_t ino, block_t blkaddr, unsigned int blkcnt) { if (!f2fs_is_multi_device(sbi)) return; while (1) { unsigned int devidx = f2fs_target_device_index(sbi, blkaddr); unsigned int blks = FDEV(devidx).end_blk - blkaddr + 1; /* update device state for fsync */ f2fs_set_dirty_device(sbi, ino, devidx, FLUSH_INO); /* update device state for checkpoint */ if (!f2fs_test_bit(devidx, (char *)&sbi->dirty_device)) { spin_lock(&sbi->dev_lock); f2fs_set_bit(devidx, (char *)&sbi->dirty_device); spin_unlock(&sbi->dev_lock); } if (blkcnt <= blks) break; blkcnt -= blks; blkaddr += blks; } } static int log_type_to_seg_type(enum log_type type) { int seg_type = CURSEG_COLD_DATA; switch (type) { case CURSEG_HOT_DATA: case CURSEG_WARM_DATA: case CURSEG_COLD_DATA: case CURSEG_HOT_NODE: case CURSEG_WARM_NODE: case CURSEG_COLD_NODE: seg_type = (int)type; break; case CURSEG_COLD_DATA_PINNED: case CURSEG_ALL_DATA_ATGC: seg_type = CURSEG_COLD_DATA; break; default: break; } return seg_type; } static void do_write_page(struct f2fs_summary *sum, struct f2fs_io_info *fio) { enum log_type type = __get_segment_type(fio); int seg_type = log_type_to_seg_type(type); bool keep_order = (f2fs_lfs_mode(fio->sbi) && seg_type == CURSEG_COLD_DATA); if (keep_order) f2fs_down_read(&fio->sbi->io_order_lock); if (f2fs_allocate_data_block(fio->sbi, fio->page, fio->old_blkaddr, &fio->new_blkaddr, sum, type, fio)) { if (fscrypt_inode_uses_fs_layer_crypto(fio->page->mapping->host)) fscrypt_finalize_bounce_page(&fio->encrypted_page); end_page_writeback(fio->page); if (f2fs_in_warm_node_list(fio->sbi, fio->page)) f2fs_del_fsync_node_entry(fio->sbi, fio->page); goto out; } if (GET_SEGNO(fio->sbi, fio->old_blkaddr) != NULL_SEGNO) f2fs_invalidate_internal_cache(fio->sbi, fio->old_blkaddr, 1); /* writeout dirty page into bdev */ f2fs_submit_page_write(fio); f2fs_update_device_state(fio->sbi, fio->ino, fio->new_blkaddr, 1); out: if (keep_order) f2fs_up_read(&fio->sbi->io_order_lock); } void f2fs_do_write_meta_page(struct f2fs_sb_info *sbi, struct folio *folio, enum iostat_type io_type) { struct f2fs_io_info fio = { .sbi = sbi, .type = META, .temp = HOT, .op = REQ_OP_WRITE, .op_flags = REQ_SYNC | REQ_META | REQ_PRIO, .old_blkaddr = folio->index, .new_blkaddr = folio->index, .page = folio_page(folio, 0), .encrypted_page = NULL, .in_list = 0, }; if (unlikely(folio->index >= MAIN_BLKADDR(sbi))) fio.op_flags &= ~REQ_META; folio_start_writeback(folio); f2fs_submit_page_write(&fio); stat_inc_meta_count(sbi, folio->index); f2fs_update_iostat(sbi, NULL, io_type, F2FS_BLKSIZE); } void f2fs_do_write_node_page(unsigned int nid, struct f2fs_io_info *fio) { struct f2fs_summary sum; set_summary(&sum, nid, 0, 0); do_write_page(&sum, fio); f2fs_update_iostat(fio->sbi, NULL, fio->io_type, F2FS_BLKSIZE); } void f2fs_outplace_write_data(struct dnode_of_data *dn, struct f2fs_io_info *fio) { struct f2fs_sb_info *sbi = fio->sbi; struct f2fs_summary sum; f2fs_bug_on(sbi, dn->data_blkaddr == NULL_ADDR); if (fio->io_type == FS_DATA_IO || fio->io_type == FS_CP_DATA_IO) f2fs_update_age_extent_cache(dn); set_summary(&sum, dn->nid, dn->ofs_in_node, fio->version); do_write_page(&sum, fio); f2fs_update_data_blkaddr(dn, fio->new_blkaddr); f2fs_update_iostat(sbi, dn->inode, fio->io_type, F2FS_BLKSIZE); } int f2fs_inplace_write_data(struct f2fs_io_info *fio) { int err; struct f2fs_sb_info *sbi = fio->sbi; unsigned int segno; fio->new_blkaddr = fio->old_blkaddr; /* i/o temperature is needed for passing down write hints */ __get_segment_type(fio); segno = GET_SEGNO(sbi, fio->new_blkaddr); if (!IS_DATASEG(get_seg_entry(sbi, segno)->type)) { set_sbi_flag(sbi, SBI_NEED_FSCK); f2fs_warn(sbi, "%s: incorrect segment(%u) type, run fsck to fix.", __func__, segno); err = -EFSCORRUPTED; f2fs_handle_error(sbi, ERROR_INCONSISTENT_SUM_TYPE); goto drop_bio; } if (f2fs_cp_error(sbi)) { err = -EIO; goto drop_bio; } if (fio->meta_gc) f2fs_truncate_meta_inode_pages(sbi, fio->new_blkaddr, 1); stat_inc_inplace_blocks(fio->sbi); if (fio->bio && !IS_F2FS_IPU_NOCACHE(sbi)) err = f2fs_merge_page_bio(fio); else err = f2fs_submit_page_bio(fio); if (!err) { f2fs_update_device_state(fio->sbi, fio->ino, fio->new_blkaddr, 1); f2fs_update_iostat(fio->sbi, fio->page->mapping->host, fio->io_type, F2FS_BLKSIZE); } return err; drop_bio: if (fio->bio && *(fio->bio)) { struct bio *bio = *(fio->bio); bio->bi_status = BLK_STS_IOERR; bio_endio(bio); *(fio->bio) = NULL; } return err; } static inline int __f2fs_get_curseg(struct f2fs_sb_info *sbi, unsigned int segno) { int i; for (i = CURSEG_HOT_DATA; i < NO_CHECK_TYPE; i++) { if (CURSEG_I(sbi, i)->segno == segno) break; } return i; } void f2fs_do_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, block_t old_blkaddr, block_t new_blkaddr, bool recover_curseg, bool recover_newaddr, bool from_gc) { struct sit_info *sit_i = SIT_I(sbi); struct curseg_info *curseg; unsigned int segno, old_cursegno; struct seg_entry *se; int type; unsigned short old_blkoff; unsigned char old_alloc_type; segno = GET_SEGNO(sbi, new_blkaddr); se = get_seg_entry(sbi, segno); type = se->type; f2fs_down_write(&SM_I(sbi)->curseg_lock); if (!recover_curseg) { /* for recovery flow */ if (se->valid_blocks == 0 && !IS_CURSEG(sbi, segno)) { if (old_blkaddr == NULL_ADDR) type = CURSEG_COLD_DATA; else type = CURSEG_WARM_DATA; } } else { if (IS_CURSEG(sbi, segno)) { /* se->type is volatile as SSR allocation */ type = __f2fs_get_curseg(sbi, segno); f2fs_bug_on(sbi, type == NO_CHECK_TYPE); } else { type = CURSEG_WARM_DATA; } } curseg = CURSEG_I(sbi, type); f2fs_bug_on(sbi, !IS_DATASEG(curseg->seg_type)); mutex_lock(&curseg->curseg_mutex); down_write(&sit_i->sentry_lock); old_cursegno = curseg->segno; old_blkoff = curseg->next_blkoff; old_alloc_type = curseg->alloc_type; /* change the current segment */ if (segno != curseg->segno) { curseg->next_segno = segno; if (change_curseg(sbi, type)) goto out_unlock; } curseg->next_blkoff = GET_BLKOFF_FROM_SEG0(sbi, new_blkaddr); curseg->sum_blk->entries[curseg->next_blkoff] = *sum; if (!recover_curseg || recover_newaddr) { if (!from_gc) update_segment_mtime(sbi, new_blkaddr, 0); update_sit_entry(sbi, new_blkaddr, 1); } if (GET_SEGNO(sbi, old_blkaddr) != NULL_SEGNO) { f2fs_invalidate_internal_cache(sbi, old_blkaddr, 1); if (!from_gc) update_segment_mtime(sbi, old_blkaddr, 0); update_sit_entry(sbi, old_blkaddr, -1); } locate_dirty_segment(sbi, GET_SEGNO(sbi, old_blkaddr)); locate_dirty_segment(sbi, GET_SEGNO(sbi, new_blkaddr)); locate_dirty_segment(sbi, old_cursegno); if (recover_curseg) { if (old_cursegno != curseg->segno) { curseg->next_segno = old_cursegno; if (change_curseg(sbi, type)) goto out_unlock; } curseg->next_blkoff = old_blkoff; curseg->alloc_type = old_alloc_type; } out_unlock: up_write(&sit_i->sentry_lock); mutex_unlock(&curseg->curseg_mutex); f2fs_up_write(&SM_I(sbi)->curseg_lock); } void f2fs_replace_block(struct f2fs_sb_info *sbi, struct dnode_of_data *dn, block_t old_addr, block_t new_addr, unsigned char version, bool recover_curseg, bool recover_newaddr) { struct f2fs_summary sum; set_summary(&sum, dn->nid, dn->ofs_in_node, version); f2fs_do_replace_block(sbi, &sum, old_addr, new_addr, recover_curseg, recover_newaddr, false); f2fs_update_data_blkaddr(dn, new_addr); } void f2fs_wait_on_page_writeback(struct page *page, enum page_type type, bool ordered, bool locked) { if (folio_test_writeback(page_folio(page))) { struct f2fs_sb_info *sbi = F2FS_P_SB(page); /* submit cached LFS IO */ f2fs_submit_merged_write_cond(sbi, NULL, page, 0, type); /* submit cached IPU IO */ f2fs_submit_merged_ipu_write(sbi, NULL, page); if (ordered) { wait_on_page_writeback(page); f2fs_bug_on(sbi, locked && folio_test_writeback(page_folio(page))); } else { wait_for_stable_page(page); } } } void f2fs_wait_on_block_writeback(struct inode *inode, block_t blkaddr) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct page *cpage; if (!f2fs_meta_inode_gc_required(inode)) return; if (!__is_valid_data_blkaddr(blkaddr)) return; cpage = find_lock_page(META_MAPPING(sbi), blkaddr); if (cpage) { f2fs_wait_on_page_writeback(cpage, DATA, true, true); f2fs_put_page(cpage, 1); } } void f2fs_wait_on_block_writeback_range(struct inode *inode, block_t blkaddr, block_t len) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); block_t i; if (!f2fs_meta_inode_gc_required(inode)) return; for (i = 0; i < len; i++) f2fs_wait_on_block_writeback(inode, blkaddr + i); f2fs_truncate_meta_inode_pages(sbi, blkaddr, len); } static int read_compacted_summaries(struct f2fs_sb_info *sbi) { struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); struct curseg_info *seg_i; unsigned char *kaddr; struct page *page; block_t start; int i, j, offset; start = start_sum_block(sbi); page = f2fs_get_meta_page(sbi, start++); if (IS_ERR(page)) return PTR_ERR(page); kaddr = (unsigned char *)page_address(page); /* Step 1: restore nat cache */ seg_i = CURSEG_I(sbi, CURSEG_HOT_DATA); memcpy(seg_i->journal, kaddr, SUM_JOURNAL_SIZE); /* Step 2: restore sit cache */ seg_i = CURSEG_I(sbi, CURSEG_COLD_DATA); memcpy(seg_i->journal, kaddr + SUM_JOURNAL_SIZE, SUM_JOURNAL_SIZE); offset = 2 * SUM_JOURNAL_SIZE; /* Step 3: restore summary entries */ for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) { unsigned short blk_off; unsigned int segno; seg_i = CURSEG_I(sbi, i); segno = le32_to_cpu(ckpt->cur_data_segno[i]); blk_off = le16_to_cpu(ckpt->cur_data_blkoff[i]); seg_i->next_segno = segno; reset_curseg(sbi, i, 0); seg_i->alloc_type = ckpt->alloc_type[i]; seg_i->next_blkoff = blk_off; if (seg_i->alloc_type == SSR) blk_off = BLKS_PER_SEG(sbi); for (j = 0; j < blk_off; j++) { struct f2fs_summary *s; s = (struct f2fs_summary *)(kaddr + offset); seg_i->sum_blk->entries[j] = *s; offset += SUMMARY_SIZE; if (offset + SUMMARY_SIZE <= PAGE_SIZE - SUM_FOOTER_SIZE) continue; f2fs_put_page(page, 1); page = NULL; page = f2fs_get_meta_page(sbi, start++); if (IS_ERR(page)) return PTR_ERR(page); kaddr = (unsigned char *)page_address(page); offset = 0; } } f2fs_put_page(page, 1); return 0; } static int read_normal_summaries(struct f2fs_sb_info *sbi, int type) { struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); struct f2fs_summary_block *sum; struct curseg_info *curseg; struct page *new; unsigned short blk_off; unsigned int segno = 0; block_t blk_addr = 0; int err = 0; /* get segment number and block addr */ if (IS_DATASEG(type)) { segno = le32_to_cpu(ckpt->cur_data_segno[type]); blk_off = le16_to_cpu(ckpt->cur_data_blkoff[type - CURSEG_HOT_DATA]); if (__exist_node_summaries(sbi)) blk_addr = sum_blk_addr(sbi, NR_CURSEG_PERSIST_TYPE, type); else blk_addr = sum_blk_addr(sbi, NR_CURSEG_DATA_TYPE, type); } else { segno = le32_to_cpu(ckpt->cur_node_segno[type - CURSEG_HOT_NODE]); blk_off = le16_to_cpu(ckpt->cur_node_blkoff[type - CURSEG_HOT_NODE]); if (__exist_node_summaries(sbi)) blk_addr = sum_blk_addr(sbi, NR_CURSEG_NODE_TYPE, type - CURSEG_HOT_NODE); else blk_addr = GET_SUM_BLOCK(sbi, segno); } new = f2fs_get_meta_page(sbi, blk_addr); if (IS_ERR(new)) return PTR_ERR(new); sum = (struct f2fs_summary_block *)page_address(new); if (IS_NODESEG(type)) { if (__exist_node_summaries(sbi)) { struct f2fs_summary *ns = &sum->entries[0]; int i; for (i = 0; i < BLKS_PER_SEG(sbi); i++, ns++) { ns->version = 0; ns->ofs_in_node = 0; } } else { err = f2fs_restore_node_summary(sbi, segno, sum); if (err) goto out; } } /* set uncompleted segment to curseg */ curseg = CURSEG_I(sbi, type); mutex_lock(&curseg->curseg_mutex); /* update journal info */ down_write(&curseg->journal_rwsem); memcpy(curseg->journal, &sum->journal, SUM_JOURNAL_SIZE); up_write(&curseg->journal_rwsem); memcpy(curseg->sum_blk->entries, sum->entries, SUM_ENTRY_SIZE); memcpy(&curseg->sum_blk->footer, &sum->footer, SUM_FOOTER_SIZE); curseg->next_segno = segno; reset_curseg(sbi, type, 0); curseg->alloc_type = ckpt->alloc_type[type]; curseg->next_blkoff = blk_off; mutex_unlock(&curseg->curseg_mutex); out: f2fs_put_page(new, 1); return err; } static int restore_curseg_summaries(struct f2fs_sb_info *sbi) { struct f2fs_journal *sit_j = CURSEG_I(sbi, CURSEG_COLD_DATA)->journal; struct f2fs_journal *nat_j = CURSEG_I(sbi, CURSEG_HOT_DATA)->journal; int type = CURSEG_HOT_DATA; int err; if (is_set_ckpt_flags(sbi, CP_COMPACT_SUM_FLAG)) { int npages = f2fs_npages_for_summary_flush(sbi, true); if (npages >= 2) f2fs_ra_meta_pages(sbi, start_sum_block(sbi), npages, META_CP, true); /* restore for compacted data summary */ err = read_compacted_summaries(sbi); if (err) return err; type = CURSEG_HOT_NODE; } if (__exist_node_summaries(sbi)) f2fs_ra_meta_pages(sbi, sum_blk_addr(sbi, NR_CURSEG_PERSIST_TYPE, type), NR_CURSEG_PERSIST_TYPE - type, META_CP, true); for (; type <= CURSEG_COLD_NODE; type++) { err = read_normal_summaries(sbi, type); if (err) return err; } /* sanity check for summary blocks */ if (nats_in_cursum(nat_j) > NAT_JOURNAL_ENTRIES || sits_in_cursum(sit_j) > SIT_JOURNAL_ENTRIES) { f2fs_err(sbi, "invalid journal entries nats %u sits %u", nats_in_cursum(nat_j), sits_in_cursum(sit_j)); return -EINVAL; } return 0; } static void write_compacted_summaries(struct f2fs_sb_info *sbi, block_t blkaddr) { struct page *page; unsigned char *kaddr; struct f2fs_summary *summary; struct curseg_info *seg_i; int written_size = 0; int i, j; page = f2fs_grab_meta_page(sbi, blkaddr++); kaddr = (unsigned char *)page_address(page); memset(kaddr, 0, PAGE_SIZE); /* Step 1: write nat cache */ seg_i = CURSEG_I(sbi, CURSEG_HOT_DATA); memcpy(kaddr, seg_i->journal, SUM_JOURNAL_SIZE); written_size += SUM_JOURNAL_SIZE; /* Step 2: write sit cache */ seg_i = CURSEG_I(sbi, CURSEG_COLD_DATA); memcpy(kaddr + written_size, seg_i->journal, SUM_JOURNAL_SIZE); written_size += SUM_JOURNAL_SIZE; /* Step 3: write summary entries */ for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) { seg_i = CURSEG_I(sbi, i); for (j = 0; j < f2fs_curseg_valid_blocks(sbi, i); j++) { if (!page) { page = f2fs_grab_meta_page(sbi, blkaddr++); kaddr = (unsigned char *)page_address(page); memset(kaddr, 0, PAGE_SIZE); written_size = 0; } summary = (struct f2fs_summary *)(kaddr + written_size); *summary = seg_i->sum_blk->entries[j]; written_size += SUMMARY_SIZE; if (written_size + SUMMARY_SIZE <= PAGE_SIZE - SUM_FOOTER_SIZE) continue; set_page_dirty(page); f2fs_put_page(page, 1); page = NULL; } } if (page) { set_page_dirty(page); f2fs_put_page(page, 1); } } static void write_normal_summaries(struct f2fs_sb_info *sbi, block_t blkaddr, int type) { int i, end; if (IS_DATASEG(type)) end = type + NR_CURSEG_DATA_TYPE; else end = type + NR_CURSEG_NODE_TYPE; for (i = type; i < end; i++) write_current_sum_page(sbi, i, blkaddr + (i - type)); } void f2fs_write_data_summaries(struct f2fs_sb_info *sbi, block_t start_blk) { if (is_set_ckpt_flags(sbi, CP_COMPACT_SUM_FLAG)) write_compacted_summaries(sbi, start_blk); else write_normal_summaries(sbi, start_blk, CURSEG_HOT_DATA); } void f2fs_write_node_summaries(struct f2fs_sb_info *sbi, block_t start_blk) { write_normal_summaries(sbi, start_blk, CURSEG_HOT_NODE); } int f2fs_lookup_journal_in_cursum(struct f2fs_journal *journal, int type, unsigned int val, int alloc) { int i; if (type == NAT_JOURNAL) { for (i = 0; i < nats_in_cursum(journal); i++) { if (le32_to_cpu(nid_in_journal(journal, i)) == val) return i; } if (alloc && __has_cursum_space(journal, 1, NAT_JOURNAL)) return update_nats_in_cursum(journal, 1); } else if (type == SIT_JOURNAL) { for (i = 0; i < sits_in_cursum(journal); i++) if (le32_to_cpu(segno_in_journal(journal, i)) == val) return i; if (alloc && __has_cursum_space(journal, 1, SIT_JOURNAL)) return update_sits_in_cursum(journal, 1); } return -1; } static struct page *get_current_sit_page(struct f2fs_sb_info *sbi, unsigned int segno) { return f2fs_get_meta_page(sbi, current_sit_addr(sbi, segno)); } static struct page *get_next_sit_page(struct f2fs_sb_info *sbi, unsigned int start) { struct sit_info *sit_i = SIT_I(sbi); struct page *page; pgoff_t src_off, dst_off; src_off = current_sit_addr(sbi, start); dst_off = next_sit_addr(sbi, src_off); page = f2fs_grab_meta_page(sbi, dst_off); seg_info_to_sit_page(sbi, page, start); set_page_dirty(page); set_to_next_sit(sit_i, start); return page; } static struct sit_entry_set *grab_sit_entry_set(void) { struct sit_entry_set *ses = f2fs_kmem_cache_alloc(sit_entry_set_slab, GFP_NOFS, true, NULL); ses->entry_cnt = 0; INIT_LIST_HEAD(&ses->set_list); return ses; } static void release_sit_entry_set(struct sit_entry_set *ses) { list_del(&ses->set_list); kmem_cache_free(sit_entry_set_slab, ses); } static void adjust_sit_entry_set(struct sit_entry_set *ses, struct list_head *head) { struct sit_entry_set *next = ses; if (list_is_last(&ses->set_list, head)) return; list_for_each_entry_continue(next, head, set_list) if (ses->entry_cnt <= next->entry_cnt) { list_move_tail(&ses->set_list, &next->set_list); return; } list_move_tail(&ses->set_list, head); } static void add_sit_entry(unsigned int segno, struct list_head *head) { struct sit_entry_set *ses; unsigned int start_segno = START_SEGNO(segno); list_for_each_entry(ses, head, set_list) { if (ses->start_segno == start_segno) { ses->entry_cnt++; adjust_sit_entry_set(ses, head); return; } } ses = grab_sit_entry_set(); ses->start_segno = start_segno; ses->entry_cnt++; list_add(&ses->set_list, head); } static void add_sits_in_set(struct f2fs_sb_info *sbi) { struct f2fs_sm_info *sm_info = SM_I(sbi); struct list_head *set_list = &sm_info->sit_entry_set; unsigned long *bitmap = SIT_I(sbi)->dirty_sentries_bitmap; unsigned int segno; for_each_set_bit(segno, bitmap, MAIN_SEGS(sbi)) add_sit_entry(segno, set_list); } static void remove_sits_in_journal(struct f2fs_sb_info *sbi) { struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_COLD_DATA); struct f2fs_journal *journal = curseg->journal; int i; down_write(&curseg->journal_rwsem); for (i = 0; i < sits_in_cursum(journal); i++) { unsigned int segno; bool dirtied; segno = le32_to_cpu(segno_in_journal(journal, i)); dirtied = __mark_sit_entry_dirty(sbi, segno); if (!dirtied) add_sit_entry(segno, &SM_I(sbi)->sit_entry_set); } update_sits_in_cursum(journal, -i); up_write(&curseg->journal_rwsem); } /* * CP calls this function, which flushes SIT entries including sit_journal, * and moves prefree segs to free segs. */ void f2fs_flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) { struct sit_info *sit_i = SIT_I(sbi); unsigned long *bitmap = sit_i->dirty_sentries_bitmap; struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_COLD_DATA); struct f2fs_journal *journal = curseg->journal; struct sit_entry_set *ses, *tmp; struct list_head *head = &SM_I(sbi)->sit_entry_set; bool to_journal = !is_sbi_flag_set(sbi, SBI_IS_RESIZEFS); struct seg_entry *se; down_write(&sit_i->sentry_lock); if (!sit_i->dirty_sentries) goto out; /* * add and account sit entries of dirty bitmap in sit entry * set temporarily */ add_sits_in_set(sbi); /* * if there are no enough space in journal to store dirty sit * entries, remove all entries from journal and add and account * them in sit entry set. */ if (!__has_cursum_space(journal, sit_i->dirty_sentries, SIT_JOURNAL) || !to_journal) remove_sits_in_journal(sbi); /* * there are two steps to flush sit entries: * #1, flush sit entries to journal in current cold data summary block. * #2, flush sit entries to sit page. */ list_for_each_entry_safe(ses, tmp, head, set_list) { struct page *page = NULL; struct f2fs_sit_block *raw_sit = NULL; unsigned int start_segno = ses->start_segno; unsigned int end = min(start_segno + SIT_ENTRY_PER_BLOCK, (unsigned long)MAIN_SEGS(sbi)); unsigned int segno = start_segno; if (to_journal && !__has_cursum_space(journal, ses->entry_cnt, SIT_JOURNAL)) to_journal = false; if (to_journal) { down_write(&curseg->journal_rwsem); } else { page = get_next_sit_page(sbi, start_segno); raw_sit = page_address(page); } /* flush dirty sit entries in region of current sit set */ for_each_set_bit_from(segno, bitmap, end) { int offset, sit_offset; se = get_seg_entry(sbi, segno); #ifdef CONFIG_F2FS_CHECK_FS if (memcmp(se->cur_valid_map, se->cur_valid_map_mir, SIT_VBLOCK_MAP_SIZE)) f2fs_bug_on(sbi, 1); #endif /* add discard candidates */ if (!(cpc->reason & CP_DISCARD)) { cpc->trim_start = segno; add_discard_addrs(sbi, cpc, false); } if (to_journal) { offset = f2fs_lookup_journal_in_cursum(journal, SIT_JOURNAL, segno, 1); f2fs_bug_on(sbi, offset < 0); segno_in_journal(journal, offset) = cpu_to_le32(segno); seg_info_to_raw_sit(se, &sit_in_journal(journal, offset)); check_block_count(sbi, segno, &sit_in_journal(journal, offset)); } else { sit_offset = SIT_ENTRY_OFFSET(sit_i, segno); seg_info_to_raw_sit(se, &raw_sit->entries[sit_offset]); check_block_count(sbi, segno, &raw_sit->entries[sit_offset]); } __clear_bit(segno, bitmap); sit_i->dirty_sentries--; ses->entry_cnt--; } if (to_journal) up_write(&curseg->journal_rwsem); else f2fs_put_page(page, 1); f2fs_bug_on(sbi, ses->entry_cnt); release_sit_entry_set(ses); } f2fs_bug_on(sbi, !list_empty(head)); f2fs_bug_on(sbi, sit_i->dirty_sentries); out: if (cpc->reason & CP_DISCARD) { __u64 trim_start = cpc->trim_start; for (; cpc->trim_start <= cpc->trim_end; cpc->trim_start++) add_discard_addrs(sbi, cpc, false); cpc->trim_start = trim_start; } up_write(&sit_i->sentry_lock); set_prefree_as_free_segments(sbi); } static int build_sit_info(struct f2fs_sb_info *sbi) { struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi); struct sit_info *sit_i; unsigned int sit_segs, start; char *src_bitmap, *bitmap; unsigned int bitmap_size, main_bitmap_size, sit_bitmap_size; unsigned int discard_map = f2fs_block_unit_discard(sbi) ? 1 : 0; /* allocate memory for SIT information */ sit_i = f2fs_kzalloc(sbi, sizeof(struct sit_info), GFP_KERNEL); if (!sit_i) return -ENOMEM; SM_I(sbi)->sit_info = sit_i; sit_i->sentries = f2fs_kvzalloc(sbi, array_size(sizeof(struct seg_entry), MAIN_SEGS(sbi)), GFP_KERNEL); if (!sit_i->sentries) return -ENOMEM; main_bitmap_size = f2fs_bitmap_size(MAIN_SEGS(sbi)); sit_i->dirty_sentries_bitmap = f2fs_kvzalloc(sbi, main_bitmap_size, GFP_KERNEL); if (!sit_i->dirty_sentries_bitmap) return -ENOMEM; #ifdef CONFIG_F2FS_CHECK_FS bitmap_size = MAIN_SEGS(sbi) * SIT_VBLOCK_MAP_SIZE * (3 + discard_map); #else bitmap_size = MAIN_SEGS(sbi) * SIT_VBLOCK_MAP_SIZE * (2 + discard_map); #endif sit_i->bitmap = f2fs_kvzalloc(sbi, bitmap_size, GFP_KERNEL); if (!sit_i->bitmap) return -ENOMEM; bitmap = sit_i->bitmap; for (start = 0; start < MAIN_SEGS(sbi); start++) { sit_i->sentries[start].cur_valid_map = bitmap; bitmap += SIT_VBLOCK_MAP_SIZE; sit_i->sentries[start].ckpt_valid_map = bitmap; bitmap += SIT_VBLOCK_MAP_SIZE; #ifdef CONFIG_F2FS_CHECK_FS sit_i->sentries[start].cur_valid_map_mir = bitmap; bitmap += SIT_VBLOCK_MAP_SIZE; #endif if (discard_map) { sit_i->sentries[start].discard_map = bitmap; bitmap += SIT_VBLOCK_MAP_SIZE; } } sit_i->tmp_map = f2fs_kzalloc(sbi, SIT_VBLOCK_MAP_SIZE, GFP_KERNEL); if (!sit_i->tmp_map) return -ENOMEM; if (__is_large_section(sbi)) { sit_i->sec_entries = f2fs_kvzalloc(sbi, array_size(sizeof(struct sec_entry), MAIN_SECS(sbi)), GFP_KERNEL); if (!sit_i->sec_entries) return -ENOMEM; } /* get information related with SIT */ sit_segs = le32_to_cpu(raw_super->segment_count_sit) >> 1; /* setup SIT bitmap from ckeckpoint pack */ sit_bitmap_size = __bitmap_size(sbi, SIT_BITMAP); src_bitmap = __bitmap_ptr(sbi, SIT_BITMAP); sit_i->sit_bitmap = kmemdup(src_bitmap, sit_bitmap_size, GFP_KERNEL); if (!sit_i->sit_bitmap) return -ENOMEM; #ifdef CONFIG_F2FS_CHECK_FS sit_i->sit_bitmap_mir = kmemdup(src_bitmap, sit_bitmap_size, GFP_KERNEL); if (!sit_i->sit_bitmap_mir) return -ENOMEM; sit_i->invalid_segmap = f2fs_kvzalloc(sbi, main_bitmap_size, GFP_KERNEL); if (!sit_i->invalid_segmap) return -ENOMEM; #endif sit_i->sit_base_addr = le32_to_cpu(raw_super->sit_blkaddr); sit_i->sit_blocks = SEGS_TO_BLKS(sbi, sit_segs); sit_i->written_valid_blocks = 0; sit_i->bitmap_size = sit_bitmap_size; sit_i->dirty_sentries = 0; sit_i->sents_per_block = SIT_ENTRY_PER_BLOCK; sit_i->elapsed_time = le64_to_cpu(sbi->ckpt->elapsed_time); sit_i->mounted_time = ktime_get_boottime_seconds(); init_rwsem(&sit_i->sentry_lock); return 0; } static int build_free_segmap(struct f2fs_sb_info *sbi) { struct free_segmap_info *free_i; unsigned int bitmap_size, sec_bitmap_size; /* allocate memory for free segmap information */ free_i = f2fs_kzalloc(sbi, sizeof(struct free_segmap_info), GFP_KERNEL); if (!free_i) return -ENOMEM; SM_I(sbi)->free_info = free_i; bitmap_size = f2fs_bitmap_size(MAIN_SEGS(sbi)); free_i->free_segmap = f2fs_kvmalloc(sbi, bitmap_size, GFP_KERNEL); if (!free_i->free_segmap) return -ENOMEM; sec_bitmap_size = f2fs_bitmap_size(MAIN_SECS(sbi)); free_i->free_secmap = f2fs_kvmalloc(sbi, sec_bitmap_size, GFP_KERNEL); if (!free_i->free_secmap) return -ENOMEM; /* set all segments as dirty temporarily */ memset(free_i->free_segmap, 0xff, bitmap_size); memset(free_i->free_secmap, 0xff, sec_bitmap_size); /* init free segmap information */ free_i->start_segno = GET_SEGNO_FROM_SEG0(sbi, MAIN_BLKADDR(sbi)); free_i->free_segments = 0; free_i->free_sections = 0; spin_lock_init(&free_i->segmap_lock); return 0; } static int build_curseg(struct f2fs_sb_info *sbi) { struct curseg_info *array; int i; array = f2fs_kzalloc(sbi, array_size(NR_CURSEG_TYPE, sizeof(*array)), GFP_KERNEL); if (!array) return -ENOMEM; SM_I(sbi)->curseg_array = array; for (i = 0; i < NO_CHECK_TYPE; i++) { mutex_init(&array[i].curseg_mutex); array[i].sum_blk = f2fs_kzalloc(sbi, PAGE_SIZE, GFP_KERNEL); if (!array[i].sum_blk) return -ENOMEM; init_rwsem(&array[i].journal_rwsem); array[i].journal = f2fs_kzalloc(sbi, sizeof(struct f2fs_journal), GFP_KERNEL); if (!array[i].journal) return -ENOMEM; array[i].seg_type = log_type_to_seg_type(i); reset_curseg_fields(&array[i]); } return restore_curseg_summaries(sbi); } static int build_sit_entries(struct f2fs_sb_info *sbi) { struct sit_info *sit_i = SIT_I(sbi); struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_COLD_DATA); struct f2fs_journal *journal = curseg->journal; struct seg_entry *se; struct f2fs_sit_entry sit; int sit_blk_cnt = SIT_BLK_CNT(sbi); unsigned int i, start, end; unsigned int readed, start_blk = 0; int err = 0; block_t sit_valid_blocks[2] = {0, 0}; do { readed = f2fs_ra_meta_pages(sbi, start_blk, BIO_MAX_VECS, META_SIT, true); start = start_blk * sit_i->sents_per_block; end = (start_blk + readed) * sit_i->sents_per_block; for (; start < end && start < MAIN_SEGS(sbi); start++) { struct f2fs_sit_block *sit_blk; struct page *page; se = &sit_i->sentries[start]; page = get_current_sit_page(sbi, start); if (IS_ERR(page)) return PTR_ERR(page); sit_blk = (struct f2fs_sit_block *)page_address(page); sit = sit_blk->entries[SIT_ENTRY_OFFSET(sit_i, start)]; f2fs_put_page(page, 1); err = check_block_count(sbi, start, &sit); if (err) return err; seg_info_from_raw_sit(se, &sit); if (se->type >= NR_PERSISTENT_LOG) { f2fs_err(sbi, "Invalid segment type: %u, segno: %u", se->type, start); f2fs_handle_error(sbi, ERROR_INCONSISTENT_SUM_TYPE); return -EFSCORRUPTED; } sit_valid_blocks[SE_PAGETYPE(se)] += se->valid_blocks; if (!f2fs_block_unit_discard(sbi)) goto init_discard_map_done; /* build discard map only one time */ if (is_set_ckpt_flags(sbi, CP_TRIMMED_FLAG)) { memset(se->discard_map, 0xff, SIT_VBLOCK_MAP_SIZE); goto init_discard_map_done; } memcpy(se->discard_map, se->cur_valid_map, SIT_VBLOCK_MAP_SIZE); sbi->discard_blks += BLKS_PER_SEG(sbi) - se->valid_blocks; init_discard_map_done: if (__is_large_section(sbi)) get_sec_entry(sbi, start)->valid_blocks += se->valid_blocks; } start_blk += readed; } while (start_blk < sit_blk_cnt); down_read(&curseg->journal_rwsem); for (i = 0; i < sits_in_cursum(journal); i++) { unsigned int old_valid_blocks; start = le32_to_cpu(segno_in_journal(journal, i)); if (start >= MAIN_SEGS(sbi)) { f2fs_err(sbi, "Wrong journal entry on segno %u", start); err = -EFSCORRUPTED; f2fs_handle_error(sbi, ERROR_CORRUPTED_JOURNAL); break; } se = &sit_i->sentries[start]; sit = sit_in_journal(journal, i); old_valid_blocks = se->valid_blocks; sit_valid_blocks[SE_PAGETYPE(se)] -= old_valid_blocks; err = check_block_count(sbi, start, &sit); if (err) break; seg_info_from_raw_sit(se, &sit); if (se->type >= NR_PERSISTENT_LOG) { f2fs_err(sbi, "Invalid segment type: %u, segno: %u", se->type, start); err = -EFSCORRUPTED; f2fs_handle_error(sbi, ERROR_INCONSISTENT_SUM_TYPE); break; } sit_valid_blocks[SE_PAGETYPE(se)] += se->valid_blocks; if (f2fs_block_unit_discard(sbi)) { if (is_set_ckpt_flags(sbi, CP_TRIMMED_FLAG)) { memset(se->discard_map, 0xff, SIT_VBLOCK_MAP_SIZE); } else { memcpy(se->discard_map, se->cur_valid_map, SIT_VBLOCK_MAP_SIZE); sbi->discard_blks += old_valid_blocks; sbi->discard_blks -= se->valid_blocks; } } if (__is_large_section(sbi)) { get_sec_entry(sbi, start)->valid_blocks += se->valid_blocks; get_sec_entry(sbi, start)->valid_blocks -= old_valid_blocks; } } up_read(&curseg->journal_rwsem); if (err) return err; if (sit_valid_blocks[NODE] != valid_node_count(sbi)) { f2fs_err(sbi, "SIT is corrupted node# %u vs %u", sit_valid_blocks[NODE], valid_node_count(sbi)); f2fs_handle_error(sbi, ERROR_INCONSISTENT_NODE_COUNT); return -EFSCORRUPTED; } if (sit_valid_blocks[DATA] + sit_valid_blocks[NODE] > valid_user_blocks(sbi)) { f2fs_err(sbi, "SIT is corrupted data# %u %u vs %u", sit_valid_blocks[DATA], sit_valid_blocks[NODE], valid_user_blocks(sbi)); f2fs_handle_error(sbi, ERROR_INCONSISTENT_BLOCK_COUNT); return -EFSCORRUPTED; } return 0; } static void init_free_segmap(struct f2fs_sb_info *sbi) { unsigned int start; int type; struct seg_entry *sentry; for (start = 0; start < MAIN_SEGS(sbi); start++) { if (f2fs_usable_blks_in_seg(sbi, start) == 0) continue; sentry = get_seg_entry(sbi, start); if (!sentry->valid_blocks) __set_free(sbi, start); else SIT_I(sbi)->written_valid_blocks += sentry->valid_blocks; } /* set use the current segments */ for (type = CURSEG_HOT_DATA; type <= CURSEG_COLD_NODE; type++) { struct curseg_info *curseg_t = CURSEG_I(sbi, type); __set_test_and_inuse(sbi, curseg_t->segno); } } static void init_dirty_segmap(struct f2fs_sb_info *sbi) { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); struct free_segmap_info *free_i = FREE_I(sbi); unsigned int segno = 0, offset = 0, secno; block_t valid_blocks, usable_blks_in_seg; while (1) { /* find dirty segment based on free segmap */ segno = find_next_inuse(free_i, MAIN_SEGS(sbi), offset); if (segno >= MAIN_SEGS(sbi)) break; offset = segno + 1; valid_blocks = get_valid_blocks(sbi, segno, false); usable_blks_in_seg = f2fs_usable_blks_in_seg(sbi, segno); if (valid_blocks == usable_blks_in_seg || !valid_blocks) continue; if (valid_blocks > usable_blks_in_seg) { f2fs_bug_on(sbi, 1); continue; } mutex_lock(&dirty_i->seglist_lock); __locate_dirty_segment(sbi, segno, DIRTY); mutex_unlock(&dirty_i->seglist_lock); } if (!__is_large_section(sbi)) return; mutex_lock(&dirty_i->seglist_lock); for (segno = 0; segno < MAIN_SEGS(sbi); segno += SEGS_PER_SEC(sbi)) { valid_blocks = get_valid_blocks(sbi, segno, true); secno = GET_SEC_FROM_SEG(sbi, segno); if (!valid_blocks || valid_blocks == CAP_BLKS_PER_SEC(sbi)) continue; if (IS_CURSEC(sbi, secno)) continue; set_bit(secno, dirty_i->dirty_secmap); } mutex_unlock(&dirty_i->seglist_lock); } static int init_victim_secmap(struct f2fs_sb_info *sbi) { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); unsigned int bitmap_size = f2fs_bitmap_size(MAIN_SECS(sbi)); dirty_i->victim_secmap = f2fs_kvzalloc(sbi, bitmap_size, GFP_KERNEL); if (!dirty_i->victim_secmap) return -ENOMEM; dirty_i->pinned_secmap = f2fs_kvzalloc(sbi, bitmap_size, GFP_KERNEL); if (!dirty_i->pinned_secmap) return -ENOMEM; dirty_i->pinned_secmap_cnt = 0; dirty_i->enable_pin_section = true; return 0; } static int build_dirty_segmap(struct f2fs_sb_info *sbi) { struct dirty_seglist_info *dirty_i; unsigned int bitmap_size, i; /* allocate memory for dirty segments list information */ dirty_i = f2fs_kzalloc(sbi, sizeof(struct dirty_seglist_info), GFP_KERNEL); if (!dirty_i) return -ENOMEM; SM_I(sbi)->dirty_info = dirty_i; mutex_init(&dirty_i->seglist_lock); bitmap_size = f2fs_bitmap_size(MAIN_SEGS(sbi)); for (i = 0; i < NR_DIRTY_TYPE; i++) { dirty_i->dirty_segmap[i] = f2fs_kvzalloc(sbi, bitmap_size, GFP_KERNEL); if (!dirty_i->dirty_segmap[i]) return -ENOMEM; } if (__is_large_section(sbi)) { bitmap_size = f2fs_bitmap_size(MAIN_SECS(sbi)); dirty_i->dirty_secmap = f2fs_kvzalloc(sbi, bitmap_size, GFP_KERNEL); if (!dirty_i->dirty_secmap) return -ENOMEM; } init_dirty_segmap(sbi); return init_victim_secmap(sbi); } static int sanity_check_curseg(struct f2fs_sb_info *sbi) { int i; /* * In LFS/SSR curseg, .next_blkoff should point to an unused blkaddr; * In LFS curseg, all blkaddr after .next_blkoff should be unused. */ for (i = 0; i < NR_PERSISTENT_LOG; i++) { struct curseg_info *curseg = CURSEG_I(sbi, i); struct seg_entry *se = get_seg_entry(sbi, curseg->segno); unsigned int blkofs = curseg->next_blkoff; if (f2fs_sb_has_readonly(sbi) && i != CURSEG_HOT_DATA && i != CURSEG_HOT_NODE) continue; sanity_check_seg_type(sbi, curseg->seg_type); if (curseg->alloc_type != LFS && curseg->alloc_type != SSR) { f2fs_err(sbi, "Current segment has invalid alloc_type:%d", curseg->alloc_type); f2fs_handle_error(sbi, ERROR_INVALID_CURSEG); return -EFSCORRUPTED; } if (f2fs_test_bit(blkofs, se->cur_valid_map)) goto out; if (curseg->alloc_type == SSR) continue; for (blkofs += 1; blkofs < BLKS_PER_SEG(sbi); blkofs++) { if (!f2fs_test_bit(blkofs, se->cur_valid_map)) continue; out: f2fs_err(sbi, "Current segment's next free block offset is inconsistent with bitmap, logtype:%u, segno:%u, type:%u, next_blkoff:%u, blkofs:%u", i, curseg->segno, curseg->alloc_type, curseg->next_blkoff, blkofs); f2fs_handle_error(sbi, ERROR_INVALID_CURSEG); return -EFSCORRUPTED; } } return 0; } #ifdef CONFIG_BLK_DEV_ZONED static int check_zone_write_pointer(struct f2fs_sb_info *sbi, struct f2fs_dev_info *fdev, struct blk_zone *zone) { unsigned int zone_segno; block_t zone_block, valid_block_cnt; unsigned int log_sectors_per_block = sbi->log_blocksize - SECTOR_SHIFT; int ret; unsigned int nofs_flags; if (zone->type != BLK_ZONE_TYPE_SEQWRITE_REQ) return 0; zone_block = fdev->start_blk + (zone->start >> log_sectors_per_block); zone_segno = GET_SEGNO(sbi, zone_block); /* * Skip check of zones cursegs point to, since * fix_curseg_write_pointer() checks them. */ if (zone_segno >= MAIN_SEGS(sbi)) return 0; /* * Get # of valid block of the zone. */ valid_block_cnt = get_valid_blocks(sbi, zone_segno, true); if (IS_CURSEC(sbi, GET_SEC_FROM_SEG(sbi, zone_segno))) { f2fs_notice(sbi, "Open zones: valid block[0x%x,0x%x] cond[%s]", zone_segno, valid_block_cnt, blk_zone_cond_str(zone->cond)); return 0; } if ((!valid_block_cnt && zone->cond == BLK_ZONE_COND_EMPTY) || (valid_block_cnt && zone->cond == BLK_ZONE_COND_FULL)) return 0; if (!valid_block_cnt) { f2fs_notice(sbi, "Zone without valid block has non-zero write " "pointer. Reset the write pointer: cond[%s]", blk_zone_cond_str(zone->cond)); ret = __f2fs_issue_discard_zone(sbi, fdev->bdev, zone_block, zone->len >> log_sectors_per_block); if (ret) f2fs_err(sbi, "Discard zone failed: %s (errno=%d)", fdev->path, ret); return ret; } /* * If there are valid blocks and the write pointer doesn't match * with them, we need to report the inconsistency and fill * the zone till the end to close the zone. This inconsistency * does not cause write error because the zone will not be * selected for write operation until it get discarded. */ f2fs_notice(sbi, "Valid blocks are not aligned with write " "pointer: valid block[0x%x,0x%x] cond[%s]", zone_segno, valid_block_cnt, blk_zone_cond_str(zone->cond)); nofs_flags = memalloc_nofs_save(); ret = blkdev_zone_mgmt(fdev->bdev, REQ_OP_ZONE_FINISH, zone->start, zone->len); memalloc_nofs_restore(nofs_flags); if (ret == -EOPNOTSUPP) { ret = blkdev_issue_zeroout(fdev->bdev, zone->wp, zone->len - (zone->wp - zone->start), GFP_NOFS, 0); if (ret) f2fs_err(sbi, "Fill up zone failed: %s (errno=%d)", fdev->path, ret); } else if (ret) { f2fs_err(sbi, "Finishing zone failed: %s (errno=%d)", fdev->path, ret); } return ret; } static struct f2fs_dev_info *get_target_zoned_dev(struct f2fs_sb_info *sbi, block_t zone_blkaddr) { int i; for (i = 0; i < sbi->s_ndevs; i++) { if (!bdev_is_zoned(FDEV(i).bdev)) continue; if (sbi->s_ndevs == 1 || (FDEV(i).start_blk <= zone_blkaddr && zone_blkaddr <= FDEV(i).end_blk)) return &FDEV(i); } return NULL; } static int report_one_zone_cb(struct blk_zone *zone, unsigned int idx, void *data) { memcpy(data, zone, sizeof(struct blk_zone)); return 0; } static int do_fix_curseg_write_pointer(struct f2fs_sb_info *sbi, int type) { struct curseg_info *cs = CURSEG_I(sbi, type); struct f2fs_dev_info *zbd; struct blk_zone zone; unsigned int cs_section, wp_segno, wp_blkoff, wp_sector_off; block_t cs_zone_block, wp_block; unsigned int log_sectors_per_block = sbi->log_blocksize - SECTOR_SHIFT; sector_t zone_sector; int err; cs_section = GET_SEC_FROM_SEG(sbi, cs->segno); cs_zone_block = START_BLOCK(sbi, GET_SEG_FROM_SEC(sbi, cs_section)); zbd = get_target_zoned_dev(sbi, cs_zone_block); if (!zbd) return 0; /* report zone for the sector the curseg points to */ zone_sector = (sector_t)(cs_zone_block - zbd->start_blk) << log_sectors_per_block; err = blkdev_report_zones(zbd->bdev, zone_sector, 1, report_one_zone_cb, &zone); if (err != 1) { f2fs_err(sbi, "Report zone failed: %s errno=(%d)", zbd->path, err); return err; } if (zone.type != BLK_ZONE_TYPE_SEQWRITE_REQ) return 0; /* * When safely unmounted in the previous mount, we could use current * segments. Otherwise, allocate new sections. */ if (is_set_ckpt_flags(sbi, CP_UMOUNT_FLAG)) { wp_block = zbd->start_blk + (zone.wp >> log_sectors_per_block); wp_segno = GET_SEGNO(sbi, wp_block); wp_blkoff = wp_block - START_BLOCK(sbi, wp_segno); wp_sector_off = zone.wp & GENMASK(log_sectors_per_block - 1, 0); if (cs->segno == wp_segno && cs->next_blkoff == wp_blkoff && wp_sector_off == 0) return 0; f2fs_notice(sbi, "Unaligned curseg[%d] with write pointer: " "curseg[0x%x,0x%x] wp[0x%x,0x%x]", type, cs->segno, cs->next_blkoff, wp_segno, wp_blkoff); } /* Allocate a new section if it's not new. */ if (cs->next_blkoff || cs->segno != GET_SEG_FROM_SEC(sbi, GET_ZONE_FROM_SEC(sbi, cs_section))) { unsigned int old_segno = cs->segno, old_blkoff = cs->next_blkoff; f2fs_allocate_new_section(sbi, type, true); f2fs_notice(sbi, "Assign new section to curseg[%d]: " "[0x%x,0x%x] -> [0x%x,0x%x]", type, old_segno, old_blkoff, cs->segno, cs->next_blkoff); } /* check consistency of the zone curseg pointed to */ if (check_zone_write_pointer(sbi, zbd, &zone)) return -EIO; /* check newly assigned zone */ cs_section = GET_SEC_FROM_SEG(sbi, cs->segno); cs_zone_block = START_BLOCK(sbi, GET_SEG_FROM_SEC(sbi, cs_section)); zbd = get_target_zoned_dev(sbi, cs_zone_block); if (!zbd) return 0; zone_sector = (sector_t)(cs_zone_block - zbd->start_blk) << log_sectors_per_block; err = blkdev_report_zones(zbd->bdev, zone_sector, 1, report_one_zone_cb, &zone); if (err != 1) { f2fs_err(sbi, "Report zone failed: %s errno=(%d)", zbd->path, err); return err; } if (zone.type != BLK_ZONE_TYPE_SEQWRITE_REQ) return 0; if (zone.wp != zone.start) { f2fs_notice(sbi, "New zone for curseg[%d] is not yet discarded. " "Reset the zone: curseg[0x%x,0x%x]", type, cs->segno, cs->next_blkoff); err = __f2fs_issue_discard_zone(sbi, zbd->bdev, cs_zone_block, zone.len >> log_sectors_per_block); if (err) { f2fs_err(sbi, "Discard zone failed: %s (errno=%d)", zbd->path, err); return err; } } return 0; } static int fix_curseg_write_pointer(struct f2fs_sb_info *sbi) { int i, ret; for (i = 0; i < NR_PERSISTENT_LOG; i++) { ret = do_fix_curseg_write_pointer(sbi, i); if (ret) return ret; } return 0; } struct check_zone_write_pointer_args { struct f2fs_sb_info *sbi; struct f2fs_dev_info *fdev; }; static int check_zone_write_pointer_cb(struct blk_zone *zone, unsigned int idx, void *data) { struct check_zone_write_pointer_args *args; args = (struct check_zone_write_pointer_args *)data; return check_zone_write_pointer(args->sbi, args->fdev, zone); } static int check_write_pointer(struct f2fs_sb_info *sbi) { int i, ret; struct check_zone_write_pointer_args args; for (i = 0; i < sbi->s_ndevs; i++) { if (!bdev_is_zoned(FDEV(i).bdev)) continue; args.sbi = sbi; args.fdev = &FDEV(i); ret = blkdev_report_zones(FDEV(i).bdev, 0, BLK_ALL_ZONES, check_zone_write_pointer_cb, &args); if (ret < 0) return ret; } return 0; } int f2fs_check_and_fix_write_pointer(struct f2fs_sb_info *sbi) { int ret; if (!f2fs_sb_has_blkzoned(sbi) || f2fs_readonly(sbi->sb) || f2fs_hw_is_readonly(sbi)) return 0; f2fs_notice(sbi, "Checking entire write pointers"); ret = fix_curseg_write_pointer(sbi); if (!ret) ret = check_write_pointer(sbi); return ret; } /* * Return the number of usable blocks in a segment. The number of blocks * returned is always equal to the number of blocks in a segment for * segments fully contained within a sequential zone capacity or a * conventional zone. For segments partially contained in a sequential * zone capacity, the number of usable blocks up to the zone capacity * is returned. 0 is returned in all other cases. */ static inline unsigned int f2fs_usable_zone_blks_in_seg( struct f2fs_sb_info *sbi, unsigned int segno) { block_t seg_start, sec_start_blkaddr, sec_cap_blkaddr; unsigned int secno; if (!sbi->unusable_blocks_per_sec) return BLKS_PER_SEG(sbi); secno = GET_SEC_FROM_SEG(sbi, segno); seg_start = START_BLOCK(sbi, segno); sec_start_blkaddr = START_BLOCK(sbi, GET_SEG_FROM_SEC(sbi, secno)); sec_cap_blkaddr = sec_start_blkaddr + CAP_BLKS_PER_SEC(sbi); /* * If segment starts before zone capacity and spans beyond * zone capacity, then usable blocks are from seg start to * zone capacity. If the segment starts after the zone capacity, * then there are no usable blocks. */ if (seg_start >= sec_cap_blkaddr) return 0; if (seg_start + BLKS_PER_SEG(sbi) > sec_cap_blkaddr) return sec_cap_blkaddr - seg_start; return BLKS_PER_SEG(sbi); } #else int f2fs_check_and_fix_write_pointer(struct f2fs_sb_info *sbi) { return 0; } static inline unsigned int f2fs_usable_zone_blks_in_seg(struct f2fs_sb_info *sbi, unsigned int segno) { return 0; } #endif unsigned int f2fs_usable_blks_in_seg(struct f2fs_sb_info *sbi, unsigned int segno) { if (f2fs_sb_has_blkzoned(sbi)) return f2fs_usable_zone_blks_in_seg(sbi, segno); return BLKS_PER_SEG(sbi); } unsigned int f2fs_usable_segs_in_sec(struct f2fs_sb_info *sbi) { if (f2fs_sb_has_blkzoned(sbi)) return CAP_SEGS_PER_SEC(sbi); return SEGS_PER_SEC(sbi); } unsigned long long f2fs_get_section_mtime(struct f2fs_sb_info *sbi, unsigned int segno) { unsigned int usable_segs_per_sec = f2fs_usable_segs_in_sec(sbi); unsigned int secno = 0, start = 0; unsigned int total_valid_blocks = 0; unsigned long long mtime = 0; unsigned int i = 0; secno = GET_SEC_FROM_SEG(sbi, segno); start = GET_SEG_FROM_SEC(sbi, secno); if (!__is_large_section(sbi)) { mtime = get_seg_entry(sbi, start + i)->mtime; goto out; } for (i = 0; i < usable_segs_per_sec; i++) { /* for large section, only check the mtime of valid segments */ struct seg_entry *se = get_seg_entry(sbi, start+i); mtime += se->mtime * se->valid_blocks; total_valid_blocks += se->valid_blocks; } if (total_valid_blocks == 0) return INVALID_MTIME; mtime = div_u64(mtime, total_valid_blocks); out: if (unlikely(mtime == INVALID_MTIME)) mtime -= 1; return mtime; } /* * Update min, max modified time for cost-benefit GC algorithm */ static void init_min_max_mtime(struct f2fs_sb_info *sbi) { struct sit_info *sit_i = SIT_I(sbi); unsigned int segno; down_write(&sit_i->sentry_lock); sit_i->min_mtime = ULLONG_MAX; for (segno = 0; segno < MAIN_SEGS(sbi); segno += SEGS_PER_SEC(sbi)) { unsigned long long mtime = 0; mtime = f2fs_get_section_mtime(sbi, segno); if (sit_i->min_mtime > mtime) sit_i->min_mtime = mtime; } sit_i->max_mtime = get_mtime(sbi, false); sit_i->dirty_max_mtime = 0; up_write(&sit_i->sentry_lock); } int f2fs_build_segment_manager(struct f2fs_sb_info *sbi) { struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi); struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); struct f2fs_sm_info *sm_info; int err; sm_info = f2fs_kzalloc(sbi, sizeof(struct f2fs_sm_info), GFP_KERNEL); if (!sm_info) return -ENOMEM; /* init sm info */ sbi->sm_info = sm_info; sm_info->seg0_blkaddr = le32_to_cpu(raw_super->segment0_blkaddr); sm_info->main_blkaddr = le32_to_cpu(raw_super->main_blkaddr); sm_info->segment_count = le32_to_cpu(raw_super->segment_count); sm_info->reserved_segments = le32_to_cpu(ckpt->rsvd_segment_count); sm_info->ovp_segments = le32_to_cpu(ckpt->overprov_segment_count); sm_info->main_segments = le32_to_cpu(raw_super->segment_count_main); sm_info->ssa_blkaddr = le32_to_cpu(raw_super->ssa_blkaddr); sm_info->rec_prefree_segments = sm_info->main_segments * DEF_RECLAIM_PREFREE_SEGMENTS / 100; if (sm_info->rec_prefree_segments > DEF_MAX_RECLAIM_PREFREE_SEGMENTS) sm_info->rec_prefree_segments = DEF_MAX_RECLAIM_PREFREE_SEGMENTS; if (!f2fs_lfs_mode(sbi)) sm_info->ipu_policy = BIT(F2FS_IPU_FSYNC); sm_info->min_ipu_util = DEF_MIN_IPU_UTIL; sm_info->min_fsync_blocks = DEF_MIN_FSYNC_BLOCKS; sm_info->min_seq_blocks = BLKS_PER_SEG(sbi); sm_info->min_hot_blocks = DEF_MIN_HOT_BLOCKS; sm_info->min_ssr_sections = reserved_sections(sbi); INIT_LIST_HEAD(&sm_info->sit_entry_set); init_f2fs_rwsem(&sm_info->curseg_lock); err = f2fs_create_flush_cmd_control(sbi); if (err) return err; err = create_discard_cmd_control(sbi); if (err) return err; err = build_sit_info(sbi); if (err) return err; err = build_free_segmap(sbi); if (err) return err; err = build_curseg(sbi); if (err) return err; /* reinit free segmap based on SIT */ err = build_sit_entries(sbi); if (err) return err; init_free_segmap(sbi); err = build_dirty_segmap(sbi); if (err) return err; err = sanity_check_curseg(sbi); if (err) return err; init_min_max_mtime(sbi); return 0; } static void discard_dirty_segmap(struct f2fs_sb_info *sbi, enum dirty_type dirty_type) { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); mutex_lock(&dirty_i->seglist_lock); kvfree(dirty_i->dirty_segmap[dirty_type]); dirty_i->nr_dirty[dirty_type] = 0; mutex_unlock(&dirty_i->seglist_lock); } static void destroy_victim_secmap(struct f2fs_sb_info *sbi) { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); kvfree(dirty_i->pinned_secmap); kvfree(dirty_i->victim_secmap); } static void destroy_dirty_segmap(struct f2fs_sb_info *sbi) { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); int i; if (!dirty_i) return; /* discard pre-free/dirty segments list */ for (i = 0; i < NR_DIRTY_TYPE; i++) discard_dirty_segmap(sbi, i); if (__is_large_section(sbi)) { mutex_lock(&dirty_i->seglist_lock); kvfree(dirty_i->dirty_secmap); mutex_unlock(&dirty_i->seglist_lock); } destroy_victim_secmap(sbi); SM_I(sbi)->dirty_info = NULL; kfree(dirty_i); } static void destroy_curseg(struct f2fs_sb_info *sbi) { struct curseg_info *array = SM_I(sbi)->curseg_array; int i; if (!array) return; SM_I(sbi)->curseg_array = NULL; for (i = 0; i < NR_CURSEG_TYPE; i++) { kfree(array[i].sum_blk); kfree(array[i].journal); } kfree(array); } static void destroy_free_segmap(struct f2fs_sb_info *sbi) { struct free_segmap_info *free_i = SM_I(sbi)->free_info; if (!free_i) return; SM_I(sbi)->free_info = NULL; kvfree(free_i->free_segmap); kvfree(free_i->free_secmap); kfree(free_i); } static void destroy_sit_info(struct f2fs_sb_info *sbi) { struct sit_info *sit_i = SIT_I(sbi); if (!sit_i) return; if (sit_i->sentries) kvfree(sit_i->bitmap); kfree(sit_i->tmp_map); kvfree(sit_i->sentries); kvfree(sit_i->sec_entries); kvfree(sit_i->dirty_sentries_bitmap); SM_I(sbi)->sit_info = NULL; kvfree(sit_i->sit_bitmap); #ifdef CONFIG_F2FS_CHECK_FS kvfree(sit_i->sit_bitmap_mir); kvfree(sit_i->invalid_segmap); #endif kfree(sit_i); } void f2fs_destroy_segment_manager(struct f2fs_sb_info *sbi) { struct f2fs_sm_info *sm_info = SM_I(sbi); if (!sm_info) return; f2fs_destroy_flush_cmd_control(sbi, true); destroy_discard_cmd_control(sbi); destroy_dirty_segmap(sbi); destroy_curseg(sbi); destroy_free_segmap(sbi); destroy_sit_info(sbi); sbi->sm_info = NULL; kfree(sm_info); } int __init f2fs_create_segment_manager_caches(void) { discard_entry_slab = f2fs_kmem_cache_create("f2fs_discard_entry", sizeof(struct discard_entry)); if (!discard_entry_slab) goto fail; discard_cmd_slab = f2fs_kmem_cache_create("f2fs_discard_cmd", sizeof(struct discard_cmd)); if (!discard_cmd_slab) goto destroy_discard_entry; sit_entry_set_slab = f2fs_kmem_cache_create("f2fs_sit_entry_set", sizeof(struct sit_entry_set)); if (!sit_entry_set_slab) goto destroy_discard_cmd; revoke_entry_slab = f2fs_kmem_cache_create("f2fs_revoke_entry", sizeof(struct revoke_entry)); if (!revoke_entry_slab) goto destroy_sit_entry_set; return 0; destroy_sit_entry_set: kmem_cache_destroy(sit_entry_set_slab); destroy_discard_cmd: kmem_cache_destroy(discard_cmd_slab); destroy_discard_entry: kmem_cache_destroy(discard_entry_slab); fail: return -ENOMEM; } void f2fs_destroy_segment_manager_caches(void) { kmem_cache_destroy(sit_entry_set_slab); kmem_cache_destroy(discard_cmd_slab); kmem_cache_destroy(discard_entry_slab); kmem_cache_destroy(revoke_entry_slab); }
9 9 9 9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 #ifndef __LINUX_MROUTE_BASE_H #define __LINUX_MROUTE_BASE_H #include <linux/netdevice.h> #include <linux/rhashtable-types.h> #include <linux/spinlock.h> #include <net/net_namespace.h> #include <net/sock.h> #include <net/fib_notifier.h> #include <net/ip_fib.h> /** * struct vif_device - interface representor for multicast routing * @dev: network device being used * @dev_tracker: refcount tracker for @dev reference * @bytes_in: statistic; bytes ingressing * @bytes_out: statistic; bytes egresing * @pkt_in: statistic; packets ingressing * @pkt_out: statistic; packets egressing * @rate_limit: Traffic shaping (NI) * @threshold: TTL threshold * @flags: Control flags * @link: Physical interface index * @dev_parent_id: device parent id * @local: Local address * @remote: Remote address for tunnels */ struct vif_device { struct net_device __rcu *dev; netdevice_tracker dev_tracker; unsigned long bytes_in, bytes_out; unsigned long pkt_in, pkt_out; unsigned long rate_limit; unsigned char threshold; unsigned short flags; int link; /* Currently only used by ipmr */ struct netdev_phys_item_id dev_parent_id; __be32 local, remote; }; struct vif_entry_notifier_info { struct fib_notifier_info info; struct net_device *dev; unsigned short vif_index; unsigned short vif_flags; u32 tb_id; }; static inline int mr_call_vif_notifier(struct notifier_block *nb, unsigned short family, enum fib_event_type event_type, struct vif_device *vif, struct net_device *vif_dev, unsigned short vif_index, u32 tb_id, struct netlink_ext_ack *extack) { struct vif_entry_notifier_info info = { .info = { .family = family, .extack = extack, }, .dev = vif_dev, .vif_index = vif_index, .vif_flags = vif->flags, .tb_id = tb_id, }; return call_fib_notifier(nb, event_type, &info.info); } static inline int mr_call_vif_notifiers(struct net *net, unsigned short family, enum fib_event_type event_type, struct vif_device *vif, struct net_device *vif_dev, unsigned short vif_index, u32 tb_id, unsigned int *ipmr_seq) { struct vif_entry_notifier_info info = { .info = { .family = family, }, .dev = vif_dev, .vif_index = vif_index, .vif_flags = vif->flags, .tb_id = tb_id, }; ASSERT_RTNL(); (*ipmr_seq)++; return call_fib_notifiers(net, event_type, &info.info); } #ifndef MAXVIFS /* This one is nasty; value is defined in uapi using different symbols for * mroute and morute6 but both map into same 32. */ #define MAXVIFS 32 #endif /* Note: This helper is deprecated. */ #define VIF_EXISTS(_mrt, _idx) (!!rcu_access_pointer((_mrt)->vif_table[_idx].dev)) /* mfc_flags: * MFC_STATIC - the entry was added statically (not by a routing daemon) * MFC_OFFLOAD - the entry was offloaded to the hardware */ enum { MFC_STATIC = BIT(0), MFC_OFFLOAD = BIT(1), }; /** * struct mr_mfc - common multicast routing entries * @mnode: rhashtable list * @mfc_parent: source interface (iif) * @mfc_flags: entry flags * @expires: unresolved entry expire time * @unresolved: unresolved cached skbs * @last_assert: time of last assert * @minvif: minimum VIF id * @maxvif: maximum VIF id * @bytes: bytes that have passed for this entry * @pkt: packets that have passed for this entry * @wrong_if: number of wrong source interface hits * @lastuse: time of last use of the group (traffic or update) * @ttls: OIF TTL threshold array * @refcount: reference count for this entry * @list: global entry list * @rcu: used for entry destruction * @free: Operation used for freeing an entry under RCU */ struct mr_mfc { struct rhlist_head mnode; unsigned short mfc_parent; int mfc_flags; union { struct { unsigned long expires; struct sk_buff_head unresolved; } unres; struct { unsigned long last_assert; int minvif; int maxvif; atomic_long_t bytes; atomic_long_t pkt; atomic_long_t wrong_if; unsigned long lastuse; unsigned char ttls[MAXVIFS]; refcount_t refcount; } res; } mfc_un; struct list_head list; struct rcu_head rcu; void (*free)(struct rcu_head *head); }; static inline void mr_cache_put(struct mr_mfc *c) { if (refcount_dec_and_test(&c->mfc_un.res.refcount)) call_rcu(&c->rcu, c->free); } static inline void mr_cache_hold(struct mr_mfc *c) { refcount_inc(&c->mfc_un.res.refcount); } struct mfc_entry_notifier_info { struct fib_notifier_info info; struct mr_mfc *mfc; u32 tb_id; }; static inline int mr_call_mfc_notifier(struct notifier_block *nb, unsigned short family, enum fib_event_type event_type, struct mr_mfc *mfc, u32 tb_id, struct netlink_ext_ack *extack) { struct mfc_entry_notifier_info info = { .info = { .family = family, .extack = extack, }, .mfc = mfc, .tb_id = tb_id }; return call_fib_notifier(nb, event_type, &info.info); } static inline int mr_call_mfc_notifiers(struct net *net, unsigned short family, enum fib_event_type event_type, struct mr_mfc *mfc, u32 tb_id, unsigned int *ipmr_seq) { struct mfc_entry_notifier_info info = { .info = { .family = family, }, .mfc = mfc, .tb_id = tb_id }; ASSERT_RTNL(); (*ipmr_seq)++; return call_fib_notifiers(net, event_type, &info.info); } struct mr_table; /** * struct mr_table_ops - callbacks and info for protocol-specific ops * @rht_params: parameters for accessing the MFC hash * @cmparg_any: a hash key to be used for matching on (*,*) routes */ struct mr_table_ops { const struct rhashtable_params *rht_params; void *cmparg_any; }; /** * struct mr_table - a multicast routing table * @list: entry within a list of multicast routing tables * @net: net where this table belongs * @ops: protocol specific operations * @id: identifier of the table * @mroute_sk: socket associated with the table * @ipmr_expire_timer: timer for handling unresolved routes * @mfc_unres_queue: list of unresolved MFC entries * @vif_table: array containing all possible vifs * @mfc_hash: Hash table of all resolved routes for easy lookup * @mfc_cache_list: list of resovled routes for possible traversal * @maxvif: Identifier of highest value vif currently in use * @cache_resolve_queue_len: current size of unresolved queue * @mroute_do_assert: Whether to inform userspace on wrong ingress * @mroute_do_pim: Whether to receive IGMP PIMv1 * @mroute_reg_vif_num: PIM-device vif index */ struct mr_table { struct list_head list; possible_net_t net; struct mr_table_ops ops; u32 id; struct sock __rcu *mroute_sk; struct timer_list ipmr_expire_timer; struct list_head mfc_unres_queue; struct vif_device vif_table[MAXVIFS]; struct rhltable mfc_hash; struct list_head mfc_cache_list; int maxvif; atomic_t cache_resolve_queue_len; bool mroute_do_assert; bool mroute_do_pim; bool mroute_do_wrvifwhole; int mroute_reg_vif_num; }; #ifdef CONFIG_IP_MROUTE_COMMON void vif_device_init(struct vif_device *v, struct net_device *dev, unsigned long rate_limit, unsigned char threshold, unsigned short flags, unsigned short get_iflink_mask); struct mr_table * mr_table_alloc(struct net *net, u32 id, struct mr_table_ops *ops, void (*expire_func)(struct timer_list *t), void (*table_set)(struct mr_table *mrt, struct net *net)); /* These actually return 'struct mr_mfc *', but to avoid need for explicit * castings they simply return void. */ void *mr_mfc_find_parent(struct mr_table *mrt, void *hasharg, int parent); void *mr_mfc_find_any_parent(struct mr_table *mrt, int vifi); void *mr_mfc_find_any(struct mr_table *mrt, int vifi, void *hasharg); int mr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, struct mr_mfc *c, struct rtmsg *rtm); int mr_table_dump(struct mr_table *mrt, struct sk_buff *skb, struct netlink_callback *cb, int (*fill)(struct mr_table *mrt, struct sk_buff *skb, u32 portid, u32 seq, struct mr_mfc *c, int cmd, int flags), spinlock_t *lock, struct fib_dump_filter *filter); int mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb, struct mr_table *(*iter)(struct net *net, struct mr_table *mrt), int (*fill)(struct mr_table *mrt, struct sk_buff *skb, u32 portid, u32 seq, struct mr_mfc *c, int cmd, int flags), spinlock_t *lock, struct fib_dump_filter *filter); int mr_dump(struct net *net, struct notifier_block *nb, unsigned short family, int (*rules_dump)(struct net *net, struct notifier_block *nb, struct netlink_ext_ack *extack), struct mr_table *(*mr_iter)(struct net *net, struct mr_table *mrt), struct netlink_ext_ack *extack); #else static inline void vif_device_init(struct vif_device *v, struct net_device *dev, unsigned long rate_limit, unsigned char threshold, unsigned short flags, unsigned short get_iflink_mask) { } static inline void *mr_mfc_find_parent(struct mr_table *mrt, void *hasharg, int parent) { return NULL; } static inline void *mr_mfc_find_any_parent(struct mr_table *mrt, int vifi) { return NULL; } static inline struct mr_mfc *mr_mfc_find_any(struct mr_table *mrt, int vifi, void *hasharg) { return NULL; } static inline int mr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, struct mr_mfc *c, struct rtmsg *rtm) { return -EINVAL; } static inline int mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb, struct mr_table *(*iter)(struct net *net, struct mr_table *mrt), int (*fill)(struct mr_table *mrt, struct sk_buff *skb, u32 portid, u32 seq, struct mr_mfc *c, int cmd, int flags), spinlock_t *lock, struct fib_dump_filter *filter) { return -EINVAL; } static inline int mr_dump(struct net *net, struct notifier_block *nb, unsigned short family, int (*rules_dump)(struct net *net, struct notifier_block *nb, struct netlink_ext_ack *extack), struct mr_table *(*mr_iter)(struct net *net, struct mr_table *mrt), struct netlink_ext_ack *extack) { return -EINVAL; } #endif static inline void *mr_mfc_find(struct mr_table *mrt, void *hasharg) { return mr_mfc_find_parent(mrt, hasharg, -1); } #ifdef CONFIG_PROC_FS struct mr_vif_iter { struct seq_net_private p; struct mr_table *mrt; int ct; }; struct mr_mfc_iter { struct seq_net_private p; struct mr_table *mrt; struct list_head *cache; /* Lock protecting the mr_table's unresolved queue */ spinlock_t *lock; }; #ifdef CONFIG_IP_MROUTE_COMMON void *mr_vif_seq_idx(struct net *net, struct mr_vif_iter *iter, loff_t pos); void *mr_vif_seq_next(struct seq_file *seq, void *v, loff_t *pos); static inline void *mr_vif_seq_start(struct seq_file *seq, loff_t *pos) { return *pos ? mr_vif_seq_idx(seq_file_net(seq), seq->private, *pos - 1) : SEQ_START_TOKEN; } /* These actually return 'struct mr_mfc *', but to avoid need for explicit * castings they simply return void. */ void *mr_mfc_seq_idx(struct net *net, struct mr_mfc_iter *it, loff_t pos); void *mr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos); static inline void *mr_mfc_seq_start(struct seq_file *seq, loff_t *pos, struct mr_table *mrt, spinlock_t *lock) { struct mr_mfc_iter *it = seq->private; it->mrt = mrt; it->cache = NULL; it->lock = lock; return *pos ? mr_mfc_seq_idx(seq_file_net(seq), seq->private, *pos - 1) : SEQ_START_TOKEN; } static inline void mr_mfc_seq_stop(struct seq_file *seq, void *v) { struct mr_mfc_iter *it = seq->private; struct mr_table *mrt = it->mrt; if (it->cache == &mrt->mfc_unres_queue) spin_unlock_bh(it->lock); else if (it->cache == &mrt->mfc_cache_list) rcu_read_unlock(); } #else static inline void *mr_vif_seq_idx(struct net *net, struct mr_vif_iter *iter, loff_t pos) { return NULL; } static inline void *mr_vif_seq_next(struct seq_file *seq, void *v, loff_t *pos) { return NULL; } static inline void *mr_vif_seq_start(struct seq_file *seq, loff_t *pos) { return NULL; } static inline void *mr_mfc_seq_idx(struct net *net, struct mr_mfc_iter *it, loff_t pos) { return NULL; } static inline void *mr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos) { return NULL; } static inline void *mr_mfc_seq_start(struct seq_file *seq, loff_t *pos, struct mr_table *mrt, spinlock_t *lock) { return NULL; } static inline void mr_mfc_seq_stop(struct seq_file *seq, void *v) { } #endif #endif #endif
1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 // SPDX-License-Identifier: GPL-2.0-or-later /* * ip_vs_proto_tcp.c: TCP load balancing support for IPVS * * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> * Julian Anastasov <ja@ssi.bg> * * Changes: Hans Schillstrom <hans.schillstrom@ericsson.com> * * Network name space (netns) aware. * Global data moved to netns i.e struct netns_ipvs * tcp_timeouts table has copy per netns in a hash table per * protocol ip_vs_proto_data and is handled by netns */ #define KMSG_COMPONENT "IPVS" #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt #include <linux/kernel.h> #include <linux/ip.h> #include <linux/tcp.h> /* for tcphdr */ #include <net/ip.h> #include <net/tcp.h> /* for csum_tcpudp_magic */ #include <net/ip6_checksum.h> #include <linux/netfilter.h> #include <linux/netfilter_ipv4.h> #include <linux/indirect_call_wrapper.h> #include <net/ip_vs.h> static int tcp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp); static int tcp_conn_schedule(struct netns_ipvs *ipvs, int af, struct sk_buff *skb, struct ip_vs_proto_data *pd, int *verdict, struct ip_vs_conn **cpp, struct ip_vs_iphdr *iph) { struct ip_vs_service *svc; struct tcphdr _tcph, *th; __be16 _ports[2], *ports = NULL; /* In the event of icmp, we're only guaranteed to have the first 8 * bytes of the transport header, so we only check the rest of the * TCP packet for non-ICMP packets */ if (likely(!ip_vs_iph_icmp(iph))) { th = skb_header_pointer(skb, iph->len, sizeof(_tcph), &_tcph); if (th) { if (th->rst || !(sysctl_sloppy_tcp(ipvs) || th->syn)) return 1; ports = &th->source; } } else { ports = skb_header_pointer( skb, iph->len, sizeof(_ports), &_ports); } if (!ports) { *verdict = NF_DROP; return 0; } /* No !th->ack check to allow scheduling on SYN+ACK for Active FTP */ if (likely(!ip_vs_iph_inverse(iph))) svc = ip_vs_service_find(ipvs, af, skb->mark, iph->protocol, &iph->daddr, ports[1]); else svc = ip_vs_service_find(ipvs, af, skb->mark, iph->protocol, &iph->saddr, ports[0]); if (svc) { int ignored; if (ip_vs_todrop(ipvs)) { /* * It seems that we are very loaded. * We have to drop this packet :( */ *verdict = NF_DROP; return 0; } /* * Let the virtual server select a real server for the * incoming connection, and create a connection entry. */ *cpp = ip_vs_schedule(svc, skb, pd, &ignored, iph); if (!*cpp && ignored <= 0) { if (!ignored) *verdict = ip_vs_leave(svc, skb, pd, iph); else *verdict = NF_DROP; return 0; } } /* NF_ACCEPT */ return 1; } static inline void tcp_fast_csum_update(int af, struct tcphdr *tcph, const union nf_inet_addr *oldip, const union nf_inet_addr *newip, __be16 oldport, __be16 newport) { #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) tcph->check = csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6, ip_vs_check_diff2(oldport, newport, ~csum_unfold(tcph->check)))); else #endif tcph->check = csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip, ip_vs_check_diff2(oldport, newport, ~csum_unfold(tcph->check)))); } static inline void tcp_partial_csum_update(int af, struct tcphdr *tcph, const union nf_inet_addr *oldip, const union nf_inet_addr *newip, __be16 oldlen, __be16 newlen) { #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) tcph->check = ~csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6, ip_vs_check_diff2(oldlen, newlen, csum_unfold(tcph->check)))); else #endif tcph->check = ~csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip, ip_vs_check_diff2(oldlen, newlen, csum_unfold(tcph->check)))); } INDIRECT_CALLABLE_SCOPE int tcp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, struct ip_vs_conn *cp, struct ip_vs_iphdr *iph) { struct tcphdr *tcph; unsigned int tcphoff = iph->len; bool payload_csum = false; int oldlen; #ifdef CONFIG_IP_VS_IPV6 if (cp->af == AF_INET6 && iph->fragoffs) return 1; #endif oldlen = skb->len - tcphoff; /* csum_check requires unshared skb */ if (skb_ensure_writable(skb, tcphoff + sizeof(*tcph))) return 0; if (unlikely(cp->app != NULL)) { int ret; /* Some checks before mangling */ if (!tcp_csum_check(cp->af, skb, pp)) return 0; /* Call application helper if needed */ if (!(ret = ip_vs_app_pkt_out(cp, skb, iph))) return 0; /* ret=2: csum update is needed after payload mangling */ if (ret == 1) oldlen = skb->len - tcphoff; else payload_csum = true; } tcph = (void *)skb_network_header(skb) + tcphoff; tcph->source = cp->vport; /* Adjust TCP checksums */ if (skb->ip_summed == CHECKSUM_PARTIAL) { tcp_partial_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr, htons(oldlen), htons(skb->len - tcphoff)); } else if (!payload_csum) { /* Only port and addr are changed, do fast csum update */ tcp_fast_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr, cp->dport, cp->vport); if (skb->ip_summed == CHECKSUM_COMPLETE) skb->ip_summed = cp->app ? CHECKSUM_UNNECESSARY : CHECKSUM_NONE; } else { /* full checksum calculation */ tcph->check = 0; skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0); #ifdef CONFIG_IP_VS_IPV6 if (cp->af == AF_INET6) tcph->check = csum_ipv6_magic(&cp->vaddr.in6, &cp->caddr.in6, skb->len - tcphoff, cp->protocol, skb->csum); else #endif tcph->check = csum_tcpudp_magic(cp->vaddr.ip, cp->caddr.ip, skb->len - tcphoff, cp->protocol, skb->csum); skb->ip_summed = CHECKSUM_UNNECESSARY; IP_VS_DBG(11, "O-pkt: %s O-csum=%d (+%zd)\n", pp->name, tcph->check, (char*)&(tcph->check) - (char*)tcph); } return 1; } static int tcp_dnat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, struct ip_vs_conn *cp, struct ip_vs_iphdr *iph) { struct tcphdr *tcph; unsigned int tcphoff = iph->len; bool payload_csum = false; int oldlen; #ifdef CONFIG_IP_VS_IPV6 if (cp->af == AF_INET6 && iph->fragoffs) return 1; #endif oldlen = skb->len - tcphoff; /* csum_check requires unshared skb */ if (skb_ensure_writable(skb, tcphoff + sizeof(*tcph))) return 0; if (unlikely(cp->app != NULL)) { int ret; /* Some checks before mangling */ if (!tcp_csum_check(cp->af, skb, pp)) return 0; /* * Attempt ip_vs_app call. * It will fix ip_vs_conn and iph ack_seq stuff */ if (!(ret = ip_vs_app_pkt_in(cp, skb, iph))) return 0; /* ret=2: csum update is needed after payload mangling */ if (ret == 1) oldlen = skb->len - tcphoff; else payload_csum = true; } tcph = (void *)skb_network_header(skb) + tcphoff; tcph->dest = cp->dport; /* * Adjust TCP checksums */ if (skb->ip_summed == CHECKSUM_PARTIAL) { tcp_partial_csum_update(cp->af, tcph, &cp->vaddr, &cp->daddr, htons(oldlen), htons(skb->len - tcphoff)); } else if (!payload_csum) { /* Only port and addr are changed, do fast csum update */ tcp_fast_csum_update(cp->af, tcph, &cp->vaddr, &cp->daddr, cp->vport, cp->dport); if (skb->ip_summed == CHECKSUM_COMPLETE) skb->ip_summed = cp->app ? CHECKSUM_UNNECESSARY : CHECKSUM_NONE; } else { /* full checksum calculation */ tcph->check = 0; skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0); #ifdef CONFIG_IP_VS_IPV6 if (cp->af == AF_INET6) tcph->check = csum_ipv6_magic(&cp->caddr.in6, &cp->daddr.in6, skb->len - tcphoff, cp->protocol, skb->csum); else #endif tcph->check = csum_tcpudp_magic(cp->caddr.ip, cp->daddr.ip, skb->len - tcphoff, cp->protocol, skb->csum); skb->ip_summed = CHECKSUM_UNNECESSARY; } return 1; } static int tcp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp) { unsigned int tcphoff; #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) tcphoff = sizeof(struct ipv6hdr); else #endif tcphoff = ip_hdrlen(skb); switch (skb->ip_summed) { case CHECKSUM_NONE: skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0); fallthrough; case CHECKSUM_COMPLETE: #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) { if (csum_ipv6_magic(&ipv6_hdr(skb)->saddr, &ipv6_hdr(skb)->daddr, skb->len - tcphoff, ipv6_hdr(skb)->nexthdr, skb->csum)) { IP_VS_DBG_RL_PKT(0, af, pp, skb, 0, "Failed checksum for"); return 0; } } else #endif if (csum_tcpudp_magic(ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, skb->len - tcphoff, ip_hdr(skb)->protocol, skb->csum)) { IP_VS_DBG_RL_PKT(0, af, pp, skb, 0, "Failed checksum for"); return 0; } break; default: /* No need to checksum. */ break; } return 1; } #define TCP_DIR_INPUT 0 #define TCP_DIR_OUTPUT 4 #define TCP_DIR_INPUT_ONLY 8 static const int tcp_state_off[IP_VS_DIR_LAST] = { [IP_VS_DIR_INPUT] = TCP_DIR_INPUT, [IP_VS_DIR_OUTPUT] = TCP_DIR_OUTPUT, [IP_VS_DIR_INPUT_ONLY] = TCP_DIR_INPUT_ONLY, }; /* * Timeout table[state] */ static const int tcp_timeouts[IP_VS_TCP_S_LAST+1] = { [IP_VS_TCP_S_NONE] = 2*HZ, [IP_VS_TCP_S_ESTABLISHED] = 15*60*HZ, [IP_VS_TCP_S_SYN_SENT] = 2*60*HZ, [IP_VS_TCP_S_SYN_RECV] = 1*60*HZ, [IP_VS_TCP_S_FIN_WAIT] = 2*60*HZ, [IP_VS_TCP_S_TIME_WAIT] = 2*60*HZ, [IP_VS_TCP_S_CLOSE] = 10*HZ, [IP_VS_TCP_S_CLOSE_WAIT] = 60*HZ, [IP_VS_TCP_S_LAST_ACK] = 30*HZ, [IP_VS_TCP_S_LISTEN] = 2*60*HZ, [IP_VS_TCP_S_SYNACK] = 120*HZ, [IP_VS_TCP_S_LAST] = 2*HZ, }; static const char *const tcp_state_name_table[IP_VS_TCP_S_LAST+1] = { [IP_VS_TCP_S_NONE] = "NONE", [IP_VS_TCP_S_ESTABLISHED] = "ESTABLISHED", [IP_VS_TCP_S_SYN_SENT] = "SYN_SENT", [IP_VS_TCP_S_SYN_RECV] = "SYN_RECV", [IP_VS_TCP_S_FIN_WAIT] = "FIN_WAIT", [IP_VS_TCP_S_TIME_WAIT] = "TIME_WAIT", [IP_VS_TCP_S_CLOSE] = "CLOSE", [IP_VS_TCP_S_CLOSE_WAIT] = "CLOSE_WAIT", [IP_VS_TCP_S_LAST_ACK] = "LAST_ACK", [IP_VS_TCP_S_LISTEN] = "LISTEN", [IP_VS_TCP_S_SYNACK] = "SYNACK", [IP_VS_TCP_S_LAST] = "BUG!", }; static const bool tcp_state_active_table[IP_VS_TCP_S_LAST] = { [IP_VS_TCP_S_NONE] = false, [IP_VS_TCP_S_ESTABLISHED] = true, [IP_VS_TCP_S_SYN_SENT] = true, [IP_VS_TCP_S_SYN_RECV] = true, [IP_VS_TCP_S_FIN_WAIT] = false, [IP_VS_TCP_S_TIME_WAIT] = false, [IP_VS_TCP_S_CLOSE] = false, [IP_VS_TCP_S_CLOSE_WAIT] = false, [IP_VS_TCP_S_LAST_ACK] = false, [IP_VS_TCP_S_LISTEN] = false, [IP_VS_TCP_S_SYNACK] = true, }; #define sNO IP_VS_TCP_S_NONE #define sES IP_VS_TCP_S_ESTABLISHED #define sSS IP_VS_TCP_S_SYN_SENT #define sSR IP_VS_TCP_S_SYN_RECV #define sFW IP_VS_TCP_S_FIN_WAIT #define sTW IP_VS_TCP_S_TIME_WAIT #define sCL IP_VS_TCP_S_CLOSE #define sCW IP_VS_TCP_S_CLOSE_WAIT #define sLA IP_VS_TCP_S_LAST_ACK #define sLI IP_VS_TCP_S_LISTEN #define sSA IP_VS_TCP_S_SYNACK struct tcp_states_t { int next_state[IP_VS_TCP_S_LAST]; }; static const char * tcp_state_name(int state) { if (state >= IP_VS_TCP_S_LAST) return "ERR!"; return tcp_state_name_table[state] ? tcp_state_name_table[state] : "?"; } static bool tcp_state_active(int state) { if (state >= IP_VS_TCP_S_LAST) return false; return tcp_state_active_table[state]; } static struct tcp_states_t tcp_states[] = { /* INPUT */ /* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ /*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }}, /*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sTW }}, /*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }}, /*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sSR }}, /* OUTPUT */ /* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ /*syn*/ {{sSS, sES, sSS, sSR, sSS, sSS, sSS, sSS, sSS, sLI, sSR }}, /*fin*/ {{sTW, sFW, sSS, sTW, sFW, sTW, sCL, sTW, sLA, sLI, sTW }}, /*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sLA, sES, sES }}, /*rst*/ {{sCL, sCL, sSS, sCL, sCL, sTW, sCL, sCL, sCL, sCL, sCL }}, /* INPUT-ONLY */ /* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ /*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }}, /*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }}, /*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }}, /*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }}, }; static struct tcp_states_t tcp_states_dos[] = { /* INPUT */ /* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ /*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSA }}, /*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sSA }}, /*ack*/ {{sES, sES, sSS, sSR, sFW, sTW, sCL, sCW, sCL, sLI, sSA }}, /*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }}, /* OUTPUT */ /* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ /*syn*/ {{sSS, sES, sSS, sSA, sSS, sSS, sSS, sSS, sSS, sLI, sSA }}, /*fin*/ {{sTW, sFW, sSS, sTW, sFW, sTW, sCL, sTW, sLA, sLI, sTW }}, /*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sLA, sES, sES }}, /*rst*/ {{sCL, sCL, sSS, sCL, sCL, sTW, sCL, sCL, sCL, sCL, sCL }}, /* INPUT-ONLY */ /* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ /*syn*/ {{sSA, sES, sES, sSR, sSA, sSA, sSA, sSA, sSA, sSA, sSA }}, /*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }}, /*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }}, /*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }}, }; static void tcp_timeout_change(struct ip_vs_proto_data *pd, int flags) { int on = (flags & 1); /* secure_tcp */ /* ** FIXME: change secure_tcp to independent sysctl var ** or make it per-service or per-app because it is valid ** for most if not for all of the applications. Something ** like "capabilities" (flags) for each object. */ pd->tcp_state_table = (on ? tcp_states_dos : tcp_states); } static inline int tcp_state_idx(struct tcphdr *th) { if (th->rst) return 3; if (th->syn) return 0; if (th->fin) return 1; if (th->ack) return 2; return -1; } static inline void set_tcp_state(struct ip_vs_proto_data *pd, struct ip_vs_conn *cp, int direction, struct tcphdr *th) { int state_idx; int new_state = IP_VS_TCP_S_CLOSE; int state_off = tcp_state_off[direction]; /* * Update state offset to INPUT_ONLY if necessary * or delete NO_OUTPUT flag if output packet detected */ if (cp->flags & IP_VS_CONN_F_NOOUTPUT) { if (state_off == TCP_DIR_OUTPUT) cp->flags &= ~IP_VS_CONN_F_NOOUTPUT; else state_off = TCP_DIR_INPUT_ONLY; } if ((state_idx = tcp_state_idx(th)) < 0) { IP_VS_DBG(8, "tcp_state_idx=%d!!!\n", state_idx); goto tcp_state_out; } new_state = pd->tcp_state_table[state_off+state_idx].next_state[cp->state]; tcp_state_out: if (new_state != cp->state) { struct ip_vs_dest *dest = cp->dest; IP_VS_DBG_BUF(8, "%s %s [%c%c%c%c] c:%s:%d v:%s:%d " "d:%s:%d state: %s->%s conn->refcnt:%d\n", pd->pp->name, ((state_off == TCP_DIR_OUTPUT) ? "output " : "input "), th->syn ? 'S' : '.', th->fin ? 'F' : '.', th->ack ? 'A' : '.', th->rst ? 'R' : '.', IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport), IP_VS_DBG_ADDR(cp->af, &cp->vaddr), ntohs(cp->vport), IP_VS_DBG_ADDR(cp->daf, &cp->daddr), ntohs(cp->dport), tcp_state_name(cp->state), tcp_state_name(new_state), refcount_read(&cp->refcnt)); if (dest) { if (!(cp->flags & IP_VS_CONN_F_INACTIVE) && !tcp_state_active(new_state)) { atomic_dec(&dest->activeconns); atomic_inc(&dest->inactconns); cp->flags |= IP_VS_CONN_F_INACTIVE; } else if ((cp->flags & IP_VS_CONN_F_INACTIVE) && tcp_state_active(new_state)) { atomic_inc(&dest->activeconns); atomic_dec(&dest->inactconns); cp->flags &= ~IP_VS_CONN_F_INACTIVE; } } if (new_state == IP_VS_TCP_S_ESTABLISHED) ip_vs_control_assure_ct(cp); } if (likely(pd)) cp->timeout = pd->timeout_table[cp->state = new_state]; else /* What to do ? */ cp->timeout = tcp_timeouts[cp->state = new_state]; } /* * Handle state transitions */ static void tcp_state_transition(struct ip_vs_conn *cp, int direction, const struct sk_buff *skb, struct ip_vs_proto_data *pd) { struct tcphdr _tcph, *th; #ifdef CONFIG_IP_VS_IPV6 int ihl = cp->af == AF_INET ? ip_hdrlen(skb) : sizeof(struct ipv6hdr); #else int ihl = ip_hdrlen(skb); #endif th = skb_header_pointer(skb, ihl, sizeof(_tcph), &_tcph); if (th == NULL) return; spin_lock_bh(&cp->lock); set_tcp_state(pd, cp, direction, th); spin_unlock_bh(&cp->lock); } static inline __u16 tcp_app_hashkey(__be16 port) { return (((__force u16)port >> TCP_APP_TAB_BITS) ^ (__force u16)port) & TCP_APP_TAB_MASK; } static int tcp_register_app(struct netns_ipvs *ipvs, struct ip_vs_app *inc) { struct ip_vs_app *i; __u16 hash; __be16 port = inc->port; int ret = 0; struct ip_vs_proto_data *pd = ip_vs_proto_data_get(ipvs, IPPROTO_TCP); hash = tcp_app_hashkey(port); list_for_each_entry(i, &ipvs->tcp_apps[hash], p_list) { if (i->port == port) { ret = -EEXIST; goto out; } } list_add_rcu(&inc->p_list, &ipvs->tcp_apps[hash]); atomic_inc(&pd->appcnt); out: return ret; } static void tcp_unregister_app(struct netns_ipvs *ipvs, struct ip_vs_app *inc) { struct ip_vs_proto_data *pd = ip_vs_proto_data_get(ipvs, IPPROTO_TCP); atomic_dec(&pd->appcnt); list_del_rcu(&inc->p_list); } static int tcp_app_conn_bind(struct ip_vs_conn *cp) { struct netns_ipvs *ipvs = cp->ipvs; int hash; struct ip_vs_app *inc; int result = 0; /* Default binding: bind app only for NAT */ if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) return 0; /* Lookup application incarnations and bind the right one */ hash = tcp_app_hashkey(cp->vport); list_for_each_entry_rcu(inc, &ipvs->tcp_apps[hash], p_list) { if (inc->port == cp->vport) { if (unlikely(!ip_vs_app_inc_get(inc))) break; IP_VS_DBG_BUF(9, "%s(): Binding conn %s:%u->" "%s:%u to app %s on port %u\n", __func__, IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport), IP_VS_DBG_ADDR(cp->af, &cp->vaddr), ntohs(cp->vport), inc->name, ntohs(inc->port)); cp->app = inc; if (inc->init_conn) result = inc->init_conn(inc, cp); break; } } return result; } /* * Set LISTEN timeout. (ip_vs_conn_put will setup timer) */ void ip_vs_tcp_conn_listen(struct ip_vs_conn *cp) { struct ip_vs_proto_data *pd = ip_vs_proto_data_get(cp->ipvs, IPPROTO_TCP); spin_lock_bh(&cp->lock); cp->state = IP_VS_TCP_S_LISTEN; cp->timeout = (pd ? pd->timeout_table[IP_VS_TCP_S_LISTEN] : tcp_timeouts[IP_VS_TCP_S_LISTEN]); spin_unlock_bh(&cp->lock); } /* --------------------------------------------- * timeouts is netns related now. * --------------------------------------------- */ static int __ip_vs_tcp_init(struct netns_ipvs *ipvs, struct ip_vs_proto_data *pd) { ip_vs_init_hash_table(ipvs->tcp_apps, TCP_APP_TAB_SIZE); pd->timeout_table = ip_vs_create_timeout_table((int *)tcp_timeouts, sizeof(tcp_timeouts)); if (!pd->timeout_table) return -ENOMEM; pd->tcp_state_table = tcp_states; return 0; } static void __ip_vs_tcp_exit(struct netns_ipvs *ipvs, struct ip_vs_proto_data *pd) { kfree(pd->timeout_table); } struct ip_vs_protocol ip_vs_protocol_tcp = { .name = "TCP", .protocol = IPPROTO_TCP, .num_states = IP_VS_TCP_S_LAST, .dont_defrag = 0, .init = NULL, .exit = NULL, .init_netns = __ip_vs_tcp_init, .exit_netns = __ip_vs_tcp_exit, .register_app = tcp_register_app, .unregister_app = tcp_unregister_app, .conn_schedule = tcp_conn_schedule, .conn_in_get = ip_vs_conn_in_get_proto, .conn_out_get = ip_vs_conn_out_get_proto, .snat_handler = tcp_snat_handler, .dnat_handler = tcp_dnat_handler, .state_name = tcp_state_name, .state_transition = tcp_state_transition, .app_conn_bind = tcp_app_conn_bind, .debug_packet = ip_vs_tcpudp_debug_packet, .timeout_change = tcp_timeout_change, };
7 7 7 7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 // SPDX-License-Identifier: GPL-2.0 #include <linux/anon_inodes.h> #include <linux/exportfs.h> #include <linux/file.h> #include <linux/fs.h> #include <linux/cgroup.h> #include <linux/magic.h> #include <linux/mount.h> #include <linux/pid.h> #include <linux/pidfs.h> #include <linux/pid_namespace.h> #include <linux/poll.h> #include <linux/proc_fs.h> #include <linux/proc_ns.h> #include <linux/pseudo_fs.h> #include <linux/ptrace.h> #include <linux/seq_file.h> #include <uapi/linux/pidfd.h> #include <linux/ipc_namespace.h> #include <linux/time_namespace.h> #include <linux/utsname.h> #include <net/net_namespace.h> #include "internal.h" #include "mount.h" static struct rb_root pidfs_ino_tree = RB_ROOT; #if BITS_PER_LONG == 32 static inline unsigned long pidfs_ino(u64 ino) { return lower_32_bits(ino); } /* On 32 bit the generation number are the upper 32 bits. */ static inline u32 pidfs_gen(u64 ino) { return upper_32_bits(ino); } #else /* On 64 bit simply return ino. */ static inline unsigned long pidfs_ino(u64 ino) { return ino; } /* On 64 bit the generation number is 0. */ static inline u32 pidfs_gen(u64 ino) { return 0; } #endif static int pidfs_ino_cmp(struct rb_node *a, const struct rb_node *b) { struct pid *pid_a = rb_entry(a, struct pid, pidfs_node); struct pid *pid_b = rb_entry(b, struct pid, pidfs_node); u64 pid_ino_a = pid_a->ino; u64 pid_ino_b = pid_b->ino; if (pid_ino_a < pid_ino_b) return -1; if (pid_ino_a > pid_ino_b) return 1; return 0; } void pidfs_add_pid(struct pid *pid) { static u64 pidfs_ino_nr = 2; /* * On 64 bit nothing special happens. The 64bit number assigned * to struct pid is the inode number. * * On 32 bit the 64 bit number assigned to struct pid is split * into two 32 bit numbers. The lower 32 bits are used as the * inode number and the upper 32 bits are used as the inode * generation number. * * On 32 bit pidfs_ino() will return the lower 32 bit. When * pidfs_ino() returns zero a wrap around happened. When a * wraparound happens the 64 bit number will be incremented by 2 * so inode numbering starts at 2 again. * * On 64 bit comparing two pidfds is as simple as comparing * inode numbers. * * When a wraparound happens on 32 bit multiple pidfds with the * same inode number are likely to exist (This isn't a problem * since before pidfs pidfds used the anonymous inode meaning * all pidfds had the same inode number.). Userspace can * reconstruct the 64 bit identifier by retrieving both the * inode number and the inode generation number to compare or * use file handles. */ if (pidfs_ino(pidfs_ino_nr) == 0) pidfs_ino_nr += 2; pid->ino = pidfs_ino_nr; pid->stashed = NULL; pidfs_ino_nr++; write_seqcount_begin(&pidmap_lock_seq); rb_find_add_rcu(&pid->pidfs_node, &pidfs_ino_tree, pidfs_ino_cmp); write_seqcount_end(&pidmap_lock_seq); } void pidfs_remove_pid(struct pid *pid) { write_seqcount_begin(&pidmap_lock_seq); rb_erase(&pid->pidfs_node, &pidfs_ino_tree); write_seqcount_end(&pidmap_lock_seq); } #ifdef CONFIG_PROC_FS /** * pidfd_show_fdinfo - print information about a pidfd * @m: proc fdinfo file * @f: file referencing a pidfd * * Pid: * This function will print the pid that a given pidfd refers to in the * pid namespace of the procfs instance. * If the pid namespace of the process is not a descendant of the pid * namespace of the procfs instance 0 will be shown as its pid. This is * similar to calling getppid() on a process whose parent is outside of * its pid namespace. * * NSpid: * If pid namespaces are supported then this function will also print * the pid of a given pidfd refers to for all descendant pid namespaces * starting from the current pid namespace of the instance, i.e. the * Pid field and the first entry in the NSpid field will be identical. * If the pid namespace of the process is not a descendant of the pid * namespace of the procfs instance 0 will be shown as its first NSpid * entry and no others will be shown. * Note that this differs from the Pid and NSpid fields in * /proc/<pid>/status where Pid and NSpid are always shown relative to * the pid namespace of the procfs instance. The difference becomes * obvious when sending around a pidfd between pid namespaces from a * different branch of the tree, i.e. where no ancestral relation is * present between the pid namespaces: * - create two new pid namespaces ns1 and ns2 in the initial pid * namespace (also take care to create new mount namespaces in the * new pid namespace and mount procfs) * - create a process with a pidfd in ns1 * - send pidfd from ns1 to ns2 * - read /proc/self/fdinfo/<pidfd> and observe that both Pid and NSpid * have exactly one entry, which is 0 */ static void pidfd_show_fdinfo(struct seq_file *m, struct file *f) { struct pid *pid = pidfd_pid(f); struct pid_namespace *ns; pid_t nr = -1; if (likely(pid_has_task(pid, PIDTYPE_PID))) { ns = proc_pid_ns(file_inode(m->file)->i_sb); nr = pid_nr_ns(pid, ns); } seq_put_decimal_ll(m, "Pid:\t", nr); #ifdef CONFIG_PID_NS seq_put_decimal_ll(m, "\nNSpid:\t", nr); if (nr > 0) { int i; /* If nr is non-zero it means that 'pid' is valid and that * ns, i.e. the pid namespace associated with the procfs * instance, is in the pid namespace hierarchy of pid. * Start at one below the already printed level. */ for (i = ns->level + 1; i <= pid->level; i++) seq_put_decimal_ll(m, "\t", pid->numbers[i].nr); } #endif seq_putc(m, '\n'); } #endif /* * Poll support for process exit notification. */ static __poll_t pidfd_poll(struct file *file, struct poll_table_struct *pts) { struct pid *pid = pidfd_pid(file); bool thread = file->f_flags & PIDFD_THREAD; struct task_struct *task; __poll_t poll_flags = 0; poll_wait(file, &pid->wait_pidfd, pts); /* * Depending on PIDFD_THREAD, inform pollers when the thread * or the whole thread-group exits. */ guard(rcu)(); task = pid_task(pid, PIDTYPE_PID); if (!task) poll_flags = EPOLLIN | EPOLLRDNORM | EPOLLHUP; else if (task->exit_state && (thread || thread_group_empty(task))) poll_flags = EPOLLIN | EPOLLRDNORM; return poll_flags; } static long pidfd_info(struct task_struct *task, unsigned int cmd, unsigned long arg) { struct pidfd_info __user *uinfo = (struct pidfd_info __user *)arg; size_t usize = _IOC_SIZE(cmd); struct pidfd_info kinfo = {}; struct user_namespace *user_ns; const struct cred *c; __u64 mask; #ifdef CONFIG_CGROUPS struct cgroup *cgrp; #endif if (!uinfo) return -EINVAL; if (usize < PIDFD_INFO_SIZE_VER0) return -EINVAL; /* First version, no smaller struct possible */ if (copy_from_user(&mask, &uinfo->mask, sizeof(mask))) return -EFAULT; c = get_task_cred(task); if (!c) return -ESRCH; /* Unconditionally return identifiers and credentials, the rest only on request */ user_ns = current_user_ns(); kinfo.ruid = from_kuid_munged(user_ns, c->uid); kinfo.rgid = from_kgid_munged(user_ns, c->gid); kinfo.euid = from_kuid_munged(user_ns, c->euid); kinfo.egid = from_kgid_munged(user_ns, c->egid); kinfo.suid = from_kuid_munged(user_ns, c->suid); kinfo.sgid = from_kgid_munged(user_ns, c->sgid); kinfo.fsuid = from_kuid_munged(user_ns, c->fsuid); kinfo.fsgid = from_kgid_munged(user_ns, c->fsgid); kinfo.mask |= PIDFD_INFO_CREDS; put_cred(c); #ifdef CONFIG_CGROUPS rcu_read_lock(); cgrp = task_dfl_cgroup(task); kinfo.cgroupid = cgroup_id(cgrp); kinfo.mask |= PIDFD_INFO_CGROUPID; rcu_read_unlock(); #endif /* * Copy pid/tgid last, to reduce the chances the information might be * stale. Note that it is not possible to ensure it will be valid as the * task might return as soon as the copy_to_user finishes, but that's ok * and userspace expects that might happen and can act accordingly, so * this is just best-effort. What we can do however is checking that all * the fields are set correctly, or return ESRCH to avoid providing * incomplete information. */ kinfo.ppid = task_ppid_nr_ns(task, NULL); kinfo.tgid = task_tgid_vnr(task); kinfo.pid = task_pid_vnr(task); kinfo.mask |= PIDFD_INFO_PID; if (kinfo.pid == 0 || kinfo.tgid == 0 || (kinfo.ppid == 0 && kinfo.pid != 1)) return -ESRCH; /* * If userspace and the kernel have the same struct size it can just * be copied. If userspace provides an older struct, only the bits that * userspace knows about will be copied. If userspace provides a new * struct, only the bits that the kernel knows about will be copied. */ if (copy_to_user(uinfo, &kinfo, min(usize, sizeof(kinfo)))) return -EFAULT; return 0; } static bool pidfs_ioctl_valid(unsigned int cmd) { switch (cmd) { case FS_IOC_GETVERSION: case PIDFD_GET_CGROUP_NAMESPACE: case PIDFD_GET_IPC_NAMESPACE: case PIDFD_GET_MNT_NAMESPACE: case PIDFD_GET_NET_NAMESPACE: case PIDFD_GET_PID_FOR_CHILDREN_NAMESPACE: case PIDFD_GET_TIME_NAMESPACE: case PIDFD_GET_TIME_FOR_CHILDREN_NAMESPACE: case PIDFD_GET_UTS_NAMESPACE: case PIDFD_GET_USER_NAMESPACE: case PIDFD_GET_PID_NAMESPACE: return true; } /* Extensible ioctls require some more careful checks. */ switch (_IOC_NR(cmd)) { case _IOC_NR(PIDFD_GET_INFO): /* * Try to prevent performing a pidfd ioctl when someone * erronously mistook the file descriptor for a pidfd. * This is not perfect but will catch most cases. */ return (_IOC_TYPE(cmd) == _IOC_TYPE(PIDFD_GET_INFO)); } return false; } static long pidfd_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct task_struct *task __free(put_task) = NULL; struct nsproxy *nsp __free(put_nsproxy) = NULL; struct pid *pid = pidfd_pid(file); struct ns_common *ns_common = NULL; struct pid_namespace *pid_ns; if (!pidfs_ioctl_valid(cmd)) return -ENOIOCTLCMD; if (cmd == FS_IOC_GETVERSION) { if (!arg) return -EINVAL; __u32 __user *argp = (__u32 __user *)arg; return put_user(file_inode(file)->i_generation, argp); } task = get_pid_task(pid, PIDTYPE_PID); if (!task) return -ESRCH; /* Extensible IOCTL that does not open namespace FDs, take a shortcut */ if (_IOC_NR(cmd) == _IOC_NR(PIDFD_GET_INFO)) return pidfd_info(task, cmd, arg); if (arg) return -EINVAL; scoped_guard(task_lock, task) { nsp = task->nsproxy; if (nsp) get_nsproxy(nsp); } if (!nsp) return -ESRCH; /* just pretend it didn't exist */ /* * We're trying to open a file descriptor to the namespace so perform a * filesystem cred ptrace check. Also, we mirror nsfs behavior. */ if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) return -EACCES; switch (cmd) { /* Namespaces that hang of nsproxy. */ case PIDFD_GET_CGROUP_NAMESPACE: if (IS_ENABLED(CONFIG_CGROUPS)) { get_cgroup_ns(nsp->cgroup_ns); ns_common = to_ns_common(nsp->cgroup_ns); } break; case PIDFD_GET_IPC_NAMESPACE: if (IS_ENABLED(CONFIG_IPC_NS)) { get_ipc_ns(nsp->ipc_ns); ns_common = to_ns_common(nsp->ipc_ns); } break; case PIDFD_GET_MNT_NAMESPACE: get_mnt_ns(nsp->mnt_ns); ns_common = to_ns_common(nsp->mnt_ns); break; case PIDFD_GET_NET_NAMESPACE: if (IS_ENABLED(CONFIG_NET_NS)) { ns_common = to_ns_common(nsp->net_ns); get_net_ns(ns_common); } break; case PIDFD_GET_PID_FOR_CHILDREN_NAMESPACE: if (IS_ENABLED(CONFIG_PID_NS)) { get_pid_ns(nsp->pid_ns_for_children); ns_common = to_ns_common(nsp->pid_ns_for_children); } break; case PIDFD_GET_TIME_NAMESPACE: if (IS_ENABLED(CONFIG_TIME_NS)) { get_time_ns(nsp->time_ns); ns_common = to_ns_common(nsp->time_ns); } break; case PIDFD_GET_TIME_FOR_CHILDREN_NAMESPACE: if (IS_ENABLED(CONFIG_TIME_NS)) { get_time_ns(nsp->time_ns_for_children); ns_common = to_ns_common(nsp->time_ns_for_children); } break; case PIDFD_GET_UTS_NAMESPACE: if (IS_ENABLED(CONFIG_UTS_NS)) { get_uts_ns(nsp->uts_ns); ns_common = to_ns_common(nsp->uts_ns); } break; /* Namespaces that don't hang of nsproxy. */ case PIDFD_GET_USER_NAMESPACE: if (IS_ENABLED(CONFIG_USER_NS)) { rcu_read_lock(); ns_common = to_ns_common(get_user_ns(task_cred_xxx(task, user_ns))); rcu_read_unlock(); } break; case PIDFD_GET_PID_NAMESPACE: if (IS_ENABLED(CONFIG_PID_NS)) { rcu_read_lock(); pid_ns = task_active_pid_ns(task); if (pid_ns) ns_common = to_ns_common(get_pid_ns(pid_ns)); rcu_read_unlock(); } break; default: return -ENOIOCTLCMD; } if (!ns_common) return -EOPNOTSUPP; /* open_namespace() unconditionally consumes the reference */ return open_namespace(ns_common); } static const struct file_operations pidfs_file_operations = { .poll = pidfd_poll, #ifdef CONFIG_PROC_FS .show_fdinfo = pidfd_show_fdinfo, #endif .unlocked_ioctl = pidfd_ioctl, .compat_ioctl = compat_ptr_ioctl, }; struct pid *pidfd_pid(const struct file *file) { if (file->f_op != &pidfs_file_operations) return ERR_PTR(-EBADF); return file_inode(file)->i_private; } static struct vfsmount *pidfs_mnt __ro_after_init; /* * The vfs falls back to simple_setattr() if i_op->setattr() isn't * implemented. Let's reject it completely until we have a clean * permission concept for pidfds. */ static int pidfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { return -EOPNOTSUPP; } /* * User space expects pidfs inodes to have no file type in st_mode. * * In particular, 'lsof' has this legacy logic: * * type = s->st_mode & S_IFMT; * switch (type) { * ... * case 0: * if (!strcmp(p, "anon_inode")) * Lf->ntype = Ntype = N_ANON_INODE; * * to detect our old anon_inode logic. * * Rather than mess with our internal sane inode data, just fix it * up here in getattr() by masking off the format bits. */ static int pidfs_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { struct inode *inode = d_inode(path->dentry); generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat); stat->mode &= ~S_IFMT; return 0; } static const struct inode_operations pidfs_inode_operations = { .getattr = pidfs_getattr, .setattr = pidfs_setattr, }; static void pidfs_evict_inode(struct inode *inode) { struct pid *pid = inode->i_private; clear_inode(inode); put_pid(pid); } static const struct super_operations pidfs_sops = { .drop_inode = generic_delete_inode, .evict_inode = pidfs_evict_inode, .statfs = simple_statfs, }; /* * 'lsof' has knowledge of out historical anon_inode use, and expects * the pidfs dentry name to start with 'anon_inode'. */ static char *pidfs_dname(struct dentry *dentry, char *buffer, int buflen) { return dynamic_dname(buffer, buflen, "anon_inode:[pidfd]"); } const struct dentry_operations pidfs_dentry_operations = { .d_dname = pidfs_dname, .d_prune = stashed_dentry_prune, }; static int pidfs_encode_fh(struct inode *inode, u32 *fh, int *max_len, struct inode *parent) { const struct pid *pid = inode->i_private; if (*max_len < 2) { *max_len = 2; return FILEID_INVALID; } *max_len = 2; *(u64 *)fh = pid->ino; return FILEID_KERNFS; } static int pidfs_ino_find(const void *key, const struct rb_node *node) { const u64 pid_ino = *(u64 *)key; const struct pid *pid = rb_entry(node, struct pid, pidfs_node); if (pid_ino < pid->ino) return -1; if (pid_ino > pid->ino) return 1; return 0; } /* Find a struct pid based on the inode number. */ static struct pid *pidfs_ino_get_pid(u64 ino) { struct pid *pid; struct rb_node *node; unsigned int seq; guard(rcu)(); do { seq = read_seqcount_begin(&pidmap_lock_seq); node = rb_find_rcu(&ino, &pidfs_ino_tree, pidfs_ino_find); if (node) break; } while (read_seqcount_retry(&pidmap_lock_seq, seq)); if (!node) return NULL; pid = rb_entry(node, struct pid, pidfs_node); /* Within our pid namespace hierarchy? */ if (pid_vnr(pid) == 0) return NULL; return get_pid(pid); } static struct dentry *pidfs_fh_to_dentry(struct super_block *sb, struct fid *fid, int fh_len, int fh_type) { int ret; u64 pid_ino; struct path path; struct pid *pid; if (fh_len < 2) return NULL; switch (fh_type) { case FILEID_KERNFS: pid_ino = *(u64 *)fid; break; default: return NULL; } pid = pidfs_ino_get_pid(pid_ino); if (!pid) return NULL; ret = path_from_stashed(&pid->stashed, pidfs_mnt, pid, &path); if (ret < 0) return ERR_PTR(ret); mntput(path.mnt); return path.dentry; } /* * Make sure that we reject any nonsensical flags that users pass via * open_by_handle_at(). Note that PIDFD_THREAD is defined as O_EXCL, and * PIDFD_NONBLOCK as O_NONBLOCK. */ #define VALID_FILE_HANDLE_OPEN_FLAGS \ (O_RDONLY | O_WRONLY | O_RDWR | O_NONBLOCK | O_CLOEXEC | O_EXCL) static int pidfs_export_permission(struct handle_to_path_ctx *ctx, unsigned int oflags) { if (oflags & ~(VALID_FILE_HANDLE_OPEN_FLAGS | O_LARGEFILE)) return -EINVAL; /* * pidfd_ino_get_pid() will verify that the struct pid is part * of the caller's pid namespace hierarchy. No further * permission checks are needed. */ return 0; } static struct file *pidfs_export_open(struct path *path, unsigned int oflags) { /* * Clear O_LARGEFILE as open_by_handle_at() forces it and raise * O_RDWR as pidfds always are. */ oflags &= ~O_LARGEFILE; return dentry_open(path, oflags | O_RDWR, current_cred()); } static const struct export_operations pidfs_export_operations = { .encode_fh = pidfs_encode_fh, .fh_to_dentry = pidfs_fh_to_dentry, .open = pidfs_export_open, .permission = pidfs_export_permission, }; static int pidfs_init_inode(struct inode *inode, void *data) { const struct pid *pid = data; inode->i_private = data; inode->i_flags |= S_PRIVATE; inode->i_mode |= S_IRWXU; inode->i_op = &pidfs_inode_operations; inode->i_fop = &pidfs_file_operations; inode->i_ino = pidfs_ino(pid->ino); inode->i_generation = pidfs_gen(pid->ino); return 0; } static void pidfs_put_data(void *data) { struct pid *pid = data; put_pid(pid); } static const struct stashed_operations pidfs_stashed_ops = { .init_inode = pidfs_init_inode, .put_data = pidfs_put_data, }; static int pidfs_init_fs_context(struct fs_context *fc) { struct pseudo_fs_context *ctx; ctx = init_pseudo(fc, PID_FS_MAGIC); if (!ctx) return -ENOMEM; ctx->ops = &pidfs_sops; ctx->eops = &pidfs_export_operations; ctx->dops = &pidfs_dentry_operations; fc->s_fs_info = (void *)&pidfs_stashed_ops; return 0; } static struct file_system_type pidfs_type = { .name = "pidfs", .init_fs_context = pidfs_init_fs_context, .kill_sb = kill_anon_super, }; struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags) { struct file *pidfd_file; struct path path; int ret; ret = path_from_stashed(&pid->stashed, pidfs_mnt, get_pid(pid), &path); if (ret < 0) return ERR_PTR(ret); pidfd_file = dentry_open(&path, flags, current_cred()); path_put(&path); return pidfd_file; } void __init pidfs_init(void) { pidfs_mnt = kern_mount(&pidfs_type); if (IS_ERR(pidfs_mnt)) panic("Failed to mount pidfs pseudo filesystem"); }
22 2 17 22 94 27 20 1 19 14 19 14 5 25 35 9 1 22 33 25 18 32 29 25 89 25 81 31 34 34 18 24 17 17 11 2 4 100 1 2 31 41 75 93 29 8 24 94 74 36 99 92 23 22 18 22 18 88 15 33 68 25 70 11 2 26 1 29 3 6 37 37 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 /* * linux/fs/hfs/extent.c * * Copyright (C) 1995-1997 Paul H. Hargrove * (C) 2003 Ardis Technologies <roman@ardistech.com> * This file may be distributed under the terms of the GNU General Public License. * * This file contains the functions related to the extents B-tree. */ #include <linux/pagemap.h> #include "hfs_fs.h" #include "btree.h" /*================ File-local functions ================*/ /* * build_key */ static void hfs_ext_build_key(hfs_btree_key *key, u32 cnid, u16 block, u8 type) { key->key_len = 7; key->ext.FkType = type; key->ext.FNum = cpu_to_be32(cnid); key->ext.FABN = cpu_to_be16(block); } /* * hfs_ext_compare() * * Description: * This is the comparison function used for the extents B-tree. In * comparing extent B-tree entries, the file id is the most * significant field (compared as unsigned ints); the fork type is * the second most significant field (compared as unsigned chars); * and the allocation block number field is the least significant * (compared as unsigned ints). * Input Variable(s): * struct hfs_ext_key *key1: pointer to the first key to compare * struct hfs_ext_key *key2: pointer to the second key to compare * Output Variable(s): * NONE * Returns: * int: negative if key1<key2, positive if key1>key2, and 0 if key1==key2 * Preconditions: * key1 and key2 point to "valid" (struct hfs_ext_key)s. * Postconditions: * This function has no side-effects */ int hfs_ext_keycmp(const btree_key *key1, const btree_key *key2) { __be32 fnum1, fnum2; __be16 block1, block2; fnum1 = key1->ext.FNum; fnum2 = key2->ext.FNum; if (fnum1 != fnum2) return be32_to_cpu(fnum1) < be32_to_cpu(fnum2) ? -1 : 1; if (key1->ext.FkType != key2->ext.FkType) return key1->ext.FkType < key2->ext.FkType ? -1 : 1; block1 = key1->ext.FABN; block2 = key2->ext.FABN; if (block1 == block2) return 0; return be16_to_cpu(block1) < be16_to_cpu(block2) ? -1 : 1; } /* * hfs_ext_find_block * * Find a block within an extent record */ static u16 hfs_ext_find_block(struct hfs_extent *ext, u16 off) { int i; u16 count; for (i = 0; i < 3; ext++, i++) { count = be16_to_cpu(ext->count); if (off < count) return be16_to_cpu(ext->block) + off; off -= count; } /* panic? */ return 0; } static int hfs_ext_block_count(struct hfs_extent *ext) { int i; u16 count = 0; for (i = 0; i < 3; ext++, i++) count += be16_to_cpu(ext->count); return count; } static u16 hfs_ext_lastblock(struct hfs_extent *ext) { int i; ext += 2; for (i = 0; i < 2; ext--, i++) if (ext->count) break; return be16_to_cpu(ext->block) + be16_to_cpu(ext->count); } static int __hfs_ext_write_extent(struct inode *inode, struct hfs_find_data *fd) { int res; hfs_ext_build_key(fd->search_key, inode->i_ino, HFS_I(inode)->cached_start, HFS_IS_RSRC(inode) ? HFS_FK_RSRC : HFS_FK_DATA); res = hfs_brec_find(fd); if (HFS_I(inode)->flags & HFS_FLG_EXT_NEW) { if (res != -ENOENT) return res; /* Fail early and avoid ENOSPC during the btree operation */ res = hfs_bmap_reserve(fd->tree, fd->tree->depth + 1); if (res) return res; hfs_brec_insert(fd, HFS_I(inode)->cached_extents, sizeof(hfs_extent_rec)); HFS_I(inode)->flags &= ~(HFS_FLG_EXT_DIRTY|HFS_FLG_EXT_NEW); } else { if (res) return res; hfs_bnode_write(fd->bnode, HFS_I(inode)->cached_extents, fd->entryoffset, fd->entrylength); HFS_I(inode)->flags &= ~HFS_FLG_EXT_DIRTY; } return 0; } int hfs_ext_write_extent(struct inode *inode) { struct hfs_find_data fd; int res = 0; if (HFS_I(inode)->flags & HFS_FLG_EXT_DIRTY) { res = hfs_find_init(HFS_SB(inode->i_sb)->ext_tree, &fd); if (res) return res; res = __hfs_ext_write_extent(inode, &fd); hfs_find_exit(&fd); } return res; } static inline int __hfs_ext_read_extent(struct hfs_find_data *fd, struct hfs_extent *extent, u32 cnid, u32 block, u8 type) { int res; hfs_ext_build_key(fd->search_key, cnid, block, type); fd->key->ext.FNum = 0; res = hfs_brec_find(fd); if (res && res != -ENOENT) return res; if (fd->key->ext.FNum != fd->search_key->ext.FNum || fd->key->ext.FkType != fd->search_key->ext.FkType) return -ENOENT; if (fd->entrylength != sizeof(hfs_extent_rec)) return -EIO; hfs_bnode_read(fd->bnode, extent, fd->entryoffset, sizeof(hfs_extent_rec)); return 0; } static inline int __hfs_ext_cache_extent(struct hfs_find_data *fd, struct inode *inode, u32 block) { int res; if (HFS_I(inode)->flags & HFS_FLG_EXT_DIRTY) { res = __hfs_ext_write_extent(inode, fd); if (res) return res; } res = __hfs_ext_read_extent(fd, HFS_I(inode)->cached_extents, inode->i_ino, block, HFS_IS_RSRC(inode) ? HFS_FK_RSRC : HFS_FK_DATA); if (!res) { HFS_I(inode)->cached_start = be16_to_cpu(fd->key->ext.FABN); HFS_I(inode)->cached_blocks = hfs_ext_block_count(HFS_I(inode)->cached_extents); } else { HFS_I(inode)->cached_start = HFS_I(inode)->cached_blocks = 0; HFS_I(inode)->flags &= ~(HFS_FLG_EXT_DIRTY|HFS_FLG_EXT_NEW); } return res; } static int hfs_ext_read_extent(struct inode *inode, u16 block) { struct hfs_find_data fd; int res; if (block >= HFS_I(inode)->cached_start && block < HFS_I(inode)->cached_start + HFS_I(inode)->cached_blocks) return 0; res = hfs_find_init(HFS_SB(inode->i_sb)->ext_tree, &fd); if (!res) { res = __hfs_ext_cache_extent(&fd, inode, block); hfs_find_exit(&fd); } return res; } static void hfs_dump_extent(struct hfs_extent *extent) { int i; hfs_dbg(EXTENT, " "); for (i = 0; i < 3; i++) hfs_dbg_cont(EXTENT, " %u:%u", be16_to_cpu(extent[i].block), be16_to_cpu(extent[i].count)); hfs_dbg_cont(EXTENT, "\n"); } static int hfs_add_extent(struct hfs_extent *extent, u16 offset, u16 alloc_block, u16 block_count) { u16 count, start; int i; hfs_dump_extent(extent); for (i = 0; i < 3; extent++, i++) { count = be16_to_cpu(extent->count); if (offset == count) { start = be16_to_cpu(extent->block); if (alloc_block != start + count) { if (++i >= 3) return -ENOSPC; extent++; extent->block = cpu_to_be16(alloc_block); } else block_count += count; extent->count = cpu_to_be16(block_count); return 0; } else if (offset < count) break; offset -= count; } /* panic? */ return -EIO; } static int hfs_free_extents(struct super_block *sb, struct hfs_extent *extent, u16 offset, u16 block_nr) { u16 count, start; int i; hfs_dump_extent(extent); for (i = 0; i < 3; extent++, i++) { count = be16_to_cpu(extent->count); if (offset == count) goto found; else if (offset < count) break; offset -= count; } /* panic? */ return -EIO; found: for (;;) { start = be16_to_cpu(extent->block); if (count <= block_nr) { hfs_clear_vbm_bits(sb, start, count); extent->block = 0; extent->count = 0; block_nr -= count; } else { count -= block_nr; hfs_clear_vbm_bits(sb, start + count, block_nr); extent->count = cpu_to_be16(count); block_nr = 0; } if (!block_nr || !i) return 0; i--; extent--; count = be16_to_cpu(extent->count); } } int hfs_free_fork(struct super_block *sb, struct hfs_cat_file *file, int type) { struct hfs_find_data fd; u32 total_blocks, blocks, start; u32 cnid = be32_to_cpu(file->FlNum); struct hfs_extent *extent; int res, i; if (type == HFS_FK_DATA) { total_blocks = be32_to_cpu(file->PyLen); extent = file->ExtRec; } else { total_blocks = be32_to_cpu(file->RPyLen); extent = file->RExtRec; } total_blocks /= HFS_SB(sb)->alloc_blksz; if (!total_blocks) return 0; blocks = 0; for (i = 0; i < 3; i++) blocks += be16_to_cpu(extent[i].count); res = hfs_free_extents(sb, extent, blocks, blocks); if (res) return res; if (total_blocks == blocks) return 0; res = hfs_find_init(HFS_SB(sb)->ext_tree, &fd); if (res) return res; do { res = __hfs_ext_read_extent(&fd, extent, cnid, total_blocks, type); if (res) break; start = be16_to_cpu(fd.key->ext.FABN); hfs_free_extents(sb, extent, total_blocks - start, total_blocks); hfs_brec_remove(&fd); total_blocks = start; } while (total_blocks > blocks); hfs_find_exit(&fd); return res; } /* * hfs_get_block */ int hfs_get_block(struct inode *inode, sector_t block, struct buffer_head *bh_result, int create) { struct super_block *sb; u16 dblock, ablock; int res; sb = inode->i_sb; /* Convert inode block to disk allocation block */ ablock = (u32)block / HFS_SB(sb)->fs_div; if (block >= HFS_I(inode)->fs_blocks) { if (!create) return 0; if (block > HFS_I(inode)->fs_blocks) return -EIO; if (ablock >= HFS_I(inode)->alloc_blocks) { res = hfs_extend_file(inode); if (res) return res; } } else create = 0; if (ablock < HFS_I(inode)->first_blocks) { dblock = hfs_ext_find_block(HFS_I(inode)->first_extents, ablock); goto done; } mutex_lock(&HFS_I(inode)->extents_lock); res = hfs_ext_read_extent(inode, ablock); if (!res) dblock = hfs_ext_find_block(HFS_I(inode)->cached_extents, ablock - HFS_I(inode)->cached_start); else { mutex_unlock(&HFS_I(inode)->extents_lock); return -EIO; } mutex_unlock(&HFS_I(inode)->extents_lock); done: map_bh(bh_result, sb, HFS_SB(sb)->fs_start + dblock * HFS_SB(sb)->fs_div + (u32)block % HFS_SB(sb)->fs_div); if (create) { set_buffer_new(bh_result); HFS_I(inode)->phys_size += sb->s_blocksize; HFS_I(inode)->fs_blocks++; inode_add_bytes(inode, sb->s_blocksize); mark_inode_dirty(inode); } return 0; } int hfs_extend_file(struct inode *inode) { struct super_block *sb = inode->i_sb; u32 start, len, goal; int res; mutex_lock(&HFS_I(inode)->extents_lock); if (HFS_I(inode)->alloc_blocks == HFS_I(inode)->first_blocks) goal = hfs_ext_lastblock(HFS_I(inode)->first_extents); else { res = hfs_ext_read_extent(inode, HFS_I(inode)->alloc_blocks); if (res) goto out; goal = hfs_ext_lastblock(HFS_I(inode)->cached_extents); } len = HFS_I(inode)->clump_blocks; start = hfs_vbm_search_free(sb, goal, &len); if (!len) { res = -ENOSPC; goto out; } hfs_dbg(EXTENT, "extend %lu: %u,%u\n", inode->i_ino, start, len); if (HFS_I(inode)->alloc_blocks == HFS_I(inode)->first_blocks) { if (!HFS_I(inode)->first_blocks) { hfs_dbg(EXTENT, "first extents\n"); /* no extents yet */ HFS_I(inode)->first_extents[0].block = cpu_to_be16(start); HFS_I(inode)->first_extents[0].count = cpu_to_be16(len); res = 0; } else { /* try to append to extents in inode */ res = hfs_add_extent(HFS_I(inode)->first_extents, HFS_I(inode)->alloc_blocks, start, len); if (res == -ENOSPC) goto insert_extent; } if (!res) { hfs_dump_extent(HFS_I(inode)->first_extents); HFS_I(inode)->first_blocks += len; } } else { res = hfs_add_extent(HFS_I(inode)->cached_extents, HFS_I(inode)->alloc_blocks - HFS_I(inode)->cached_start, start, len); if (!res) { hfs_dump_extent(HFS_I(inode)->cached_extents); HFS_I(inode)->flags |= HFS_FLG_EXT_DIRTY; HFS_I(inode)->cached_blocks += len; } else if (res == -ENOSPC) goto insert_extent; } out: mutex_unlock(&HFS_I(inode)->extents_lock); if (!res) { HFS_I(inode)->alloc_blocks += len; mark_inode_dirty(inode); if (inode->i_ino < HFS_FIRSTUSER_CNID) set_bit(HFS_FLG_ALT_MDB_DIRTY, &HFS_SB(sb)->flags); set_bit(HFS_FLG_MDB_DIRTY, &HFS_SB(sb)->flags); hfs_mark_mdb_dirty(sb); } return res; insert_extent: hfs_dbg(EXTENT, "insert new extent\n"); res = hfs_ext_write_extent(inode); if (res) goto out; memset(HFS_I(inode)->cached_extents, 0, sizeof(hfs_extent_rec)); HFS_I(inode)->cached_extents[0].block = cpu_to_be16(start); HFS_I(inode)->cached_extents[0].count = cpu_to_be16(len); hfs_dump_extent(HFS_I(inode)->cached_extents); HFS_I(inode)->flags |= HFS_FLG_EXT_DIRTY|HFS_FLG_EXT_NEW; HFS_I(inode)->cached_start = HFS_I(inode)->alloc_blocks; HFS_I(inode)->cached_blocks = len; res = 0; goto out; } void hfs_file_truncate(struct inode *inode) { struct super_block *sb = inode->i_sb; struct hfs_find_data fd; u16 blk_cnt, alloc_cnt, start; u32 size; int res; hfs_dbg(INODE, "truncate: %lu, %Lu -> %Lu\n", inode->i_ino, (long long)HFS_I(inode)->phys_size, inode->i_size); if (inode->i_size > HFS_I(inode)->phys_size) { struct address_space *mapping = inode->i_mapping; void *fsdata = NULL; struct folio *folio; /* XXX: Can use generic_cont_expand? */ size = inode->i_size - 1; res = hfs_write_begin(NULL, mapping, size + 1, 0, &folio, &fsdata); if (!res) { res = generic_write_end(NULL, mapping, size + 1, 0, 0, folio, fsdata); } if (res) inode->i_size = HFS_I(inode)->phys_size; return; } else if (inode->i_size == HFS_I(inode)->phys_size) return; size = inode->i_size + HFS_SB(sb)->alloc_blksz - 1; blk_cnt = size / HFS_SB(sb)->alloc_blksz; alloc_cnt = HFS_I(inode)->alloc_blocks; if (blk_cnt == alloc_cnt) goto out; mutex_lock(&HFS_I(inode)->extents_lock); res = hfs_find_init(HFS_SB(sb)->ext_tree, &fd); if (res) { mutex_unlock(&HFS_I(inode)->extents_lock); /* XXX: We lack error handling of hfs_file_truncate() */ return; } while (1) { if (alloc_cnt == HFS_I(inode)->first_blocks) { hfs_free_extents(sb, HFS_I(inode)->first_extents, alloc_cnt, alloc_cnt - blk_cnt); hfs_dump_extent(HFS_I(inode)->first_extents); HFS_I(inode)->first_blocks = blk_cnt; break; } res = __hfs_ext_cache_extent(&fd, inode, alloc_cnt); if (res) break; start = HFS_I(inode)->cached_start; hfs_free_extents(sb, HFS_I(inode)->cached_extents, alloc_cnt - start, alloc_cnt - blk_cnt); hfs_dump_extent(HFS_I(inode)->cached_extents); if (blk_cnt > start) { HFS_I(inode)->flags |= HFS_FLG_EXT_DIRTY; break; } alloc_cnt = start; HFS_I(inode)->cached_start = HFS_I(inode)->cached_blocks = 0; HFS_I(inode)->flags &= ~(HFS_FLG_EXT_DIRTY|HFS_FLG_EXT_NEW); hfs_brec_remove(&fd); } hfs_find_exit(&fd); mutex_unlock(&HFS_I(inode)->extents_lock); HFS_I(inode)->alloc_blocks = blk_cnt; out: HFS_I(inode)->phys_size = inode->i_size; HFS_I(inode)->fs_blocks = (inode->i_size + sb->s_blocksize - 1) >> sb->s_blocksize_bits; inode_set_bytes(inode, HFS_I(inode)->fs_blocks << sb->s_blocksize_bits); mark_inode_dirty(inode); }
4 4 4 1 20 11 10 15 2 1 2 2 1 2 3 1 5 2 7 11 11 11 181 181 1 182 181 10 3 7 10 5 5 4 4 4 1 4 4 4 4 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 // SPDX-License-Identifier: GPL-2.0-or-later /* -*- linux-c -*- --------------------------------------------------------- * * * linux/fs/devpts/inode.c * * Copyright 1998-2004 H. Peter Anvin -- All Rights Reserved * * ------------------------------------------------------------------------- */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/init.h> #include <linux/fs.h> #include <linux/sched.h> #include <linux/namei.h> #include <linux/slab.h> #include <linux/mount.h> #include <linux/tty.h> #include <linux/mutex.h> #include <linux/magic.h> #include <linux/idr.h> #include <linux/devpts_fs.h> #include <linux/parser.h> #include <linux/fsnotify.h> #include <linux/seq_file.h> #define DEVPTS_DEFAULT_MODE 0600 /* * ptmx is a new node in /dev/pts and will be unused in legacy (single- * instance) mode. To prevent surprises in user space, set permissions of * ptmx to 0. Use 'chmod' or remount with '-o ptmxmode' to set meaningful * permissions. */ #define DEVPTS_DEFAULT_PTMX_MODE 0000 #define PTMX_MINOR 2 /* * sysctl support for setting limits on the number of Unix98 ptys allocated. * Otherwise one can eat up all kernel memory by opening /dev/ptmx repeatedly. */ static int pty_limit = NR_UNIX98_PTY_DEFAULT; static int pty_reserve = NR_UNIX98_PTY_RESERVE; static int pty_limit_min; static int pty_limit_max = INT_MAX; static atomic_t pty_count = ATOMIC_INIT(0); static const struct ctl_table pty_table[] = { { .procname = "max", .maxlen = sizeof(int), .mode = 0644, .data = &pty_limit, .proc_handler = proc_dointvec_minmax, .extra1 = &pty_limit_min, .extra2 = &pty_limit_max, }, { .procname = "reserve", .maxlen = sizeof(int), .mode = 0644, .data = &pty_reserve, .proc_handler = proc_dointvec_minmax, .extra1 = &pty_limit_min, .extra2 = &pty_limit_max, }, { .procname = "nr", .maxlen = sizeof(int), .mode = 0444, .data = &pty_count, .proc_handler = proc_dointvec, }, }; struct pts_mount_opts { int setuid; int setgid; kuid_t uid; kgid_t gid; umode_t mode; umode_t ptmxmode; int reserve; int max; }; enum { Opt_uid, Opt_gid, Opt_mode, Opt_ptmxmode, Opt_newinstance, Opt_max, Opt_err }; static const match_table_t tokens = { {Opt_uid, "uid=%u"}, {Opt_gid, "gid=%u"}, {Opt_mode, "mode=%o"}, {Opt_ptmxmode, "ptmxmode=%o"}, {Opt_newinstance, "newinstance"}, {Opt_max, "max=%d"}, {Opt_err, NULL} }; struct pts_fs_info { struct ida allocated_ptys; struct pts_mount_opts mount_opts; struct super_block *sb; struct dentry *ptmx_dentry; }; static inline struct pts_fs_info *DEVPTS_SB(struct super_block *sb) { return sb->s_fs_info; } static int devpts_ptmx_path(struct path *path) { struct super_block *sb; int err; /* Is a devpts filesystem at "pts" in the same directory? */ err = path_pts(path); if (err) return err; /* Is the path the root of a devpts filesystem? */ sb = path->mnt->mnt_sb; if ((sb->s_magic != DEVPTS_SUPER_MAGIC) || (path->mnt->mnt_root != sb->s_root)) return -ENODEV; return 0; } /* * Try to find a suitable devpts filesystem. We support the following * scenarios: * - The ptmx device node is located in the same directory as the devpts * mount where the pts device nodes are located. * This is e.g. the case when calling open on the /dev/pts/ptmx device * node when the devpts filesystem is mounted at /dev/pts. * - The ptmx device node is located outside the devpts filesystem mount * where the pts device nodes are located. For example, the ptmx device * is a symlink, separate device node, or bind-mount. * A supported scenario is bind-mounting /dev/pts/ptmx to /dev/ptmx and * then calling open on /dev/ptmx. In this case a suitable pts * subdirectory can be found in the common parent directory /dev of the * devpts mount and the ptmx bind-mount, after resolving the /dev/ptmx * bind-mount. * If no suitable pts subdirectory can be found this function will fail. * This is e.g. the case when bind-mounting /dev/pts/ptmx to /ptmx. */ struct vfsmount *devpts_mntget(struct file *filp, struct pts_fs_info *fsi) { struct path path; int err = 0; path = filp->f_path; path_get(&path); /* Walk upward while the start point is a bind mount of * a single file. */ while (path.mnt->mnt_root == path.dentry) if (follow_up(&path) == 0) break; /* devpts_ptmx_path() finds a devpts fs or returns an error. */ if ((path.mnt->mnt_sb->s_magic != DEVPTS_SUPER_MAGIC) || (DEVPTS_SB(path.mnt->mnt_sb) != fsi)) err = devpts_ptmx_path(&path); dput(path.dentry); if (!err) { if (DEVPTS_SB(path.mnt->mnt_sb) == fsi) return path.mnt; err = -ENODEV; } mntput(path.mnt); return ERR_PTR(err); } struct pts_fs_info *devpts_acquire(struct file *filp) { struct pts_fs_info *result; struct path path; struct super_block *sb; path = filp->f_path; path_get(&path); /* Has the devpts filesystem already been found? */ if (path.mnt->mnt_sb->s_magic != DEVPTS_SUPER_MAGIC) { int err; err = devpts_ptmx_path(&path); if (err) { result = ERR_PTR(err); goto out; } } /* * pty code needs to hold extra references in case of last /dev/tty close */ sb = path.mnt->mnt_sb; atomic_inc(&sb->s_active); result = DEVPTS_SB(sb); out: path_put(&path); return result; } void devpts_release(struct pts_fs_info *fsi) { deactivate_super(fsi->sb); } #define PARSE_MOUNT 0 #define PARSE_REMOUNT 1 /* * parse_mount_options(): * Set @opts to mount options specified in @data. If an option is not * specified in @data, set it to its default value. * * Note: @data may be NULL (in which case all options are set to default). */ static int parse_mount_options(char *data, int op, struct pts_mount_opts *opts) { char *p; kuid_t uid; kgid_t gid; opts->setuid = 0; opts->setgid = 0; opts->uid = GLOBAL_ROOT_UID; opts->gid = GLOBAL_ROOT_GID; opts->mode = DEVPTS_DEFAULT_MODE; opts->ptmxmode = DEVPTS_DEFAULT_PTMX_MODE; opts->max = NR_UNIX98_PTY_MAX; /* Only allow instances mounted from the initial mount * namespace to tap the reserve pool of ptys. */ if (op == PARSE_MOUNT) opts->reserve = (current->nsproxy->mnt_ns == init_task.nsproxy->mnt_ns); while ((p = strsep(&data, ",")) != NULL) { substring_t args[MAX_OPT_ARGS]; int token; int option; if (!*p) continue; token = match_token(p, tokens, args); switch (token) { case Opt_uid: if (match_int(&args[0], &option)) return -EINVAL; uid = make_kuid(current_user_ns(), option); if (!uid_valid(uid)) return -EINVAL; opts->uid = uid; opts->setuid = 1; break; case Opt_gid: if (match_int(&args[0], &option)) return -EINVAL; gid = make_kgid(current_user_ns(), option); if (!gid_valid(gid)) return -EINVAL; opts->gid = gid; opts->setgid = 1; break; case Opt_mode: if (match_octal(&args[0], &option)) return -EINVAL; opts->mode = option & S_IALLUGO; break; case Opt_ptmxmode: if (match_octal(&args[0], &option)) return -EINVAL; opts->ptmxmode = option & S_IALLUGO; break; case Opt_newinstance: break; case Opt_max: if (match_int(&args[0], &option) || option < 0 || option > NR_UNIX98_PTY_MAX) return -EINVAL; opts->max = option; break; default: pr_err("called with bogus options\n"); return -EINVAL; } } return 0; } static int mknod_ptmx(struct super_block *sb) { int mode; int rc = -ENOMEM; struct dentry *dentry; struct inode *inode; struct dentry *root = sb->s_root; struct pts_fs_info *fsi = DEVPTS_SB(sb); struct pts_mount_opts *opts = &fsi->mount_opts; kuid_t ptmx_uid = current_fsuid(); kgid_t ptmx_gid = current_fsgid(); inode_lock(d_inode(root)); /* If we have already created ptmx node, return */ if (fsi->ptmx_dentry) { rc = 0; goto out; } dentry = d_alloc_name(root, "ptmx"); if (!dentry) { pr_err("Unable to alloc dentry for ptmx node\n"); goto out; } /* * Create a new 'ptmx' node in this mount of devpts. */ inode = new_inode(sb); if (!inode) { pr_err("Unable to alloc inode for ptmx node\n"); dput(dentry); goto out; } inode->i_ino = 2; simple_inode_init_ts(inode); mode = S_IFCHR|opts->ptmxmode; init_special_inode(inode, mode, MKDEV(TTYAUX_MAJOR, 2)); inode->i_uid = ptmx_uid; inode->i_gid = ptmx_gid; d_add(dentry, inode); fsi->ptmx_dentry = dentry; rc = 0; out: inode_unlock(d_inode(root)); return rc; } static void update_ptmx_mode(struct pts_fs_info *fsi) { struct inode *inode; if (fsi->ptmx_dentry) { inode = d_inode(fsi->ptmx_dentry); inode->i_mode = S_IFCHR|fsi->mount_opts.ptmxmode; } } static int devpts_remount(struct super_block *sb, int *flags, char *data) { int err; struct pts_fs_info *fsi = DEVPTS_SB(sb); struct pts_mount_opts *opts = &fsi->mount_opts; err = parse_mount_options(data, PARSE_REMOUNT, opts); /* * parse_mount_options() restores options to default values * before parsing and may have changed ptmxmode. So, update the * mode in the inode too. Bogus options don't fail the remount, * so do this even on error return. */ update_ptmx_mode(fsi); return err; } static int devpts_show_options(struct seq_file *seq, struct dentry *root) { struct pts_fs_info *fsi = DEVPTS_SB(root->d_sb); struct pts_mount_opts *opts = &fsi->mount_opts; if (opts->setuid) seq_printf(seq, ",uid=%u", from_kuid_munged(&init_user_ns, opts->uid)); if (opts->setgid) seq_printf(seq, ",gid=%u", from_kgid_munged(&init_user_ns, opts->gid)); seq_printf(seq, ",mode=%03o", opts->mode); seq_printf(seq, ",ptmxmode=%03o", opts->ptmxmode); if (opts->max < NR_UNIX98_PTY_MAX) seq_printf(seq, ",max=%d", opts->max); return 0; } static const struct super_operations devpts_sops = { .statfs = simple_statfs, .remount_fs = devpts_remount, .show_options = devpts_show_options, }; static void *new_pts_fs_info(struct super_block *sb) { struct pts_fs_info *fsi; fsi = kzalloc(sizeof(struct pts_fs_info), GFP_KERNEL); if (!fsi) return NULL; ida_init(&fsi->allocated_ptys); fsi->mount_opts.mode = DEVPTS_DEFAULT_MODE; fsi->mount_opts.ptmxmode = DEVPTS_DEFAULT_PTMX_MODE; fsi->sb = sb; return fsi; } static int devpts_fill_super(struct super_block *s, void *data, int silent) { struct inode *inode; int error; s->s_iflags &= ~SB_I_NODEV; s->s_blocksize = 1024; s->s_blocksize_bits = 10; s->s_magic = DEVPTS_SUPER_MAGIC; s->s_op = &devpts_sops; s->s_d_op = &simple_dentry_operations; s->s_time_gran = 1; error = -ENOMEM; s->s_fs_info = new_pts_fs_info(s); if (!s->s_fs_info) goto fail; error = parse_mount_options(data, PARSE_MOUNT, &DEVPTS_SB(s)->mount_opts); if (error) goto fail; error = -ENOMEM; inode = new_inode(s); if (!inode) goto fail; inode->i_ino = 1; simple_inode_init_ts(inode); inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR; inode->i_op = &simple_dir_inode_operations; inode->i_fop = &simple_dir_operations; set_nlink(inode, 2); s->s_root = d_make_root(inode); if (!s->s_root) { pr_err("get root dentry failed\n"); goto fail; } error = mknod_ptmx(s); if (error) goto fail_dput; return 0; fail_dput: dput(s->s_root); s->s_root = NULL; fail: return error; } /* * devpts_mount() * * Mount a new (private) instance of devpts. PTYs created in this * instance are independent of the PTYs in other devpts instances. */ static struct dentry *devpts_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data) { return mount_nodev(fs_type, flags, data, devpts_fill_super); } static void devpts_kill_sb(struct super_block *sb) { struct pts_fs_info *fsi = DEVPTS_SB(sb); if (fsi) ida_destroy(&fsi->allocated_ptys); kfree(fsi); kill_litter_super(sb); } static struct file_system_type devpts_fs_type = { .name = "devpts", .mount = devpts_mount, .kill_sb = devpts_kill_sb, .fs_flags = FS_USERNS_MOUNT, }; /* * The normal naming convention is simply /dev/pts/<number>; this conforms * to the System V naming convention */ int devpts_new_index(struct pts_fs_info *fsi) { int index = -ENOSPC; if (atomic_inc_return(&pty_count) >= (pty_limit - (fsi->mount_opts.reserve ? 0 : pty_reserve))) goto out; index = ida_alloc_max(&fsi->allocated_ptys, fsi->mount_opts.max - 1, GFP_KERNEL); out: if (index < 0) atomic_dec(&pty_count); return index; } void devpts_kill_index(struct pts_fs_info *fsi, int idx) { ida_free(&fsi->allocated_ptys, idx); atomic_dec(&pty_count); } /** * devpts_pty_new -- create a new inode in /dev/pts/ * @fsi: Filesystem info for this instance. * @index: used as a name of the node * @priv: what's given back by devpts_get_priv * * The dentry for the created inode is returned. * Remove it from /dev/pts/ with devpts_pty_kill(). */ struct dentry *devpts_pty_new(struct pts_fs_info *fsi, int index, void *priv) { struct dentry *dentry; struct super_block *sb = fsi->sb; struct inode *inode; struct dentry *root; struct pts_mount_opts *opts; char s[12]; root = sb->s_root; opts = &fsi->mount_opts; inode = new_inode(sb); if (!inode) return ERR_PTR(-ENOMEM); inode->i_ino = index + 3; inode->i_uid = opts->setuid ? opts->uid : current_fsuid(); inode->i_gid = opts->setgid ? opts->gid : current_fsgid(); simple_inode_init_ts(inode); init_special_inode(inode, S_IFCHR|opts->mode, MKDEV(UNIX98_PTY_SLAVE_MAJOR, index)); sprintf(s, "%d", index); dentry = d_alloc_name(root, s); if (dentry) { dentry->d_fsdata = priv; d_add(dentry, inode); fsnotify_create(d_inode(root), dentry); } else { iput(inode); dentry = ERR_PTR(-ENOMEM); } return dentry; } /** * devpts_get_priv -- get private data for a slave * @dentry: dentry of the slave * * Returns whatever was passed as priv in devpts_pty_new for a given inode. */ void *devpts_get_priv(struct dentry *dentry) { if (dentry->d_sb->s_magic != DEVPTS_SUPER_MAGIC) return NULL; return dentry->d_fsdata; } /** * devpts_pty_kill -- remove inode form /dev/pts/ * @dentry: dentry of the slave to be removed * * This is an inverse operation of devpts_pty_new. */ void devpts_pty_kill(struct dentry *dentry) { WARN_ON_ONCE(dentry->d_sb->s_magic != DEVPTS_SUPER_MAGIC); dentry->d_fsdata = NULL; drop_nlink(dentry->d_inode); d_drop(dentry); fsnotify_unlink(d_inode(dentry->d_parent), dentry); dput(dentry); /* d_alloc_name() in devpts_pty_new() */ } static int __init init_devpts_fs(void) { int err = register_filesystem(&devpts_fs_type); if (!err) { register_sysctl("kernel/pty", pty_table); } return err; } module_init(init_devpts_fs)
6 6 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 // SPDX-License-Identifier: GPL-2.0 #include <linux/slab.h> #include <linux/kernel.h> #include <linux/bitops.h> #include <linux/cpumask.h> #include <linux/export.h> #include <linux/memblock.h> #include <linux/numa.h> /** * cpumask_next_wrap - helper to implement for_each_cpu_wrap * @n: the cpu prior to the place to search * @mask: the cpumask pointer * @start: the start point of the iteration * @wrap: assume @n crossing @start terminates the iteration * * Return: >= nr_cpu_ids on completion * * Note: the @wrap argument is required for the start condition when * we cannot assume @start is set in @mask. */ unsigned int cpumask_next_wrap(int n, const struct cpumask *mask, int start, bool wrap) { unsigned int next; again: next = cpumask_next(n, mask); if (wrap && n < start && next >= start) { return nr_cpumask_bits; } else if (next >= nr_cpumask_bits) { wrap = true; n = -1; goto again; } return next; } EXPORT_SYMBOL(cpumask_next_wrap); /* These are not inline because of header tangles. */ #ifdef CONFIG_CPUMASK_OFFSTACK /** * alloc_cpumask_var_node - allocate a struct cpumask on a given node * @mask: pointer to cpumask_var_t where the cpumask is returned * @flags: GFP_ flags * @node: memory node from which to allocate or %NUMA_NO_NODE * * Only defined when CONFIG_CPUMASK_OFFSTACK=y, otherwise is * a nop returning a constant 1 (in <linux/cpumask.h>). * * Return: TRUE if memory allocation succeeded, FALSE otherwise. * * In addition, mask will be NULL if this fails. Note that gcc is * usually smart enough to know that mask can never be NULL if * CONFIG_CPUMASK_OFFSTACK=n, so does code elimination in that case * too. */ bool alloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, int node) { *mask = kmalloc_node(cpumask_size(), flags, node); #ifdef CONFIG_DEBUG_PER_CPU_MAPS if (!*mask) { printk(KERN_ERR "=> alloc_cpumask_var: failed!\n"); dump_stack(); } #endif return *mask != NULL; } EXPORT_SYMBOL(alloc_cpumask_var_node); /** * alloc_bootmem_cpumask_var - allocate a struct cpumask from the bootmem arena. * @mask: pointer to cpumask_var_t where the cpumask is returned * * Only defined when CONFIG_CPUMASK_OFFSTACK=y, otherwise is * a nop (in <linux/cpumask.h>). * Either returns an allocated (zero-filled) cpumask, or causes the * system to panic. */ void __init alloc_bootmem_cpumask_var(cpumask_var_t *mask) { *mask = memblock_alloc_or_panic(cpumask_size(), SMP_CACHE_BYTES); } /** * free_cpumask_var - frees memory allocated for a struct cpumask. * @mask: cpumask to free * * This is safe on a NULL mask. */ void free_cpumask_var(cpumask_var_t mask) { kfree(mask); } EXPORT_SYMBOL(free_cpumask_var); /** * free_bootmem_cpumask_var - frees result of alloc_bootmem_cpumask_var * @mask: cpumask to free */ void __init free_bootmem_cpumask_var(cpumask_var_t mask) { memblock_free(mask, cpumask_size()); } #endif /** * cpumask_local_spread - select the i'th cpu based on NUMA distances * @i: index number * @node: local numa_node * * Return: online CPU according to a numa aware policy; local cpus are returned * first, followed by non-local ones, then it wraps around. * * For those who wants to enumerate all CPUs based on their NUMA distances, * i.e. call this function in a loop, like: * * for (i = 0; i < num_online_cpus(); i++) { * cpu = cpumask_local_spread(i, node); * do_something(cpu); * } * * There's a better alternative based on for_each()-like iterators: * * for_each_numa_hop_mask(mask, node) { * for_each_cpu_andnot(cpu, mask, prev) * do_something(cpu); * prev = mask; * } * * It's simpler and more verbose than above. Complexity of iterator-based * enumeration is O(sched_domains_numa_levels * nr_cpu_ids), while * cpumask_local_spread() when called for each cpu is * O(sched_domains_numa_levels * nr_cpu_ids * log(nr_cpu_ids)). */ unsigned int cpumask_local_spread(unsigned int i, int node) { unsigned int cpu; /* Wrap: we always want a cpu. */ i %= num_online_cpus(); cpu = sched_numa_find_nth_cpu(cpu_online_mask, i, node); WARN_ON(cpu >= nr_cpu_ids); return cpu; } EXPORT_SYMBOL(cpumask_local_spread); static DEFINE_PER_CPU(int, distribute_cpu_mask_prev); /** * cpumask_any_and_distribute - Return an arbitrary cpu within src1p & src2p. * @src1p: first &cpumask for intersection * @src2p: second &cpumask for intersection * * Iterated calls using the same srcp1 and srcp2 will be distributed within * their intersection. * * Return: >= nr_cpu_ids if the intersection is empty. */ unsigned int cpumask_any_and_distribute(const struct cpumask *src1p, const struct cpumask *src2p) { unsigned int next, prev; /* NOTE: our first selection will skip 0. */ prev = __this_cpu_read(distribute_cpu_mask_prev); next = find_next_and_bit_wrap(cpumask_bits(src1p), cpumask_bits(src2p), nr_cpumask_bits, prev + 1); if (next < nr_cpu_ids) __this_cpu_write(distribute_cpu_mask_prev, next); return next; } EXPORT_SYMBOL(cpumask_any_and_distribute); /** * cpumask_any_distribute - Return an arbitrary cpu from srcp * @srcp: &cpumask for selection * * Return: >= nr_cpu_ids if the intersection is empty. */ unsigned int cpumask_any_distribute(const struct cpumask *srcp) { unsigned int next, prev; /* NOTE: our first selection will skip 0. */ prev = __this_cpu_read(distribute_cpu_mask_prev); next = find_next_bit_wrap(cpumask_bits(srcp), nr_cpumask_bits, prev + 1); if (next < nr_cpu_ids) __this_cpu_write(distribute_cpu_mask_prev, next); return next; } EXPORT_SYMBOL(cpumask_any_distribute);
2 2 2 2 2 7 7 7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2009 Rafael J. Wysocki <rjw@sisk.pl>, Novell Inc. * * This file contains power management functions related to interrupts. */ #include <linux/irq.h> #include <linux/module.h> #include <linux/interrupt.h> #include <linux/suspend.h> #include <linux/syscore_ops.h> #include "internals.h" bool irq_pm_check_wakeup(struct irq_desc *desc) { if (irqd_is_wakeup_armed(&desc->irq_data)) { irqd_clear(&desc->irq_data, IRQD_WAKEUP_ARMED); desc->istate |= IRQS_SUSPENDED | IRQS_PENDING; desc->depth++; irq_disable(desc); pm_system_irq_wakeup(irq_desc_get_irq(desc)); return true; } return false; } /* * Called from __setup_irq() with desc->lock held after @action has * been installed in the action chain. */ void irq_pm_install_action(struct irq_desc *desc, struct irqaction *action) { desc->nr_actions++; if (action->flags & IRQF_FORCE_RESUME) desc->force_resume_depth++; WARN_ON_ONCE(desc->force_resume_depth && desc->force_resume_depth != desc->nr_actions); if (action->flags & IRQF_NO_SUSPEND) desc->no_suspend_depth++; else if (action->flags & IRQF_COND_SUSPEND) desc->cond_suspend_depth++; WARN_ON_ONCE(desc->no_suspend_depth && (desc->no_suspend_depth + desc->cond_suspend_depth) != desc->nr_actions); } /* * Called from __free_irq() with desc->lock held after @action has * been removed from the action chain. */ void irq_pm_remove_action(struct irq_desc *desc, struct irqaction *action) { desc->nr_actions--; if (action->flags & IRQF_FORCE_RESUME) desc->force_resume_depth--; if (action->flags & IRQF_NO_SUSPEND) desc->no_suspend_depth--; else if (action->flags & IRQF_COND_SUSPEND) desc->cond_suspend_depth--; } static bool suspend_device_irq(struct irq_desc *desc) { unsigned long chipflags = irq_desc_get_chip(desc)->flags; struct irq_data *irqd = &desc->irq_data; if (!desc->action || irq_desc_is_chained(desc) || desc->no_suspend_depth) return false; if (irqd_is_wakeup_set(irqd)) { irqd_set(irqd, IRQD_WAKEUP_ARMED); if ((chipflags & IRQCHIP_ENABLE_WAKEUP_ON_SUSPEND) && irqd_irq_disabled(irqd)) { /* * Interrupt marked for wakeup is in disabled state. * Enable interrupt here to unmask/enable in irqchip * to be able to resume with such interrupts. */ __enable_irq(desc); irqd_set(irqd, IRQD_IRQ_ENABLED_ON_SUSPEND); } /* * We return true here to force the caller to issue * synchronize_irq(). We need to make sure that the * IRQD_WAKEUP_ARMED is visible before we return from * suspend_device_irqs(). */ return true; } desc->istate |= IRQS_SUSPENDED; __disable_irq(desc); /* * Hardware which has no wakeup source configuration facility * requires that the non wakeup interrupts are masked at the * chip level. The chip implementation indicates that with * IRQCHIP_MASK_ON_SUSPEND. */ if (chipflags & IRQCHIP_MASK_ON_SUSPEND) mask_irq(desc); return true; } /** * suspend_device_irqs - disable all currently enabled interrupt lines * * During system-wide suspend or hibernation device drivers need to be * prevented from receiving interrupts and this function is provided * for this purpose. * * So we disable all interrupts and mark them IRQS_SUSPENDED except * for those which are unused, those which are marked as not * suspendable via an interrupt request with the flag IRQF_NO_SUSPEND * set and those which are marked as active wakeup sources. * * The active wakeup sources are handled by the flow handler entry * code which checks for the IRQD_WAKEUP_ARMED flag, suspends the * interrupt and notifies the pm core about the wakeup. */ void suspend_device_irqs(void) { struct irq_desc *desc; int irq; for_each_irq_desc(irq, desc) { unsigned long flags; bool sync; if (irq_settings_is_nested_thread(desc)) continue; raw_spin_lock_irqsave(&desc->lock, flags); sync = suspend_device_irq(desc); raw_spin_unlock_irqrestore(&desc->lock, flags); if (sync) synchronize_irq(irq); } } static void resume_irq(struct irq_desc *desc) { struct irq_data *irqd = &desc->irq_data; irqd_clear(irqd, IRQD_WAKEUP_ARMED); if (irqd_is_enabled_on_suspend(irqd)) { /* * Interrupt marked for wakeup was enabled during suspend * entry. Disable such interrupts to restore them back to * original state. */ __disable_irq(desc); irqd_clear(irqd, IRQD_IRQ_ENABLED_ON_SUSPEND); } if (desc->istate & IRQS_SUSPENDED) goto resume; /* Force resume the interrupt? */ if (!desc->force_resume_depth) return; /* Pretend that it got disabled ! */ desc->depth++; irq_state_set_disabled(desc); irq_state_set_masked(desc); resume: desc->istate &= ~IRQS_SUSPENDED; __enable_irq(desc); } static void resume_irqs(bool want_early) { struct irq_desc *desc; int irq; for_each_irq_desc(irq, desc) { unsigned long flags; bool is_early = desc->action && desc->action->flags & IRQF_EARLY_RESUME; if (!is_early && want_early) continue; if (irq_settings_is_nested_thread(desc)) continue; raw_spin_lock_irqsave(&desc->lock, flags); resume_irq(desc); raw_spin_unlock_irqrestore(&desc->lock, flags); } } /** * rearm_wake_irq - rearm a wakeup interrupt line after signaling wakeup * @irq: Interrupt to rearm */ void rearm_wake_irq(unsigned int irq) { unsigned long flags; struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL); if (!desc) return; if (!(desc->istate & IRQS_SUSPENDED) || !irqd_is_wakeup_set(&desc->irq_data)) goto unlock; desc->istate &= ~IRQS_SUSPENDED; irqd_set(&desc->irq_data, IRQD_WAKEUP_ARMED); __enable_irq(desc); unlock: irq_put_desc_busunlock(desc, flags); } /** * irq_pm_syscore_resume - enable interrupt lines early * * Enable all interrupt lines with %IRQF_EARLY_RESUME set. */ static void irq_pm_syscore_resume(void) { resume_irqs(true); } static struct syscore_ops irq_pm_syscore_ops = { .resume = irq_pm_syscore_resume, }; static int __init irq_pm_init_ops(void) { register_syscore_ops(&irq_pm_syscore_ops); return 0; } device_initcall(irq_pm_init_ops); /** * resume_device_irqs - enable interrupt lines disabled by suspend_device_irqs() * * Enable all non-%IRQF_EARLY_RESUME interrupt lines previously * disabled by suspend_device_irqs() that have the IRQS_SUSPENDED flag * set as well as those with %IRQF_FORCE_RESUME. */ void resume_device_irqs(void) { resume_irqs(false); }
185 185 300 200 295 39 8 32 39 39 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 /* * linux/fs/hfs/string.c * * Copyright (C) 1995-1997 Paul H. Hargrove * (C) 2003 Ardis Technologies <roman@ardistech.com> * This file may be distributed under the terms of the GNU General Public License. * * This file contains the string comparison function for the * Macintosh character set. * * The code in this file is derived from code which is copyright * 1986, 1989, 1990 by Abacus Research and Development, Inc. (ARDI) * It is used here by the permission of ARDI's president Cliff Matthews. */ #include "hfs_fs.h" #include <linux/dcache.h> /*================ File-local variables ================*/ /* * unsigned char caseorder[] * * Defines the lexical ordering of characters on the Macintosh * * Composition of the 'casefold' and 'order' tables from ARDI's code * with the entry for 0x20 changed to match that for 0xCA to remove * special case for those two characters. */ static unsigned char caseorder[256] = { 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F, 0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,0x18,0x19,0x1A,0x1B,0x1C,0x1D,0x1E,0x1F, 0x20,0x22,0x23,0x28,0x29,0x2A,0x2B,0x2C,0x2F,0x30,0x31,0x32,0x33,0x34,0x35,0x36, 0x37,0x38,0x39,0x3A,0x3B,0x3C,0x3D,0x3E,0x3F,0x40,0x41,0x42,0x43,0x44,0x45,0x46, 0x47,0x48,0x57,0x59,0x5D,0x5F,0x66,0x68,0x6A,0x6C,0x72,0x74,0x76,0x78,0x7A,0x7E, 0x8C,0x8E,0x90,0x92,0x95,0x97,0x9E,0xA0,0xA2,0xA4,0xA7,0xA9,0xAA,0xAB,0xAC,0xAD, 0x4E,0x48,0x57,0x59,0x5D,0x5F,0x66,0x68,0x6A,0x6C,0x72,0x74,0x76,0x78,0x7A,0x7E, 0x8C,0x8E,0x90,0x92,0x95,0x97,0x9E,0xA0,0xA2,0xA4,0xA7,0xAF,0xB0,0xB1,0xB2,0xB3, 0x4A,0x4C,0x5A,0x60,0x7B,0x7F,0x98,0x4F,0x49,0x51,0x4A,0x4B,0x4C,0x5A,0x60,0x63, 0x64,0x65,0x6E,0x6F,0x70,0x71,0x7B,0x84,0x85,0x86,0x7F,0x80,0x9A,0x9B,0x9C,0x98, 0xB4,0xB5,0xB6,0xB7,0xB8,0xB9,0xBA,0x94,0xBB,0xBC,0xBD,0xBE,0xBF,0xC0,0x4D,0x81, 0xC1,0xC2,0xC3,0xC4,0xC5,0xC6,0xC7,0xC8,0xC9,0xCA,0xCB,0x55,0x8A,0xCC,0x4D,0x81, 0xCD,0xCE,0xCF,0xD0,0xD1,0xD2,0xD3,0x26,0x27,0xD4,0x20,0x49,0x4B,0x80,0x82,0x82, 0xD5,0xD6,0x24,0x25,0x2D,0x2E,0xD7,0xD8,0xA6,0xD9,0xDA,0xDB,0xDC,0xDD,0xDE,0xDF, 0xE0,0xE1,0xE2,0xE3,0xE4,0xE5,0xE6,0xE7,0xE8,0xE9,0xEA,0xEB,0xEC,0xED,0xEE,0xEF, 0xF0,0xF1,0xF2,0xF3,0xF4,0xF5,0xF6,0xF7,0xF8,0xF9,0xFA,0xFB,0xFC,0xFD,0xFE,0xFF }; /*================ Global functions ================*/ /* * Hash a string to an integer in a case-independent way */ int hfs_hash_dentry(const struct dentry *dentry, struct qstr *this) { const unsigned char *name = this->name; unsigned int hash, len = this->len; if (len > HFS_NAMELEN) len = HFS_NAMELEN; hash = init_name_hash(dentry); for (; len; len--) hash = partial_name_hash(caseorder[*name++], hash); this->hash = end_name_hash(hash); return 0; } /* * Compare two strings in the HFS filename character ordering * Returns positive, negative, or zero, not just 0 or (+/-)1 * * Equivalent to ARDI's call: * ROMlib_RelString(s1+1, s2+1, true, false, (s1[0]<<16) | s2[0]) */ int hfs_strcmp(const unsigned char *s1, unsigned int len1, const unsigned char *s2, unsigned int len2) { int len, tmp; len = (len1 > len2) ? len2 : len1; while (len--) { tmp = (int)caseorder[*(s1++)] - (int)caseorder[*(s2++)]; if (tmp) return tmp; } return len1 - len2; } /* * Test for equality of two strings in the HFS filename character ordering. * return 1 on failure and 0 on success */ int hfs_compare_dentry(const struct dentry *dentry, unsigned int len, const char *str, const struct qstr *name) { const unsigned char *n1, *n2; if (len >= HFS_NAMELEN) { if (name->len < HFS_NAMELEN) return 1; len = HFS_NAMELEN; } else if (len != name->len) return 1; n1 = str; n2 = name->name; while (len--) { if (caseorder[*n1++] != caseorder[*n2++]) return 1; } return 0; }
1 1 1 1 2 2 1 1 1 1 1 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/sysv/dir.c * * minix/dir.c * Copyright (C) 1991, 1992 Linus Torvalds * * coh/dir.c * Copyright (C) 1993 Pascal Haible, Bruno Haible * * sysv/dir.c * Copyright (C) 1993 Bruno Haible * * SystemV/Coherent directory handling functions */ #include <linux/pagemap.h> #include <linux/highmem.h> #include <linux/swap.h> #include "sysv.h" static int sysv_readdir(struct file *, struct dir_context *); const struct file_operations sysv_dir_operations = { .llseek = generic_file_llseek, .read = generic_read_dir, .iterate_shared = sysv_readdir, .fsync = generic_file_fsync, }; static void dir_commit_chunk(struct folio *folio, loff_t pos, unsigned len) { struct address_space *mapping = folio->mapping; struct inode *dir = mapping->host; block_write_end(NULL, mapping, pos, len, len, folio, NULL); if (pos+len > dir->i_size) { i_size_write(dir, pos+len); mark_inode_dirty(dir); } folio_unlock(folio); } static int sysv_handle_dirsync(struct inode *dir) { int err; err = filemap_write_and_wait(dir->i_mapping); if (!err) err = sync_inode_metadata(dir, 1); return err; } /* * Calls to dir_get_folio()/folio_release_kmap() must be nested according to the * rules documented in mm/highmem.rst. * * NOTE: sysv_find_entry() and sysv_dotdot() act as calls to dir_get_folio() * and must be treated accordingly for nesting purposes. */ static void *dir_get_folio(struct inode *dir, unsigned long n, struct folio **foliop) { struct folio *folio = read_mapping_folio(dir->i_mapping, n, NULL); if (IS_ERR(folio)) return ERR_CAST(folio); *foliop = folio; return kmap_local_folio(folio, 0); } static int sysv_readdir(struct file *file, struct dir_context *ctx) { unsigned long pos = ctx->pos; struct inode *inode = file_inode(file); struct super_block *sb = inode->i_sb; unsigned long npages = dir_pages(inode); unsigned offset; unsigned long n; ctx->pos = pos = (pos + SYSV_DIRSIZE-1) & ~(SYSV_DIRSIZE-1); if (pos >= inode->i_size) return 0; offset = pos & ~PAGE_MASK; n = pos >> PAGE_SHIFT; for ( ; n < npages; n++, offset = 0) { char *kaddr, *limit; struct sysv_dir_entry *de; struct folio *folio; kaddr = dir_get_folio(inode, n, &folio); if (IS_ERR(kaddr)) continue; de = (struct sysv_dir_entry *)(kaddr+offset); limit = kaddr + PAGE_SIZE - SYSV_DIRSIZE; for ( ;(char*)de <= limit; de++, ctx->pos += sizeof(*de)) { char *name = de->name; if (!de->inode) continue; if (!dir_emit(ctx, name, strnlen(name,SYSV_NAMELEN), fs16_to_cpu(SYSV_SB(sb), de->inode), DT_UNKNOWN)) { folio_release_kmap(folio, kaddr); return 0; } } folio_release_kmap(folio, kaddr); } return 0; } /* compare strings: name[0..len-1] (not zero-terminated) and * buffer[0..] (filled with zeroes up to buffer[0..maxlen-1]) */ static inline int namecompare(int len, int maxlen, const char * name, const char * buffer) { if (len < maxlen && buffer[len]) return 0; return !memcmp(name, buffer, len); } /* * sysv_find_entry() * * finds an entry in the specified directory with the wanted name. * It does NOT read the inode of the * entry - you'll have to do that yourself if you want to. * * On Success folio_release_kmap() should be called on *foliop. * * sysv_find_entry() acts as a call to dir_get_folio() and must be treated * accordingly for nesting purposes. */ struct sysv_dir_entry *sysv_find_entry(struct dentry *dentry, struct folio **foliop) { const char * name = dentry->d_name.name; int namelen = dentry->d_name.len; struct inode * dir = d_inode(dentry->d_parent); unsigned long start, n; unsigned long npages = dir_pages(dir); struct sysv_dir_entry *de; start = SYSV_I(dir)->i_dir_start_lookup; if (start >= npages) start = 0; n = start; do { char *kaddr = dir_get_folio(dir, n, foliop); if (!IS_ERR(kaddr)) { de = (struct sysv_dir_entry *)kaddr; kaddr += folio_size(*foliop) - SYSV_DIRSIZE; for ( ; (char *) de <= kaddr ; de++) { if (!de->inode) continue; if (namecompare(namelen, SYSV_NAMELEN, name, de->name)) goto found; } folio_release_kmap(*foliop, kaddr); } if (++n >= npages) n = 0; } while (n != start); return NULL; found: SYSV_I(dir)->i_dir_start_lookup = n; return de; } int sysv_add_link(struct dentry *dentry, struct inode *inode) { struct inode *dir = d_inode(dentry->d_parent); const char * name = dentry->d_name.name; int namelen = dentry->d_name.len; struct folio *folio = NULL; struct sysv_dir_entry * de; unsigned long npages = dir_pages(dir); unsigned long n; char *kaddr; loff_t pos; int err; /* We take care of directory expansion in the same loop */ for (n = 0; n <= npages; n++) { kaddr = dir_get_folio(dir, n, &folio); if (IS_ERR(kaddr)) return PTR_ERR(kaddr); de = (struct sysv_dir_entry *)kaddr; kaddr += PAGE_SIZE - SYSV_DIRSIZE; while ((char *)de <= kaddr) { if (!de->inode) goto got_it; err = -EEXIST; if (namecompare(namelen, SYSV_NAMELEN, name, de->name)) goto out_folio; de++; } folio_release_kmap(folio, kaddr); } BUG(); return -EINVAL; got_it: pos = folio_pos(folio) + offset_in_folio(folio, de); folio_lock(folio); err = sysv_prepare_chunk(folio, pos, SYSV_DIRSIZE); if (err) goto out_unlock; memcpy (de->name, name, namelen); memset (de->name + namelen, 0, SYSV_DIRSIZE - namelen - 2); de->inode = cpu_to_fs16(SYSV_SB(inode->i_sb), inode->i_ino); dir_commit_chunk(folio, pos, SYSV_DIRSIZE); inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); mark_inode_dirty(dir); err = sysv_handle_dirsync(dir); out_folio: folio_release_kmap(folio, kaddr); return err; out_unlock: folio_unlock(folio); goto out_folio; } int sysv_delete_entry(struct sysv_dir_entry *de, struct folio *folio) { struct inode *inode = folio->mapping->host; loff_t pos = folio_pos(folio) + offset_in_folio(folio, de); int err; folio_lock(folio); err = sysv_prepare_chunk(folio, pos, SYSV_DIRSIZE); if (err) { folio_unlock(folio); return err; } de->inode = 0; dir_commit_chunk(folio, pos, SYSV_DIRSIZE); inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); mark_inode_dirty(inode); return sysv_handle_dirsync(inode); } int sysv_make_empty(struct inode *inode, struct inode *dir) { struct folio *folio = filemap_grab_folio(inode->i_mapping, 0); struct sysv_dir_entry * de; char *kaddr; int err; if (IS_ERR(folio)) return PTR_ERR(folio); err = sysv_prepare_chunk(folio, 0, 2 * SYSV_DIRSIZE); if (err) { folio_unlock(folio); goto fail; } kaddr = kmap_local_folio(folio, 0); memset(kaddr, 0, folio_size(folio)); de = (struct sysv_dir_entry *)kaddr; de->inode = cpu_to_fs16(SYSV_SB(inode->i_sb), inode->i_ino); strcpy(de->name,"."); de++; de->inode = cpu_to_fs16(SYSV_SB(inode->i_sb), dir->i_ino); strcpy(de->name,".."); kunmap_local(kaddr); dir_commit_chunk(folio, 0, 2 * SYSV_DIRSIZE); err = sysv_handle_dirsync(inode); fail: folio_put(folio); return err; } /* * routine to check that the specified directory is empty (for rmdir) */ int sysv_empty_dir(struct inode * inode) { struct super_block *sb = inode->i_sb; struct folio *folio = NULL; unsigned long i, npages = dir_pages(inode); char *kaddr; for (i = 0; i < npages; i++) { struct sysv_dir_entry *de; kaddr = dir_get_folio(inode, i, &folio); if (IS_ERR(kaddr)) continue; de = (struct sysv_dir_entry *)kaddr; kaddr += folio_size(folio) - SYSV_DIRSIZE; for ( ;(char *)de <= kaddr; de++) { if (!de->inode) continue; /* check for . and .. */ if (de->name[0] != '.') goto not_empty; if (!de->name[1]) { if (de->inode == cpu_to_fs16(SYSV_SB(sb), inode->i_ino)) continue; goto not_empty; } if (de->name[1] != '.' || de->name[2]) goto not_empty; } folio_release_kmap(folio, kaddr); } return 1; not_empty: folio_release_kmap(folio, kaddr); return 0; } /* Releases the page */ int sysv_set_link(struct sysv_dir_entry *de, struct folio *folio, struct inode *inode) { struct inode *dir = folio->mapping->host; loff_t pos = folio_pos(folio) + offset_in_folio(folio, de); int err; folio_lock(folio); err = sysv_prepare_chunk(folio, pos, SYSV_DIRSIZE); if (err) { folio_unlock(folio); return err; } de->inode = cpu_to_fs16(SYSV_SB(inode->i_sb), inode->i_ino); dir_commit_chunk(folio, pos, SYSV_DIRSIZE); inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); mark_inode_dirty(dir); return sysv_handle_dirsync(inode); } /* * Calls to dir_get_folio()/folio_release_kmap() must be nested according to the * rules documented in mm/highmem.rst. * * sysv_dotdot() acts as a call to dir_get_folio() and must be treated * accordingly for nesting purposes. */ struct sysv_dir_entry *sysv_dotdot(struct inode *dir, struct folio **foliop) { struct sysv_dir_entry *de = dir_get_folio(dir, 0, foliop); if (IS_ERR(de)) return NULL; /* ".." is the second directory entry */ return de + 1; } ino_t sysv_inode_by_name(struct dentry *dentry) { struct folio *folio; struct sysv_dir_entry *de = sysv_find_entry (dentry, &folio); ino_t res = 0; if (de) { res = fs16_to_cpu(SYSV_SB(dentry->d_sb), de->inode); folio_release_kmap(folio, de); } return res; }
2 8 1264 1260 1264 1264 9 1256 1258 6 1259 9 7 1256 1263 1254 9 1261 6 1256 7 9 1257 9 1256 9 1254 1263 1252 11 1264 1260 1264 9 1256 1256 1255 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 // SPDX-License-Identifier: GPL-2.0 /* * Functions related to setting various queue properties from drivers */ #include <linux/kernel.h> #include <linux/module.h> #include <linux/init.h> #include <linux/bio.h> #include <linux/blk-integrity.h> #include <linux/pagemap.h> #include <linux/backing-dev-defs.h> #include <linux/gcd.h> #include <linux/lcm.h> #include <linux/jiffies.h> #include <linux/gfp.h> #include <linux/dma-mapping.h> #include "blk.h" #include "blk-rq-qos.h" #include "blk-wbt.h" void blk_queue_rq_timeout(struct request_queue *q, unsigned int timeout) { q->rq_timeout = timeout; } EXPORT_SYMBOL_GPL(blk_queue_rq_timeout); /** * blk_set_stacking_limits - set default limits for stacking devices * @lim: the queue_limits structure to reset * * Prepare queue limits for applying limits from underlying devices using * blk_stack_limits(). */ void blk_set_stacking_limits(struct queue_limits *lim) { memset(lim, 0, sizeof(*lim)); lim->logical_block_size = SECTOR_SIZE; lim->physical_block_size = SECTOR_SIZE; lim->io_min = SECTOR_SIZE; lim->discard_granularity = SECTOR_SIZE; lim->dma_alignment = SECTOR_SIZE - 1; lim->seg_boundary_mask = BLK_SEG_BOUNDARY_MASK; /* Inherit limits from component devices */ lim->max_segments = USHRT_MAX; lim->max_discard_segments = USHRT_MAX; lim->max_hw_sectors = UINT_MAX; lim->max_segment_size = UINT_MAX; lim->max_sectors = UINT_MAX; lim->max_dev_sectors = UINT_MAX; lim->max_write_zeroes_sectors = UINT_MAX; lim->max_hw_zone_append_sectors = UINT_MAX; lim->max_user_discard_sectors = UINT_MAX; } EXPORT_SYMBOL(blk_set_stacking_limits); void blk_apply_bdi_limits(struct backing_dev_info *bdi, struct queue_limits *lim) { /* * For read-ahead of large files to be effective, we need to read ahead * at least twice the optimal I/O size. */ bdi->ra_pages = max(lim->io_opt * 2 / PAGE_SIZE, VM_READAHEAD_PAGES); bdi->io_pages = lim->max_sectors >> PAGE_SECTORS_SHIFT; } static int blk_validate_zoned_limits(struct queue_limits *lim) { if (!(lim->features & BLK_FEAT_ZONED)) { if (WARN_ON_ONCE(lim->max_open_zones) || WARN_ON_ONCE(lim->max_active_zones) || WARN_ON_ONCE(lim->zone_write_granularity) || WARN_ON_ONCE(lim->max_zone_append_sectors)) return -EINVAL; return 0; } if (WARN_ON_ONCE(!IS_ENABLED(CONFIG_BLK_DEV_ZONED))) return -EINVAL; /* * Given that active zones include open zones, the maximum number of * open zones cannot be larger than the maximum number of active zones. */ if (lim->max_active_zones && lim->max_open_zones > lim->max_active_zones) return -EINVAL; if (lim->zone_write_granularity < lim->logical_block_size) lim->zone_write_granularity = lim->logical_block_size; /* * The Zone Append size is limited by the maximum I/O size and the zone * size given that it can't span zones. * * If no max_hw_zone_append_sectors limit is provided, the block layer * will emulated it, else we're also bound by the hardware limit. */ lim->max_zone_append_sectors = min_not_zero(lim->max_hw_zone_append_sectors, min(lim->chunk_sectors, lim->max_hw_sectors)); return 0; } static int blk_validate_integrity_limits(struct queue_limits *lim) { struct blk_integrity *bi = &lim->integrity; if (!bi->tuple_size) { if (bi->csum_type != BLK_INTEGRITY_CSUM_NONE || bi->tag_size || ((bi->flags & BLK_INTEGRITY_REF_TAG))) { pr_warn("invalid PI settings.\n"); return -EINVAL; } return 0; } if (!IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY)) { pr_warn("integrity support disabled.\n"); return -EINVAL; } if (bi->csum_type == BLK_INTEGRITY_CSUM_NONE && (bi->flags & BLK_INTEGRITY_REF_TAG)) { pr_warn("ref tag not support without checksum.\n"); return -EINVAL; } if (!bi->interval_exp) bi->interval_exp = ilog2(lim->logical_block_size); return 0; } /* * Returns max guaranteed bytes which we can fit in a bio. * * We request that an atomic_write is ITER_UBUF iov_iter (so a single vector), * so we assume that we can fit in at least PAGE_SIZE in a segment, apart from * the first and last segments. */ static unsigned int blk_queue_max_guaranteed_bio(struct queue_limits *lim) { unsigned int max_segments = min(BIO_MAX_VECS, lim->max_segments); unsigned int length; length = min(max_segments, 2) * lim->logical_block_size; if (max_segments > 2) length += (max_segments - 2) * PAGE_SIZE; return length; } static void blk_atomic_writes_update_limits(struct queue_limits *lim) { unsigned int unit_limit = min(lim->max_hw_sectors << SECTOR_SHIFT, blk_queue_max_guaranteed_bio(lim)); unit_limit = rounddown_pow_of_two(unit_limit); lim->atomic_write_max_sectors = min(lim->atomic_write_hw_max >> SECTOR_SHIFT, lim->max_hw_sectors); lim->atomic_write_unit_min = min(lim->atomic_write_hw_unit_min, unit_limit); lim->atomic_write_unit_max = min(lim->atomic_write_hw_unit_max, unit_limit); lim->atomic_write_boundary_sectors = lim->atomic_write_hw_boundary >> SECTOR_SHIFT; } static void blk_validate_atomic_write_limits(struct queue_limits *lim) { unsigned int boundary_sectors; if (!(lim->features & BLK_FEAT_ATOMIC_WRITES)) goto unsupported; if (!lim->atomic_write_hw_max) goto unsupported; if (WARN_ON_ONCE(!is_power_of_2(lim->atomic_write_hw_unit_min))) goto unsupported; if (WARN_ON_ONCE(!is_power_of_2(lim->atomic_write_hw_unit_max))) goto unsupported; if (WARN_ON_ONCE(lim->atomic_write_hw_unit_min > lim->atomic_write_hw_unit_max)) goto unsupported; if (WARN_ON_ONCE(lim->atomic_write_hw_unit_max > lim->atomic_write_hw_max)) goto unsupported; boundary_sectors = lim->atomic_write_hw_boundary >> SECTOR_SHIFT; if (boundary_sectors) { if (WARN_ON_ONCE(lim->atomic_write_hw_max > lim->atomic_write_hw_boundary)) goto unsupported; /* * A feature of boundary support is that it disallows bios to * be merged which would result in a merged request which * crosses either a chunk sector or atomic write HW boundary, * even though chunk sectors may be just set for performance. * For simplicity, disallow atomic writes for a chunk sector * which is non-zero and smaller than atomic write HW boundary. * Furthermore, chunk sectors must be a multiple of atomic * write HW boundary. Otherwise boundary support becomes * complicated. * Devices which do not conform to these rules can be dealt * with if and when they show up. */ if (WARN_ON_ONCE(lim->chunk_sectors % boundary_sectors)) goto unsupported; /* * The boundary size just needs to be a multiple of unit_max * (and not necessarily a power-of-2), so this following check * could be relaxed in future. * Furthermore, if needed, unit_max could even be reduced so * that it is compliant with a !power-of-2 boundary. */ if (!is_power_of_2(boundary_sectors)) goto unsupported; } blk_atomic_writes_update_limits(lim); return; unsupported: lim->atomic_write_max_sectors = 0; lim->atomic_write_boundary_sectors = 0; lim->atomic_write_unit_min = 0; lim->atomic_write_unit_max = 0; } /* * Check that the limits in lim are valid, initialize defaults for unset * values, and cap values based on others where needed. */ int blk_validate_limits(struct queue_limits *lim) { unsigned int max_hw_sectors; unsigned int logical_block_sectors; unsigned long seg_size; int err; /* * Unless otherwise specified, default to 512 byte logical blocks and a * physical block size equal to the logical block size. */ if (!lim->logical_block_size) lim->logical_block_size = SECTOR_SIZE; else if (blk_validate_block_size(lim->logical_block_size)) { pr_warn("Invalid logical block size (%d)\n", lim->logical_block_size); return -EINVAL; } if (lim->physical_block_size < lim->logical_block_size) lim->physical_block_size = lim->logical_block_size; /* * The minimum I/O size defaults to the physical block size unless * explicitly overridden. */ if (lim->io_min < lim->physical_block_size) lim->io_min = lim->physical_block_size; /* * The optimal I/O size may not be aligned to physical block size * (because it may be limited by dma engines which have no clue about * block size of the disks attached to them), so we round it down here. */ lim->io_opt = round_down(lim->io_opt, lim->physical_block_size); /* * max_hw_sectors has a somewhat weird default for historical reason, * but driver really should set their own instead of relying on this * value. * * The block layer relies on the fact that every driver can * handle at lest a page worth of data per I/O, and needs the value * aligned to the logical block size. */ if (!lim->max_hw_sectors) lim->max_hw_sectors = BLK_SAFE_MAX_SECTORS; if (WARN_ON_ONCE(lim->max_hw_sectors < PAGE_SECTORS)) return -EINVAL; logical_block_sectors = lim->logical_block_size >> SECTOR_SHIFT; if (WARN_ON_ONCE(logical_block_sectors > lim->max_hw_sectors)) return -EINVAL; lim->max_hw_sectors = round_down(lim->max_hw_sectors, logical_block_sectors); /* * The actual max_sectors value is a complex beast and also takes the * max_dev_sectors value (set by SCSI ULPs) and a user configurable * value into account. The ->max_sectors value is always calculated * from these, so directly setting it won't have any effect. */ max_hw_sectors = min_not_zero(lim->max_hw_sectors, lim->max_dev_sectors); if (lim->max_user_sectors) { if (lim->max_user_sectors < BLK_MIN_SEGMENT_SIZE / SECTOR_SIZE) return -EINVAL; lim->max_sectors = min(max_hw_sectors, lim->max_user_sectors); } else if (lim->io_opt > (BLK_DEF_MAX_SECTORS_CAP << SECTOR_SHIFT)) { lim->max_sectors = min(max_hw_sectors, lim->io_opt >> SECTOR_SHIFT); } else if (lim->io_min > (BLK_DEF_MAX_SECTORS_CAP << SECTOR_SHIFT)) { lim->max_sectors = min(max_hw_sectors, lim->io_min >> SECTOR_SHIFT); } else { lim->max_sectors = min(max_hw_sectors, BLK_DEF_MAX_SECTORS_CAP); } lim->max_sectors = round_down(lim->max_sectors, logical_block_sectors); /* * Random default for the maximum number of segments. Driver should not * rely on this and set their own. */ if (!lim->max_segments) lim->max_segments = BLK_MAX_SEGMENTS; lim->max_discard_sectors = min(lim->max_hw_discard_sectors, lim->max_user_discard_sectors); if (!lim->max_discard_segments) lim->max_discard_segments = 1; if (lim->discard_granularity < lim->physical_block_size) lim->discard_granularity = lim->physical_block_size; /* * By default there is no limit on the segment boundary alignment, * but if there is one it can't be smaller than the page size as * that would break all the normal I/O patterns. */ if (!lim->seg_boundary_mask) lim->seg_boundary_mask = BLK_SEG_BOUNDARY_MASK; if (WARN_ON_ONCE(lim->seg_boundary_mask < BLK_MIN_SEGMENT_SIZE - 1)) return -EINVAL; /* * Stacking device may have both virtual boundary and max segment * size limit, so allow this setting now, and long-term the two * might need to move out of stacking limits since we have immutable * bvec and lower layer bio splitting is supposed to handle the two * correctly. */ if (lim->virt_boundary_mask) { if (!lim->max_segment_size) lim->max_segment_size = UINT_MAX; } else { /* * The maximum segment size has an odd historic 64k default that * drivers probably should override. Just like the I/O size we * require drivers to at least handle a full page per segment. */ if (!lim->max_segment_size) lim->max_segment_size = BLK_MAX_SEGMENT_SIZE; if (WARN_ON_ONCE(lim->max_segment_size < BLK_MIN_SEGMENT_SIZE)) return -EINVAL; } /* setup min segment size for building new segment in fast path */ if (lim->seg_boundary_mask > lim->max_segment_size - 1) seg_size = lim->max_segment_size; else seg_size = lim->seg_boundary_mask + 1; lim->min_segment_size = min_t(unsigned int, seg_size, PAGE_SIZE); /* * We require drivers to at least do logical block aligned I/O, but * historically could not check for that due to the separate calls * to set the limits. Once the transition is finished the check * below should be narrowed down to check the logical block size. */ if (!lim->dma_alignment) lim->dma_alignment = SECTOR_SIZE - 1; if (WARN_ON_ONCE(lim->dma_alignment > PAGE_SIZE)) return -EINVAL; if (lim->alignment_offset) { lim->alignment_offset &= (lim->physical_block_size - 1); lim->flags &= ~BLK_FLAG_MISALIGNED; } if (!(lim->features & BLK_FEAT_WRITE_CACHE)) lim->features &= ~BLK_FEAT_FUA; blk_validate_atomic_write_limits(lim); err = blk_validate_integrity_limits(lim); if (err) return err; return blk_validate_zoned_limits(lim); } EXPORT_SYMBOL_GPL(blk_validate_limits); /* * Set the default limits for a newly allocated queue. @lim contains the * initial limits set by the driver, which could be no limit in which case * all fields are cleared to zero. */ int blk_set_default_limits(struct queue_limits *lim) { /* * Most defaults are set by capping the bounds in blk_validate_limits, * but max_user_discard_sectors is special and needs an explicit * initialization to the max value here. */ lim->max_user_discard_sectors = UINT_MAX; return blk_validate_limits(lim); } /** * queue_limits_commit_update - commit an atomic update of queue limits * @q: queue to update * @lim: limits to apply * * Apply the limits in @lim that were obtained from queue_limits_start_update() * and updated by the caller to @q. The caller must have frozen the queue or * ensure that there are no outstanding I/Os by other means. * * Returns 0 if successful, else a negative error code. */ int queue_limits_commit_update(struct request_queue *q, struct queue_limits *lim) { int error; error = blk_validate_limits(lim); if (error) goto out_unlock; #ifdef CONFIG_BLK_INLINE_ENCRYPTION if (q->crypto_profile && lim->integrity.tag_size) { pr_warn("blk-integrity: Integrity and hardware inline encryption are not supported together.\n"); error = -EINVAL; goto out_unlock; } #endif q->limits = *lim; if (q->disk) blk_apply_bdi_limits(q->disk->bdi, lim); out_unlock: mutex_unlock(&q->limits_lock); return error; } EXPORT_SYMBOL_GPL(queue_limits_commit_update); /** * queue_limits_commit_update_frozen - commit an atomic update of queue limits * @q: queue to update * @lim: limits to apply * * Apply the limits in @lim that were obtained from queue_limits_start_update() * and updated with the new values by the caller to @q. Freezes the queue * before the update and unfreezes it after. * * Returns 0 if successful, else a negative error code. */ int queue_limits_commit_update_frozen(struct request_queue *q, struct queue_limits *lim) { unsigned int memflags; int ret; memflags = blk_mq_freeze_queue(q); ret = queue_limits_commit_update(q, lim); blk_mq_unfreeze_queue(q, memflags); return ret; } EXPORT_SYMBOL_GPL(queue_limits_commit_update_frozen); /** * queue_limits_set - apply queue limits to queue * @q: queue to update * @lim: limits to apply * * Apply the limits in @lim that were freshly initialized to @q. * To update existing limits use queue_limits_start_update() and * queue_limits_commit_update() instead. * * Returns 0 if successful, else a negative error code. */ int queue_limits_set(struct request_queue *q, struct queue_limits *lim) { mutex_lock(&q->limits_lock); return queue_limits_commit_update(q, lim); } EXPORT_SYMBOL_GPL(queue_limits_set); static int queue_limit_alignment_offset(const struct queue_limits *lim, sector_t sector) { unsigned int granularity = max(lim->physical_block_size, lim->io_min); unsigned int alignment = sector_div(sector, granularity >> SECTOR_SHIFT) << SECTOR_SHIFT; return (granularity + lim->alignment_offset - alignment) % granularity; } static unsigned int queue_limit_discard_alignment( const struct queue_limits *lim, sector_t sector) { unsigned int alignment, granularity, offset; if (!lim->max_discard_sectors) return 0; /* Why are these in bytes, not sectors? */ alignment = lim->discard_alignment >> SECTOR_SHIFT; granularity = lim->discard_granularity >> SECTOR_SHIFT; /* Offset of the partition start in 'granularity' sectors */ offset = sector_div(sector, granularity); /* And why do we do this modulus *again* in blkdev_issue_discard()? */ offset = (granularity + alignment - offset) % granularity; /* Turn it back into bytes, gaah */ return offset << SECTOR_SHIFT; } static unsigned int blk_round_down_sectors(unsigned int sectors, unsigned int lbs) { sectors = round_down(sectors, lbs >> SECTOR_SHIFT); if (sectors < PAGE_SIZE >> SECTOR_SHIFT) sectors = PAGE_SIZE >> SECTOR_SHIFT; return sectors; } /* Check if second and later bottom devices are compliant */ static bool blk_stack_atomic_writes_tail(struct queue_limits *t, struct queue_limits *b) { /* We're not going to support different boundary sizes.. yet */ if (t->atomic_write_hw_boundary != b->atomic_write_hw_boundary) return false; /* Can't support this */ if (t->atomic_write_hw_unit_min > b->atomic_write_hw_unit_max) return false; /* Or this */ if (t->atomic_write_hw_unit_max < b->atomic_write_hw_unit_min) return false; t->atomic_write_hw_max = min(t->atomic_write_hw_max, b->atomic_write_hw_max); t->atomic_write_hw_unit_min = max(t->atomic_write_hw_unit_min, b->atomic_write_hw_unit_min); t->atomic_write_hw_unit_max = min(t->atomic_write_hw_unit_max, b->atomic_write_hw_unit_max); return true; } /* Check for valid boundary of first bottom device */ static bool blk_stack_atomic_writes_boundary_head(struct queue_limits *t, struct queue_limits *b) { /* * Ensure atomic write boundary is aligned with chunk sectors. Stacked * devices store chunk sectors in t->io_min. */ if (b->atomic_write_hw_boundary > t->io_min && b->atomic_write_hw_boundary % t->io_min) return false; if (t->io_min > b->atomic_write_hw_boundary && t->io_min % b->atomic_write_hw_boundary) return false; t->atomic_write_hw_boundary = b->atomic_write_hw_boundary; return true; } /* Check stacking of first bottom device */ static bool blk_stack_atomic_writes_head(struct queue_limits *t, struct queue_limits *b) { if (b->atomic_write_hw_boundary && !blk_stack_atomic_writes_boundary_head(t, b)) return false; if (t->io_min <= SECTOR_SIZE) { /* No chunk sectors, so use bottom device values directly */ t->atomic_write_hw_unit_max = b->atomic_write_hw_unit_max; t->atomic_write_hw_unit_min = b->atomic_write_hw_unit_min; t->atomic_write_hw_max = b->atomic_write_hw_max; return true; } /* * Find values for limits which work for chunk size. * b->atomic_write_hw_unit_{min, max} may not be aligned with chunk * size (t->io_min), as chunk size is not restricted to a power-of-2. * So we need to find highest power-of-2 which works for the chunk * size. * As an example scenario, we could have b->unit_max = 16K and * t->io_min = 24K. For this case, reduce t->unit_max to a value * aligned with both limits, i.e. 8K in this example. */ t->atomic_write_hw_unit_max = b->atomic_write_hw_unit_max; while (t->io_min % t->atomic_write_hw_unit_max) t->atomic_write_hw_unit_max /= 2; t->atomic_write_hw_unit_min = min(b->atomic_write_hw_unit_min, t->atomic_write_hw_unit_max); t->atomic_write_hw_max = min(b->atomic_write_hw_max, t->io_min); return true; } static void blk_stack_atomic_writes_limits(struct queue_limits *t, struct queue_limits *b, sector_t start) { if (!(b->features & BLK_FEAT_ATOMIC_WRITES)) goto unsupported; if (!b->atomic_write_hw_unit_min) goto unsupported; if (!blk_atomic_write_start_sect_aligned(start, b)) goto unsupported; /* * If atomic_write_hw_max is set, we have already stacked 1x bottom * device, so check for compliance. */ if (t->atomic_write_hw_max) { if (!blk_stack_atomic_writes_tail(t, b)) goto unsupported; return; } if (!blk_stack_atomic_writes_head(t, b)) goto unsupported; return; unsupported: t->atomic_write_hw_max = 0; t->atomic_write_hw_unit_max = 0; t->atomic_write_hw_unit_min = 0; t->atomic_write_hw_boundary = 0; } /** * blk_stack_limits - adjust queue_limits for stacked devices * @t: the stacking driver limits (top device) * @b: the underlying queue limits (bottom, component device) * @start: first data sector within component device * * Description: * This function is used by stacking drivers like MD and DM to ensure * that all component devices have compatible block sizes and * alignments. The stacking driver must provide a queue_limits * struct (top) and then iteratively call the stacking function for * all component (bottom) devices. The stacking function will * attempt to combine the values and ensure proper alignment. * * Returns 0 if the top and bottom queue_limits are compatible. The * top device's block sizes and alignment offsets may be adjusted to * ensure alignment with the bottom device. If no compatible sizes * and alignments exist, -1 is returned and the resulting top * queue_limits will have the misaligned flag set to indicate that * the alignment_offset is undefined. */ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, sector_t start) { unsigned int top, bottom, alignment, ret = 0; t->features |= (b->features & BLK_FEAT_INHERIT_MASK); /* * Some feaures need to be supported both by the stacking driver and all * underlying devices. The stacking driver sets these flags before * stacking the limits, and this will clear the flags if any of the * underlying devices does not support it. */ if (!(b->features & BLK_FEAT_NOWAIT)) t->features &= ~BLK_FEAT_NOWAIT; if (!(b->features & BLK_FEAT_POLL)) t->features &= ~BLK_FEAT_POLL; t->flags |= (b->flags & BLK_FLAG_MISALIGNED); t->max_sectors = min_not_zero(t->max_sectors, b->max_sectors); t->max_user_sectors = min_not_zero(t->max_user_sectors, b->max_user_sectors); t->max_hw_sectors = min_not_zero(t->max_hw_sectors, b->max_hw_sectors); t->max_dev_sectors = min_not_zero(t->max_dev_sectors, b->max_dev_sectors); t->max_write_zeroes_sectors = min(t->max_write_zeroes_sectors, b->max_write_zeroes_sectors); t->max_hw_zone_append_sectors = min(t->max_hw_zone_append_sectors, b->max_hw_zone_append_sectors); t->seg_boundary_mask = min_not_zero(t->seg_boundary_mask, b->seg_boundary_mask); t->virt_boundary_mask = min_not_zero(t->virt_boundary_mask, b->virt_boundary_mask); t->max_segments = min_not_zero(t->max_segments, b->max_segments); t->max_discard_segments = min_not_zero(t->max_discard_segments, b->max_discard_segments); t->max_integrity_segments = min_not_zero(t->max_integrity_segments, b->max_integrity_segments); t->max_segment_size = min_not_zero(t->max_segment_size, b->max_segment_size); alignment = queue_limit_alignment_offset(b, start); /* Bottom device has different alignment. Check that it is * compatible with the current top alignment. */ if (t->alignment_offset != alignment) { top = max(t->physical_block_size, t->io_min) + t->alignment_offset; bottom = max(b->physical_block_size, b->io_min) + alignment; /* Verify that top and bottom intervals line up */ if (max(top, bottom) % min(top, bottom)) { t->flags |= BLK_FLAG_MISALIGNED; ret = -1; } } t->logical_block_size = max(t->logical_block_size, b->logical_block_size); t->physical_block_size = max(t->physical_block_size, b->physical_block_size); t->io_min = max(t->io_min, b->io_min); t->io_opt = lcm_not_zero(t->io_opt, b->io_opt); t->dma_alignment = max(t->dma_alignment, b->dma_alignment); /* Set non-power-of-2 compatible chunk_sectors boundary */ if (b->chunk_sectors) t->chunk_sectors = gcd(t->chunk_sectors, b->chunk_sectors); /* Physical block size a multiple of the logical block size? */ if (t->physical_block_size & (t->logical_block_size - 1)) { t->physical_block_size = t->logical_block_size; t->flags |= BLK_FLAG_MISALIGNED; ret = -1; } /* Minimum I/O a multiple of the physical block size? */ if (t->io_min & (t->physical_block_size - 1)) { t->io_min = t->physical_block_size; t->flags |= BLK_FLAG_MISALIGNED; ret = -1; } /* Optimal I/O a multiple of the physical block size? */ if (t->io_opt & (t->physical_block_size - 1)) { t->io_opt = 0; t->flags |= BLK_FLAG_MISALIGNED; ret = -1; } /* chunk_sectors a multiple of the physical block size? */ if ((t->chunk_sectors << 9) & (t->physical_block_size - 1)) { t->chunk_sectors = 0; t->flags |= BLK_FLAG_MISALIGNED; ret = -1; } /* Find lowest common alignment_offset */ t->alignment_offset = lcm_not_zero(t->alignment_offset, alignment) % max(t->physical_block_size, t->io_min); /* Verify that new alignment_offset is on a logical block boundary */ if (t->alignment_offset & (t->logical_block_size - 1)) { t->flags |= BLK_FLAG_MISALIGNED; ret = -1; } t->max_sectors = blk_round_down_sectors(t->max_sectors, t->logical_block_size); t->max_hw_sectors = blk_round_down_sectors(t->max_hw_sectors, t->logical_block_size); t->max_dev_sectors = blk_round_down_sectors(t->max_dev_sectors, t->logical_block_size); /* Discard alignment and granularity */ if (b->discard_granularity) { alignment = queue_limit_discard_alignment(b, start); t->max_discard_sectors = min_not_zero(t->max_discard_sectors, b->max_discard_sectors); t->max_hw_discard_sectors = min_not_zero(t->max_hw_discard_sectors, b->max_hw_discard_sectors); t->discard_granularity = max(t->discard_granularity, b->discard_granularity); t->discard_alignment = lcm_not_zero(t->discard_alignment, alignment) % t->discard_granularity; } t->max_secure_erase_sectors = min_not_zero(t->max_secure_erase_sectors, b->max_secure_erase_sectors); t->zone_write_granularity = max(t->zone_write_granularity, b->zone_write_granularity); if (!(t->features & BLK_FEAT_ZONED)) { t->zone_write_granularity = 0; t->max_zone_append_sectors = 0; } blk_stack_atomic_writes_limits(t, b, start); return ret; } EXPORT_SYMBOL(blk_stack_limits); /** * queue_limits_stack_bdev - adjust queue_limits for stacked devices * @t: the stacking driver limits (top device) * @bdev: the underlying block device (bottom) * @offset: offset to beginning of data within component device * @pfx: prefix to use for warnings logged * * Description: * This function is used by stacking drivers like MD and DM to ensure * that all component devices have compatible block sizes and * alignments. The stacking driver must provide a queue_limits * struct (top) and then iteratively call the stacking function for * all component (bottom) devices. The stacking function will * attempt to combine the values and ensure proper alignment. */ void queue_limits_stack_bdev(struct queue_limits *t, struct block_device *bdev, sector_t offset, const char *pfx) { if (blk_stack_limits(t, bdev_limits(bdev), get_start_sect(bdev) + offset)) pr_notice("%s: Warning: Device %pg is misaligned\n", pfx, bdev); } EXPORT_SYMBOL_GPL(queue_limits_stack_bdev); /** * queue_limits_stack_integrity - stack integrity profile * @t: target queue limits * @b: base queue limits * * Check if the integrity profile in the @b can be stacked into the * target @t. Stacking is possible if either: * * a) does not have any integrity information stacked into it yet * b) the integrity profile in @b is identical to the one in @t * * If @b can be stacked into @t, return %true. Else return %false and clear the * integrity information in @t. */ bool queue_limits_stack_integrity(struct queue_limits *t, struct queue_limits *b) { struct blk_integrity *ti = &t->integrity; struct blk_integrity *bi = &b->integrity; if (!IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY)) return true; if (!ti->tuple_size) { /* inherit the settings from the first underlying device */ if (!(ti->flags & BLK_INTEGRITY_STACKED)) { ti->flags = BLK_INTEGRITY_DEVICE_CAPABLE | (bi->flags & BLK_INTEGRITY_REF_TAG); ti->csum_type = bi->csum_type; ti->tuple_size = bi->tuple_size; ti->pi_offset = bi->pi_offset; ti->interval_exp = bi->interval_exp; ti->tag_size = bi->tag_size; goto done; } if (!bi->tuple_size) goto done; } if (ti->tuple_size != bi->tuple_size) goto incompatible; if (ti->interval_exp != bi->interval_exp) goto incompatible; if (ti->tag_size != bi->tag_size) goto incompatible; if (ti->csum_type != bi->csum_type) goto incompatible; if ((ti->flags & BLK_INTEGRITY_REF_TAG) != (bi->flags & BLK_INTEGRITY_REF_TAG)) goto incompatible; done: ti->flags |= BLK_INTEGRITY_STACKED; return true; incompatible: memset(ti, 0, sizeof(*ti)); return false; } EXPORT_SYMBOL_GPL(queue_limits_stack_integrity); /** * blk_set_queue_depth - tell the block layer about the device queue depth * @q: the request queue for the device * @depth: queue depth * */ void blk_set_queue_depth(struct request_queue *q, unsigned int depth) { q->queue_depth = depth; rq_qos_queue_depth_changed(q); } EXPORT_SYMBOL(blk_set_queue_depth); int bdev_alignment_offset(struct block_device *bdev) { struct request_queue *q = bdev_get_queue(bdev); if (q->limits.flags & BLK_FLAG_MISALIGNED) return -1; if (bdev_is_partition(bdev)) return queue_limit_alignment_offset(&q->limits, bdev->bd_start_sect); return q->limits.alignment_offset; } EXPORT_SYMBOL_GPL(bdev_alignment_offset); unsigned int bdev_discard_alignment(struct block_device *bdev) { struct request_queue *q = bdev_get_queue(bdev); if (bdev_is_partition(bdev)) return queue_limit_discard_alignment(&q->limits, bdev->bd_start_sect); return q->limits.discard_alignment; } EXPORT_SYMBOL_GPL(bdev_discard_alignment);
1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 // SPDX-License-Identifier: GPL-2.0 /* Generic nexthop implementation * * Copyright (c) 2017-19 Cumulus Networks * Copyright (c) 2017-19 David Ahern <dsa@cumulusnetworks.com> */ #include <linux/nexthop.h> #include <linux/rtnetlink.h> #include <linux/slab.h> #include <linux/vmalloc.h> #include <net/arp.h> #include <net/ipv6_stubs.h> #include <net/lwtunnel.h> #include <net/ndisc.h> #include <net/nexthop.h> #include <net/route.h> #include <net/sock.h> #define NH_RES_DEFAULT_IDLE_TIMER (120 * HZ) #define NH_RES_DEFAULT_UNBALANCED_TIMER 0 /* No forced rebalancing. */ static void remove_nexthop(struct net *net, struct nexthop *nh, struct nl_info *nlinfo); #define NH_DEV_HASHBITS 8 #define NH_DEV_HASHSIZE (1U << NH_DEV_HASHBITS) #define NHA_OP_FLAGS_DUMP_ALL (NHA_OP_FLAG_DUMP_STATS | \ NHA_OP_FLAG_DUMP_HW_STATS) static const struct nla_policy rtm_nh_policy_new[] = { [NHA_ID] = { .type = NLA_U32 }, [NHA_GROUP] = { .type = NLA_BINARY }, [NHA_GROUP_TYPE] = { .type = NLA_U16 }, [NHA_BLACKHOLE] = { .type = NLA_FLAG }, [NHA_OIF] = { .type = NLA_U32 }, [NHA_GATEWAY] = { .type = NLA_BINARY }, [NHA_ENCAP_TYPE] = { .type = NLA_U16 }, [NHA_ENCAP] = { .type = NLA_NESTED }, [NHA_FDB] = { .type = NLA_FLAG }, [NHA_RES_GROUP] = { .type = NLA_NESTED }, [NHA_HW_STATS_ENABLE] = NLA_POLICY_MAX(NLA_U32, true), }; static const struct nla_policy rtm_nh_policy_get[] = { [NHA_ID] = { .type = NLA_U32 }, [NHA_OP_FLAGS] = NLA_POLICY_MASK(NLA_U32, NHA_OP_FLAGS_DUMP_ALL), }; static const struct nla_policy rtm_nh_policy_del[] = { [NHA_ID] = { .type = NLA_U32 }, }; static const struct nla_policy rtm_nh_policy_dump[] = { [NHA_OIF] = { .type = NLA_U32 }, [NHA_GROUPS] = { .type = NLA_FLAG }, [NHA_MASTER] = { .type = NLA_U32 }, [NHA_FDB] = { .type = NLA_FLAG }, [NHA_OP_FLAGS] = NLA_POLICY_MASK(NLA_U32, NHA_OP_FLAGS_DUMP_ALL), }; static const struct nla_policy rtm_nh_res_policy_new[] = { [NHA_RES_GROUP_BUCKETS] = { .type = NLA_U16 }, [NHA_RES_GROUP_IDLE_TIMER] = { .type = NLA_U32 }, [NHA_RES_GROUP_UNBALANCED_TIMER] = { .type = NLA_U32 }, }; static const struct nla_policy rtm_nh_policy_dump_bucket[] = { [NHA_ID] = { .type = NLA_U32 }, [NHA_OIF] = { .type = NLA_U32 }, [NHA_MASTER] = { .type = NLA_U32 }, [NHA_RES_BUCKET] = { .type = NLA_NESTED }, }; static const struct nla_policy rtm_nh_res_bucket_policy_dump[] = { [NHA_RES_BUCKET_NH_ID] = { .type = NLA_U32 }, }; static const struct nla_policy rtm_nh_policy_get_bucket[] = { [NHA_ID] = { .type = NLA_U32 }, [NHA_RES_BUCKET] = { .type = NLA_NESTED }, }; static const struct nla_policy rtm_nh_res_bucket_policy_get[] = { [NHA_RES_BUCKET_INDEX] = { .type = NLA_U16 }, }; static bool nexthop_notifiers_is_empty(struct net *net) { return !net->nexthop.notifier_chain.head; } static void __nh_notifier_single_info_init(struct nh_notifier_single_info *nh_info, const struct nh_info *nhi) { nh_info->dev = nhi->fib_nhc.nhc_dev; nh_info->gw_family = nhi->fib_nhc.nhc_gw_family; if (nh_info->gw_family == AF_INET) nh_info->ipv4 = nhi->fib_nhc.nhc_gw.ipv4; else if (nh_info->gw_family == AF_INET6) nh_info->ipv6 = nhi->fib_nhc.nhc_gw.ipv6; nh_info->id = nhi->nh_parent->id; nh_info->is_reject = nhi->reject_nh; nh_info->is_fdb = nhi->fdb_nh; nh_info->has_encap = !!nhi->fib_nhc.nhc_lwtstate; } static int nh_notifier_single_info_init(struct nh_notifier_info *info, const struct nexthop *nh) { struct nh_info *nhi = rtnl_dereference(nh->nh_info); info->type = NH_NOTIFIER_INFO_TYPE_SINGLE; info->nh = kzalloc(sizeof(*info->nh), GFP_KERNEL); if (!info->nh) return -ENOMEM; __nh_notifier_single_info_init(info->nh, nhi); return 0; } static void nh_notifier_single_info_fini(struct nh_notifier_info *info) { kfree(info->nh); } static int nh_notifier_mpath_info_init(struct nh_notifier_info *info, struct nh_group *nhg) { u16 num_nh = nhg->num_nh; int i; info->type = NH_NOTIFIER_INFO_TYPE_GRP; info->nh_grp = kzalloc(struct_size(info->nh_grp, nh_entries, num_nh), GFP_KERNEL); if (!info->nh_grp) return -ENOMEM; info->nh_grp->num_nh = num_nh; info->nh_grp->is_fdb = nhg->fdb_nh; info->nh_grp->hw_stats = nhg->hw_stats; for (i = 0; i < num_nh; i++) { struct nh_grp_entry *nhge = &nhg->nh_entries[i]; struct nh_info *nhi; nhi = rtnl_dereference(nhge->nh->nh_info); info->nh_grp->nh_entries[i].weight = nhge->weight; __nh_notifier_single_info_init(&info->nh_grp->nh_entries[i].nh, nhi); } return 0; } static int nh_notifier_res_table_info_init(struct nh_notifier_info *info, struct nh_group *nhg) { struct nh_res_table *res_table = rtnl_dereference(nhg->res_table); u16 num_nh_buckets = res_table->num_nh_buckets; unsigned long size; u16 i; info->type = NH_NOTIFIER_INFO_TYPE_RES_TABLE; size = struct_size(info->nh_res_table, nhs, num_nh_buckets); info->nh_res_table = __vmalloc(size, GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN); if (!info->nh_res_table) return -ENOMEM; info->nh_res_table->num_nh_buckets = num_nh_buckets; info->nh_res_table->hw_stats = nhg->hw_stats; for (i = 0; i < num_nh_buckets; i++) { struct nh_res_bucket *bucket = &res_table->nh_buckets[i]; struct nh_grp_entry *nhge; struct nh_info *nhi; nhge = rtnl_dereference(bucket->nh_entry); nhi = rtnl_dereference(nhge->nh->nh_info); __nh_notifier_single_info_init(&info->nh_res_table->nhs[i], nhi); } return 0; } static int nh_notifier_grp_info_init(struct nh_notifier_info *info, const struct nexthop *nh) { struct nh_group *nhg = rtnl_dereference(nh->nh_grp); if (nhg->hash_threshold) return nh_notifier_mpath_info_init(info, nhg); else if (nhg->resilient) return nh_notifier_res_table_info_init(info, nhg); return -EINVAL; } static void nh_notifier_grp_info_fini(struct nh_notifier_info *info, const struct nexthop *nh) { struct nh_group *nhg = rtnl_dereference(nh->nh_grp); if (nhg->hash_threshold) kfree(info->nh_grp); else if (nhg->resilient) vfree(info->nh_res_table); } static int nh_notifier_info_init(struct nh_notifier_info *info, const struct nexthop *nh) { info->id = nh->id; if (nh->is_group) return nh_notifier_grp_info_init(info, nh); else return nh_notifier_single_info_init(info, nh); } static void nh_notifier_info_fini(struct nh_notifier_info *info, const struct nexthop *nh) { if (nh->is_group) nh_notifier_grp_info_fini(info, nh); else nh_notifier_single_info_fini(info); } static int call_nexthop_notifiers(struct net *net, enum nexthop_event_type event_type, struct nexthop *nh, struct netlink_ext_ack *extack) { struct nh_notifier_info info = { .net = net, .extack = extack, }; int err; ASSERT_RTNL(); if (nexthop_notifiers_is_empty(net)) return 0; err = nh_notifier_info_init(&info, nh); if (err) { NL_SET_ERR_MSG(extack, "Failed to initialize nexthop notifier info"); return err; } err = blocking_notifier_call_chain(&net->nexthop.notifier_chain, event_type, &info); nh_notifier_info_fini(&info, nh); return notifier_to_errno(err); } static int nh_notifier_res_bucket_idle_timer_get(const struct nh_notifier_info *info, bool force, unsigned int *p_idle_timer_ms) { struct nh_res_table *res_table; struct nh_group *nhg; struct nexthop *nh; int err = 0; /* When 'force' is false, nexthop bucket replacement is performed * because the bucket was deemed to be idle. In this case, capable * listeners can choose to perform an atomic replacement: The bucket is * only replaced if it is inactive. However, if the idle timer interval * is smaller than the interval in which a listener is querying * buckets' activity from the device, then atomic replacement should * not be tried. Pass the idle timer value to listeners, so that they * could determine which type of replacement to perform. */ if (force) { *p_idle_timer_ms = 0; return 0; } rcu_read_lock(); nh = nexthop_find_by_id(info->net, info->id); if (!nh) { err = -EINVAL; goto out; } nhg = rcu_dereference(nh->nh_grp); res_table = rcu_dereference(nhg->res_table); *p_idle_timer_ms = jiffies_to_msecs(res_table->idle_timer); out: rcu_read_unlock(); return err; } static int nh_notifier_res_bucket_info_init(struct nh_notifier_info *info, u16 bucket_index, bool force, struct nh_info *oldi, struct nh_info *newi) { unsigned int idle_timer_ms; int err; err = nh_notifier_res_bucket_idle_timer_get(info, force, &idle_timer_ms); if (err) return err; info->type = NH_NOTIFIER_INFO_TYPE_RES_BUCKET; info->nh_res_bucket = kzalloc(sizeof(*info->nh_res_bucket), GFP_KERNEL); if (!info->nh_res_bucket) return -ENOMEM; info->nh_res_bucket->bucket_index = bucket_index; info->nh_res_bucket->idle_timer_ms = idle_timer_ms; info->nh_res_bucket->force = force; __nh_notifier_single_info_init(&info->nh_res_bucket->old_nh, oldi); __nh_notifier_single_info_init(&info->nh_res_bucket->new_nh, newi); return 0; } static void nh_notifier_res_bucket_info_fini(struct nh_notifier_info *info) { kfree(info->nh_res_bucket); } static int __call_nexthop_res_bucket_notifiers(struct net *net, u32 nhg_id, u16 bucket_index, bool force, struct nh_info *oldi, struct nh_info *newi, struct netlink_ext_ack *extack) { struct nh_notifier_info info = { .net = net, .extack = extack, .id = nhg_id, }; int err; if (nexthop_notifiers_is_empty(net)) return 0; err = nh_notifier_res_bucket_info_init(&info, bucket_index, force, oldi, newi); if (err) return err; err = blocking_notifier_call_chain(&net->nexthop.notifier_chain, NEXTHOP_EVENT_BUCKET_REPLACE, &info); nh_notifier_res_bucket_info_fini(&info); return notifier_to_errno(err); } /* There are three users of RES_TABLE, and NHs etc. referenced from there: * * 1) a collection of callbacks for NH maintenance. This operates under * RTNL, * 2) the delayed work that gradually balances the resilient table, * 3) and nexthop_select_path(), operating under RCU. * * Both the delayed work and the RTNL block are writers, and need to * maintain mutual exclusion. Since there are only two and well-known * writers for each table, the RTNL code can make sure it has exclusive * access thus: * * - Have the DW operate without locking; * - synchronously cancel the DW; * - do the writing; * - if the write was not actually a delete, call upkeep, which schedules * DW again if necessary. * * The functions that are always called from the RTNL context use * rtnl_dereference(). The functions that can also be called from the DW do * a raw dereference and rely on the above mutual exclusion scheme. */ #define nh_res_dereference(p) (rcu_dereference_raw(p)) static int call_nexthop_res_bucket_notifiers(struct net *net, u32 nhg_id, u16 bucket_index, bool force, struct nexthop *old_nh, struct nexthop *new_nh, struct netlink_ext_ack *extack) { struct nh_info *oldi = nh_res_dereference(old_nh->nh_info); struct nh_info *newi = nh_res_dereference(new_nh->nh_info); return __call_nexthop_res_bucket_notifiers(net, nhg_id, bucket_index, force, oldi, newi, extack); } static int call_nexthop_res_table_notifiers(struct net *net, struct nexthop *nh, struct netlink_ext_ack *extack) { struct nh_notifier_info info = { .net = net, .extack = extack, .id = nh->id, }; struct nh_group *nhg; int err; ASSERT_RTNL(); if (nexthop_notifiers_is_empty(net)) return 0; /* At this point, the nexthop buckets are still not populated. Only * emit a notification with the logical nexthops, so that a listener * could potentially veto it in case of unsupported configuration. */ nhg = rtnl_dereference(nh->nh_grp); err = nh_notifier_mpath_info_init(&info, nhg); if (err) { NL_SET_ERR_MSG(extack, "Failed to initialize nexthop notifier info"); return err; } err = blocking_notifier_call_chain(&net->nexthop.notifier_chain, NEXTHOP_EVENT_RES_TABLE_PRE_REPLACE, &info); kfree(info.nh_grp); return notifier_to_errno(err); } static int call_nexthop_notifier(struct notifier_block *nb, struct net *net, enum nexthop_event_type event_type, struct nexthop *nh, struct netlink_ext_ack *extack) { struct nh_notifier_info info = { .net = net, .extack = extack, }; int err; err = nh_notifier_info_init(&info, nh); if (err) return err; err = nb->notifier_call(nb, event_type, &info); nh_notifier_info_fini(&info, nh); return notifier_to_errno(err); } static unsigned int nh_dev_hashfn(unsigned int val) { unsigned int mask = NH_DEV_HASHSIZE - 1; return (val ^ (val >> NH_DEV_HASHBITS) ^ (val >> (NH_DEV_HASHBITS * 2))) & mask; } static void nexthop_devhash_add(struct net *net, struct nh_info *nhi) { struct net_device *dev = nhi->fib_nhc.nhc_dev; struct hlist_head *head; unsigned int hash; WARN_ON(!dev); hash = nh_dev_hashfn(dev->ifindex); head = &net->nexthop.devhash[hash]; hlist_add_head(&nhi->dev_hash, head); } static void nexthop_free_group(struct nexthop *nh) { struct nh_group *nhg; int i; nhg = rcu_dereference_raw(nh->nh_grp); for (i = 0; i < nhg->num_nh; ++i) { struct nh_grp_entry *nhge = &nhg->nh_entries[i]; WARN_ON(!list_empty(&nhge->nh_list)); free_percpu(nhge->stats); nexthop_put(nhge->nh); } WARN_ON(nhg->spare == nhg); if (nhg->resilient) vfree(rcu_dereference_raw(nhg->res_table)); kfree(nhg->spare); kfree(nhg); } static void nexthop_free_single(struct nexthop *nh) { struct nh_info *nhi; nhi = rcu_dereference_raw(nh->nh_info); switch (nhi->family) { case AF_INET: fib_nh_release(nh->net, &nhi->fib_nh); break; case AF_INET6: ipv6_stub->fib6_nh_release(&nhi->fib6_nh); break; } kfree(nhi); } void nexthop_free_rcu(struct rcu_head *head) { struct nexthop *nh = container_of(head, struct nexthop, rcu); if (nh->is_group) nexthop_free_group(nh); else nexthop_free_single(nh); kfree(nh); } EXPORT_SYMBOL_GPL(nexthop_free_rcu); static struct nexthop *nexthop_alloc(void) { struct nexthop *nh; nh = kzalloc(sizeof(struct nexthop), GFP_KERNEL); if (nh) { INIT_LIST_HEAD(&nh->fi_list); INIT_LIST_HEAD(&nh->f6i_list); INIT_LIST_HEAD(&nh->grp_list); INIT_LIST_HEAD(&nh->fdb_list); } return nh; } static struct nh_group *nexthop_grp_alloc(u16 num_nh) { struct nh_group *nhg; nhg = kzalloc(struct_size(nhg, nh_entries, num_nh), GFP_KERNEL); if (nhg) nhg->num_nh = num_nh; return nhg; } static void nh_res_table_upkeep_dw(struct work_struct *work); static struct nh_res_table * nexthop_res_table_alloc(struct net *net, u32 nhg_id, struct nh_config *cfg) { const u16 num_nh_buckets = cfg->nh_grp_res_num_buckets; struct nh_res_table *res_table; unsigned long size; size = struct_size(res_table, nh_buckets, num_nh_buckets); res_table = __vmalloc(size, GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN); if (!res_table) return NULL; res_table->net = net; res_table->nhg_id = nhg_id; INIT_DELAYED_WORK(&res_table->upkeep_dw, &nh_res_table_upkeep_dw); INIT_LIST_HEAD(&res_table->uw_nh_entries); res_table->idle_timer = cfg->nh_grp_res_idle_timer; res_table->unbalanced_timer = cfg->nh_grp_res_unbalanced_timer; res_table->num_nh_buckets = num_nh_buckets; return res_table; } static void nh_base_seq_inc(struct net *net) { while (++net->nexthop.seq == 0) ; } /* no reference taken; rcu lock or rtnl must be held */ struct nexthop *nexthop_find_by_id(struct net *net, u32 id) { struct rb_node **pp, *parent = NULL, *next; pp = &net->nexthop.rb_root.rb_node; while (1) { struct nexthop *nh; next = rcu_dereference_raw(*pp); if (!next) break; parent = next; nh = rb_entry(parent, struct nexthop, rb_node); if (id < nh->id) pp = &next->rb_left; else if (id > nh->id) pp = &next->rb_right; else return nh; } return NULL; } EXPORT_SYMBOL_GPL(nexthop_find_by_id); /* used for auto id allocation; called with rtnl held */ static u32 nh_find_unused_id(struct net *net) { u32 id_start = net->nexthop.last_id_allocated; while (1) { net->nexthop.last_id_allocated++; if (net->nexthop.last_id_allocated == id_start) break; if (!nexthop_find_by_id(net, net->nexthop.last_id_allocated)) return net->nexthop.last_id_allocated; } return 0; } static void nh_res_time_set_deadline(unsigned long next_time, unsigned long *deadline) { if (time_before(next_time, *deadline)) *deadline = next_time; } static clock_t nh_res_table_unbalanced_time(struct nh_res_table *res_table) { if (list_empty(&res_table->uw_nh_entries)) return 0; return jiffies_delta_to_clock_t(jiffies - res_table->unbalanced_since); } static int nla_put_nh_group_res(struct sk_buff *skb, struct nh_group *nhg) { struct nh_res_table *res_table = rtnl_dereference(nhg->res_table); struct nlattr *nest; nest = nla_nest_start(skb, NHA_RES_GROUP); if (!nest) return -EMSGSIZE; if (nla_put_u16(skb, NHA_RES_GROUP_BUCKETS, res_table->num_nh_buckets) || nla_put_u32(skb, NHA_RES_GROUP_IDLE_TIMER, jiffies_to_clock_t(res_table->idle_timer)) || nla_put_u32(skb, NHA_RES_GROUP_UNBALANCED_TIMER, jiffies_to_clock_t(res_table->unbalanced_timer)) || nla_put_u64_64bit(skb, NHA_RES_GROUP_UNBALANCED_TIME, nh_res_table_unbalanced_time(res_table), NHA_RES_GROUP_PAD)) goto nla_put_failure; nla_nest_end(skb, nest); return 0; nla_put_failure: nla_nest_cancel(skb, nest); return -EMSGSIZE; } static void nh_grp_entry_stats_inc(struct nh_grp_entry *nhge) { struct nh_grp_entry_stats *cpu_stats; cpu_stats = get_cpu_ptr(nhge->stats); u64_stats_update_begin(&cpu_stats->syncp); u64_stats_inc(&cpu_stats->packets); u64_stats_update_end(&cpu_stats->syncp); put_cpu_ptr(cpu_stats); } static void nh_grp_entry_stats_read(struct nh_grp_entry *nhge, u64 *ret_packets) { int i; *ret_packets = 0; for_each_possible_cpu(i) { struct nh_grp_entry_stats *cpu_stats; unsigned int start; u64 packets; cpu_stats = per_cpu_ptr(nhge->stats, i); do { start = u64_stats_fetch_begin(&cpu_stats->syncp); packets = u64_stats_read(&cpu_stats->packets); } while (u64_stats_fetch_retry(&cpu_stats->syncp, start)); *ret_packets += packets; } } static int nh_notifier_grp_hw_stats_init(struct nh_notifier_info *info, const struct nexthop *nh) { struct nh_group *nhg; int i; ASSERT_RTNL(); nhg = rtnl_dereference(nh->nh_grp); info->id = nh->id; info->type = NH_NOTIFIER_INFO_TYPE_GRP_HW_STATS; info->nh_grp_hw_stats = kzalloc(struct_size(info->nh_grp_hw_stats, stats, nhg->num_nh), GFP_KERNEL); if (!info->nh_grp_hw_stats) return -ENOMEM; info->nh_grp_hw_stats->num_nh = nhg->num_nh; for (i = 0; i < nhg->num_nh; i++) { struct nh_grp_entry *nhge = &nhg->nh_entries[i]; info->nh_grp_hw_stats->stats[i].id = nhge->nh->id; } return 0; } static void nh_notifier_grp_hw_stats_fini(struct nh_notifier_info *info) { kfree(info->nh_grp_hw_stats); } void nh_grp_hw_stats_report_delta(struct nh_notifier_grp_hw_stats_info *info, unsigned int nh_idx, u64 delta_packets) { info->hw_stats_used = true; info->stats[nh_idx].packets += delta_packets; } EXPORT_SYMBOL(nh_grp_hw_stats_report_delta); static void nh_grp_hw_stats_apply_update(struct nexthop *nh, struct nh_notifier_info *info) { struct nh_group *nhg; int i; ASSERT_RTNL(); nhg = rtnl_dereference(nh->nh_grp); for (i = 0; i < nhg->num_nh; i++) { struct nh_grp_entry *nhge = &nhg->nh_entries[i]; nhge->packets_hw += info->nh_grp_hw_stats->stats[i].packets; } } static int nh_grp_hw_stats_update(struct nexthop *nh, bool *hw_stats_used) { struct nh_notifier_info info = { .net = nh->net, }; struct net *net = nh->net; int err; if (nexthop_notifiers_is_empty(net)) { *hw_stats_used = false; return 0; } err = nh_notifier_grp_hw_stats_init(&info, nh); if (err) return err; err = blocking_notifier_call_chain(&net->nexthop.notifier_chain, NEXTHOP_EVENT_HW_STATS_REPORT_DELTA, &info); /* Cache whatever we got, even if there was an error, otherwise the * successful stats retrievals would get lost. */ nh_grp_hw_stats_apply_update(nh, &info); *hw_stats_used = info.nh_grp_hw_stats->hw_stats_used; nh_notifier_grp_hw_stats_fini(&info); return notifier_to_errno(err); } static int nla_put_nh_group_stats_entry(struct sk_buff *skb, struct nh_grp_entry *nhge, u32 op_flags) { struct nlattr *nest; u64 packets; nh_grp_entry_stats_read(nhge, &packets); nest = nla_nest_start(skb, NHA_GROUP_STATS_ENTRY); if (!nest) return -EMSGSIZE; if (nla_put_u32(skb, NHA_GROUP_STATS_ENTRY_ID, nhge->nh->id) || nla_put_uint(skb, NHA_GROUP_STATS_ENTRY_PACKETS, packets + nhge->packets_hw)) goto nla_put_failure; if (op_flags & NHA_OP_FLAG_DUMP_HW_STATS && nla_put_uint(skb, NHA_GROUP_STATS_ENTRY_PACKETS_HW, nhge->packets_hw)) goto nla_put_failure; nla_nest_end(skb, nest); return 0; nla_put_failure: nla_nest_cancel(skb, nest); return -EMSGSIZE; } static int nla_put_nh_group_stats(struct sk_buff *skb, struct nexthop *nh, u32 op_flags) { struct nh_group *nhg = rtnl_dereference(nh->nh_grp); struct nlattr *nest; bool hw_stats_used; int err; int i; if (nla_put_u32(skb, NHA_HW_STATS_ENABLE, nhg->hw_stats)) goto err_out; if (op_flags & NHA_OP_FLAG_DUMP_HW_STATS && nhg->hw_stats) { err = nh_grp_hw_stats_update(nh, &hw_stats_used); if (err) goto out; if (nla_put_u32(skb, NHA_HW_STATS_USED, hw_stats_used)) goto err_out; } nest = nla_nest_start(skb, NHA_GROUP_STATS); if (!nest) goto err_out; for (i = 0; i < nhg->num_nh; i++) if (nla_put_nh_group_stats_entry(skb, &nhg->nh_entries[i], op_flags)) goto cancel_out; nla_nest_end(skb, nest); return 0; cancel_out: nla_nest_cancel(skb, nest); err_out: err = -EMSGSIZE; out: return err; } static int nla_put_nh_group(struct sk_buff *skb, struct nexthop *nh, u32 op_flags, u32 *resp_op_flags) { struct nh_group *nhg = rtnl_dereference(nh->nh_grp); struct nexthop_grp *p; size_t len = nhg->num_nh * sizeof(*p); struct nlattr *nla; u16 group_type = 0; u16 weight; int i; *resp_op_flags |= NHA_OP_FLAG_RESP_GRP_RESVD_0; if (nhg->hash_threshold) group_type = NEXTHOP_GRP_TYPE_MPATH; else if (nhg->resilient) group_type = NEXTHOP_GRP_TYPE_RES; if (nla_put_u16(skb, NHA_GROUP_TYPE, group_type)) goto nla_put_failure; nla = nla_reserve(skb, NHA_GROUP, len); if (!nla) goto nla_put_failure; p = nla_data(nla); for (i = 0; i < nhg->num_nh; ++i) { weight = nhg->nh_entries[i].weight - 1; *p++ = (struct nexthop_grp) { .id = nhg->nh_entries[i].nh->id, .weight = weight, .weight_high = weight >> 8, }; } if (nhg->resilient && nla_put_nh_group_res(skb, nhg)) goto nla_put_failure; if (op_flags & NHA_OP_FLAG_DUMP_STATS && (nla_put_u32(skb, NHA_HW_STATS_ENABLE, nhg->hw_stats) || nla_put_nh_group_stats(skb, nh, op_flags))) goto nla_put_failure; return 0; nla_put_failure: return -EMSGSIZE; } static int nh_fill_node(struct sk_buff *skb, struct nexthop *nh, int event, u32 portid, u32 seq, unsigned int nlflags, u32 op_flags) { struct fib6_nh *fib6_nh; struct fib_nh *fib_nh; struct nlmsghdr *nlh; struct nh_info *nhi; struct nhmsg *nhm; nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nhm), nlflags); if (!nlh) return -EMSGSIZE; nhm = nlmsg_data(nlh); nhm->nh_family = AF_UNSPEC; nhm->nh_flags = nh->nh_flags; nhm->nh_protocol = nh->protocol; nhm->nh_scope = 0; nhm->resvd = 0; if (nla_put_u32(skb, NHA_ID, nh->id)) goto nla_put_failure; if (nh->is_group) { struct nh_group *nhg = rtnl_dereference(nh->nh_grp); u32 resp_op_flags = 0; if (nhg->fdb_nh && nla_put_flag(skb, NHA_FDB)) goto nla_put_failure; if (nla_put_nh_group(skb, nh, op_flags, &resp_op_flags) || nla_put_u32(skb, NHA_OP_FLAGS, resp_op_flags)) goto nla_put_failure; goto out; } nhi = rtnl_dereference(nh->nh_info); nhm->nh_family = nhi->family; if (nhi->reject_nh) { if (nla_put_flag(skb, NHA_BLACKHOLE)) goto nla_put_failure; goto out; } else if (nhi->fdb_nh) { if (nla_put_flag(skb, NHA_FDB)) goto nla_put_failure; } else { const struct net_device *dev; dev = nhi->fib_nhc.nhc_dev; if (dev && nla_put_u32(skb, NHA_OIF, dev->ifindex)) goto nla_put_failure; } nhm->nh_scope = nhi->fib_nhc.nhc_scope; switch (nhi->family) { case AF_INET: fib_nh = &nhi->fib_nh; if (fib_nh->fib_nh_gw_family && nla_put_be32(skb, NHA_GATEWAY, fib_nh->fib_nh_gw4)) goto nla_put_failure; break; case AF_INET6: fib6_nh = &nhi->fib6_nh; if (fib6_nh->fib_nh_gw_family && nla_put_in6_addr(skb, NHA_GATEWAY, &fib6_nh->fib_nh_gw6)) goto nla_put_failure; break; } if (nhi->fib_nhc.nhc_lwtstate && lwtunnel_fill_encap(skb, nhi->fib_nhc.nhc_lwtstate, NHA_ENCAP, NHA_ENCAP_TYPE) < 0) goto nla_put_failure; out: nlmsg_end(skb, nlh); return 0; nla_put_failure: nlmsg_cancel(skb, nlh); return -EMSGSIZE; } static size_t nh_nlmsg_size_grp_res(struct nh_group *nhg) { return nla_total_size(0) + /* NHA_RES_GROUP */ nla_total_size(2) + /* NHA_RES_GROUP_BUCKETS */ nla_total_size(4) + /* NHA_RES_GROUP_IDLE_TIMER */ nla_total_size(4) + /* NHA_RES_GROUP_UNBALANCED_TIMER */ nla_total_size_64bit(8);/* NHA_RES_GROUP_UNBALANCED_TIME */ } static size_t nh_nlmsg_size_grp(struct nexthop *nh) { struct nh_group *nhg = rtnl_dereference(nh->nh_grp); size_t sz = sizeof(struct nexthop_grp) * nhg->num_nh; size_t tot = nla_total_size(sz) + nla_total_size(2); /* NHA_GROUP_TYPE */ if (nhg->resilient) tot += nh_nlmsg_size_grp_res(nhg); return tot; } static size_t nh_nlmsg_size_single(struct nexthop *nh) { struct nh_info *nhi = rtnl_dereference(nh->nh_info); size_t sz; /* covers NHA_BLACKHOLE since NHA_OIF and BLACKHOLE * are mutually exclusive */ sz = nla_total_size(4); /* NHA_OIF */ switch (nhi->family) { case AF_INET: if (nhi->fib_nh.fib_nh_gw_family) sz += nla_total_size(4); /* NHA_GATEWAY */ break; case AF_INET6: /* NHA_GATEWAY */ if (nhi->fib6_nh.fib_nh_gw_family) sz += nla_total_size(sizeof(const struct in6_addr)); break; } if (nhi->fib_nhc.nhc_lwtstate) { sz += lwtunnel_get_encap_size(nhi->fib_nhc.nhc_lwtstate); sz += nla_total_size(2); /* NHA_ENCAP_TYPE */ } return sz; } static size_t nh_nlmsg_size(struct nexthop *nh) { size_t sz = NLMSG_ALIGN(sizeof(struct nhmsg)); sz += nla_total_size(4); /* NHA_ID */ if (nh->is_group) sz += nh_nlmsg_size_grp(nh) + nla_total_size(4) + /* NHA_OP_FLAGS */ 0; else sz += nh_nlmsg_size_single(nh); return sz; } static void nexthop_notify(int event, struct nexthop *nh, struct nl_info *info) { unsigned int nlflags = info->nlh ? info->nlh->nlmsg_flags : 0; u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0; struct sk_buff *skb; int err = -ENOBUFS; skb = nlmsg_new(nh_nlmsg_size(nh), gfp_any()); if (!skb) goto errout; err = nh_fill_node(skb, nh, event, info->portid, seq, nlflags, 0); if (err < 0) { /* -EMSGSIZE implies BUG in nh_nlmsg_size() */ WARN_ON(err == -EMSGSIZE); kfree_skb(skb); goto errout; } rtnl_notify(skb, info->nl_net, info->portid, RTNLGRP_NEXTHOP, info->nlh, gfp_any()); return; errout: rtnl_set_sk_err(info->nl_net, RTNLGRP_NEXTHOP, err); } static unsigned long nh_res_bucket_used_time(const struct nh_res_bucket *bucket) { return (unsigned long)atomic_long_read(&bucket->used_time); } static unsigned long nh_res_bucket_idle_point(const struct nh_res_table *res_table, const struct nh_res_bucket *bucket, unsigned long now) { unsigned long time = nh_res_bucket_used_time(bucket); /* Bucket was not used since it was migrated. The idle time is now. */ if (time == bucket->migrated_time) return now; return time + res_table->idle_timer; } static unsigned long nh_res_table_unb_point(const struct nh_res_table *res_table) { return res_table->unbalanced_since + res_table->unbalanced_timer; } static void nh_res_bucket_set_idle(const struct nh_res_table *res_table, struct nh_res_bucket *bucket) { unsigned long now = jiffies; atomic_long_set(&bucket->used_time, (long)now); bucket->migrated_time = now; } static void nh_res_bucket_set_busy(struct nh_res_bucket *bucket) { atomic_long_set(&bucket->used_time, (long)jiffies); } static clock_t nh_res_bucket_idle_time(const struct nh_res_bucket *bucket) { unsigned long used_time = nh_res_bucket_used_time(bucket); return jiffies_delta_to_clock_t(jiffies - used_time); } static int nh_fill_res_bucket(struct sk_buff *skb, struct nexthop *nh, struct nh_res_bucket *bucket, u16 bucket_index, int event, u32 portid, u32 seq, unsigned int nlflags, struct netlink_ext_ack *extack) { struct nh_grp_entry *nhge = nh_res_dereference(bucket->nh_entry); struct nlmsghdr *nlh; struct nlattr *nest; struct nhmsg *nhm; nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nhm), nlflags); if (!nlh) return -EMSGSIZE; nhm = nlmsg_data(nlh); nhm->nh_family = AF_UNSPEC; nhm->nh_flags = bucket->nh_flags; nhm->nh_protocol = nh->protocol; nhm->nh_scope = 0; nhm->resvd = 0; if (nla_put_u32(skb, NHA_ID, nh->id)) goto nla_put_failure; nest = nla_nest_start(skb, NHA_RES_BUCKET); if (!nest) goto nla_put_failure; if (nla_put_u16(skb, NHA_RES_BUCKET_INDEX, bucket_index) || nla_put_u32(skb, NHA_RES_BUCKET_NH_ID, nhge->nh->id) || nla_put_u64_64bit(skb, NHA_RES_BUCKET_IDLE_TIME, nh_res_bucket_idle_time(bucket), NHA_RES_BUCKET_PAD)) goto nla_put_failure_nest; nla_nest_end(skb, nest); nlmsg_end(skb, nlh); return 0; nla_put_failure_nest: nla_nest_cancel(skb, nest); nla_put_failure: nlmsg_cancel(skb, nlh); return -EMSGSIZE; } static void nexthop_bucket_notify(struct nh_res_table *res_table, u16 bucket_index) { struct nh_res_bucket *bucket = &res_table->nh_buckets[bucket_index]; struct nh_grp_entry *nhge = nh_res_dereference(bucket->nh_entry); struct nexthop *nh = nhge->nh_parent; struct sk_buff *skb; int err = -ENOBUFS; skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); if (!skb) goto errout; err = nh_fill_res_bucket(skb, nh, bucket, bucket_index, RTM_NEWNEXTHOPBUCKET, 0, 0, NLM_F_REPLACE, NULL); if (err < 0) { kfree_skb(skb); goto errout; } rtnl_notify(skb, nh->net, 0, RTNLGRP_NEXTHOP, NULL, GFP_KERNEL); return; errout: rtnl_set_sk_err(nh->net, RTNLGRP_NEXTHOP, err); } static bool valid_group_nh(struct nexthop *nh, unsigned int npaths, bool *is_fdb, struct netlink_ext_ack *extack) { if (nh->is_group) { struct nh_group *nhg = rtnl_dereference(nh->nh_grp); /* Nesting groups within groups is not supported. */ if (nhg->hash_threshold) { NL_SET_ERR_MSG(extack, "Hash-threshold group can not be a nexthop within a group"); return false; } if (nhg->resilient) { NL_SET_ERR_MSG(extack, "Resilient group can not be a nexthop within a group"); return false; } *is_fdb = nhg->fdb_nh; } else { struct nh_info *nhi = rtnl_dereference(nh->nh_info); if (nhi->reject_nh && npaths > 1) { NL_SET_ERR_MSG(extack, "Blackhole nexthop can not be used in a group with more than 1 path"); return false; } *is_fdb = nhi->fdb_nh; } return true; } static int nh_check_attr_fdb_group(struct nexthop *nh, u8 *nh_family, struct netlink_ext_ack *extack) { struct nh_info *nhi; nhi = rtnl_dereference(nh->nh_info); if (!nhi->fdb_nh) { NL_SET_ERR_MSG(extack, "FDB nexthop group can only have fdb nexthops"); return -EINVAL; } if (*nh_family == AF_UNSPEC) { *nh_family = nhi->family; } else if (*nh_family != nhi->family) { NL_SET_ERR_MSG(extack, "FDB nexthop group cannot have mixed family nexthops"); return -EINVAL; } return 0; } static int nh_check_attr_group(struct net *net, struct nlattr *tb[], size_t tb_size, u16 nh_grp_type, struct netlink_ext_ack *extack) { unsigned int len = nla_len(tb[NHA_GROUP]); u8 nh_family = AF_UNSPEC; struct nexthop_grp *nhg; unsigned int i, j; u8 nhg_fdb = 0; if (!len || len & (sizeof(struct nexthop_grp) - 1)) { NL_SET_ERR_MSG(extack, "Invalid length for nexthop group attribute"); return -EINVAL; } /* convert len to number of nexthop ids */ len /= sizeof(*nhg); nhg = nla_data(tb[NHA_GROUP]); for (i = 0; i < len; ++i) { if (nhg[i].resvd2) { NL_SET_ERR_MSG(extack, "Reserved field in nexthop_grp must be 0"); return -EINVAL; } if (nexthop_grp_weight(&nhg[i]) == 0) { /* 0xffff got passed in, representing weight of 0x10000, * which is too heavy. */ NL_SET_ERR_MSG(extack, "Invalid value for weight"); return -EINVAL; } for (j = i + 1; j < len; ++j) { if (nhg[i].id == nhg[j].id) { NL_SET_ERR_MSG(extack, "Nexthop id can not be used twice in a group"); return -EINVAL; } } } if (tb[NHA_FDB]) nhg_fdb = 1; nhg = nla_data(tb[NHA_GROUP]); for (i = 0; i < len; ++i) { struct nexthop *nh; bool is_fdb_nh; nh = nexthop_find_by_id(net, nhg[i].id); if (!nh) { NL_SET_ERR_MSG(extack, "Invalid nexthop id"); return -EINVAL; } if (!valid_group_nh(nh, len, &is_fdb_nh, extack)) return -EINVAL; if (nhg_fdb && nh_check_attr_fdb_group(nh, &nh_family, extack)) return -EINVAL; if (!nhg_fdb && is_fdb_nh) { NL_SET_ERR_MSG(extack, "Non FDB nexthop group cannot have fdb nexthops"); return -EINVAL; } } for (i = NHA_GROUP_TYPE + 1; i < tb_size; ++i) { if (!tb[i]) continue; switch (i) { case NHA_HW_STATS_ENABLE: case NHA_FDB: continue; case NHA_RES_GROUP: if (nh_grp_type == NEXTHOP_GRP_TYPE_RES) continue; break; } NL_SET_ERR_MSG(extack, "No other attributes can be set in nexthop groups"); return -EINVAL; } return 0; } static bool ipv6_good_nh(const struct fib6_nh *nh) { int state = NUD_REACHABLE; struct neighbour *n; rcu_read_lock(); n = __ipv6_neigh_lookup_noref_stub(nh->fib_nh_dev, &nh->fib_nh_gw6); if (n) state = READ_ONCE(n->nud_state); rcu_read_unlock(); return !!(state & NUD_VALID); } static bool ipv4_good_nh(const struct fib_nh *nh) { int state = NUD_REACHABLE; struct neighbour *n; rcu_read_lock(); n = __ipv4_neigh_lookup_noref(nh->fib_nh_dev, (__force u32)nh->fib_nh_gw4); if (n) state = READ_ONCE(n->nud_state); rcu_read_unlock(); return !!(state & NUD_VALID); } static bool nexthop_is_good_nh(const struct nexthop *nh) { struct nh_info *nhi = rcu_dereference(nh->nh_info); switch (nhi->family) { case AF_INET: return ipv4_good_nh(&nhi->fib_nh); case AF_INET6: return ipv6_good_nh(&nhi->fib6_nh); } return false; } static struct nexthop *nexthop_select_path_fdb(struct nh_group *nhg, int hash) { int i; for (i = 0; i < nhg->num_nh; i++) { struct nh_grp_entry *nhge = &nhg->nh_entries[i]; if (hash > atomic_read(&nhge->hthr.upper_bound)) continue; nh_grp_entry_stats_inc(nhge); return nhge->nh; } WARN_ON_ONCE(1); return NULL; } static struct nexthop *nexthop_select_path_hthr(struct nh_group *nhg, int hash) { struct nh_grp_entry *nhge0 = NULL; int i; if (nhg->fdb_nh) return nexthop_select_path_fdb(nhg, hash); for (i = 0; i < nhg->num_nh; ++i) { struct nh_grp_entry *nhge = &nhg->nh_entries[i]; /* nexthops always check if it is good and does * not rely on a sysctl for this behavior */ if (!nexthop_is_good_nh(nhge->nh)) continue; if (!nhge0) nhge0 = nhge; if (hash > atomic_read(&nhge->hthr.upper_bound)) continue; nh_grp_entry_stats_inc(nhge); return nhge->nh; } if (!nhge0) nhge0 = &nhg->nh_entries[0]; nh_grp_entry_stats_inc(nhge0); return nhge0->nh; } static struct nexthop *nexthop_select_path_res(struct nh_group *nhg, int hash) { struct nh_res_table *res_table = rcu_dereference(nhg->res_table); u16 bucket_index = hash % res_table->num_nh_buckets; struct nh_res_bucket *bucket; struct nh_grp_entry *nhge; /* nexthop_select_path() is expected to return a non-NULL value, so * skip protocol validation and just hand out whatever there is. */ bucket = &res_table->nh_buckets[bucket_index]; nh_res_bucket_set_busy(bucket); nhge = rcu_dereference(bucket->nh_entry); nh_grp_entry_stats_inc(nhge); return nhge->nh; } struct nexthop *nexthop_select_path(struct nexthop *nh, int hash) { struct nh_group *nhg; if (!nh->is_group) return nh; nhg = rcu_dereference(nh->nh_grp); if (nhg->hash_threshold) return nexthop_select_path_hthr(nhg, hash); else if (nhg->resilient) return nexthop_select_path_res(nhg, hash); /* Unreachable. */ return NULL; } EXPORT_SYMBOL_GPL(nexthop_select_path); int nexthop_for_each_fib6_nh(struct nexthop *nh, int (*cb)(struct fib6_nh *nh, void *arg), void *arg) { struct nh_info *nhi; int err; if (nh->is_group) { struct nh_group *nhg; int i; nhg = rcu_dereference_rtnl(nh->nh_grp); for (i = 0; i < nhg->num_nh; i++) { struct nh_grp_entry *nhge = &nhg->nh_entries[i]; nhi = rcu_dereference_rtnl(nhge->nh->nh_info); err = cb(&nhi->fib6_nh, arg); if (err) return err; } } else { nhi = rcu_dereference_rtnl(nh->nh_info); err = cb(&nhi->fib6_nh, arg); if (err) return err; } return 0; } EXPORT_SYMBOL_GPL(nexthop_for_each_fib6_nh); static int check_src_addr(const struct in6_addr *saddr, struct netlink_ext_ack *extack) { if (!ipv6_addr_any(saddr)) { NL_SET_ERR_MSG(extack, "IPv6 routes using source address can not use nexthop objects"); return -EINVAL; } return 0; } int fib6_check_nexthop(struct nexthop *nh, struct fib6_config *cfg, struct netlink_ext_ack *extack) { struct nh_info *nhi; bool is_fdb_nh; /* fib6_src is unique to a fib6_info and limits the ability to cache * routes in fib6_nh within a nexthop that is potentially shared * across multiple fib entries. If the config wants to use source * routing it can not use nexthop objects. mlxsw also does not allow * fib6_src on routes. */ if (cfg && check_src_addr(&cfg->fc_src, extack) < 0) return -EINVAL; if (nh->is_group) { struct nh_group *nhg; nhg = rtnl_dereference(nh->nh_grp); if (nhg->has_v4) goto no_v4_nh; is_fdb_nh = nhg->fdb_nh; } else { nhi = rtnl_dereference(nh->nh_info); if (nhi->family == AF_INET) goto no_v4_nh; is_fdb_nh = nhi->fdb_nh; } if (is_fdb_nh) { NL_SET_ERR_MSG(extack, "Route cannot point to a fdb nexthop"); return -EINVAL; } return 0; no_v4_nh: NL_SET_ERR_MSG(extack, "IPv6 routes can not use an IPv4 nexthop"); return -EINVAL; } EXPORT_SYMBOL_GPL(fib6_check_nexthop); /* if existing nexthop has ipv6 routes linked to it, need * to verify this new spec works with ipv6 */ static int fib6_check_nh_list(struct nexthop *old, struct nexthop *new, struct netlink_ext_ack *extack) { struct fib6_info *f6i; if (list_empty(&old->f6i_list)) return 0; list_for_each_entry(f6i, &old->f6i_list, nh_list) { if (check_src_addr(&f6i->fib6_src.addr, extack) < 0) return -EINVAL; } return fib6_check_nexthop(new, NULL, extack); } static int nexthop_check_scope(struct nh_info *nhi, u8 scope, struct netlink_ext_ack *extack) { if (scope == RT_SCOPE_HOST && nhi->fib_nhc.nhc_gw_family) { NL_SET_ERR_MSG(extack, "Route with host scope can not have a gateway"); return -EINVAL; } if (nhi->fib_nhc.nhc_flags & RTNH_F_ONLINK && scope >= RT_SCOPE_LINK) { NL_SET_ERR_MSG(extack, "Scope mismatch with nexthop"); return -EINVAL; } return 0; } /* Invoked by fib add code to verify nexthop by id is ok with * config for prefix; parts of fib_check_nh not done when nexthop * object is used. */ int fib_check_nexthop(struct nexthop *nh, u8 scope, struct netlink_ext_ack *extack) { struct nh_info *nhi; int err = 0; if (nh->is_group) { struct nh_group *nhg; nhg = rtnl_dereference(nh->nh_grp); if (nhg->fdb_nh) { NL_SET_ERR_MSG(extack, "Route cannot point to a fdb nexthop"); err = -EINVAL; goto out; } if (scope == RT_SCOPE_HOST) { NL_SET_ERR_MSG(extack, "Route with host scope can not have multiple nexthops"); err = -EINVAL; goto out; } /* all nexthops in a group have the same scope */ nhi = rtnl_dereference(nhg->nh_entries[0].nh->nh_info); err = nexthop_check_scope(nhi, scope, extack); } else { nhi = rtnl_dereference(nh->nh_info); if (nhi->fdb_nh) { NL_SET_ERR_MSG(extack, "Route cannot point to a fdb nexthop"); err = -EINVAL; goto out; } err = nexthop_check_scope(nhi, scope, extack); } out: return err; } static int fib_check_nh_list(struct nexthop *old, struct nexthop *new, struct netlink_ext_ack *extack) { struct fib_info *fi; list_for_each_entry(fi, &old->fi_list, nh_list) { int err; err = fib_check_nexthop(new, fi->fib_scope, extack); if (err) return err; } return 0; } static bool nh_res_nhge_is_balanced(const struct nh_grp_entry *nhge) { return nhge->res.count_buckets == nhge->res.wants_buckets; } static bool nh_res_nhge_is_ow(const struct nh_grp_entry *nhge) { return nhge->res.count_buckets > nhge->res.wants_buckets; } static bool nh_res_nhge_is_uw(const struct nh_grp_entry *nhge) { return nhge->res.count_buckets < nhge->res.wants_buckets; } static bool nh_res_table_is_balanced(const struct nh_res_table *res_table) { return list_empty(&res_table->uw_nh_entries); } static void nh_res_bucket_unset_nh(struct nh_res_bucket *bucket) { struct nh_grp_entry *nhge; if (bucket->occupied) { nhge = nh_res_dereference(bucket->nh_entry); nhge->res.count_buckets--; bucket->occupied = false; } } static void nh_res_bucket_set_nh(struct nh_res_bucket *bucket, struct nh_grp_entry *nhge) { nh_res_bucket_unset_nh(bucket); bucket->occupied = true; rcu_assign_pointer(bucket->nh_entry, nhge); nhge->res.count_buckets++; } static bool nh_res_bucket_should_migrate(struct nh_res_table *res_table, struct nh_res_bucket *bucket, unsigned long *deadline, bool *force) { unsigned long now = jiffies; struct nh_grp_entry *nhge; unsigned long idle_point; if (!bucket->occupied) { /* The bucket is not occupied, its NHGE pointer is either * NULL or obsolete. We _have to_ migrate: set force. */ *force = true; return true; } nhge = nh_res_dereference(bucket->nh_entry); /* If the bucket is populated by an underweight or balanced * nexthop, do not migrate. */ if (!nh_res_nhge_is_ow(nhge)) return false; /* At this point we know that the bucket is populated with an * overweight nexthop. It needs to be migrated to a new nexthop if * the idle timer of unbalanced timer expired. */ idle_point = nh_res_bucket_idle_point(res_table, bucket, now); if (time_after_eq(now, idle_point)) { /* The bucket is idle. We _can_ migrate: unset force. */ *force = false; return true; } /* Unbalanced timer of 0 means "never force". */ if (res_table->unbalanced_timer) { unsigned long unb_point; unb_point = nh_res_table_unb_point(res_table); if (time_after(now, unb_point)) { /* The bucket is not idle, but the unbalanced timer * expired. We _can_ migrate, but set force anyway, * so that drivers know to ignore activity reports * from the HW. */ *force = true; return true; } nh_res_time_set_deadline(unb_point, deadline); } nh_res_time_set_deadline(idle_point, deadline); return false; } static bool nh_res_bucket_migrate(struct nh_res_table *res_table, u16 bucket_index, bool notify, bool notify_nl, bool force) { struct nh_res_bucket *bucket = &res_table->nh_buckets[bucket_index]; struct nh_grp_entry *new_nhge; struct netlink_ext_ack extack; int err; new_nhge = list_first_entry_or_null(&res_table->uw_nh_entries, struct nh_grp_entry, res.uw_nh_entry); if (WARN_ON_ONCE(!new_nhge)) /* If this function is called, "bucket" is either not * occupied, or it belongs to a next hop that is * overweight. In either case, there ought to be a * corresponding underweight next hop. */ return false; if (notify) { struct nh_grp_entry *old_nhge; old_nhge = nh_res_dereference(bucket->nh_entry); err = call_nexthop_res_bucket_notifiers(res_table->net, res_table->nhg_id, bucket_index, force, old_nhge->nh, new_nhge->nh, &extack); if (err) { pr_err_ratelimited("%s\n", extack._msg); if (!force) return false; /* It is not possible to veto a forced replacement, so * just clear the hardware flags from the nexthop * bucket to indicate to user space that this bucket is * not correctly populated in hardware. */ bucket->nh_flags &= ~(RTNH_F_OFFLOAD | RTNH_F_TRAP); } } nh_res_bucket_set_nh(bucket, new_nhge); nh_res_bucket_set_idle(res_table, bucket); if (notify_nl) nexthop_bucket_notify(res_table, bucket_index); if (nh_res_nhge_is_balanced(new_nhge)) list_del(&new_nhge->res.uw_nh_entry); return true; } #define NH_RES_UPKEEP_DW_MINIMUM_INTERVAL (HZ / 2) static void nh_res_table_upkeep(struct nh_res_table *res_table, bool notify, bool notify_nl) { unsigned long now = jiffies; unsigned long deadline; u16 i; /* Deadline is the next time that upkeep should be run. It is the * earliest time at which one of the buckets might be migrated. * Start at the most pessimistic estimate: either unbalanced_timer * from now, or if there is none, idle_timer from now. For each * encountered time point, call nh_res_time_set_deadline() to * refine the estimate. */ if (res_table->unbalanced_timer) deadline = now + res_table->unbalanced_timer; else deadline = now + res_table->idle_timer; for (i = 0; i < res_table->num_nh_buckets; i++) { struct nh_res_bucket *bucket = &res_table->nh_buckets[i]; bool force; if (nh_res_bucket_should_migrate(res_table, bucket, &deadline, &force)) { if (!nh_res_bucket_migrate(res_table, i, notify, notify_nl, force)) { unsigned long idle_point; /* A driver can override the migration * decision if the HW reports that the * bucket is actually not idle. Therefore * remark the bucket as busy again and * update the deadline. */ nh_res_bucket_set_busy(bucket); idle_point = nh_res_bucket_idle_point(res_table, bucket, now); nh_res_time_set_deadline(idle_point, &deadline); } } } /* If the group is still unbalanced, schedule the next upkeep to * either the deadline computed above, or the minimum deadline, * whichever comes later. */ if (!nh_res_table_is_balanced(res_table)) { unsigned long now = jiffies; unsigned long min_deadline; min_deadline = now + NH_RES_UPKEEP_DW_MINIMUM_INTERVAL; if (time_before(deadline, min_deadline)) deadline = min_deadline; queue_delayed_work(system_power_efficient_wq, &res_table->upkeep_dw, deadline - now); } } static void nh_res_table_upkeep_dw(struct work_struct *work) { struct delayed_work *dw = to_delayed_work(work); struct nh_res_table *res_table; res_table = container_of(dw, struct nh_res_table, upkeep_dw); nh_res_table_upkeep(res_table, true, true); } static void nh_res_table_cancel_upkeep(struct nh_res_table *res_table) { cancel_delayed_work_sync(&res_table->upkeep_dw); } static void nh_res_group_rebalance(struct nh_group *nhg, struct nh_res_table *res_table) { u16 prev_upper_bound = 0; u32 total = 0; u32 w = 0; int i; INIT_LIST_HEAD(&res_table->uw_nh_entries); for (i = 0; i < nhg->num_nh; ++i) total += nhg->nh_entries[i].weight; for (i = 0; i < nhg->num_nh; ++i) { struct nh_grp_entry *nhge = &nhg->nh_entries[i]; u16 upper_bound; u64 btw; w += nhge->weight; btw = ((u64)res_table->num_nh_buckets) * w; upper_bound = DIV_ROUND_CLOSEST_ULL(btw, total); nhge->res.wants_buckets = upper_bound - prev_upper_bound; prev_upper_bound = upper_bound; if (nh_res_nhge_is_uw(nhge)) { if (list_empty(&res_table->uw_nh_entries)) res_table->unbalanced_since = jiffies; list_add(&nhge->res.uw_nh_entry, &res_table->uw_nh_entries); } } } /* Migrate buckets in res_table so that they reference NHGE's from NHG with * the right NH ID. Set those buckets that do not have a corresponding NHGE * entry in NHG as not occupied. */ static void nh_res_table_migrate_buckets(struct nh_res_table *res_table, struct nh_group *nhg) { u16 i; for (i = 0; i < res_table->num_nh_buckets; i++) { struct nh_res_bucket *bucket = &res_table->nh_buckets[i]; u32 id = rtnl_dereference(bucket->nh_entry)->nh->id; bool found = false; int j; for (j = 0; j < nhg->num_nh; j++) { struct nh_grp_entry *nhge = &nhg->nh_entries[j]; if (nhge->nh->id == id) { nh_res_bucket_set_nh(bucket, nhge); found = true; break; } } if (!found) nh_res_bucket_unset_nh(bucket); } } static void replace_nexthop_grp_res(struct nh_group *oldg, struct nh_group *newg) { /* For NH group replacement, the new NHG might only have a stub * hash table with 0 buckets, because the number of buckets was not * specified. For NH removal, oldg and newg both reference the same * res_table. So in any case, in the following, we want to work * with oldg->res_table. */ struct nh_res_table *old_res_table = rtnl_dereference(oldg->res_table); unsigned long prev_unbalanced_since = old_res_table->unbalanced_since; bool prev_has_uw = !list_empty(&old_res_table->uw_nh_entries); nh_res_table_cancel_upkeep(old_res_table); nh_res_table_migrate_buckets(old_res_table, newg); nh_res_group_rebalance(newg, old_res_table); if (prev_has_uw && !list_empty(&old_res_table->uw_nh_entries)) old_res_table->unbalanced_since = prev_unbalanced_since; nh_res_table_upkeep(old_res_table, true, false); } static void nh_hthr_group_rebalance(struct nh_group *nhg) { u32 total = 0; u32 w = 0; int i; for (i = 0; i < nhg->num_nh; ++i) total += nhg->nh_entries[i].weight; for (i = 0; i < nhg->num_nh; ++i) { struct nh_grp_entry *nhge = &nhg->nh_entries[i]; u32 upper_bound; w += nhge->weight; upper_bound = DIV_ROUND_CLOSEST_ULL((u64)w << 31, total) - 1; atomic_set(&nhge->hthr.upper_bound, upper_bound); } } static void remove_nh_grp_entry(struct net *net, struct nh_grp_entry *nhge, struct nl_info *nlinfo) { struct nh_grp_entry *nhges, *new_nhges; struct nexthop *nhp = nhge->nh_parent; struct netlink_ext_ack extack; struct nexthop *nh = nhge->nh; struct nh_group *nhg, *newg; int i, j, err; WARN_ON(!nh); nhg = rtnl_dereference(nhp->nh_grp); newg = nhg->spare; /* last entry, keep it visible and remove the parent */ if (nhg->num_nh == 1) { remove_nexthop(net, nhp, nlinfo); return; } newg->has_v4 = false; newg->is_multipath = nhg->is_multipath; newg->hash_threshold = nhg->hash_threshold; newg->resilient = nhg->resilient; newg->fdb_nh = nhg->fdb_nh; newg->num_nh = nhg->num_nh; /* copy old entries to new except the one getting removed */ nhges = nhg->nh_entries; new_nhges = newg->nh_entries; for (i = 0, j = 0; i < nhg->num_nh; ++i) { struct nh_info *nhi; /* current nexthop getting removed */ if (nhg->nh_entries[i].nh == nh) { newg->num_nh--; continue; } nhi = rtnl_dereference(nhges[i].nh->nh_info); if (nhi->family == AF_INET) newg->has_v4 = true; list_del(&nhges[i].nh_list); new_nhges[j].stats = nhges[i].stats; new_nhges[j].nh_parent = nhges[i].nh_parent; new_nhges[j].nh = nhges[i].nh; new_nhges[j].weight = nhges[i].weight; list_add(&new_nhges[j].nh_list, &new_nhges[j].nh->grp_list); j++; } if (newg->hash_threshold) nh_hthr_group_rebalance(newg); else if (newg->resilient) replace_nexthop_grp_res(nhg, newg); rcu_assign_pointer(nhp->nh_grp, newg); list_del(&nhge->nh_list); free_percpu(nhge->stats); nexthop_put(nhge->nh); /* Removal of a NH from a resilient group is notified through * bucket notifications. */ if (newg->hash_threshold) { err = call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE, nhp, &extack); if (err) pr_err("%s\n", extack._msg); } if (nlinfo) nexthop_notify(RTM_NEWNEXTHOP, nhp, nlinfo); } static void remove_nexthop_from_groups(struct net *net, struct nexthop *nh, struct nl_info *nlinfo) { struct nh_grp_entry *nhge, *tmp; list_for_each_entry_safe(nhge, tmp, &nh->grp_list, nh_list) remove_nh_grp_entry(net, nhge, nlinfo); /* make sure all see the newly published array before releasing rtnl */ synchronize_net(); } static void remove_nexthop_group(struct nexthop *nh, struct nl_info *nlinfo) { struct nh_group *nhg = rcu_dereference_rtnl(nh->nh_grp); struct nh_res_table *res_table; int i, num_nh = nhg->num_nh; for (i = 0; i < num_nh; ++i) { struct nh_grp_entry *nhge = &nhg->nh_entries[i]; if (WARN_ON(!nhge->nh)) continue; list_del_init(&nhge->nh_list); } if (nhg->resilient) { res_table = rtnl_dereference(nhg->res_table); nh_res_table_cancel_upkeep(res_table); } } /* not called for nexthop replace */ static void __remove_nexthop_fib(struct net *net, struct nexthop *nh) { struct fib6_info *f6i, *tmp; bool do_flush = false; struct fib_info *fi; list_for_each_entry(fi, &nh->fi_list, nh_list) { fi->fib_flags |= RTNH_F_DEAD; do_flush = true; } if (do_flush) fib_flush(net); /* ip6_del_rt removes the entry from this list hence the _safe */ list_for_each_entry_safe(f6i, tmp, &nh->f6i_list, nh_list) { /* __ip6_del_rt does a release, so do a hold here */ fib6_info_hold(f6i); ipv6_stub->ip6_del_rt(net, f6i, !READ_ONCE(net->ipv4.sysctl_nexthop_compat_mode)); } } static void __remove_nexthop(struct net *net, struct nexthop *nh, struct nl_info *nlinfo) { __remove_nexthop_fib(net, nh); if (nh->is_group) { remove_nexthop_group(nh, nlinfo); } else { struct nh_info *nhi; nhi = rtnl_dereference(nh->nh_info); if (nhi->fib_nhc.nhc_dev) hlist_del(&nhi->dev_hash); remove_nexthop_from_groups(net, nh, nlinfo); } } static void remove_nexthop(struct net *net, struct nexthop *nh, struct nl_info *nlinfo) { call_nexthop_notifiers(net, NEXTHOP_EVENT_DEL, nh, NULL); /* remove from the tree */ rb_erase(&nh->rb_node, &net->nexthop.rb_root); if (nlinfo) nexthop_notify(RTM_DELNEXTHOP, nh, nlinfo); __remove_nexthop(net, nh, nlinfo); nh_base_seq_inc(net); nexthop_put(nh); } /* if any FIB entries reference this nexthop, any dst entries * need to be regenerated */ static void nh_rt_cache_flush(struct net *net, struct nexthop *nh, struct nexthop *replaced_nh) { struct fib6_info *f6i; struct nh_group *nhg; int i; if (!list_empty(&nh->fi_list)) rt_cache_flush(net); list_for_each_entry(f6i, &nh->f6i_list, nh_list) ipv6_stub->fib6_update_sernum(net, f6i); /* if an IPv6 group was replaced, we have to release all old * dsts to make sure all refcounts are released */ if (!replaced_nh->is_group) return; nhg = rtnl_dereference(replaced_nh->nh_grp); for (i = 0; i < nhg->num_nh; i++) { struct nh_grp_entry *nhge = &nhg->nh_entries[i]; struct nh_info *nhi = rtnl_dereference(nhge->nh->nh_info); if (nhi->family == AF_INET6) ipv6_stub->fib6_nh_release_dsts(&nhi->fib6_nh); } } static int replace_nexthop_grp(struct net *net, struct nexthop *old, struct nexthop *new, const struct nh_config *cfg, struct netlink_ext_ack *extack) { struct nh_res_table *tmp_table = NULL; struct nh_res_table *new_res_table; struct nh_res_table *old_res_table; struct nh_group *oldg, *newg; int i, err; if (!new->is_group) { NL_SET_ERR_MSG(extack, "Can not replace a nexthop group with a nexthop."); return -EINVAL; } oldg = rtnl_dereference(old->nh_grp); newg = rtnl_dereference(new->nh_grp); if (newg->hash_threshold != oldg->hash_threshold) { NL_SET_ERR_MSG(extack, "Can not replace a nexthop group with one of a different type."); return -EINVAL; } if (newg->hash_threshold) { err = call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE, new, extack); if (err) return err; } else if (newg->resilient) { new_res_table = rtnl_dereference(newg->res_table); old_res_table = rtnl_dereference(oldg->res_table); /* Accept if num_nh_buckets was not given, but if it was * given, demand that the value be correct. */ if (cfg->nh_grp_res_has_num_buckets && cfg->nh_grp_res_num_buckets != old_res_table->num_nh_buckets) { NL_SET_ERR_MSG(extack, "Can not change number of buckets of a resilient nexthop group."); return -EINVAL; } /* Emit a pre-replace notification so that listeners could veto * a potentially unsupported configuration. Otherwise, * individual bucket replacement notifications would need to be * vetoed, which is something that should only happen if the * bucket is currently active. */ err = call_nexthop_res_table_notifiers(net, new, extack); if (err) return err; if (cfg->nh_grp_res_has_idle_timer) old_res_table->idle_timer = cfg->nh_grp_res_idle_timer; if (cfg->nh_grp_res_has_unbalanced_timer) old_res_table->unbalanced_timer = cfg->nh_grp_res_unbalanced_timer; replace_nexthop_grp_res(oldg, newg); tmp_table = new_res_table; rcu_assign_pointer(newg->res_table, old_res_table); rcu_assign_pointer(newg->spare->res_table, old_res_table); } /* update parents - used by nexthop code for cleanup */ for (i = 0; i < newg->num_nh; i++) newg->nh_entries[i].nh_parent = old; rcu_assign_pointer(old->nh_grp, newg); /* Make sure concurrent readers are not using 'oldg' anymore. */ synchronize_net(); if (newg->resilient) { rcu_assign_pointer(oldg->res_table, tmp_table); rcu_assign_pointer(oldg->spare->res_table, tmp_table); } for (i = 0; i < oldg->num_nh; i++) oldg->nh_entries[i].nh_parent = new; rcu_assign_pointer(new->nh_grp, oldg); return 0; } static void nh_group_v4_update(struct nh_group *nhg) { struct nh_grp_entry *nhges; bool has_v4 = false; int i; nhges = nhg->nh_entries; for (i = 0; i < nhg->num_nh; i++) { struct nh_info *nhi; nhi = rtnl_dereference(nhges[i].nh->nh_info); if (nhi->family == AF_INET) has_v4 = true; } nhg->has_v4 = has_v4; } static int replace_nexthop_single_notify_res(struct net *net, struct nh_res_table *res_table, struct nexthop *old, struct nh_info *oldi, struct nh_info *newi, struct netlink_ext_ack *extack) { u32 nhg_id = res_table->nhg_id; int err; u16 i; for (i = 0; i < res_table->num_nh_buckets; i++) { struct nh_res_bucket *bucket = &res_table->nh_buckets[i]; struct nh_grp_entry *nhge; nhge = rtnl_dereference(bucket->nh_entry); if (nhge->nh == old) { err = __call_nexthop_res_bucket_notifiers(net, nhg_id, i, true, oldi, newi, extack); if (err) goto err_notify; } } return 0; err_notify: while (i-- > 0) { struct nh_res_bucket *bucket = &res_table->nh_buckets[i]; struct nh_grp_entry *nhge; nhge = rtnl_dereference(bucket->nh_entry); if (nhge->nh == old) __call_nexthop_res_bucket_notifiers(net, nhg_id, i, true, newi, oldi, extack); } return err; } static int replace_nexthop_single_notify(struct net *net, struct nexthop *group_nh, struct nexthop *old, struct nh_info *oldi, struct nh_info *newi, struct netlink_ext_ack *extack) { struct nh_group *nhg = rtnl_dereference(group_nh->nh_grp); struct nh_res_table *res_table; if (nhg->hash_threshold) { return call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE, group_nh, extack); } else if (nhg->resilient) { res_table = rtnl_dereference(nhg->res_table); return replace_nexthop_single_notify_res(net, res_table, old, oldi, newi, extack); } return -EINVAL; } static int replace_nexthop_single(struct net *net, struct nexthop *old, struct nexthop *new, struct netlink_ext_ack *extack) { u8 old_protocol, old_nh_flags; struct nh_info *oldi, *newi; struct nh_grp_entry *nhge; int err; if (new->is_group) { NL_SET_ERR_MSG(extack, "Can not replace a nexthop with a nexthop group."); return -EINVAL; } err = call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE, new, extack); if (err) return err; /* Hardware flags were set on 'old' as 'new' is not in the red-black * tree. Therefore, inherit the flags from 'old' to 'new'. */ new->nh_flags |= old->nh_flags & (RTNH_F_OFFLOAD | RTNH_F_TRAP); oldi = rtnl_dereference(old->nh_info); newi = rtnl_dereference(new->nh_info); newi->nh_parent = old; oldi->nh_parent = new; old_protocol = old->protocol; old_nh_flags = old->nh_flags; old->protocol = new->protocol; old->nh_flags = new->nh_flags; rcu_assign_pointer(old->nh_info, newi); rcu_assign_pointer(new->nh_info, oldi); /* Send a replace notification for all the groups using the nexthop. */ list_for_each_entry(nhge, &old->grp_list, nh_list) { struct nexthop *nhp = nhge->nh_parent; err = replace_nexthop_single_notify(net, nhp, old, oldi, newi, extack); if (err) goto err_notify; } /* When replacing an IPv4 nexthop with an IPv6 nexthop, potentially * update IPv4 indication in all the groups using the nexthop. */ if (oldi->family == AF_INET && newi->family == AF_INET6) { list_for_each_entry(nhge, &old->grp_list, nh_list) { struct nexthop *nhp = nhge->nh_parent; struct nh_group *nhg; nhg = rtnl_dereference(nhp->nh_grp); nh_group_v4_update(nhg); } } return 0; err_notify: rcu_assign_pointer(new->nh_info, newi); rcu_assign_pointer(old->nh_info, oldi); old->nh_flags = old_nh_flags; old->protocol = old_protocol; oldi->nh_parent = old; newi->nh_parent = new; list_for_each_entry_continue_reverse(nhge, &old->grp_list, nh_list) { struct nexthop *nhp = nhge->nh_parent; replace_nexthop_single_notify(net, nhp, old, newi, oldi, NULL); } call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE, old, extack); return err; } static void __nexthop_replace_notify(struct net *net, struct nexthop *nh, struct nl_info *info) { struct fib6_info *f6i; if (!list_empty(&nh->fi_list)) { struct fib_info *fi; /* expectation is a few fib_info per nexthop and then * a lot of routes per fib_info. So mark the fib_info * and then walk the fib tables once */ list_for_each_entry(fi, &nh->fi_list, nh_list) fi->nh_updated = true; fib_info_notify_update(net, info); list_for_each_entry(fi, &nh->fi_list, nh_list) fi->nh_updated = false; } list_for_each_entry(f6i, &nh->f6i_list, nh_list) ipv6_stub->fib6_rt_update(net, f6i, info); } /* send RTM_NEWROUTE with REPLACE flag set for all FIB entries * linked to this nexthop and for all groups that the nexthop * is a member of */ static void nexthop_replace_notify(struct net *net, struct nexthop *nh, struct nl_info *info) { struct nh_grp_entry *nhge; __nexthop_replace_notify(net, nh, info); list_for_each_entry(nhge, &nh->grp_list, nh_list) __nexthop_replace_notify(net, nhge->nh_parent, info); } static int replace_nexthop(struct net *net, struct nexthop *old, struct nexthop *new, const struct nh_config *cfg, struct netlink_ext_ack *extack) { bool new_is_reject = false; struct nh_grp_entry *nhge; int err; /* check that existing FIB entries are ok with the * new nexthop definition */ err = fib_check_nh_list(old, new, extack); if (err) return err; err = fib6_check_nh_list(old, new, extack); if (err) return err; if (!new->is_group) { struct nh_info *nhi = rtnl_dereference(new->nh_info); new_is_reject = nhi->reject_nh; } list_for_each_entry(nhge, &old->grp_list, nh_list) { /* if new nexthop is a blackhole, any groups using this * nexthop cannot have more than 1 path */ if (new_is_reject && nexthop_num_path(nhge->nh_parent) > 1) { NL_SET_ERR_MSG(extack, "Blackhole nexthop can not be a member of a group with more than one path"); return -EINVAL; } err = fib_check_nh_list(nhge->nh_parent, new, extack); if (err) return err; err = fib6_check_nh_list(nhge->nh_parent, new, extack); if (err) return err; } if (old->is_group) err = replace_nexthop_grp(net, old, new, cfg, extack); else err = replace_nexthop_single(net, old, new, extack); if (!err) { nh_rt_cache_flush(net, old, new); __remove_nexthop(net, new, NULL); nexthop_put(new); } return err; } /* called with rtnl_lock held */ static int insert_nexthop(struct net *net, struct nexthop *new_nh, struct nh_config *cfg, struct netlink_ext_ack *extack) { struct rb_node **pp, *parent = NULL, *next; struct rb_root *root = &net->nexthop.rb_root; bool replace = !!(cfg->nlflags & NLM_F_REPLACE); bool create = !!(cfg->nlflags & NLM_F_CREATE); u32 new_id = new_nh->id; int replace_notify = 0; int rc = -EEXIST; pp = &root->rb_node; while (1) { struct nexthop *nh; next = *pp; if (!next) break; parent = next; nh = rb_entry(parent, struct nexthop, rb_node); if (new_id < nh->id) { pp = &next->rb_left; } else if (new_id > nh->id) { pp = &next->rb_right; } else if (replace) { rc = replace_nexthop(net, nh, new_nh, cfg, extack); if (!rc) { new_nh = nh; /* send notification with old nh */ replace_notify = 1; } goto out; } else { /* id already exists and not a replace */ goto out; } } if (replace && !create) { NL_SET_ERR_MSG(extack, "Replace specified without create and no entry exists"); rc = -ENOENT; goto out; } if (new_nh->is_group) { struct nh_group *nhg = rtnl_dereference(new_nh->nh_grp); struct nh_res_table *res_table; if (nhg->resilient) { res_table = rtnl_dereference(nhg->res_table); /* Not passing the number of buckets is OK when * replacing, but not when creating a new group. */ if (!cfg->nh_grp_res_has_num_buckets) { NL_SET_ERR_MSG(extack, "Number of buckets not specified for nexthop group insertion"); rc = -EINVAL; goto out; } nh_res_group_rebalance(nhg, res_table); /* Do not send bucket notifications, we do full * notification below. */ nh_res_table_upkeep(res_table, false, false); } } rb_link_node_rcu(&new_nh->rb_node, parent, pp); rb_insert_color(&new_nh->rb_node, root); /* The initial insertion is a full notification for hash-threshold as * well as resilient groups. */ rc = call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE, new_nh, extack); if (rc) rb_erase(&new_nh->rb_node, &net->nexthop.rb_root); out: if (!rc) { nh_base_seq_inc(net); nexthop_notify(RTM_NEWNEXTHOP, new_nh, &cfg->nlinfo); if (replace_notify && READ_ONCE(net->ipv4.sysctl_nexthop_compat_mode)) nexthop_replace_notify(net, new_nh, &cfg->nlinfo); } return rc; } /* rtnl */ /* remove all nexthops tied to a device being deleted */ static void nexthop_flush_dev(struct net_device *dev, unsigned long event) { unsigned int hash = nh_dev_hashfn(dev->ifindex); struct net *net = dev_net(dev); struct hlist_head *head = &net->nexthop.devhash[hash]; struct hlist_node *n; struct nh_info *nhi; hlist_for_each_entry_safe(nhi, n, head, dev_hash) { if (nhi->fib_nhc.nhc_dev != dev) continue; if (nhi->reject_nh && (event == NETDEV_DOWN || event == NETDEV_CHANGE)) continue; remove_nexthop(net, nhi->nh_parent, NULL); } } /* rtnl; called when net namespace is deleted */ static void flush_all_nexthops(struct net *net) { struct rb_root *root = &net->nexthop.rb_root; struct rb_node *node; struct nexthop *nh; while ((node = rb_first(root))) { nh = rb_entry(node, struct nexthop, rb_node); remove_nexthop(net, nh, NULL); cond_resched(); } } static struct nexthop *nexthop_create_group(struct net *net, struct nh_config *cfg) { struct nlattr *grps_attr = cfg->nh_grp; struct nexthop_grp *entry = nla_data(grps_attr); u16 num_nh = nla_len(grps_attr) / sizeof(*entry); struct nh_group *nhg; struct nexthop *nh; int err; int i; if (WARN_ON(!num_nh)) return ERR_PTR(-EINVAL); nh = nexthop_alloc(); if (!nh) return ERR_PTR(-ENOMEM); nh->is_group = 1; nhg = nexthop_grp_alloc(num_nh); if (!nhg) { kfree(nh); return ERR_PTR(-ENOMEM); } /* spare group used for removals */ nhg->spare = nexthop_grp_alloc(num_nh); if (!nhg->spare) { kfree(nhg); kfree(nh); return ERR_PTR(-ENOMEM); } nhg->spare->spare = nhg; for (i = 0; i < nhg->num_nh; ++i) { struct nexthop *nhe; struct nh_info *nhi; nhe = nexthop_find_by_id(net, entry[i].id); if (!nexthop_get(nhe)) { err = -ENOENT; goto out_no_nh; } nhi = rtnl_dereference(nhe->nh_info); if (nhi->family == AF_INET) nhg->has_v4 = true; nhg->nh_entries[i].stats = netdev_alloc_pcpu_stats(struct nh_grp_entry_stats); if (!nhg->nh_entries[i].stats) { err = -ENOMEM; nexthop_put(nhe); goto out_no_nh; } nhg->nh_entries[i].nh = nhe; nhg->nh_entries[i].weight = nexthop_grp_weight(&entry[i]); list_add(&nhg->nh_entries[i].nh_list, &nhe->grp_list); nhg->nh_entries[i].nh_parent = nh; } if (cfg->nh_grp_type == NEXTHOP_GRP_TYPE_MPATH) { nhg->hash_threshold = 1; nhg->is_multipath = true; } else if (cfg->nh_grp_type == NEXTHOP_GRP_TYPE_RES) { struct nh_res_table *res_table; res_table = nexthop_res_table_alloc(net, cfg->nh_id, cfg); if (!res_table) { err = -ENOMEM; goto out_no_nh; } rcu_assign_pointer(nhg->spare->res_table, res_table); rcu_assign_pointer(nhg->res_table, res_table); nhg->resilient = true; nhg->is_multipath = true; } WARN_ON_ONCE(nhg->hash_threshold + nhg->resilient != 1); if (nhg->hash_threshold) nh_hthr_group_rebalance(nhg); if (cfg->nh_fdb) nhg->fdb_nh = 1; if (cfg->nh_hw_stats) nhg->hw_stats = true; rcu_assign_pointer(nh->nh_grp, nhg); return nh; out_no_nh: for (i--; i >= 0; --i) { list_del(&nhg->nh_entries[i].nh_list); free_percpu(nhg->nh_entries[i].stats); nexthop_put(nhg->nh_entries[i].nh); } kfree(nhg->spare); kfree(nhg); kfree(nh); return ERR_PTR(err); } static int nh_create_ipv4(struct net *net, struct nexthop *nh, struct nh_info *nhi, struct nh_config *cfg, struct netlink_ext_ack *extack) { struct fib_nh *fib_nh = &nhi->fib_nh; struct fib_config fib_cfg = { .fc_oif = cfg->nh_ifindex, .fc_gw4 = cfg->gw.ipv4, .fc_gw_family = cfg->gw.ipv4 ? AF_INET : 0, .fc_flags = cfg->nh_flags, .fc_nlinfo = cfg->nlinfo, .fc_encap = cfg->nh_encap, .fc_encap_type = cfg->nh_encap_type, }; u32 tb_id = (cfg->dev ? l3mdev_fib_table(cfg->dev) : RT_TABLE_MAIN); int err; err = fib_nh_init(net, fib_nh, &fib_cfg, 1, extack); if (err) { fib_nh_release(net, fib_nh); goto out; } if (nhi->fdb_nh) goto out; /* sets nh_dev if successful */ err = fib_check_nh(net, fib_nh, tb_id, 0, extack); if (!err) { nh->nh_flags = fib_nh->fib_nh_flags; fib_info_update_nhc_saddr(net, &fib_nh->nh_common, !fib_nh->fib_nh_scope ? 0 : fib_nh->fib_nh_scope - 1); } else { fib_nh_release(net, fib_nh); } out: return err; } static int nh_create_ipv6(struct net *net, struct nexthop *nh, struct nh_info *nhi, struct nh_config *cfg, struct netlink_ext_ack *extack) { struct fib6_nh *fib6_nh = &nhi->fib6_nh; struct fib6_config fib6_cfg = { .fc_table = l3mdev_fib_table(cfg->dev), .fc_ifindex = cfg->nh_ifindex, .fc_gateway = cfg->gw.ipv6, .fc_flags = cfg->nh_flags, .fc_nlinfo = cfg->nlinfo, .fc_encap = cfg->nh_encap, .fc_encap_type = cfg->nh_encap_type, .fc_is_fdb = cfg->nh_fdb, }; int err; if (!ipv6_addr_any(&cfg->gw.ipv6)) fib6_cfg.fc_flags |= RTF_GATEWAY; /* sets nh_dev if successful */ err = ipv6_stub->fib6_nh_init(net, fib6_nh, &fib6_cfg, GFP_KERNEL, extack); if (err) { /* IPv6 is not enabled, don't call fib6_nh_release */ if (err == -EAFNOSUPPORT) goto out; ipv6_stub->fib6_nh_release(fib6_nh); } else { nh->nh_flags = fib6_nh->fib_nh_flags; } out: return err; } static struct nexthop *nexthop_create(struct net *net, struct nh_config *cfg, struct netlink_ext_ack *extack) { struct nh_info *nhi; struct nexthop *nh; int err = 0; nh = nexthop_alloc(); if (!nh) return ERR_PTR(-ENOMEM); nhi = kzalloc(sizeof(*nhi), GFP_KERNEL); if (!nhi) { kfree(nh); return ERR_PTR(-ENOMEM); } nh->nh_flags = cfg->nh_flags; nh->net = net; nhi->nh_parent = nh; nhi->family = cfg->nh_family; nhi->fib_nhc.nhc_scope = RT_SCOPE_LINK; if (cfg->nh_fdb) nhi->fdb_nh = 1; if (cfg->nh_blackhole) { nhi->reject_nh = 1; cfg->nh_ifindex = net->loopback_dev->ifindex; } switch (cfg->nh_family) { case AF_INET: err = nh_create_ipv4(net, nh, nhi, cfg, extack); break; case AF_INET6: err = nh_create_ipv6(net, nh, nhi, cfg, extack); break; } if (err) { kfree(nhi); kfree(nh); return ERR_PTR(err); } /* add the entry to the device based hash */ if (!nhi->fdb_nh) nexthop_devhash_add(net, nhi); rcu_assign_pointer(nh->nh_info, nhi); return nh; } /* called with rtnl lock held */ static struct nexthop *nexthop_add(struct net *net, struct nh_config *cfg, struct netlink_ext_ack *extack) { struct nexthop *nh; int err; if (cfg->nlflags & NLM_F_REPLACE && !cfg->nh_id) { NL_SET_ERR_MSG(extack, "Replace requires nexthop id"); return ERR_PTR(-EINVAL); } if (!cfg->nh_id) { cfg->nh_id = nh_find_unused_id(net); if (!cfg->nh_id) { NL_SET_ERR_MSG(extack, "No unused id"); return ERR_PTR(-EINVAL); } } if (cfg->nh_grp) nh = nexthop_create_group(net, cfg); else nh = nexthop_create(net, cfg, extack); if (IS_ERR(nh)) return nh; refcount_set(&nh->refcnt, 1); nh->id = cfg->nh_id; nh->protocol = cfg->nh_protocol; nh->net = net; err = insert_nexthop(net, nh, cfg, extack); if (err) { __remove_nexthop(net, nh, NULL); nexthop_put(nh); nh = ERR_PTR(err); } return nh; } static int rtm_nh_get_timer(struct nlattr *attr, unsigned long fallback, unsigned long *timer_p, bool *has_p, struct netlink_ext_ack *extack) { unsigned long timer; u32 value; if (!attr) { *timer_p = fallback; *has_p = false; return 0; } value = nla_get_u32(attr); timer = clock_t_to_jiffies(value); if (timer == ~0UL) { NL_SET_ERR_MSG(extack, "Timer value too large"); return -EINVAL; } *timer_p = timer; *has_p = true; return 0; } static int rtm_to_nh_config_grp_res(struct nlattr *res, struct nh_config *cfg, struct netlink_ext_ack *extack) { struct nlattr *tb[ARRAY_SIZE(rtm_nh_res_policy_new)] = {}; int err; if (res) { err = nla_parse_nested(tb, ARRAY_SIZE(rtm_nh_res_policy_new) - 1, res, rtm_nh_res_policy_new, extack); if (err < 0) return err; } if (tb[NHA_RES_GROUP_BUCKETS]) { cfg->nh_grp_res_num_buckets = nla_get_u16(tb[NHA_RES_GROUP_BUCKETS]); cfg->nh_grp_res_has_num_buckets = true; if (!cfg->nh_grp_res_num_buckets) { NL_SET_ERR_MSG(extack, "Number of buckets needs to be non-0"); return -EINVAL; } } err = rtm_nh_get_timer(tb[NHA_RES_GROUP_IDLE_TIMER], NH_RES_DEFAULT_IDLE_TIMER, &cfg->nh_grp_res_idle_timer, &cfg->nh_grp_res_has_idle_timer, extack); if (err) return err; return rtm_nh_get_timer(tb[NHA_RES_GROUP_UNBALANCED_TIMER], NH_RES_DEFAULT_UNBALANCED_TIMER, &cfg->nh_grp_res_unbalanced_timer, &cfg->nh_grp_res_has_unbalanced_timer, extack); } static int rtm_to_nh_config(struct net *net, struct sk_buff *skb, struct nlmsghdr *nlh, struct nh_config *cfg, struct netlink_ext_ack *extack) { struct nhmsg *nhm = nlmsg_data(nlh); struct nlattr *tb[ARRAY_SIZE(rtm_nh_policy_new)]; int err; err = nlmsg_parse(nlh, sizeof(*nhm), tb, ARRAY_SIZE(rtm_nh_policy_new) - 1, rtm_nh_policy_new, extack); if (err < 0) return err; err = -EINVAL; if (nhm->resvd || nhm->nh_scope) { NL_SET_ERR_MSG(extack, "Invalid values in ancillary header"); goto out; } if (nhm->nh_flags & ~NEXTHOP_VALID_USER_FLAGS) { NL_SET_ERR_MSG(extack, "Invalid nexthop flags in ancillary header"); goto out; } switch (nhm->nh_family) { case AF_INET: case AF_INET6: break; case AF_UNSPEC: if (tb[NHA_GROUP]) break; fallthrough; default: NL_SET_ERR_MSG(extack, "Invalid address family"); goto out; } memset(cfg, 0, sizeof(*cfg)); cfg->nlflags = nlh->nlmsg_flags; cfg->nlinfo.portid = NETLINK_CB(skb).portid; cfg->nlinfo.nlh = nlh; cfg->nlinfo.nl_net = net; cfg->nh_family = nhm->nh_family; cfg->nh_protocol = nhm->nh_protocol; cfg->nh_flags = nhm->nh_flags; if (tb[NHA_ID]) cfg->nh_id = nla_get_u32(tb[NHA_ID]); if (tb[NHA_FDB]) { if (tb[NHA_OIF] || tb[NHA_BLACKHOLE] || tb[NHA_ENCAP] || tb[NHA_ENCAP_TYPE]) { NL_SET_ERR_MSG(extack, "Fdb attribute can not be used with encap, oif or blackhole"); goto out; } if (nhm->nh_flags) { NL_SET_ERR_MSG(extack, "Unsupported nexthop flags in ancillary header"); goto out; } cfg->nh_fdb = nla_get_flag(tb[NHA_FDB]); } if (tb[NHA_GROUP]) { if (nhm->nh_family != AF_UNSPEC) { NL_SET_ERR_MSG(extack, "Invalid family for group"); goto out; } cfg->nh_grp = tb[NHA_GROUP]; cfg->nh_grp_type = NEXTHOP_GRP_TYPE_MPATH; if (tb[NHA_GROUP_TYPE]) cfg->nh_grp_type = nla_get_u16(tb[NHA_GROUP_TYPE]); if (cfg->nh_grp_type > NEXTHOP_GRP_TYPE_MAX) { NL_SET_ERR_MSG(extack, "Invalid group type"); goto out; } err = nh_check_attr_group(net, tb, ARRAY_SIZE(tb), cfg->nh_grp_type, extack); if (err) goto out; if (cfg->nh_grp_type == NEXTHOP_GRP_TYPE_RES) err = rtm_to_nh_config_grp_res(tb[NHA_RES_GROUP], cfg, extack); if (tb[NHA_HW_STATS_ENABLE]) cfg->nh_hw_stats = nla_get_u32(tb[NHA_HW_STATS_ENABLE]); /* no other attributes should be set */ goto out; } if (tb[NHA_BLACKHOLE]) { if (tb[NHA_GATEWAY] || tb[NHA_OIF] || tb[NHA_ENCAP] || tb[NHA_ENCAP_TYPE] || tb[NHA_FDB]) { NL_SET_ERR_MSG(extack, "Blackhole attribute can not be used with gateway, oif, encap or fdb"); goto out; } cfg->nh_blackhole = 1; err = 0; goto out; } if (!cfg->nh_fdb && !tb[NHA_OIF]) { NL_SET_ERR_MSG(extack, "Device attribute required for non-blackhole and non-fdb nexthops"); goto out; } if (!cfg->nh_fdb && tb[NHA_OIF]) { cfg->nh_ifindex = nla_get_u32(tb[NHA_OIF]); if (cfg->nh_ifindex) cfg->dev = __dev_get_by_index(net, cfg->nh_ifindex); if (!cfg->dev) { NL_SET_ERR_MSG(extack, "Invalid device index"); goto out; } else if (!(cfg->dev->flags & IFF_UP)) { NL_SET_ERR_MSG(extack, "Nexthop device is not up"); err = -ENETDOWN; goto out; } else if (!netif_carrier_ok(cfg->dev)) { NL_SET_ERR_MSG(extack, "Carrier for nexthop device is down"); err = -ENETDOWN; goto out; } } err = -EINVAL; if (tb[NHA_GATEWAY]) { struct nlattr *gwa = tb[NHA_GATEWAY]; switch (cfg->nh_family) { case AF_INET: if (nla_len(gwa) != sizeof(u32)) { NL_SET_ERR_MSG(extack, "Invalid gateway"); goto out; } cfg->gw.ipv4 = nla_get_be32(gwa); break; case AF_INET6: if (nla_len(gwa) != sizeof(struct in6_addr)) { NL_SET_ERR_MSG(extack, "Invalid gateway"); goto out; } cfg->gw.ipv6 = nla_get_in6_addr(gwa); break; default: NL_SET_ERR_MSG(extack, "Unknown address family for gateway"); goto out; } } else { /* device only nexthop (no gateway) */ if (cfg->nh_flags & RTNH_F_ONLINK) { NL_SET_ERR_MSG(extack, "ONLINK flag can not be set for nexthop without a gateway"); goto out; } } if (tb[NHA_ENCAP]) { cfg->nh_encap = tb[NHA_ENCAP]; if (!tb[NHA_ENCAP_TYPE]) { NL_SET_ERR_MSG(extack, "LWT encapsulation type is missing"); goto out; } cfg->nh_encap_type = nla_get_u16(tb[NHA_ENCAP_TYPE]); err = lwtunnel_valid_encap_type(cfg->nh_encap_type, extack); if (err < 0) goto out; } else if (tb[NHA_ENCAP_TYPE]) { NL_SET_ERR_MSG(extack, "LWT encapsulation attribute is missing"); goto out; } if (tb[NHA_HW_STATS_ENABLE]) { NL_SET_ERR_MSG(extack, "Cannot enable nexthop hardware statistics for non-group nexthops"); goto out; } err = 0; out: return err; } /* rtnl */ static int rtm_new_nexthop(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct nh_config cfg; struct nexthop *nh; int err; err = rtm_to_nh_config(net, skb, nlh, &cfg, extack); if (!err) { nh = nexthop_add(net, &cfg, extack); if (IS_ERR(nh)) err = PTR_ERR(nh); } return err; } static int nh_valid_get_del_req(const struct nlmsghdr *nlh, struct nlattr **tb, u32 *id, u32 *op_flags, struct netlink_ext_ack *extack) { struct nhmsg *nhm = nlmsg_data(nlh); if (nhm->nh_protocol || nhm->resvd || nhm->nh_scope || nhm->nh_flags) { NL_SET_ERR_MSG(extack, "Invalid values in header"); return -EINVAL; } if (!tb[NHA_ID]) { NL_SET_ERR_MSG(extack, "Nexthop id is missing"); return -EINVAL; } *id = nla_get_u32(tb[NHA_ID]); if (!(*id)) { NL_SET_ERR_MSG(extack, "Invalid nexthop id"); return -EINVAL; } if (op_flags) *op_flags = nla_get_u32_default(tb[NHA_OP_FLAGS], 0); return 0; } /* rtnl */ static int rtm_del_nexthop(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct nlattr *tb[ARRAY_SIZE(rtm_nh_policy_del)]; struct net *net = sock_net(skb->sk); struct nl_info nlinfo = { .nlh = nlh, .nl_net = net, .portid = NETLINK_CB(skb).portid, }; struct nexthop *nh; int err; u32 id; err = nlmsg_parse(nlh, sizeof(struct nhmsg), tb, ARRAY_SIZE(rtm_nh_policy_del) - 1, rtm_nh_policy_del, extack); if (err < 0) return err; err = nh_valid_get_del_req(nlh, tb, &id, NULL, extack); if (err) return err; nh = nexthop_find_by_id(net, id); if (!nh) return -ENOENT; remove_nexthop(net, nh, &nlinfo); return 0; } /* rtnl */ static int rtm_get_nexthop(struct sk_buff *in_skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct nlattr *tb[ARRAY_SIZE(rtm_nh_policy_get)]; struct net *net = sock_net(in_skb->sk); struct sk_buff *skb = NULL; struct nexthop *nh; u32 op_flags; int err; u32 id; err = nlmsg_parse(nlh, sizeof(struct nhmsg), tb, ARRAY_SIZE(rtm_nh_policy_get) - 1, rtm_nh_policy_get, extack); if (err < 0) return err; err = nh_valid_get_del_req(nlh, tb, &id, &op_flags, extack); if (err) return err; err = -ENOBUFS; skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); if (!skb) goto out; err = -ENOENT; nh = nexthop_find_by_id(net, id); if (!nh) goto errout_free; err = nh_fill_node(skb, nh, RTM_NEWNEXTHOP, NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, 0, op_flags); if (err < 0) { WARN_ON(err == -EMSGSIZE); goto errout_free; } err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid); out: return err; errout_free: kfree_skb(skb); goto out; } struct nh_dump_filter { u32 nh_id; int dev_idx; int master_idx; bool group_filter; bool fdb_filter; u32 res_bucket_nh_id; u32 op_flags; }; static bool nh_dump_filtered(struct nexthop *nh, struct nh_dump_filter *filter, u8 family) { const struct net_device *dev; const struct nh_info *nhi; if (filter->group_filter && !nh->is_group) return true; if (!filter->dev_idx && !filter->master_idx && !family) return false; if (nh->is_group) return true; nhi = rtnl_dereference(nh->nh_info); if (family && nhi->family != family) return true; dev = nhi->fib_nhc.nhc_dev; if (filter->dev_idx && (!dev || dev->ifindex != filter->dev_idx)) return true; if (filter->master_idx) { struct net_device *master; if (!dev) return true; master = netdev_master_upper_dev_get((struct net_device *)dev); if (!master || master->ifindex != filter->master_idx) return true; } return false; } static int __nh_valid_dump_req(const struct nlmsghdr *nlh, struct nlattr **tb, struct nh_dump_filter *filter, struct netlink_ext_ack *extack) { struct nhmsg *nhm; u32 idx; if (tb[NHA_OIF]) { idx = nla_get_u32(tb[NHA_OIF]); if (idx > INT_MAX) { NL_SET_ERR_MSG(extack, "Invalid device index"); return -EINVAL; } filter->dev_idx = idx; } if (tb[NHA_MASTER]) { idx = nla_get_u32(tb[NHA_MASTER]); if (idx > INT_MAX) { NL_SET_ERR_MSG(extack, "Invalid master device index"); return -EINVAL; } filter->master_idx = idx; } filter->group_filter = nla_get_flag(tb[NHA_GROUPS]); filter->fdb_filter = nla_get_flag(tb[NHA_FDB]); nhm = nlmsg_data(nlh); if (nhm->nh_protocol || nhm->resvd || nhm->nh_scope || nhm->nh_flags) { NL_SET_ERR_MSG(extack, "Invalid values in header for nexthop dump request"); return -EINVAL; } return 0; } static int nh_valid_dump_req(const struct nlmsghdr *nlh, struct nh_dump_filter *filter, struct netlink_callback *cb) { struct nlattr *tb[ARRAY_SIZE(rtm_nh_policy_dump)]; int err; err = nlmsg_parse(nlh, sizeof(struct nhmsg), tb, ARRAY_SIZE(rtm_nh_policy_dump) - 1, rtm_nh_policy_dump, cb->extack); if (err < 0) return err; filter->op_flags = nla_get_u32_default(tb[NHA_OP_FLAGS], 0); return __nh_valid_dump_req(nlh, tb, filter, cb->extack); } struct rtm_dump_nh_ctx { u32 idx; }; static struct rtm_dump_nh_ctx * rtm_dump_nh_ctx(struct netlink_callback *cb) { struct rtm_dump_nh_ctx *ctx = (void *)cb->ctx; BUILD_BUG_ON(sizeof(*ctx) > sizeof(cb->ctx)); return ctx; } static int rtm_dump_walk_nexthops(struct sk_buff *skb, struct netlink_callback *cb, struct rb_root *root, struct rtm_dump_nh_ctx *ctx, int (*nh_cb)(struct sk_buff *skb, struct netlink_callback *cb, struct nexthop *nh, void *data), void *data) { struct rb_node *node; int s_idx; int err; s_idx = ctx->idx; for (node = rb_first(root); node; node = rb_next(node)) { struct nexthop *nh; nh = rb_entry(node, struct nexthop, rb_node); if (nh->id < s_idx) continue; ctx->idx = nh->id; err = nh_cb(skb, cb, nh, data); if (err) return err; } return 0; } static int rtm_dump_nexthop_cb(struct sk_buff *skb, struct netlink_callback *cb, struct nexthop *nh, void *data) { struct nhmsg *nhm = nlmsg_data(cb->nlh); struct nh_dump_filter *filter = data; if (nh_dump_filtered(nh, filter, nhm->nh_family)) return 0; return nh_fill_node(skb, nh, RTM_NEWNEXTHOP, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, filter->op_flags); } /* rtnl */ static int rtm_dump_nexthop(struct sk_buff *skb, struct netlink_callback *cb) { struct rtm_dump_nh_ctx *ctx = rtm_dump_nh_ctx(cb); struct net *net = sock_net(skb->sk); struct rb_root *root = &net->nexthop.rb_root; struct nh_dump_filter filter = {}; int err; err = nh_valid_dump_req(cb->nlh, &filter, cb); if (err < 0) return err; err = rtm_dump_walk_nexthops(skb, cb, root, ctx, &rtm_dump_nexthop_cb, &filter); cb->seq = net->nexthop.seq; nl_dump_check_consistent(cb, nlmsg_hdr(skb)); return err; } static struct nexthop * nexthop_find_group_resilient(struct net *net, u32 id, struct netlink_ext_ack *extack) { struct nh_group *nhg; struct nexthop *nh; nh = nexthop_find_by_id(net, id); if (!nh) return ERR_PTR(-ENOENT); if (!nh->is_group) { NL_SET_ERR_MSG(extack, "Not a nexthop group"); return ERR_PTR(-EINVAL); } nhg = rtnl_dereference(nh->nh_grp); if (!nhg->resilient) { NL_SET_ERR_MSG(extack, "Nexthop group not of type resilient"); return ERR_PTR(-EINVAL); } return nh; } static int nh_valid_dump_nhid(struct nlattr *attr, u32 *nh_id_p, struct netlink_ext_ack *extack) { u32 idx; if (attr) { idx = nla_get_u32(attr); if (!idx) { NL_SET_ERR_MSG(extack, "Invalid nexthop id"); return -EINVAL; } *nh_id_p = idx; } else { *nh_id_p = 0; } return 0; } static int nh_valid_dump_bucket_req(const struct nlmsghdr *nlh, struct nh_dump_filter *filter, struct netlink_callback *cb) { struct nlattr *res_tb[ARRAY_SIZE(rtm_nh_res_bucket_policy_dump)]; struct nlattr *tb[ARRAY_SIZE(rtm_nh_policy_dump_bucket)]; int err; err = nlmsg_parse(nlh, sizeof(struct nhmsg), tb, ARRAY_SIZE(rtm_nh_policy_dump_bucket) - 1, rtm_nh_policy_dump_bucket, NULL); if (err < 0) return err; err = nh_valid_dump_nhid(tb[NHA_ID], &filter->nh_id, cb->extack); if (err) return err; if (tb[NHA_RES_BUCKET]) { size_t max = ARRAY_SIZE(rtm_nh_res_bucket_policy_dump) - 1; err = nla_parse_nested(res_tb, max, tb[NHA_RES_BUCKET], rtm_nh_res_bucket_policy_dump, cb->extack); if (err < 0) return err; err = nh_valid_dump_nhid(res_tb[NHA_RES_BUCKET_NH_ID], &filter->res_bucket_nh_id, cb->extack); if (err) return err; } return __nh_valid_dump_req(nlh, tb, filter, cb->extack); } struct rtm_dump_res_bucket_ctx { struct rtm_dump_nh_ctx nh; u16 bucket_index; }; static struct rtm_dump_res_bucket_ctx * rtm_dump_res_bucket_ctx(struct netlink_callback *cb) { struct rtm_dump_res_bucket_ctx *ctx = (void *)cb->ctx; BUILD_BUG_ON(sizeof(*ctx) > sizeof(cb->ctx)); return ctx; } struct rtm_dump_nexthop_bucket_data { struct rtm_dump_res_bucket_ctx *ctx; struct nh_dump_filter filter; }; static int rtm_dump_nexthop_bucket_nh(struct sk_buff *skb, struct netlink_callback *cb, struct nexthop *nh, struct rtm_dump_nexthop_bucket_data *dd) { u32 portid = NETLINK_CB(cb->skb).portid; struct nhmsg *nhm = nlmsg_data(cb->nlh); struct nh_res_table *res_table; struct nh_group *nhg; u16 bucket_index; int err; nhg = rtnl_dereference(nh->nh_grp); res_table = rtnl_dereference(nhg->res_table); for (bucket_index = dd->ctx->bucket_index; bucket_index < res_table->num_nh_buckets; bucket_index++) { struct nh_res_bucket *bucket; struct nh_grp_entry *nhge; bucket = &res_table->nh_buckets[bucket_index]; nhge = rtnl_dereference(bucket->nh_entry); if (nh_dump_filtered(nhge->nh, &dd->filter, nhm->nh_family)) continue; if (dd->filter.res_bucket_nh_id && dd->filter.res_bucket_nh_id != nhge->nh->id) continue; dd->ctx->bucket_index = bucket_index; err = nh_fill_res_bucket(skb, nh, bucket, bucket_index, RTM_NEWNEXTHOPBUCKET, portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->extack); if (err) return err; } dd->ctx->bucket_index = 0; return 0; } static int rtm_dump_nexthop_bucket_cb(struct sk_buff *skb, struct netlink_callback *cb, struct nexthop *nh, void *data) { struct rtm_dump_nexthop_bucket_data *dd = data; struct nh_group *nhg; if (!nh->is_group) return 0; nhg = rtnl_dereference(nh->nh_grp); if (!nhg->resilient) return 0; return rtm_dump_nexthop_bucket_nh(skb, cb, nh, dd); } /* rtnl */ static int rtm_dump_nexthop_bucket(struct sk_buff *skb, struct netlink_callback *cb) { struct rtm_dump_res_bucket_ctx *ctx = rtm_dump_res_bucket_ctx(cb); struct rtm_dump_nexthop_bucket_data dd = { .ctx = ctx }; struct net *net = sock_net(skb->sk); struct nexthop *nh; int err; err = nh_valid_dump_bucket_req(cb->nlh, &dd.filter, cb); if (err) return err; if (dd.filter.nh_id) { nh = nexthop_find_group_resilient(net, dd.filter.nh_id, cb->extack); if (IS_ERR(nh)) return PTR_ERR(nh); err = rtm_dump_nexthop_bucket_nh(skb, cb, nh, &dd); } else { struct rb_root *root = &net->nexthop.rb_root; err = rtm_dump_walk_nexthops(skb, cb, root, &ctx->nh, &rtm_dump_nexthop_bucket_cb, &dd); } cb->seq = net->nexthop.seq; nl_dump_check_consistent(cb, nlmsg_hdr(skb)); return err; } static int nh_valid_get_bucket_req_res_bucket(struct nlattr *res, u16 *bucket_index, struct netlink_ext_ack *extack) { struct nlattr *tb[ARRAY_SIZE(rtm_nh_res_bucket_policy_get)]; int err; err = nla_parse_nested(tb, ARRAY_SIZE(rtm_nh_res_bucket_policy_get) - 1, res, rtm_nh_res_bucket_policy_get, extack); if (err < 0) return err; if (!tb[NHA_RES_BUCKET_INDEX]) { NL_SET_ERR_MSG(extack, "Bucket index is missing"); return -EINVAL; } *bucket_index = nla_get_u16(tb[NHA_RES_BUCKET_INDEX]); return 0; } static int nh_valid_get_bucket_req(const struct nlmsghdr *nlh, u32 *id, u16 *bucket_index, struct netlink_ext_ack *extack) { struct nlattr *tb[ARRAY_SIZE(rtm_nh_policy_get_bucket)]; int err; err = nlmsg_parse(nlh, sizeof(struct nhmsg), tb, ARRAY_SIZE(rtm_nh_policy_get_bucket) - 1, rtm_nh_policy_get_bucket, extack); if (err < 0) return err; err = nh_valid_get_del_req(nlh, tb, id, NULL, extack); if (err) return err; if (!tb[NHA_RES_BUCKET]) { NL_SET_ERR_MSG(extack, "Bucket information is missing"); return -EINVAL; } err = nh_valid_get_bucket_req_res_bucket(tb[NHA_RES_BUCKET], bucket_index, extack); if (err) return err; return 0; } /* rtnl */ static int rtm_get_nexthop_bucket(struct sk_buff *in_skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(in_skb->sk); struct nh_res_table *res_table; struct sk_buff *skb = NULL; struct nh_group *nhg; struct nexthop *nh; u16 bucket_index; int err; u32 id; err = nh_valid_get_bucket_req(nlh, &id, &bucket_index, extack); if (err) return err; nh = nexthop_find_group_resilient(net, id, extack); if (IS_ERR(nh)) return PTR_ERR(nh); nhg = rtnl_dereference(nh->nh_grp); res_table = rtnl_dereference(nhg->res_table); if (bucket_index >= res_table->num_nh_buckets) { NL_SET_ERR_MSG(extack, "Bucket index out of bounds"); return -ENOENT; } skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); if (!skb) return -ENOBUFS; err = nh_fill_res_bucket(skb, nh, &res_table->nh_buckets[bucket_index], bucket_index, RTM_NEWNEXTHOPBUCKET, NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, 0, extack); if (err < 0) { WARN_ON(err == -EMSGSIZE); goto errout_free; } return rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid); errout_free: kfree_skb(skb); return err; } static void nexthop_sync_mtu(struct net_device *dev, u32 orig_mtu) { unsigned int hash = nh_dev_hashfn(dev->ifindex); struct net *net = dev_net(dev); struct hlist_head *head = &net->nexthop.devhash[hash]; struct hlist_node *n; struct nh_info *nhi; hlist_for_each_entry_safe(nhi, n, head, dev_hash) { if (nhi->fib_nhc.nhc_dev == dev) { if (nhi->family == AF_INET) fib_nhc_update_mtu(&nhi->fib_nhc, dev->mtu, orig_mtu); } } } /* rtnl */ static int nh_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct netdev_notifier_info_ext *info_ext; switch (event) { case NETDEV_DOWN: case NETDEV_UNREGISTER: nexthop_flush_dev(dev, event); break; case NETDEV_CHANGE: if (!(dev_get_flags(dev) & (IFF_RUNNING | IFF_LOWER_UP))) nexthop_flush_dev(dev, event); break; case NETDEV_CHANGEMTU: info_ext = ptr; nexthop_sync_mtu(dev, info_ext->ext.mtu); rt_cache_flush(dev_net(dev)); break; } return NOTIFY_DONE; } static struct notifier_block nh_netdev_notifier = { .notifier_call = nh_netdev_event, }; static int nexthops_dump(struct net *net, struct notifier_block *nb, enum nexthop_event_type event_type, struct netlink_ext_ack *extack) { struct rb_root *root = &net->nexthop.rb_root; struct rb_node *node; int err = 0; for (node = rb_first(root); node; node = rb_next(node)) { struct nexthop *nh; nh = rb_entry(node, struct nexthop, rb_node); err = call_nexthop_notifier(nb, net, event_type, nh, extack); if (err) break; } return err; } int register_nexthop_notifier(struct net *net, struct notifier_block *nb, struct netlink_ext_ack *extack) { int err; rtnl_lock(); err = nexthops_dump(net, nb, NEXTHOP_EVENT_REPLACE, extack); if (err) goto unlock; err = blocking_notifier_chain_register(&net->nexthop.notifier_chain, nb); unlock: rtnl_unlock(); return err; } EXPORT_SYMBOL(register_nexthop_notifier); int __unregister_nexthop_notifier(struct net *net, struct notifier_block *nb) { int err; err = blocking_notifier_chain_unregister(&net->nexthop.notifier_chain, nb); if (!err) nexthops_dump(net, nb, NEXTHOP_EVENT_DEL, NULL); return err; } EXPORT_SYMBOL(__unregister_nexthop_notifier); int unregister_nexthop_notifier(struct net *net, struct notifier_block *nb) { int err; rtnl_lock(); err = __unregister_nexthop_notifier(net, nb); rtnl_unlock(); return err; } EXPORT_SYMBOL(unregister_nexthop_notifier); void nexthop_set_hw_flags(struct net *net, u32 id, bool offload, bool trap) { struct nexthop *nexthop; rcu_read_lock(); nexthop = nexthop_find_by_id(net, id); if (!nexthop) goto out; nexthop->nh_flags &= ~(RTNH_F_OFFLOAD | RTNH_F_TRAP); if (offload) nexthop->nh_flags |= RTNH_F_OFFLOAD; if (trap) nexthop->nh_flags |= RTNH_F_TRAP; out: rcu_read_unlock(); } EXPORT_SYMBOL(nexthop_set_hw_flags); void nexthop_bucket_set_hw_flags(struct net *net, u32 id, u16 bucket_index, bool offload, bool trap) { struct nh_res_table *res_table; struct nh_res_bucket *bucket; struct nexthop *nexthop; struct nh_group *nhg; rcu_read_lock(); nexthop = nexthop_find_by_id(net, id); if (!nexthop || !nexthop->is_group) goto out; nhg = rcu_dereference(nexthop->nh_grp); if (!nhg->resilient) goto out; if (bucket_index >= nhg->res_table->num_nh_buckets) goto out; res_table = rcu_dereference(nhg->res_table); bucket = &res_table->nh_buckets[bucket_index]; bucket->nh_flags &= ~(RTNH_F_OFFLOAD | RTNH_F_TRAP); if (offload) bucket->nh_flags |= RTNH_F_OFFLOAD; if (trap) bucket->nh_flags |= RTNH_F_TRAP; out: rcu_read_unlock(); } EXPORT_SYMBOL(nexthop_bucket_set_hw_flags); void nexthop_res_grp_activity_update(struct net *net, u32 id, u16 num_buckets, unsigned long *activity) { struct nh_res_table *res_table; struct nexthop *nexthop; struct nh_group *nhg; u16 i; rcu_read_lock(); nexthop = nexthop_find_by_id(net, id); if (!nexthop || !nexthop->is_group) goto out; nhg = rcu_dereference(nexthop->nh_grp); if (!nhg->resilient) goto out; /* Instead of silently ignoring some buckets, demand that the sizes * be the same. */ res_table = rcu_dereference(nhg->res_table); if (num_buckets != res_table->num_nh_buckets) goto out; for (i = 0; i < num_buckets; i++) { if (test_bit(i, activity)) nh_res_bucket_set_busy(&res_table->nh_buckets[i]); } out: rcu_read_unlock(); } EXPORT_SYMBOL(nexthop_res_grp_activity_update); static void __net_exit nexthop_net_exit_batch_rtnl(struct list_head *net_list, struct list_head *dev_to_kill) { struct net *net; ASSERT_RTNL(); list_for_each_entry(net, net_list, exit_list) flush_all_nexthops(net); } static void __net_exit nexthop_net_exit(struct net *net) { kfree(net->nexthop.devhash); net->nexthop.devhash = NULL; } static int __net_init nexthop_net_init(struct net *net) { size_t sz = sizeof(struct hlist_head) * NH_DEV_HASHSIZE; net->nexthop.rb_root = RB_ROOT; net->nexthop.devhash = kzalloc(sz, GFP_KERNEL); if (!net->nexthop.devhash) return -ENOMEM; BLOCKING_INIT_NOTIFIER_HEAD(&net->nexthop.notifier_chain); return 0; } static struct pernet_operations nexthop_net_ops = { .init = nexthop_net_init, .exit = nexthop_net_exit, .exit_batch_rtnl = nexthop_net_exit_batch_rtnl, }; static const struct rtnl_msg_handler nexthop_rtnl_msg_handlers[] __initconst = { {.msgtype = RTM_NEWNEXTHOP, .doit = rtm_new_nexthop}, {.msgtype = RTM_DELNEXTHOP, .doit = rtm_del_nexthop}, {.msgtype = RTM_GETNEXTHOP, .doit = rtm_get_nexthop, .dumpit = rtm_dump_nexthop}, {.msgtype = RTM_GETNEXTHOPBUCKET, .doit = rtm_get_nexthop_bucket, .dumpit = rtm_dump_nexthop_bucket}, {.protocol = PF_INET, .msgtype = RTM_NEWNEXTHOP, .doit = rtm_new_nexthop}, {.protocol = PF_INET, .msgtype = RTM_GETNEXTHOP, .dumpit = rtm_dump_nexthop}, {.protocol = PF_INET6, .msgtype = RTM_NEWNEXTHOP, .doit = rtm_new_nexthop}, {.protocol = PF_INET6, .msgtype = RTM_GETNEXTHOP, .dumpit = rtm_dump_nexthop}, }; static int __init nexthop_init(void) { register_pernet_subsys(&nexthop_net_ops); register_netdevice_notifier(&nh_netdev_notifier); rtnl_register_many(nexthop_rtnl_msg_handlers); return 0; } subsys_initcall(nexthop_init);
1 314 309 300 266 13 159 157 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 /* SPDX-License-Identifier: GPL-2.0-only */ /* * * Copyright (C) 2011 Novell Inc. * Copyright (C) 2016 Red Hat, Inc. */ struct ovl_config { char *upperdir; char *workdir; char **lowerdirs; bool default_permissions; int redirect_mode; int verity_mode; bool index; int uuid; bool nfs_export; int xino; bool metacopy; bool userxattr; bool ovl_volatile; }; struct ovl_sb { struct super_block *sb; dev_t pseudo_dev; /* Unusable (conflicting) uuid */ bool bad_uuid; /* Used as a lower layer (but maybe also as upper) */ bool is_lower; }; struct ovl_layer { /* ovl_free_fs() relies on @mnt being the first member! */ struct vfsmount *mnt; /* Trap in ovl inode cache */ struct inode *trap; struct ovl_sb *fs; /* Index of this layer in fs root (upper idx == 0) */ int idx; /* One fsid per unique underlying sb (upper fsid == 0) */ int fsid; /* xwhiteouts were found on this layer */ bool has_xwhiteouts; }; struct ovl_path { const struct ovl_layer *layer; struct dentry *dentry; }; struct ovl_entry { unsigned int __numlower; struct ovl_path __lowerstack[]; }; /* private information held for overlayfs's superblock */ struct ovl_fs { unsigned int numlayer; /* Number of unique fs among layers including upper fs */ unsigned int numfs; /* Number of data-only lower layers */ unsigned int numdatalayer; struct ovl_layer *layers; struct ovl_sb *fs; /* workbasedir is the path at workdir= mount option */ struct dentry *workbasedir; /* workdir is the 'work' or 'index' directory under workbasedir */ struct dentry *workdir; long namelen; /* pathnames of lower and upper dirs, for show_options */ struct ovl_config config; /* creds of process who forced instantiation of super block */ const struct cred *creator_cred; bool tmpfile; bool noxattr; bool nofh; /* Did we take the inuse lock? */ bool upperdir_locked; bool workdir_locked; /* Traps in ovl inode cache */ struct inode *workbasedir_trap; struct inode *workdir_trap; /* -1: disabled, 0: same fs, 1..32: number of unused ino bits */ int xino_mode; /* For allocation of non-persistent inode numbers */ atomic_long_t last_ino; /* Shared whiteout cache */ struct dentry *whiteout; bool no_shared_whiteout; /* r/o snapshot of upperdir sb's only taken on volatile mounts */ errseq_t errseq; }; /* Number of lower layers, not including data-only layers */ static inline unsigned int ovl_numlowerlayer(struct ovl_fs *ofs) { return ofs->numlayer - ofs->numdatalayer - 1; } static inline struct vfsmount *ovl_upper_mnt(struct ovl_fs *ofs) { return ofs->layers[0].mnt; } static inline struct mnt_idmap *ovl_upper_mnt_idmap(struct ovl_fs *ofs) { return mnt_idmap(ovl_upper_mnt(ofs)); } extern struct file_system_type ovl_fs_type; static inline struct ovl_fs *OVL_FS(struct super_block *sb) { if (IS_ENABLED(CONFIG_OVERLAY_FS_DEBUG)) WARN_ON_ONCE(sb->s_type != &ovl_fs_type); return (struct ovl_fs *)sb->s_fs_info; } static inline bool ovl_should_sync(struct ovl_fs *ofs) { return !ofs->config.ovl_volatile; } static inline unsigned int ovl_numlower(struct ovl_entry *oe) { return oe ? oe->__numlower : 0; } static inline struct ovl_path *ovl_lowerstack(struct ovl_entry *oe) { return ovl_numlower(oe) ? oe->__lowerstack : NULL; } static inline struct ovl_path *ovl_lowerpath(struct ovl_entry *oe) { return ovl_lowerstack(oe); } static inline struct ovl_path *ovl_lowerdata(struct ovl_entry *oe) { struct ovl_path *lowerstack = ovl_lowerstack(oe); return lowerstack ? &lowerstack[oe->__numlower - 1] : NULL; } /* May return NULL if lazy lookup of lowerdata is needed */ static inline struct dentry *ovl_lowerdata_dentry(struct ovl_entry *oe) { struct ovl_path *lowerdata = ovl_lowerdata(oe); return lowerdata ? READ_ONCE(lowerdata->dentry) : NULL; } /* private information held for every overlayfs dentry */ static inline unsigned long *OVL_E_FLAGS(struct dentry *dentry) { return (unsigned long *) &dentry->d_fsdata; } struct ovl_inode { union { struct ovl_dir_cache *cache; /* directory */ const char *lowerdata_redirect; /* regular file */ }; const char *redirect; u64 version; unsigned long flags; struct inode vfs_inode; struct dentry *__upperdentry; struct ovl_entry *oe; /* synchronize copy up and more */ struct mutex lock; }; static inline struct ovl_inode *OVL_I(struct inode *inode) { return container_of(inode, struct ovl_inode, vfs_inode); } static inline struct ovl_entry *OVL_I_E(struct inode *inode) { return inode ? OVL_I(inode)->oe : NULL; } static inline struct ovl_entry *OVL_E(struct dentry *dentry) { return OVL_I_E(d_inode(dentry)); } static inline struct dentry *ovl_upperdentry_dereference(struct ovl_inode *oi) { return READ_ONCE(oi->__upperdentry); }
1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 /* * Copyright (c) 2015, Mellanox Technologies inc. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include "core_priv.h" #include <linux/in.h> #include <linux/in6.h> /* For in6_dev_get/in6_dev_put */ #include <net/addrconf.h> #include <net/bonding.h> #include <rdma/ib_cache.h> #include <rdma/ib_addr.h> static struct workqueue_struct *gid_cache_wq; enum gid_op_type { GID_DEL = 0, GID_ADD }; struct update_gid_event_work { struct work_struct work; union ib_gid gid; struct ib_gid_attr gid_attr; enum gid_op_type gid_op; }; #define ROCE_NETDEV_CALLBACK_SZ 3 struct netdev_event_work_cmd { roce_netdev_callback cb; roce_netdev_filter filter; struct net_device *ndev; struct net_device *filter_ndev; }; struct netdev_event_work { struct work_struct work; struct netdev_event_work_cmd cmds[ROCE_NETDEV_CALLBACK_SZ]; }; static const struct { bool (*is_supported)(const struct ib_device *device, u32 port_num); enum ib_gid_type gid_type; } PORT_CAP_TO_GID_TYPE[] = { {rdma_protocol_roce_eth_encap, IB_GID_TYPE_ROCE}, {rdma_protocol_roce_udp_encap, IB_GID_TYPE_ROCE_UDP_ENCAP}, }; #define CAP_TO_GID_TABLE_SIZE ARRAY_SIZE(PORT_CAP_TO_GID_TYPE) unsigned long roce_gid_type_mask_support(struct ib_device *ib_dev, u32 port) { int i; unsigned int ret_flags = 0; if (!rdma_protocol_roce(ib_dev, port)) return 1UL << IB_GID_TYPE_IB; for (i = 0; i < CAP_TO_GID_TABLE_SIZE; i++) if (PORT_CAP_TO_GID_TYPE[i].is_supported(ib_dev, port)) ret_flags |= 1UL << PORT_CAP_TO_GID_TYPE[i].gid_type; return ret_flags; } EXPORT_SYMBOL(roce_gid_type_mask_support); static void update_gid(enum gid_op_type gid_op, struct ib_device *ib_dev, u32 port, union ib_gid *gid, struct ib_gid_attr *gid_attr) { int i; unsigned long gid_type_mask = roce_gid_type_mask_support(ib_dev, port); for (i = 0; i < IB_GID_TYPE_SIZE; i++) { if ((1UL << i) & gid_type_mask) { gid_attr->gid_type = i; switch (gid_op) { case GID_ADD: ib_cache_gid_add(ib_dev, port, gid, gid_attr); break; case GID_DEL: ib_cache_gid_del(ib_dev, port, gid, gid_attr); break; } } } } enum bonding_slave_state { BONDING_SLAVE_STATE_ACTIVE = 1UL << 0, BONDING_SLAVE_STATE_INACTIVE = 1UL << 1, /* No primary slave or the device isn't a slave in bonding */ BONDING_SLAVE_STATE_NA = 1UL << 2, }; static enum bonding_slave_state is_eth_active_slave_of_bonding_rcu(struct net_device *dev, struct net_device *upper) { if (upper && netif_is_bond_master(upper)) { struct net_device *pdev = bond_option_active_slave_get_rcu(netdev_priv(upper)); if (pdev) return dev == pdev ? BONDING_SLAVE_STATE_ACTIVE : BONDING_SLAVE_STATE_INACTIVE; } return BONDING_SLAVE_STATE_NA; } #define REQUIRED_BOND_STATES (BONDING_SLAVE_STATE_ACTIVE | \ BONDING_SLAVE_STATE_NA) static bool is_eth_port_of_netdev_filter(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { struct net_device *real_dev; bool res; if (!rdma_ndev) return false; rcu_read_lock(); real_dev = rdma_vlan_dev_real_dev(cookie); if (!real_dev) real_dev = cookie; res = ((rdma_is_upper_dev_rcu(rdma_ndev, cookie) && (is_eth_active_slave_of_bonding_rcu(rdma_ndev, real_dev) & REQUIRED_BOND_STATES)) || real_dev == rdma_ndev); rcu_read_unlock(); return res; } static bool is_eth_port_inactive_slave_filter(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { struct net_device *master_dev; bool res; if (!rdma_ndev) return false; rcu_read_lock(); master_dev = netdev_master_upper_dev_get_rcu(rdma_ndev); res = is_eth_active_slave_of_bonding_rcu(rdma_ndev, master_dev) == BONDING_SLAVE_STATE_INACTIVE; rcu_read_unlock(); return res; } /** * is_ndev_for_default_gid_filter - Check if a given netdevice * can be considered for default GIDs or not. * @ib_dev: IB device to check * @port: Port to consider for adding default GID * @rdma_ndev: rdma netdevice pointer * @cookie: Netdevice to consider to form a default GID * * is_ndev_for_default_gid_filter() returns true if a given netdevice can be * considered for deriving default RoCE GID, returns false otherwise. */ static bool is_ndev_for_default_gid_filter(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { struct net_device *cookie_ndev = cookie; bool res; if (!rdma_ndev) return false; rcu_read_lock(); /* * When rdma netdevice is used in bonding, bonding master netdevice * should be considered for default GIDs. Therefore, ignore slave rdma * netdevices when bonding is considered. * Additionally when event(cookie) netdevice is bond master device, * make sure that it the upper netdevice of rdma netdevice. */ res = ((cookie_ndev == rdma_ndev && !netif_is_bond_slave(rdma_ndev)) || (netif_is_bond_master(cookie_ndev) && rdma_is_upper_dev_rcu(rdma_ndev, cookie_ndev))); rcu_read_unlock(); return res; } static bool pass_all_filter(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { return true; } static bool upper_device_filter(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { bool res; if (!rdma_ndev) return false; if (rdma_ndev == cookie) return true; rcu_read_lock(); res = rdma_is_upper_dev_rcu(rdma_ndev, cookie); rcu_read_unlock(); return res; } /** * is_upper_ndev_bond_master_filter - Check if a given netdevice * is bond master device of netdevice of the RDMA device of port. * @ib_dev: IB device to check * @port: Port to consider for adding default GID * @rdma_ndev: Pointer to rdma netdevice * @cookie: Netdevice to consider to form a default GID * * is_upper_ndev_bond_master_filter() returns true if a cookie_netdev * is bond master device and rdma_ndev is its lower netdevice. It might * not have been established as slave device yet. */ static bool is_upper_ndev_bond_master_filter(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { struct net_device *cookie_ndev = cookie; bool match = false; if (!rdma_ndev) return false; rcu_read_lock(); if (netif_is_bond_master(cookie_ndev) && rdma_is_upper_dev_rcu(rdma_ndev, cookie_ndev)) match = true; rcu_read_unlock(); return match; } static void update_gid_ip(enum gid_op_type gid_op, struct ib_device *ib_dev, u32 port, struct net_device *ndev, struct sockaddr *addr) { union ib_gid gid; struct ib_gid_attr gid_attr; rdma_ip2gid(addr, &gid); memset(&gid_attr, 0, sizeof(gid_attr)); gid_attr.ndev = ndev; update_gid(gid_op, ib_dev, port, &gid, &gid_attr); } static void bond_delete_netdev_default_gids(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, struct net_device *event_ndev) { struct net_device *real_dev = rdma_vlan_dev_real_dev(event_ndev); unsigned long gid_type_mask; if (!rdma_ndev) return; if (!real_dev) real_dev = event_ndev; rcu_read_lock(); if (((rdma_ndev != event_ndev && !rdma_is_upper_dev_rcu(rdma_ndev, event_ndev)) || is_eth_active_slave_of_bonding_rcu(rdma_ndev, real_dev) == BONDING_SLAVE_STATE_INACTIVE)) { rcu_read_unlock(); return; } rcu_read_unlock(); gid_type_mask = roce_gid_type_mask_support(ib_dev, port); ib_cache_gid_set_default_gid(ib_dev, port, rdma_ndev, gid_type_mask, IB_CACHE_GID_DEFAULT_MODE_DELETE); } static void enum_netdev_ipv4_ips(struct ib_device *ib_dev, u32 port, struct net_device *ndev) { const struct in_ifaddr *ifa; struct in_device *in_dev; struct sin_list { struct list_head list; struct sockaddr_in ip; }; struct sin_list *sin_iter; struct sin_list *sin_temp; LIST_HEAD(sin_list); if (ndev->reg_state >= NETREG_UNREGISTERING) return; rcu_read_lock(); in_dev = __in_dev_get_rcu(ndev); if (!in_dev) { rcu_read_unlock(); return; } in_dev_for_each_ifa_rcu(ifa, in_dev) { struct sin_list *entry = kzalloc(sizeof(*entry), GFP_ATOMIC); if (!entry) continue; entry->ip.sin_family = AF_INET; entry->ip.sin_addr.s_addr = ifa->ifa_address; list_add_tail(&entry->list, &sin_list); } rcu_read_unlock(); list_for_each_entry_safe(sin_iter, sin_temp, &sin_list, list) { update_gid_ip(GID_ADD, ib_dev, port, ndev, (struct sockaddr *)&sin_iter->ip); list_del(&sin_iter->list); kfree(sin_iter); } } static void enum_netdev_ipv6_ips(struct ib_device *ib_dev, u32 port, struct net_device *ndev) { struct inet6_ifaddr *ifp; struct inet6_dev *in6_dev; struct sin6_list { struct list_head list; struct sockaddr_in6 sin6; }; struct sin6_list *sin6_iter; struct sin6_list *sin6_temp; struct ib_gid_attr gid_attr = {.ndev = ndev}; LIST_HEAD(sin6_list); if (ndev->reg_state >= NETREG_UNREGISTERING) return; in6_dev = in6_dev_get(ndev); if (!in6_dev) return; read_lock_bh(&in6_dev->lock); list_for_each_entry(ifp, &in6_dev->addr_list, if_list) { struct sin6_list *entry = kzalloc(sizeof(*entry), GFP_ATOMIC); if (!entry) continue; entry->sin6.sin6_family = AF_INET6; entry->sin6.sin6_addr = ifp->addr; list_add_tail(&entry->list, &sin6_list); } read_unlock_bh(&in6_dev->lock); in6_dev_put(in6_dev); list_for_each_entry_safe(sin6_iter, sin6_temp, &sin6_list, list) { union ib_gid gid; rdma_ip2gid((struct sockaddr *)&sin6_iter->sin6, &gid); update_gid(GID_ADD, ib_dev, port, &gid, &gid_attr); list_del(&sin6_iter->list); kfree(sin6_iter); } } static void _add_netdev_ips(struct ib_device *ib_dev, u32 port, struct net_device *ndev) { enum_netdev_ipv4_ips(ib_dev, port, ndev); if (IS_ENABLED(CONFIG_IPV6)) enum_netdev_ipv6_ips(ib_dev, port, ndev); } static void add_netdev_ips(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { _add_netdev_ips(ib_dev, port, cookie); } static void del_netdev_ips(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { ib_cache_gid_del_all_netdev_gids(ib_dev, port, cookie); } /** * del_default_gids - Delete default GIDs of the event/cookie netdevice * @ib_dev: RDMA device pointer * @port: Port of the RDMA device whose GID table to consider * @rdma_ndev: Unused rdma netdevice * @cookie: Pointer to event netdevice * * del_default_gids() deletes the default GIDs of the event/cookie netdevice. */ static void del_default_gids(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { struct net_device *cookie_ndev = cookie; unsigned long gid_type_mask; gid_type_mask = roce_gid_type_mask_support(ib_dev, port); ib_cache_gid_set_default_gid(ib_dev, port, cookie_ndev, gid_type_mask, IB_CACHE_GID_DEFAULT_MODE_DELETE); } static void add_default_gids(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { struct net_device *event_ndev = cookie; unsigned long gid_type_mask; gid_type_mask = roce_gid_type_mask_support(ib_dev, port); ib_cache_gid_set_default_gid(ib_dev, port, event_ndev, gid_type_mask, IB_CACHE_GID_DEFAULT_MODE_SET); } static void enum_all_gids_of_dev_cb(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { struct net *net; struct net_device *ndev; /* Lock the rtnl to make sure the netdevs does not move under * our feet */ rtnl_lock(); down_read(&net_rwsem); for_each_net(net) for_each_netdev(net, ndev) { /* * Filter and add default GIDs of the primary netdevice * when not in bonding mode, or add default GIDs * of bond master device, when in bonding mode. */ if (is_ndev_for_default_gid_filter(ib_dev, port, rdma_ndev, ndev)) add_default_gids(ib_dev, port, rdma_ndev, ndev); if (is_eth_port_of_netdev_filter(ib_dev, port, rdma_ndev, ndev)) _add_netdev_ips(ib_dev, port, ndev); } up_read(&net_rwsem); rtnl_unlock(); } /** * rdma_roce_rescan_device - Rescan all of the network devices in the system * and add their gids, as needed, to the relevant RoCE devices. * * @ib_dev: the rdma device */ void rdma_roce_rescan_device(struct ib_device *ib_dev) { ib_enum_roce_netdev(ib_dev, pass_all_filter, NULL, enum_all_gids_of_dev_cb, NULL); } EXPORT_SYMBOL(rdma_roce_rescan_device); /** * rdma_roce_rescan_port - Rescan all of the network devices in the system * and add their gids if relevant to the port of the RoCE device. * * @ib_dev: IB device * @port: Port number */ void rdma_roce_rescan_port(struct ib_device *ib_dev, u32 port) { struct net_device *ndev = NULL; if (rdma_protocol_roce(ib_dev, port)) { ndev = ib_device_get_netdev(ib_dev, port); if (!ndev) return; enum_all_gids_of_dev_cb(ib_dev, port, ndev, ndev); dev_put(ndev); } } EXPORT_SYMBOL(rdma_roce_rescan_port); static void callback_for_addr_gid_device_scan(struct ib_device *device, u32 port, struct net_device *rdma_ndev, void *cookie) { struct update_gid_event_work *parsed = cookie; return update_gid(parsed->gid_op, device, port, &parsed->gid, &parsed->gid_attr); } struct upper_list { struct list_head list; struct net_device *upper; }; static int netdev_upper_walk(struct net_device *upper, struct netdev_nested_priv *priv) { struct upper_list *entry = kmalloc(sizeof(*entry), GFP_ATOMIC); struct list_head *upper_list = (struct list_head *)priv->data; if (!entry) return 0; list_add_tail(&entry->list, upper_list); dev_hold(upper); entry->upper = upper; return 0; } static void handle_netdev_upper(struct ib_device *ib_dev, u32 port, void *cookie, void (*handle_netdev)(struct ib_device *ib_dev, u32 port, struct net_device *ndev)) { struct net_device *ndev = cookie; struct netdev_nested_priv priv; struct upper_list *upper_iter; struct upper_list *upper_temp; LIST_HEAD(upper_list); priv.data = &upper_list; rcu_read_lock(); netdev_walk_all_upper_dev_rcu(ndev, netdev_upper_walk, &priv); rcu_read_unlock(); handle_netdev(ib_dev, port, ndev); list_for_each_entry_safe(upper_iter, upper_temp, &upper_list, list) { handle_netdev(ib_dev, port, upper_iter->upper); dev_put(upper_iter->upper); list_del(&upper_iter->list); kfree(upper_iter); } } void roce_del_all_netdev_gids(struct ib_device *ib_dev, u32 port, struct net_device *ndev) { ib_cache_gid_del_all_netdev_gids(ib_dev, port, ndev); } EXPORT_SYMBOL(roce_del_all_netdev_gids); static void del_netdev_upper_ips(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { handle_netdev_upper(ib_dev, port, cookie, roce_del_all_netdev_gids); } static void add_netdev_upper_ips(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { handle_netdev_upper(ib_dev, port, cookie, _add_netdev_ips); } static void del_netdev_default_ips_join(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { struct net_device *master_ndev; rcu_read_lock(); master_ndev = netdev_master_upper_dev_get_rcu(rdma_ndev); dev_hold(master_ndev); rcu_read_unlock(); if (master_ndev) { bond_delete_netdev_default_gids(ib_dev, port, rdma_ndev, master_ndev); dev_put(master_ndev); } } /* The following functions operate on all IB devices. netdevice_event and * addr_event execute ib_enum_all_roce_netdevs through a work. * ib_enum_all_roce_netdevs iterates through all IB devices. */ static void netdevice_event_work_handler(struct work_struct *_work) { struct netdev_event_work *work = container_of(_work, struct netdev_event_work, work); unsigned int i; for (i = 0; i < ARRAY_SIZE(work->cmds) && work->cmds[i].cb; i++) { ib_enum_all_roce_netdevs(work->cmds[i].filter, work->cmds[i].filter_ndev, work->cmds[i].cb, work->cmds[i].ndev); dev_put(work->cmds[i].ndev); dev_put(work->cmds[i].filter_ndev); } kfree(work); } static int netdevice_queue_work(struct netdev_event_work_cmd *cmds, struct net_device *ndev) { unsigned int i; struct netdev_event_work *ndev_work = kmalloc(sizeof(*ndev_work), GFP_KERNEL); if (!ndev_work) return NOTIFY_DONE; memcpy(ndev_work->cmds, cmds, sizeof(ndev_work->cmds)); for (i = 0; i < ARRAY_SIZE(ndev_work->cmds) && ndev_work->cmds[i].cb; i++) { if (!ndev_work->cmds[i].ndev) ndev_work->cmds[i].ndev = ndev; if (!ndev_work->cmds[i].filter_ndev) ndev_work->cmds[i].filter_ndev = ndev; dev_hold(ndev_work->cmds[i].ndev); dev_hold(ndev_work->cmds[i].filter_ndev); } INIT_WORK(&ndev_work->work, netdevice_event_work_handler); queue_work(gid_cache_wq, &ndev_work->work); return NOTIFY_DONE; } static const struct netdev_event_work_cmd add_cmd = { .cb = add_netdev_ips, .filter = is_eth_port_of_netdev_filter }; static const struct netdev_event_work_cmd add_cmd_upper_ips = { .cb = add_netdev_upper_ips, .filter = is_eth_port_of_netdev_filter }; static void ndev_event_unlink(struct netdev_notifier_changeupper_info *changeupper_info, struct netdev_event_work_cmd *cmds) { static const struct netdev_event_work_cmd upper_ips_del_cmd = { .cb = del_netdev_upper_ips, .filter = upper_device_filter }; cmds[0] = upper_ips_del_cmd; cmds[0].ndev = changeupper_info->upper_dev; cmds[1] = add_cmd; } static const struct netdev_event_work_cmd bonding_default_add_cmd = { .cb = add_default_gids, .filter = is_upper_ndev_bond_master_filter }; static void ndev_event_link(struct net_device *event_ndev, struct netdev_notifier_changeupper_info *changeupper_info, struct netdev_event_work_cmd *cmds) { static const struct netdev_event_work_cmd bonding_default_del_cmd = { .cb = del_default_gids, .filter = is_upper_ndev_bond_master_filter }; /* * When a lower netdev is linked to its upper bonding * netdev, delete lower slave netdev's default GIDs. */ cmds[0] = bonding_default_del_cmd; cmds[0].ndev = event_ndev; cmds[0].filter_ndev = changeupper_info->upper_dev; /* Now add bonding upper device default GIDs */ cmds[1] = bonding_default_add_cmd; cmds[1].ndev = changeupper_info->upper_dev; cmds[1].filter_ndev = changeupper_info->upper_dev; /* Now add bonding upper device IP based GIDs */ cmds[2] = add_cmd_upper_ips; cmds[2].ndev = changeupper_info->upper_dev; cmds[2].filter_ndev = changeupper_info->upper_dev; } static void netdevice_event_changeupper(struct net_device *event_ndev, struct netdev_notifier_changeupper_info *changeupper_info, struct netdev_event_work_cmd *cmds) { if (changeupper_info->linking) ndev_event_link(event_ndev, changeupper_info, cmds); else ndev_event_unlink(changeupper_info, cmds); } static const struct netdev_event_work_cmd add_default_gid_cmd = { .cb = add_default_gids, .filter = is_ndev_for_default_gid_filter, }; static int netdevice_event(struct notifier_block *this, unsigned long event, void *ptr) { static const struct netdev_event_work_cmd del_cmd = { .cb = del_netdev_ips, .filter = pass_all_filter}; static const struct netdev_event_work_cmd bonding_default_del_cmd_join = { .cb = del_netdev_default_ips_join, .filter = is_eth_port_inactive_slave_filter }; static const struct netdev_event_work_cmd netdev_del_cmd = { .cb = del_netdev_ips, .filter = is_eth_port_of_netdev_filter }; static const struct netdev_event_work_cmd bonding_event_ips_del_cmd = { .cb = del_netdev_upper_ips, .filter = upper_device_filter}; struct net_device *ndev = netdev_notifier_info_to_dev(ptr); struct netdev_event_work_cmd cmds[ROCE_NETDEV_CALLBACK_SZ] = { {NULL} }; if (ndev->type != ARPHRD_ETHER) return NOTIFY_DONE; switch (event) { case NETDEV_REGISTER: case NETDEV_UP: cmds[0] = bonding_default_del_cmd_join; cmds[1] = add_default_gid_cmd; cmds[2] = add_cmd; break; case NETDEV_UNREGISTER: if (ndev->reg_state < NETREG_UNREGISTERED) cmds[0] = del_cmd; else return NOTIFY_DONE; break; case NETDEV_CHANGEADDR: cmds[0] = netdev_del_cmd; if (ndev->reg_state == NETREG_REGISTERED) { cmds[1] = add_default_gid_cmd; cmds[2] = add_cmd; } break; case NETDEV_CHANGEUPPER: netdevice_event_changeupper(ndev, container_of(ptr, struct netdev_notifier_changeupper_info, info), cmds); break; case NETDEV_BONDING_FAILOVER: cmds[0] = bonding_event_ips_del_cmd; /* Add default GIDs of the bond device */ cmds[1] = bonding_default_add_cmd; /* Add IP based GIDs of the bond device */ cmds[2] = add_cmd_upper_ips; break; default: return NOTIFY_DONE; } return netdevice_queue_work(cmds, ndev); } static void update_gid_event_work_handler(struct work_struct *_work) { struct update_gid_event_work *work = container_of(_work, struct update_gid_event_work, work); ib_enum_all_roce_netdevs(is_eth_port_of_netdev_filter, work->gid_attr.ndev, callback_for_addr_gid_device_scan, work); dev_put(work->gid_attr.ndev); kfree(work); } static int addr_event(struct notifier_block *this, unsigned long event, struct sockaddr *sa, struct net_device *ndev) { struct update_gid_event_work *work; enum gid_op_type gid_op; if (ndev->type != ARPHRD_ETHER) return NOTIFY_DONE; switch (event) { case NETDEV_UP: gid_op = GID_ADD; break; case NETDEV_DOWN: gid_op = GID_DEL; break; default: return NOTIFY_DONE; } work = kmalloc(sizeof(*work), GFP_ATOMIC); if (!work) return NOTIFY_DONE; INIT_WORK(&work->work, update_gid_event_work_handler); rdma_ip2gid(sa, &work->gid); work->gid_op = gid_op; memset(&work->gid_attr, 0, sizeof(work->gid_attr)); dev_hold(ndev); work->gid_attr.ndev = ndev; queue_work(gid_cache_wq, &work->work); return NOTIFY_DONE; } static int inetaddr_event(struct notifier_block *this, unsigned long event, void *ptr) { struct sockaddr_in in; struct net_device *ndev; struct in_ifaddr *ifa = ptr; in.sin_family = AF_INET; in.sin_addr.s_addr = ifa->ifa_address; ndev = ifa->ifa_dev->dev; return addr_event(this, event, (struct sockaddr *)&in, ndev); } static int inet6addr_event(struct notifier_block *this, unsigned long event, void *ptr) { struct sockaddr_in6 in6; struct net_device *ndev; struct inet6_ifaddr *ifa6 = ptr; in6.sin6_family = AF_INET6; in6.sin6_addr = ifa6->addr; ndev = ifa6->idev->dev; return addr_event(this, event, (struct sockaddr *)&in6, ndev); } static struct notifier_block nb_netdevice = { .notifier_call = netdevice_event }; static struct notifier_block nb_inetaddr = { .notifier_call = inetaddr_event }; static struct notifier_block nb_inet6addr = { .notifier_call = inet6addr_event }; int __init roce_gid_mgmt_init(void) { gid_cache_wq = alloc_ordered_workqueue("gid-cache-wq", 0); if (!gid_cache_wq) return -ENOMEM; register_inetaddr_notifier(&nb_inetaddr); if (IS_ENABLED(CONFIG_IPV6)) register_inet6addr_notifier(&nb_inet6addr); /* We relay on the netdevice notifier to enumerate all * existing devices in the system. Register to this notifier * last to make sure we will not miss any IP add/del * callbacks. */ register_netdevice_notifier(&nb_netdevice); return 0; } void __exit roce_gid_mgmt_cleanup(void) { if (IS_ENABLED(CONFIG_IPV6)) unregister_inet6addr_notifier(&nb_inet6addr); unregister_inetaddr_notifier(&nb_inetaddr); unregister_netdevice_notifier(&nb_netdevice); /* Ensure all gid deletion tasks complete before we go down, * to avoid any reference to free'd memory. By the time * ib-core is removed, all physical devices have been removed, * so no issue with remaining hardware contexts. */ destroy_workqueue(gid_cache_wq); }
1 1 2232 2232 2232 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright 2002 Andi Kleen, SuSE Labs. * Thanks to Ben LaHaise for precious feedback. */ #include <linux/highmem.h> #include <linux/memblock.h> #include <linux/sched.h> #include <linux/mm.h> #include <linux/interrupt.h> #include <linux/seq_file.h> #include <linux/proc_fs.h> #include <linux/debugfs.h> #include <linux/pfn.h> #include <linux/percpu.h> #include <linux/gfp.h> #include <linux/pci.h> #include <linux/vmalloc.h> #include <linux/libnvdimm.h> #include <linux/vmstat.h> #include <linux/kernel.h> #include <linux/cc_platform.h> #include <linux/set_memory.h> #include <linux/memregion.h> #include <asm/e820/api.h> #include <asm/processor.h> #include <asm/tlbflush.h> #include <asm/sections.h> #include <asm/setup.h> #include <linux/uaccess.h> #include <asm/pgalloc.h> #include <asm/proto.h> #include <asm/memtype.h> #include "../mm_internal.h" /* * The current flushing context - we pass it instead of 5 arguments: */ struct cpa_data { unsigned long *vaddr; pgd_t *pgd; pgprot_t mask_set; pgprot_t mask_clr; unsigned long numpages; unsigned long curpage; unsigned long pfn; unsigned int flags; unsigned int force_split : 1, force_static_prot : 1, force_flush_all : 1; struct page **pages; }; enum cpa_warn { CPA_CONFLICT, CPA_PROTECT, CPA_DETECT, }; static const int cpa_warn_level = CPA_PROTECT; /* * Serialize cpa() (for !DEBUG_PAGEALLOC which uses large identity mappings) * using cpa_lock. So that we don't allow any other cpu, with stale large tlb * entries change the page attribute in parallel to some other cpu * splitting a large page entry along with changing the attribute. */ static DEFINE_SPINLOCK(cpa_lock); #define CPA_FLUSHTLB 1 #define CPA_ARRAY 2 #define CPA_PAGES_ARRAY 4 #define CPA_NO_CHECK_ALIAS 8 /* Do not search for aliases */ static inline pgprot_t cachemode2pgprot(enum page_cache_mode pcm) { return __pgprot(cachemode2protval(pcm)); } #ifdef CONFIG_PROC_FS static unsigned long direct_pages_count[PG_LEVEL_NUM]; void update_page_count(int level, unsigned long pages) { /* Protect against CPA */ spin_lock(&pgd_lock); direct_pages_count[level] += pages; spin_unlock(&pgd_lock); } static void split_page_count(int level) { if (direct_pages_count[level] == 0) return; direct_pages_count[level]--; if (system_state == SYSTEM_RUNNING) { if (level == PG_LEVEL_2M) count_vm_event(DIRECT_MAP_LEVEL2_SPLIT); else if (level == PG_LEVEL_1G) count_vm_event(DIRECT_MAP_LEVEL3_SPLIT); } direct_pages_count[level - 1] += PTRS_PER_PTE; } void arch_report_meminfo(struct seq_file *m) { seq_printf(m, "DirectMap4k: %8lu kB\n", direct_pages_count[PG_LEVEL_4K] << 2); #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) seq_printf(m, "DirectMap2M: %8lu kB\n", direct_pages_count[PG_LEVEL_2M] << 11); #else seq_printf(m, "DirectMap4M: %8lu kB\n", direct_pages_count[PG_LEVEL_2M] << 12); #endif if (direct_gbpages) seq_printf(m, "DirectMap1G: %8lu kB\n", direct_pages_count[PG_LEVEL_1G] << 20); } #else static inline void split_page_count(int level) { } #endif #ifdef CONFIG_X86_CPA_STATISTICS static unsigned long cpa_1g_checked; static unsigned long cpa_1g_sameprot; static unsigned long cpa_1g_preserved; static unsigned long cpa_2m_checked; static unsigned long cpa_2m_sameprot; static unsigned long cpa_2m_preserved; static unsigned long cpa_4k_install; static inline void cpa_inc_1g_checked(void) { cpa_1g_checked++; } static inline void cpa_inc_2m_checked(void) { cpa_2m_checked++; } static inline void cpa_inc_4k_install(void) { data_race(cpa_4k_install++); } static inline void cpa_inc_lp_sameprot(int level) { if (level == PG_LEVEL_1G) cpa_1g_sameprot++; else cpa_2m_sameprot++; } static inline void cpa_inc_lp_preserved(int level) { if (level == PG_LEVEL_1G) cpa_1g_preserved++; else cpa_2m_preserved++; } static int cpastats_show(struct seq_file *m, void *p) { seq_printf(m, "1G pages checked: %16lu\n", cpa_1g_checked); seq_printf(m, "1G pages sameprot: %16lu\n", cpa_1g_sameprot); seq_printf(m, "1G pages preserved: %16lu\n", cpa_1g_preserved); seq_printf(m, "2M pages checked: %16lu\n", cpa_2m_checked); seq_printf(m, "2M pages sameprot: %16lu\n", cpa_2m_sameprot); seq_printf(m, "2M pages preserved: %16lu\n", cpa_2m_preserved); seq_printf(m, "4K pages set-checked: %16lu\n", cpa_4k_install); return 0; } static int cpastats_open(struct inode *inode, struct file *file) { return single_open(file, cpastats_show, NULL); } static const struct file_operations cpastats_fops = { .open = cpastats_open, .read = seq_read, .llseek = seq_lseek, .release = single_release, }; static int __init cpa_stats_init(void) { debugfs_create_file("cpa_stats", S_IRUSR, arch_debugfs_dir, NULL, &cpastats_fops); return 0; } late_initcall(cpa_stats_init); #else static inline void cpa_inc_1g_checked(void) { } static inline void cpa_inc_2m_checked(void) { } static inline void cpa_inc_4k_install(void) { } static inline void cpa_inc_lp_sameprot(int level) { } static inline void cpa_inc_lp_preserved(int level) { } #endif static inline int within(unsigned long addr, unsigned long start, unsigned long end) { return addr >= start && addr < end; } static inline int within_inclusive(unsigned long addr, unsigned long start, unsigned long end) { return addr >= start && addr <= end; } #ifdef CONFIG_X86_64 /* * The kernel image is mapped into two places in the virtual address space * (addresses without KASLR, of course): * * 1. The kernel direct map (0xffff880000000000) * 2. The "high kernel map" (0xffffffff81000000) * * We actually execute out of #2. If we get the address of a kernel symbol, it * points to #2, but almost all physical-to-virtual translations point to #1. * * This is so that we can have both a directmap of all physical memory *and* * take full advantage of the limited (s32) immediate addressing range (2G) * of x86_64. * * See Documentation/arch/x86/x86_64/mm.rst for more detail. */ static inline unsigned long highmap_start_pfn(void) { return __pa_symbol(_text) >> PAGE_SHIFT; } static inline unsigned long highmap_end_pfn(void) { /* Do not reference physical address outside the kernel. */ return __pa_symbol(roundup(_brk_end, PMD_SIZE) - 1) >> PAGE_SHIFT; } static bool __cpa_pfn_in_highmap(unsigned long pfn) { /* * Kernel text has an alias mapping at a high address, known * here as "highmap". */ return within_inclusive(pfn, highmap_start_pfn(), highmap_end_pfn()); } #else static bool __cpa_pfn_in_highmap(unsigned long pfn) { /* There is no highmap on 32-bit */ return false; } #endif /* * See set_mce_nospec(). * * Machine check recovery code needs to change cache mode of poisoned pages to * UC to avoid speculative access logging another error. But passing the * address of the 1:1 mapping to set_memory_uc() is a fine way to encourage a * speculative access. So we cheat and flip the top bit of the address. This * works fine for the code that updates the page tables. But at the end of the * process we need to flush the TLB and cache and the non-canonical address * causes a #GP fault when used by the INVLPG and CLFLUSH instructions. * * But in the common case we already have a canonical address. This code * will fix the top bit if needed and is a no-op otherwise. */ static inline unsigned long fix_addr(unsigned long addr) { #ifdef CONFIG_X86_64 return (long)(addr << 1) >> 1; #else return addr; #endif } static unsigned long __cpa_addr(struct cpa_data *cpa, unsigned long idx) { if (cpa->flags & CPA_PAGES_ARRAY) { struct page *page = cpa->pages[idx]; if (unlikely(PageHighMem(page))) return 0; return (unsigned long)page_address(page); } if (cpa->flags & CPA_ARRAY) return cpa->vaddr[idx]; return *cpa->vaddr + idx * PAGE_SIZE; } /* * Flushing functions */ static void clflush_cache_range_opt(void *vaddr, unsigned int size) { const unsigned long clflush_size = boot_cpu_data.x86_clflush_size; void *p = (void *)((unsigned long)vaddr & ~(clflush_size - 1)); void *vend = vaddr + size; if (p >= vend) return; for (; p < vend; p += clflush_size) clflushopt(p); } /** * clflush_cache_range - flush a cache range with clflush * @vaddr: virtual start address * @size: number of bytes to flush * * CLFLUSHOPT is an unordered instruction which needs fencing with MFENCE or * SFENCE to avoid ordering issues. */ void clflush_cache_range(void *vaddr, unsigned int size) { mb(); clflush_cache_range_opt(vaddr, size); mb(); } EXPORT_SYMBOL_GPL(clflush_cache_range); #ifdef CONFIG_ARCH_HAS_PMEM_API void arch_invalidate_pmem(void *addr, size_t size) { clflush_cache_range(addr, size); } EXPORT_SYMBOL_GPL(arch_invalidate_pmem); #endif #ifdef CONFIG_ARCH_HAS_CPU_CACHE_INVALIDATE_MEMREGION bool cpu_cache_has_invalidate_memregion(void) { return !cpu_feature_enabled(X86_FEATURE_HYPERVISOR); } EXPORT_SYMBOL_NS_GPL(cpu_cache_has_invalidate_memregion, "DEVMEM"); int cpu_cache_invalidate_memregion(int res_desc) { if (WARN_ON_ONCE(!cpu_cache_has_invalidate_memregion())) return -ENXIO; wbinvd_on_all_cpus(); return 0; } EXPORT_SYMBOL_NS_GPL(cpu_cache_invalidate_memregion, "DEVMEM"); #endif static void __cpa_flush_all(void *arg) { unsigned long cache = (unsigned long)arg; /* * Flush all to work around Errata in early athlons regarding * large page flushing. */ __flush_tlb_all(); if (cache && boot_cpu_data.x86 >= 4) wbinvd(); } static void cpa_flush_all(unsigned long cache) { BUG_ON(irqs_disabled() && !early_boot_irqs_disabled); on_each_cpu(__cpa_flush_all, (void *) cache, 1); } static void __cpa_flush_tlb(void *data) { struct cpa_data *cpa = data; unsigned int i; for (i = 0; i < cpa->numpages; i++) flush_tlb_one_kernel(fix_addr(__cpa_addr(cpa, i))); } static void cpa_flush(struct cpa_data *data, int cache) { struct cpa_data *cpa = data; unsigned int i; BUG_ON(irqs_disabled() && !early_boot_irqs_disabled); if (cache && !static_cpu_has(X86_FEATURE_CLFLUSH)) { cpa_flush_all(cache); return; } if (cpa->force_flush_all || cpa->numpages > tlb_single_page_flush_ceiling) flush_tlb_all(); else on_each_cpu(__cpa_flush_tlb, cpa, 1); if (!cache) return; mb(); for (i = 0; i < cpa->numpages; i++) { unsigned long addr = __cpa_addr(cpa, i); unsigned int level; pte_t *pte = lookup_address(addr, &level); /* * Only flush present addresses: */ if (pte && (pte_val(*pte) & _PAGE_PRESENT)) clflush_cache_range_opt((void *)fix_addr(addr), PAGE_SIZE); } mb(); } static bool overlaps(unsigned long r1_start, unsigned long r1_end, unsigned long r2_start, unsigned long r2_end) { return (r1_start <= r2_end && r1_end >= r2_start) || (r2_start <= r1_end && r2_end >= r1_start); } #ifdef CONFIG_PCI_BIOS /* * The BIOS area between 640k and 1Mb needs to be executable for PCI BIOS * based config access (CONFIG_PCI_GOBIOS) support. */ #define BIOS_PFN PFN_DOWN(BIOS_BEGIN) #define BIOS_PFN_END PFN_DOWN(BIOS_END - 1) static pgprotval_t protect_pci_bios(unsigned long spfn, unsigned long epfn) { if (pcibios_enabled && overlaps(spfn, epfn, BIOS_PFN, BIOS_PFN_END)) return _PAGE_NX; return 0; } #else static pgprotval_t protect_pci_bios(unsigned long spfn, unsigned long epfn) { return 0; } #endif /* * The .rodata section needs to be read-only. Using the pfn catches all * aliases. This also includes __ro_after_init, so do not enforce until * kernel_set_to_readonly is true. */ static pgprotval_t protect_rodata(unsigned long spfn, unsigned long epfn) { unsigned long epfn_ro, spfn_ro = PFN_DOWN(__pa_symbol(__start_rodata)); /* * Note: __end_rodata is at page aligned and not inclusive, so * subtract 1 to get the last enforced PFN in the rodata area. */ epfn_ro = PFN_DOWN(__pa_symbol(__end_rodata)) - 1; if (kernel_set_to_readonly && overlaps(spfn, epfn, spfn_ro, epfn_ro)) return _PAGE_RW; return 0; } /* * Protect kernel text against becoming non executable by forbidding * _PAGE_NX. This protects only the high kernel mapping (_text -> _etext) * out of which the kernel actually executes. Do not protect the low * mapping. * * This does not cover __inittext since that is gone after boot. */ static pgprotval_t protect_kernel_text(unsigned long start, unsigned long end) { unsigned long t_end = (unsigned long)_etext - 1; unsigned long t_start = (unsigned long)_text; if (overlaps(start, end, t_start, t_end)) return _PAGE_NX; return 0; } #if defined(CONFIG_X86_64) /* * Once the kernel maps the text as RO (kernel_set_to_readonly is set), * kernel text mappings for the large page aligned text, rodata sections * will be always read-only. For the kernel identity mappings covering the * holes caused by this alignment can be anything that user asks. * * This will preserve the large page mappings for kernel text/data at no * extra cost. */ static pgprotval_t protect_kernel_text_ro(unsigned long start, unsigned long end) { unsigned long t_end = (unsigned long)__end_rodata_hpage_align - 1; unsigned long t_start = (unsigned long)_text; unsigned int level; if (!kernel_set_to_readonly || !overlaps(start, end, t_start, t_end)) return 0; /* * Don't enforce the !RW mapping for the kernel text mapping, if * the current mapping is already using small page mapping. No * need to work hard to preserve large page mappings in this case. * * This also fixes the Linux Xen paravirt guest boot failure caused * by unexpected read-only mappings for kernel identity * mappings. In this paravirt guest case, the kernel text mapping * and the kernel identity mapping share the same page-table pages, * so the protections for kernel text and identity mappings have to * be the same. */ if (lookup_address(start, &level) && (level != PG_LEVEL_4K)) return _PAGE_RW; return 0; } #else static pgprotval_t protect_kernel_text_ro(unsigned long start, unsigned long end) { return 0; } #endif static inline bool conflicts(pgprot_t prot, pgprotval_t val) { return (pgprot_val(prot) & ~val) != pgprot_val(prot); } static inline void check_conflict(int warnlvl, pgprot_t prot, pgprotval_t val, unsigned long start, unsigned long end, unsigned long pfn, const char *txt) { static const char *lvltxt[] = { [CPA_CONFLICT] = "conflict", [CPA_PROTECT] = "protect", [CPA_DETECT] = "detect", }; if (warnlvl > cpa_warn_level || !conflicts(prot, val)) return; pr_warn("CPA %8s %10s: 0x%016lx - 0x%016lx PFN %lx req %016llx prevent %016llx\n", lvltxt[warnlvl], txt, start, end, pfn, (unsigned long long)pgprot_val(prot), (unsigned long long)val); } /* * Certain areas of memory on x86 require very specific protection flags, * for example the BIOS area or kernel text. Callers don't always get this * right (again, ioremap() on BIOS memory is not uncommon) so this function * checks and fixes these known static required protection bits. */ static inline pgprot_t static_protections(pgprot_t prot, unsigned long start, unsigned long pfn, unsigned long npg, unsigned long lpsize, int warnlvl) { pgprotval_t forbidden, res; unsigned long end; /* * There is no point in checking RW/NX conflicts when the requested * mapping is setting the page !PRESENT. */ if (!(pgprot_val(prot) & _PAGE_PRESENT)) return prot; /* Operate on the virtual address */ end = start + npg * PAGE_SIZE - 1; res = protect_kernel_text(start, end); check_conflict(warnlvl, prot, res, start, end, pfn, "Text NX"); forbidden = res; /* * Special case to preserve a large page. If the change spawns the * full large page mapping then there is no point to split it * up. Happens with ftrace and is going to be removed once ftrace * switched to text_poke(). */ if (lpsize != (npg * PAGE_SIZE) || (start & (lpsize - 1))) { res = protect_kernel_text_ro(start, end); check_conflict(warnlvl, prot, res, start, end, pfn, "Text RO"); forbidden |= res; } /* Check the PFN directly */ res = protect_pci_bios(pfn, pfn + npg - 1); check_conflict(warnlvl, prot, res, start, end, pfn, "PCIBIOS NX"); forbidden |= res; res = protect_rodata(pfn, pfn + npg - 1); check_conflict(warnlvl, prot, res, start, end, pfn, "Rodata RO"); forbidden |= res; return __pgprot(pgprot_val(prot) & ~forbidden); } /* * Validate strict W^X semantics. */ static inline pgprot_t verify_rwx(pgprot_t old, pgprot_t new, unsigned long start, unsigned long pfn, unsigned long npg, bool nx, bool rw) { unsigned long end; /* * 32-bit has some unfixable W+X issues, like EFI code * and writeable data being in the same page. Disable * detection and enforcement there. */ if (IS_ENABLED(CONFIG_X86_32)) return new; /* Only verify when NX is supported: */ if (!(__supported_pte_mask & _PAGE_NX)) return new; if (!((pgprot_val(old) ^ pgprot_val(new)) & (_PAGE_RW | _PAGE_NX))) return new; if ((pgprot_val(new) & (_PAGE_RW | _PAGE_NX)) != _PAGE_RW) return new; /* Non-leaf translation entries can disable writing or execution. */ if (!rw || nx) return new; end = start + npg * PAGE_SIZE - 1; WARN_ONCE(1, "CPA detected W^X violation: %016llx -> %016llx range: 0x%016lx - 0x%016lx PFN %lx\n", (unsigned long long)pgprot_val(old), (unsigned long long)pgprot_val(new), start, end, pfn); /* * For now, allow all permission change attempts by returning the * attempted permissions. This can 'return old' to actively * refuse the permission change at a later time. */ return new; } /* * Lookup the page table entry for a virtual address in a specific pgd. * Return a pointer to the entry (or NULL if the entry does not exist), * the level of the entry, and the effective NX and RW bits of all * page table levels. */ pte_t *lookup_address_in_pgd_attr(pgd_t *pgd, unsigned long address, unsigned int *level, bool *nx, bool *rw) { p4d_t *p4d; pud_t *pud; pmd_t *pmd; *level = PG_LEVEL_256T; *nx = false; *rw = true; if (pgd_none(*pgd)) return NULL; *level = PG_LEVEL_512G; *nx |= pgd_flags(*pgd) & _PAGE_NX; *rw &= pgd_flags(*pgd) & _PAGE_RW; p4d = p4d_offset(pgd, address); if (p4d_none(*p4d)) return NULL; if (p4d_leaf(*p4d) || !p4d_present(*p4d)) return (pte_t *)p4d; *level = PG_LEVEL_1G; *nx |= p4d_flags(*p4d) & _PAGE_NX; *rw &= p4d_flags(*p4d) & _PAGE_RW; pud = pud_offset(p4d, address); if (pud_none(*pud)) return NULL; if (pud_leaf(*pud) || !pud_present(*pud)) return (pte_t *)pud; *level = PG_LEVEL_2M; *nx |= pud_flags(*pud) & _PAGE_NX; *rw &= pud_flags(*pud) & _PAGE_RW; pmd = pmd_offset(pud, address); if (pmd_none(*pmd)) return NULL; if (pmd_leaf(*pmd) || !pmd_present(*pmd)) return (pte_t *)pmd; *level = PG_LEVEL_4K; *nx |= pmd_flags(*pmd) & _PAGE_NX; *rw &= pmd_flags(*pmd) & _PAGE_RW; return pte_offset_kernel(pmd, address); } /* * Lookup the page table entry for a virtual address in a specific pgd. * Return a pointer to the entry and the level of the mapping. */ pte_t *lookup_address_in_pgd(pgd_t *pgd, unsigned long address, unsigned int *level) { bool nx, rw; return lookup_address_in_pgd_attr(pgd, address, level, &nx, &rw); } /* * Lookup the page table entry for a virtual address. Return a pointer * to the entry and the level of the mapping. * * Note: the function returns p4d, pud or pmd either when the entry is marked * large or when the present bit is not set. Otherwise it returns NULL. */ pte_t *lookup_address(unsigned long address, unsigned int *level) { return lookup_address_in_pgd(pgd_offset_k(address), address, level); } EXPORT_SYMBOL_GPL(lookup_address); static pte_t *_lookup_address_cpa(struct cpa_data *cpa, unsigned long address, unsigned int *level, bool *nx, bool *rw) { pgd_t *pgd; if (!cpa->pgd) pgd = pgd_offset_k(address); else pgd = cpa->pgd + pgd_index(address); return lookup_address_in_pgd_attr(pgd, address, level, nx, rw); } /* * Lookup the PMD entry for a virtual address. Return a pointer to the entry * or NULL if not present. */ pmd_t *lookup_pmd_address(unsigned long address) { pgd_t *pgd; p4d_t *p4d; pud_t *pud; pgd = pgd_offset_k(address); if (pgd_none(*pgd)) return NULL; p4d = p4d_offset(pgd, address); if (p4d_none(*p4d) || p4d_leaf(*p4d) || !p4d_present(*p4d)) return NULL; pud = pud_offset(p4d, address); if (pud_none(*pud) || pud_leaf(*pud) || !pud_present(*pud)) return NULL; return pmd_offset(pud, address); } /* * This is necessary because __pa() does not work on some * kinds of memory, like vmalloc() or the alloc_remap() * areas on 32-bit NUMA systems. The percpu areas can * end up in this kind of memory, for instance. * * Note that as long as the PTEs are well-formed with correct PFNs, this * works without checking the PRESENT bit in the leaf PTE. This is unlike * the similar vmalloc_to_page() and derivatives. Callers may depend on * this behavior. * * This could be optimized, but it is only used in paths that are not perf * sensitive, and keeping it unoptimized should increase the testing coverage * for the more obscure platforms. */ phys_addr_t slow_virt_to_phys(void *__virt_addr) { unsigned long virt_addr = (unsigned long)__virt_addr; phys_addr_t phys_addr; unsigned long offset; enum pg_level level; pte_t *pte; pte = lookup_address(virt_addr, &level); BUG_ON(!pte); /* * pXX_pfn() returns unsigned long, which must be cast to phys_addr_t * before being left-shifted PAGE_SHIFT bits -- this trick is to * make 32-PAE kernel work correctly. */ switch (level) { case PG_LEVEL_1G: phys_addr = (phys_addr_t)pud_pfn(*(pud_t *)pte) << PAGE_SHIFT; offset = virt_addr & ~PUD_MASK; break; case PG_LEVEL_2M: phys_addr = (phys_addr_t)pmd_pfn(*(pmd_t *)pte) << PAGE_SHIFT; offset = virt_addr & ~PMD_MASK; break; default: phys_addr = (phys_addr_t)pte_pfn(*pte) << PAGE_SHIFT; offset = virt_addr & ~PAGE_MASK; } return (phys_addr_t)(phys_addr | offset); } EXPORT_SYMBOL_GPL(slow_virt_to_phys); /* * Set the new pmd in all the pgds we know about: */ static void __set_pmd_pte(pte_t *kpte, unsigned long address, pte_t pte) { /* change init_mm */ set_pte_atomic(kpte, pte); #ifdef CONFIG_X86_32 if (!SHARED_KERNEL_PMD) { struct page *page; list_for_each_entry(page, &pgd_list, lru) { pgd_t *pgd; p4d_t *p4d; pud_t *pud; pmd_t *pmd; pgd = (pgd_t *)page_address(page) + pgd_index(address); p4d = p4d_offset(pgd, address); pud = pud_offset(p4d, address); pmd = pmd_offset(pud, address); set_pte_atomic((pte_t *)pmd, pte); } } #endif } static pgprot_t pgprot_clear_protnone_bits(pgprot_t prot) { /* * _PAGE_GLOBAL means "global page" for present PTEs. * But, it is also used to indicate _PAGE_PROTNONE * for non-present PTEs. * * This ensures that a _PAGE_GLOBAL PTE going from * present to non-present is not confused as * _PAGE_PROTNONE. */ if (!(pgprot_val(prot) & _PAGE_PRESENT)) pgprot_val(prot) &= ~_PAGE_GLOBAL; return prot; } static int __should_split_large_page(pte_t *kpte, unsigned long address, struct cpa_data *cpa) { unsigned long numpages, pmask, psize, lpaddr, pfn, old_pfn; pgprot_t old_prot, new_prot, req_prot, chk_prot; pte_t new_pte, *tmp; enum pg_level level; bool nx, rw; /* * Check for races, another CPU might have split this page * up already: */ tmp = _lookup_address_cpa(cpa, address, &level, &nx, &rw); if (tmp != kpte) return 1; switch (level) { case PG_LEVEL_2M: old_prot = pmd_pgprot(*(pmd_t *)kpte); old_pfn = pmd_pfn(*(pmd_t *)kpte); cpa_inc_2m_checked(); break; case PG_LEVEL_1G: old_prot = pud_pgprot(*(pud_t *)kpte); old_pfn = pud_pfn(*(pud_t *)kpte); cpa_inc_1g_checked(); break; default: return -EINVAL; } psize = page_level_size(level); pmask = page_level_mask(level); /* * Calculate the number of pages, which fit into this large * page starting at address: */ lpaddr = (address + psize) & pmask; numpages = (lpaddr - address) >> PAGE_SHIFT; if (numpages < cpa->numpages) cpa->numpages = numpages; /* * We are safe now. Check whether the new pgprot is the same: * Convert protection attributes to 4k-format, as cpa->mask* are set * up accordingly. */ /* Clear PSE (aka _PAGE_PAT) and move PAT bit to correct position */ req_prot = pgprot_large_2_4k(old_prot); pgprot_val(req_prot) &= ~pgprot_val(cpa->mask_clr); pgprot_val(req_prot) |= pgprot_val(cpa->mask_set); /* * req_prot is in format of 4k pages. It must be converted to large * page format: the caching mode includes the PAT bit located at * different bit positions in the two formats. */ req_prot = pgprot_4k_2_large(req_prot); req_prot = pgprot_clear_protnone_bits(req_prot); if (pgprot_val(req_prot) & _PAGE_PRESENT) pgprot_val(req_prot) |= _PAGE_PSE; /* * old_pfn points to the large page base pfn. So we need to add the * offset of the virtual address: */ pfn = old_pfn + ((address & (psize - 1)) >> PAGE_SHIFT); cpa->pfn = pfn; /* * Calculate the large page base address and the number of 4K pages * in the large page */ lpaddr = address & pmask; numpages = psize >> PAGE_SHIFT; /* * Sanity check that the existing mapping is correct versus the static * protections. static_protections() guards against !PRESENT, so no * extra conditional required here. */ chk_prot = static_protections(old_prot, lpaddr, old_pfn, numpages, psize, CPA_CONFLICT); if (WARN_ON_ONCE(pgprot_val(chk_prot) != pgprot_val(old_prot))) { /* * Split the large page and tell the split code to * enforce static protections. */ cpa->force_static_prot = 1; return 1; } /* * Optimization: If the requested pgprot is the same as the current * pgprot, then the large page can be preserved and no updates are * required independent of alignment and length of the requested * range. The above already established that the current pgprot is * correct, which in consequence makes the requested pgprot correct * as well if it is the same. The static protection scan below will * not come to a different conclusion. */ if (pgprot_val(req_prot) == pgprot_val(old_prot)) { cpa_inc_lp_sameprot(level); return 0; } /* * If the requested range does not cover the full page, split it up */ if (address != lpaddr || cpa->numpages != numpages) return 1; /* * Check whether the requested pgprot is conflicting with a static * protection requirement in the large page. */ new_prot = static_protections(req_prot, lpaddr, old_pfn, numpages, psize, CPA_DETECT); new_prot = verify_rwx(old_prot, new_prot, lpaddr, old_pfn, numpages, nx, rw); /* * If there is a conflict, split the large page. * * There used to be a 4k wise evaluation trying really hard to * preserve the large pages, but experimentation has shown, that this * does not help at all. There might be corner cases which would * preserve one large page occasionally, but it's really not worth the * extra code and cycles for the common case. */ if (pgprot_val(req_prot) != pgprot_val(new_prot)) return 1; /* All checks passed. Update the large page mapping. */ new_pte = pfn_pte(old_pfn, new_prot); __set_pmd_pte(kpte, address, new_pte); cpa->flags |= CPA_FLUSHTLB; cpa_inc_lp_preserved(level); return 0; } static int should_split_large_page(pte_t *kpte, unsigned long address, struct cpa_data *cpa) { int do_split; if (cpa->force_split) return 1; spin_lock(&pgd_lock); do_split = __should_split_large_page(kpte, address, cpa); spin_unlock(&pgd_lock); return do_split; } static void split_set_pte(struct cpa_data *cpa, pte_t *pte, unsigned long pfn, pgprot_t ref_prot, unsigned long address, unsigned long size) { unsigned int npg = PFN_DOWN(size); pgprot_t prot; /* * If should_split_large_page() discovered an inconsistent mapping, * remove the invalid protection in the split mapping. */ if (!cpa->force_static_prot) goto set; /* Hand in lpsize = 0 to enforce the protection mechanism */ prot = static_protections(ref_prot, address, pfn, npg, 0, CPA_PROTECT); if (pgprot_val(prot) == pgprot_val(ref_prot)) goto set; /* * If this is splitting a PMD, fix it up. PUD splits cannot be * fixed trivially as that would require to rescan the newly * installed PMD mappings after returning from split_large_page() * so an eventual further split can allocate the necessary PTE * pages. Warn for now and revisit it in case this actually * happens. */ if (size == PAGE_SIZE) ref_prot = prot; else pr_warn_once("CPA: Cannot fixup static protections for PUD split\n"); set: set_pte(pte, pfn_pte(pfn, ref_prot)); } static int __split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address, struct page *base) { unsigned long lpaddr, lpinc, ref_pfn, pfn, pfninc = 1; pte_t *pbase = (pte_t *)page_address(base); unsigned int i, level; pgprot_t ref_prot; bool nx, rw; pte_t *tmp; spin_lock(&pgd_lock); /* * Check for races, another CPU might have split this page * up for us already: */ tmp = _lookup_address_cpa(cpa, address, &level, &nx, &rw); if (tmp != kpte) { spin_unlock(&pgd_lock); return 1; } paravirt_alloc_pte(&init_mm, page_to_pfn(base)); switch (level) { case PG_LEVEL_2M: ref_prot = pmd_pgprot(*(pmd_t *)kpte); /* * Clear PSE (aka _PAGE_PAT) and move * PAT bit to correct position. */ ref_prot = pgprot_large_2_4k(ref_prot); ref_pfn = pmd_pfn(*(pmd_t *)kpte); lpaddr = address & PMD_MASK; lpinc = PAGE_SIZE; break; case PG_LEVEL_1G: ref_prot = pud_pgprot(*(pud_t *)kpte); ref_pfn = pud_pfn(*(pud_t *)kpte); pfninc = PMD_SIZE >> PAGE_SHIFT; lpaddr = address & PUD_MASK; lpinc = PMD_SIZE; /* * Clear the PSE flags if the PRESENT flag is not set * otherwise pmd_present() will return true even on a non * present pmd. */ if (!(pgprot_val(ref_prot) & _PAGE_PRESENT)) pgprot_val(ref_prot) &= ~_PAGE_PSE; break; default: spin_unlock(&pgd_lock); return 1; } ref_prot = pgprot_clear_protnone_bits(ref_prot); /* * Get the target pfn from the original entry: */ pfn = ref_pfn; for (i = 0; i < PTRS_PER_PTE; i++, pfn += pfninc, lpaddr += lpinc) split_set_pte(cpa, pbase + i, pfn, ref_prot, lpaddr, lpinc); if (virt_addr_valid(address)) { unsigned long pfn = PFN_DOWN(__pa(address)); if (pfn_range_is_mapped(pfn, pfn + 1)) split_page_count(level); } /* * Install the new, split up pagetable. * * We use the standard kernel pagetable protections for the new * pagetable protections, the actual ptes set above control the * primary protection behavior: */ __set_pmd_pte(kpte, address, mk_pte(base, __pgprot(_KERNPG_TABLE))); /* * Do a global flush tlb after splitting the large page * and before we do the actual change page attribute in the PTE. * * Without this, we violate the TLB application note, that says: * "The TLBs may contain both ordinary and large-page * translations for a 4-KByte range of linear addresses. This * may occur if software modifies the paging structures so that * the page size used for the address range changes. If the two * translations differ with respect to page frame or attributes * (e.g., permissions), processor behavior is undefined and may * be implementation-specific." * * We do this global tlb flush inside the cpa_lock, so that we * don't allow any other cpu, with stale tlb entries change the * page attribute in parallel, that also falls into the * just split large page entry. */ flush_tlb_all(); spin_unlock(&pgd_lock); return 0; } static int split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address) { struct page *base; if (!debug_pagealloc_enabled()) spin_unlock(&cpa_lock); base = alloc_pages(GFP_KERNEL, 0); if (!debug_pagealloc_enabled()) spin_lock(&cpa_lock); if (!base) return -ENOMEM; if (__split_large_page(cpa, kpte, address, base)) __free_page(base); return 0; } static bool try_to_free_pte_page(pte_t *pte) { int i; for (i = 0; i < PTRS_PER_PTE; i++) if (!pte_none(pte[i])) return false; free_page((unsigned long)pte); return true; } static bool try_to_free_pmd_page(pmd_t *pmd) { int i; for (i = 0; i < PTRS_PER_PMD; i++) if (!pmd_none(pmd[i])) return false; free_page((unsigned long)pmd); return true; } static bool unmap_pte_range(pmd_t *pmd, unsigned long start, unsigned long end) { pte_t *pte = pte_offset_kernel(pmd, start); while (start < end) { set_pte(pte, __pte(0)); start += PAGE_SIZE; pte++; } if (try_to_free_pte_page((pte_t *)pmd_page_vaddr(*pmd))) { pmd_clear(pmd); return true; } return false; } static void __unmap_pmd_range(pud_t *pud, pmd_t *pmd, unsigned long start, unsigned long end) { if (unmap_pte_range(pmd, start, end)) if (try_to_free_pmd_page(pud_pgtable(*pud))) pud_clear(pud); } static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end) { pmd_t *pmd = pmd_offset(pud, start); /* * Not on a 2MB page boundary? */ if (start & (PMD_SIZE - 1)) { unsigned long next_page = (start + PMD_SIZE) & PMD_MASK; unsigned long pre_end = min_t(unsigned long, end, next_page); __unmap_pmd_range(pud, pmd, start, pre_end); start = pre_end; pmd++; } /* * Try to unmap in 2M chunks. */ while (end - start >= PMD_SIZE) { if (pmd_leaf(*pmd)) pmd_clear(pmd); else __unmap_pmd_range(pud, pmd, start, start + PMD_SIZE); start += PMD_SIZE; pmd++; } /* * 4K leftovers? */ if (start < end) return __unmap_pmd_range(pud, pmd, start, end); /* * Try again to free the PMD page if haven't succeeded above. */ if (!pud_none(*pud)) if (try_to_free_pmd_page(pud_pgtable(*pud))) pud_clear(pud); } static void unmap_pud_range(p4d_t *p4d, unsigned long start, unsigned long end) { pud_t *pud = pud_offset(p4d, start); /* * Not on a GB page boundary? */ if (start & (PUD_SIZE - 1)) { unsigned long next_page = (start + PUD_SIZE) & PUD_MASK; unsigned long pre_end = min_t(unsigned long, end, next_page); unmap_pmd_range(pud, start, pre_end); start = pre_end; pud++; } /* * Try to unmap in 1G chunks? */ while (end - start >= PUD_SIZE) { if (pud_leaf(*pud)) pud_clear(pud); else unmap_pmd_range(pud, start, start + PUD_SIZE); start += PUD_SIZE; pud++; } /* * 2M leftovers? */ if (start < end) unmap_pmd_range(pud, start, end); /* * No need to try to free the PUD page because we'll free it in * populate_pgd's error path */ } static int alloc_pte_page(pmd_t *pmd) { pte_t *pte = (pte_t *)get_zeroed_page(GFP_KERNEL); if (!pte) return -1; set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE)); return 0; } static int alloc_pmd_page(pud_t *pud) { pmd_t *pmd = (pmd_t *)get_zeroed_page(GFP_KERNEL); if (!pmd) return -1; set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE)); return 0; } static void populate_pte(struct cpa_data *cpa, unsigned long start, unsigned long end, unsigned num_pages, pmd_t *pmd, pgprot_t pgprot) { pte_t *pte; pte = pte_offset_kernel(pmd, start); pgprot = pgprot_clear_protnone_bits(pgprot); while (num_pages-- && start < end) { set_pte(pte, pfn_pte(cpa->pfn, pgprot)); start += PAGE_SIZE; cpa->pfn++; pte++; } } static long populate_pmd(struct cpa_data *cpa, unsigned long start, unsigned long end, unsigned num_pages, pud_t *pud, pgprot_t pgprot) { long cur_pages = 0; pmd_t *pmd; pgprot_t pmd_pgprot; /* * Not on a 2M boundary? */ if (start & (PMD_SIZE - 1)) { unsigned long pre_end = start + (num_pages << PAGE_SHIFT); unsigned long next_page = (start + PMD_SIZE) & PMD_MASK; pre_end = min_t(unsigned long, pre_end, next_page); cur_pages = (pre_end - start) >> PAGE_SHIFT; cur_pages = min_t(unsigned int, num_pages, cur_pages); /* * Need a PTE page? */ pmd = pmd_offset(pud, start); if (pmd_none(*pmd)) if (alloc_pte_page(pmd)) return -1; populate_pte(cpa, start, pre_end, cur_pages, pmd, pgprot); start = pre_end; } /* * We mapped them all? */ if (num_pages == cur_pages) return cur_pages; pmd_pgprot = pgprot_4k_2_large(pgprot); while (end - start >= PMD_SIZE) { /* * We cannot use a 1G page so allocate a PMD page if needed. */ if (pud_none(*pud)) if (alloc_pmd_page(pud)) return -1; pmd = pmd_offset(pud, start); set_pmd(pmd, pmd_mkhuge(pfn_pmd(cpa->pfn, canon_pgprot(pmd_pgprot)))); start += PMD_SIZE; cpa->pfn += PMD_SIZE >> PAGE_SHIFT; cur_pages += PMD_SIZE >> PAGE_SHIFT; } /* * Map trailing 4K pages. */ if (start < end) { pmd = pmd_offset(pud, start); if (pmd_none(*pmd)) if (alloc_pte_page(pmd)) return -1; populate_pte(cpa, start, end, num_pages - cur_pages, pmd, pgprot); } return num_pages; } static int populate_pud(struct cpa_data *cpa, unsigned long start, p4d_t *p4d, pgprot_t pgprot) { pud_t *pud; unsigned long end; long cur_pages = 0; pgprot_t pud_pgprot; end = start + (cpa->numpages << PAGE_SHIFT); /* * Not on a Gb page boundary? => map everything up to it with * smaller pages. */ if (start & (PUD_SIZE - 1)) { unsigned long pre_end; unsigned long next_page = (start + PUD_SIZE) & PUD_MASK; pre_end = min_t(unsigned long, end, next_page); cur_pages = (pre_end - start) >> PAGE_SHIFT; cur_pages = min_t(int, (int)cpa->numpages, cur_pages); pud = pud_offset(p4d, start); /* * Need a PMD page? */ if (pud_none(*pud)) if (alloc_pmd_page(pud)) return -1; cur_pages = populate_pmd(cpa, start, pre_end, cur_pages, pud, pgprot); if (cur_pages < 0) return cur_pages; start = pre_end; } /* We mapped them all? */ if (cpa->numpages == cur_pages) return cur_pages; pud = pud_offset(p4d, start); pud_pgprot = pgprot_4k_2_large(pgprot); /* * Map everything starting from the Gb boundary, possibly with 1G pages */ while (boot_cpu_has(X86_FEATURE_GBPAGES) && end - start >= PUD_SIZE) { set_pud(pud, pud_mkhuge(pfn_pud(cpa->pfn, canon_pgprot(pud_pgprot)))); start += PUD_SIZE; cpa->pfn += PUD_SIZE >> PAGE_SHIFT; cur_pages += PUD_SIZE >> PAGE_SHIFT; pud++; } /* Map trailing leftover */ if (start < end) { long tmp; pud = pud_offset(p4d, start); if (pud_none(*pud)) if (alloc_pmd_page(pud)) return -1; tmp = populate_pmd(cpa, start, end, cpa->numpages - cur_pages, pud, pgprot); if (tmp < 0) return cur_pages; cur_pages += tmp; } return cur_pages; } /* * Restrictions for kernel page table do not necessarily apply when mapping in * an alternate PGD. */ static int populate_pgd(struct cpa_data *cpa, unsigned long addr) { pgprot_t pgprot = __pgprot(_KERNPG_TABLE); pud_t *pud = NULL; /* shut up gcc */ p4d_t *p4d; pgd_t *pgd_entry; long ret; pgd_entry = cpa->pgd + pgd_index(addr); if (pgd_none(*pgd_entry)) { p4d = (p4d_t *)get_zeroed_page(GFP_KERNEL); if (!p4d) return -1; set_pgd(pgd_entry, __pgd(__pa(p4d) | _KERNPG_TABLE)); } /* * Allocate a PUD page and hand it down for mapping. */ p4d = p4d_offset(pgd_entry, addr); if (p4d_none(*p4d)) { pud = (pud_t *)get_zeroed_page(GFP_KERNEL); if (!pud) return -1; set_p4d(p4d, __p4d(__pa(pud) | _KERNPG_TABLE)); } pgprot_val(pgprot) &= ~pgprot_val(cpa->mask_clr); pgprot_val(pgprot) |= pgprot_val(cpa->mask_set); ret = populate_pud(cpa, addr, p4d, pgprot); if (ret < 0) { /* * Leave the PUD page in place in case some other CPU or thread * already found it, but remove any useless entries we just * added to it. */ unmap_pud_range(p4d, addr, addr + (cpa->numpages << PAGE_SHIFT)); return ret; } cpa->numpages = ret; return 0; } static int __cpa_process_fault(struct cpa_data *cpa, unsigned long vaddr, int primary) { if (cpa->pgd) { /* * Right now, we only execute this code path when mapping * the EFI virtual memory map regions, no other users * provide a ->pgd value. This may change in the future. */ return populate_pgd(cpa, vaddr); } /* * Ignore all non primary paths. */ if (!primary) { cpa->numpages = 1; return 0; } /* * Ignore the NULL PTE for kernel identity mapping, as it is expected * to have holes. * Also set numpages to '1' indicating that we processed cpa req for * one virtual address page and its pfn. TBD: numpages can be set based * on the initial value and the level returned by lookup_address(). */ if (within(vaddr, PAGE_OFFSET, PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT))) { cpa->numpages = 1; cpa->pfn = __pa(vaddr) >> PAGE_SHIFT; return 0; } else if (__cpa_pfn_in_highmap(cpa->pfn)) { /* Faults in the highmap are OK, so do not warn: */ return -EFAULT; } else { WARN(1, KERN_WARNING "CPA: called for zero pte. " "vaddr = %lx cpa->vaddr = %lx\n", vaddr, *cpa->vaddr); return -EFAULT; } } static int __change_page_attr(struct cpa_data *cpa, int primary) { unsigned long address; int do_split, err; unsigned int level; pte_t *kpte, old_pte; bool nx, rw; address = __cpa_addr(cpa, cpa->curpage); repeat: kpte = _lookup_address_cpa(cpa, address, &level, &nx, &rw); if (!kpte) return __cpa_process_fault(cpa, address, primary); old_pte = *kpte; if (pte_none(old_pte)) return __cpa_process_fault(cpa, address, primary); if (level == PG_LEVEL_4K) { pte_t new_pte; pgprot_t old_prot = pte_pgprot(old_pte); pgprot_t new_prot = pte_pgprot(old_pte); unsigned long pfn = pte_pfn(old_pte); pgprot_val(new_prot) &= ~pgprot_val(cpa->mask_clr); pgprot_val(new_prot) |= pgprot_val(cpa->mask_set); cpa_inc_4k_install(); /* Hand in lpsize = 0 to enforce the protection mechanism */ new_prot = static_protections(new_prot, address, pfn, 1, 0, CPA_PROTECT); new_prot = verify_rwx(old_prot, new_prot, address, pfn, 1, nx, rw); new_prot = pgprot_clear_protnone_bits(new_prot); /* * We need to keep the pfn from the existing PTE, * after all we're only going to change its attributes * not the memory it points to */ new_pte = pfn_pte(pfn, new_prot); cpa->pfn = pfn; /* * Do we really change anything ? */ if (pte_val(old_pte) != pte_val(new_pte)) { set_pte_atomic(kpte, new_pte); cpa->flags |= CPA_FLUSHTLB; } cpa->numpages = 1; return 0; } /* * Check, whether we can keep the large page intact * and just change the pte: */ do_split = should_split_large_page(kpte, address, cpa); /* * When the range fits into the existing large page, * return. cp->numpages and cpa->tlbflush have been updated in * try_large_page: */ if (do_split <= 0) return do_split; /* * We have to split the large page: */ err = split_large_page(cpa, kpte, address); if (!err) goto repeat; return err; } static int __change_page_attr_set_clr(struct cpa_data *cpa, int primary); /* * Check the directmap and "high kernel map" 'aliases'. */ static int cpa_process_alias(struct cpa_data *cpa) { struct cpa_data alias_cpa; unsigned long laddr = (unsigned long)__va(cpa->pfn << PAGE_SHIFT); unsigned long vaddr; int ret; if (!pfn_range_is_mapped(cpa->pfn, cpa->pfn + 1)) return 0; /* * No need to redo, when the primary call touched the direct * mapping already: */ vaddr = __cpa_addr(cpa, cpa->curpage); if (!(within(vaddr, PAGE_OFFSET, PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT)))) { alias_cpa = *cpa; alias_cpa.vaddr = &laddr; alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY); alias_cpa.curpage = 0; /* Directmap always has NX set, do not modify. */ if (__supported_pte_mask & _PAGE_NX) { alias_cpa.mask_clr.pgprot &= ~_PAGE_NX; alias_cpa.mask_set.pgprot &= ~_PAGE_NX; } cpa->force_flush_all = 1; ret = __change_page_attr_set_clr(&alias_cpa, 0); if (ret) return ret; } #ifdef CONFIG_X86_64 /* * If the primary call didn't touch the high mapping already * and the physical address is inside the kernel map, we need * to touch the high mapped kernel as well: */ if (!within(vaddr, (unsigned long)_text, _brk_end) && __cpa_pfn_in_highmap(cpa->pfn)) { unsigned long temp_cpa_vaddr = (cpa->pfn << PAGE_SHIFT) + __START_KERNEL_map - phys_base; alias_cpa = *cpa; alias_cpa.vaddr = &temp_cpa_vaddr; alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY); alias_cpa.curpage = 0; /* * [_text, _brk_end) also covers data, do not modify NX except * in cases where the highmap is the primary target. */ if (__supported_pte_mask & _PAGE_NX) { alias_cpa.mask_clr.pgprot &= ~_PAGE_NX; alias_cpa.mask_set.pgprot &= ~_PAGE_NX; } cpa->force_flush_all = 1; /* * The high mapping range is imprecise, so ignore the * return value. */ __change_page_attr_set_clr(&alias_cpa, 0); } #endif return 0; } static int __change_page_attr_set_clr(struct cpa_data *cpa, int primary) { unsigned long numpages = cpa->numpages; unsigned long rempages = numpages; int ret = 0; /* * No changes, easy! */ if (!(pgprot_val(cpa->mask_set) | pgprot_val(cpa->mask_clr)) && !cpa->force_split) return ret; while (rempages) { /* * Store the remaining nr of pages for the large page * preservation check. */ cpa->numpages = rempages; /* for array changes, we can't use large page */ if (cpa->flags & (CPA_ARRAY | CPA_PAGES_ARRAY)) cpa->numpages = 1; if (!debug_pagealloc_enabled()) spin_lock(&cpa_lock); ret = __change_page_attr(cpa, primary); if (!debug_pagealloc_enabled()) spin_unlock(&cpa_lock); if (ret) goto out; if (primary && !(cpa->flags & CPA_NO_CHECK_ALIAS)) { ret = cpa_process_alias(cpa); if (ret) goto out; } /* * Adjust the number of pages with the result of the * CPA operation. Either a large page has been * preserved or a single page update happened. */ BUG_ON(cpa->numpages > rempages || !cpa->numpages); rempages -= cpa->numpages; cpa->curpage += cpa->numpages; } out: /* Restore the original numpages */ cpa->numpages = numpages; return ret; } static int change_page_attr_set_clr(unsigned long *addr, int numpages, pgprot_t mask_set, pgprot_t mask_clr, int force_split, int in_flag, struct page **pages) { struct cpa_data cpa; int ret, cache; memset(&cpa, 0, sizeof(cpa)); /* * Check, if we are requested to set a not supported * feature. Clearing non-supported features is OK. */ mask_set = canon_pgprot(mask_set); if (!pgprot_val(mask_set) && !pgprot_val(mask_clr) && !force_split) return 0; /* Ensure we are PAGE_SIZE aligned */ if (in_flag & CPA_ARRAY) { int i; for (i = 0; i < numpages; i++) { if (addr[i] & ~PAGE_MASK) { addr[i] &= PAGE_MASK; WARN_ON_ONCE(1); } } } else if (!(in_flag & CPA_PAGES_ARRAY)) { /* * in_flag of CPA_PAGES_ARRAY implies it is aligned. * No need to check in that case */ if (*addr & ~PAGE_MASK) { *addr &= PAGE_MASK; /* * People should not be passing in unaligned addresses: */ WARN_ON_ONCE(1); } } /* Must avoid aliasing mappings in the highmem code */ kmap_flush_unused(); vm_unmap_aliases(); cpa.vaddr = addr; cpa.pages = pages; cpa.numpages = numpages; cpa.mask_set = mask_set; cpa.mask_clr = mask_clr; cpa.flags = in_flag; cpa.curpage = 0; cpa.force_split = force_split; ret = __change_page_attr_set_clr(&cpa, 1); /* * Check whether we really changed something: */ if (!(cpa.flags & CPA_FLUSHTLB)) goto out; /* * No need to flush, when we did not set any of the caching * attributes: */ cache = !!pgprot2cachemode(mask_set); /* * On error; flush everything to be sure. */ if (ret) { cpa_flush_all(cache); goto out; } cpa_flush(&cpa, cache); out: return ret; } static inline int change_page_attr_set(unsigned long *addr, int numpages, pgprot_t mask, int array) { return change_page_attr_set_clr(addr, numpages, mask, __pgprot(0), 0, (array ? CPA_ARRAY : 0), NULL); } static inline int change_page_attr_clear(unsigned long *addr, int numpages, pgprot_t mask, int array) { return change_page_attr_set_clr(addr, numpages, __pgprot(0), mask, 0, (array ? CPA_ARRAY : 0), NULL); } static inline int cpa_set_pages_array(struct page **pages, int numpages, pgprot_t mask) { return change_page_attr_set_clr(NULL, numpages, mask, __pgprot(0), 0, CPA_PAGES_ARRAY, pages); } static inline int cpa_clear_pages_array(struct page **pages, int numpages, pgprot_t mask) { return change_page_attr_set_clr(NULL, numpages, __pgprot(0), mask, 0, CPA_PAGES_ARRAY, pages); } /* * __set_memory_prot is an internal helper for callers that have been passed * a pgprot_t value from upper layers and a reservation has already been taken. * If you want to set the pgprot to a specific page protocol, use the * set_memory_xx() functions. */ int __set_memory_prot(unsigned long addr, int numpages, pgprot_t prot) { return change_page_attr_set_clr(&addr, numpages, prot, __pgprot(~pgprot_val(prot)), 0, 0, NULL); } int _set_memory_uc(unsigned long addr, int numpages) { /* * for now UC MINUS. see comments in ioremap() * If you really need strong UC use ioremap_uc(), but note * that you cannot override IO areas with set_memory_*() as * these helpers cannot work with IO memory. */ return change_page_attr_set(&addr, numpages, cachemode2pgprot(_PAGE_CACHE_MODE_UC_MINUS), 0); } int set_memory_uc(unsigned long addr, int numpages) { int ret; /* * for now UC MINUS. see comments in ioremap() */ ret = memtype_reserve(__pa(addr), __pa(addr) + numpages * PAGE_SIZE, _PAGE_CACHE_MODE_UC_MINUS, NULL); if (ret) goto out_err; ret = _set_memory_uc(addr, numpages); if (ret) goto out_free; return 0; out_free: memtype_free(__pa(addr), __pa(addr) + numpages * PAGE_SIZE); out_err: return ret; } EXPORT_SYMBOL(set_memory_uc); int _set_memory_wc(unsigned long addr, int numpages) { int ret; ret = change_page_attr_set(&addr, numpages, cachemode2pgprot(_PAGE_CACHE_MODE_UC_MINUS), 0); if (!ret) { ret = change_page_attr_set_clr(&addr, numpages, cachemode2pgprot(_PAGE_CACHE_MODE_WC), __pgprot(_PAGE_CACHE_MASK), 0, 0, NULL); } return ret; } int set_memory_wc(unsigned long addr, int numpages) { int ret; ret = memtype_reserve(__pa(addr), __pa(addr) + numpages * PAGE_SIZE, _PAGE_CACHE_MODE_WC, NULL); if (ret) return ret; ret = _set_memory_wc(addr, numpages); if (ret) memtype_free(__pa(addr), __pa(addr) + numpages * PAGE_SIZE); return ret; } EXPORT_SYMBOL(set_memory_wc); int _set_memory_wt(unsigned long addr, int numpages) { return change_page_attr_set(&addr, numpages, cachemode2pgprot(_PAGE_CACHE_MODE_WT), 0); } int _set_memory_wb(unsigned long addr, int numpages) { /* WB cache mode is hard wired to all cache attribute bits being 0 */ return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_CACHE_MASK), 0); } int set_memory_wb(unsigned long addr, int numpages) { int ret; ret = _set_memory_wb(addr, numpages); if (ret) return ret; memtype_free(__pa(addr), __pa(addr) + numpages * PAGE_SIZE); return 0; } EXPORT_SYMBOL(set_memory_wb); /* Prevent speculative access to a page by marking it not-present */ #ifdef CONFIG_X86_64 int set_mce_nospec(unsigned long pfn) { unsigned long decoy_addr; int rc; /* SGX pages are not in the 1:1 map */ if (arch_is_platform_page(pfn << PAGE_SHIFT)) return 0; /* * We would like to just call: * set_memory_XX((unsigned long)pfn_to_kaddr(pfn), 1); * but doing that would radically increase the odds of a * speculative access to the poison page because we'd have * the virtual address of the kernel 1:1 mapping sitting * around in registers. * Instead we get tricky. We create a non-canonical address * that looks just like the one we want, but has bit 63 flipped. * This relies on set_memory_XX() properly sanitizing any __pa() * results with __PHYSICAL_MASK or PTE_PFN_MASK. */ decoy_addr = (pfn << PAGE_SHIFT) + (PAGE_OFFSET ^ BIT(63)); rc = set_memory_np(decoy_addr, 1); if (rc) pr_warn("Could not invalidate pfn=0x%lx from 1:1 map\n", pfn); return rc; } /* Restore full speculative operation to the pfn. */ int clear_mce_nospec(unsigned long pfn) { unsigned long addr = (unsigned long) pfn_to_kaddr(pfn); return set_memory_p(addr, 1); } EXPORT_SYMBOL_GPL(clear_mce_nospec); #endif /* CONFIG_X86_64 */ int set_memory_x(unsigned long addr, int numpages) { if (!(__supported_pte_mask & _PAGE_NX)) return 0; return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_NX), 0); } int set_memory_nx(unsigned long addr, int numpages) { if (!(__supported_pte_mask & _PAGE_NX)) return 0; return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_NX), 0); } int set_memory_ro(unsigned long addr, int numpages) { return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_RW | _PAGE_DIRTY), 0); } int set_memory_rox(unsigned long addr, int numpages) { pgprot_t clr = __pgprot(_PAGE_RW | _PAGE_DIRTY); if (__supported_pte_mask & _PAGE_NX) clr.pgprot |= _PAGE_NX; return change_page_attr_clear(&addr, numpages, clr, 0); } int set_memory_rw(unsigned long addr, int numpages) { return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_RW), 0); } int set_memory_np(unsigned long addr, int numpages) { return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_PRESENT), 0); } int set_memory_np_noalias(unsigned long addr, int numpages) { return change_page_attr_set_clr(&addr, numpages, __pgprot(0), __pgprot(_PAGE_PRESENT), 0, CPA_NO_CHECK_ALIAS, NULL); } int set_memory_p(unsigned long addr, int numpages) { return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_PRESENT), 0); } int set_memory_4k(unsigned long addr, int numpages) { return change_page_attr_set_clr(&addr, numpages, __pgprot(0), __pgprot(0), 1, 0, NULL); } int set_memory_nonglobal(unsigned long addr, int numpages) { return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_GLOBAL), 0); } int set_memory_global(unsigned long addr, int numpages) { return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_GLOBAL), 0); } /* * __set_memory_enc_pgtable() is used for the hypervisors that get * informed about "encryption" status via page tables. */ static int __set_memory_enc_pgtable(unsigned long addr, int numpages, bool enc) { pgprot_t empty = __pgprot(0); struct cpa_data cpa; int ret; /* Should not be working on unaligned addresses */ if (WARN_ONCE(addr & ~PAGE_MASK, "misaligned address: %#lx\n", addr)) addr &= PAGE_MASK; memset(&cpa, 0, sizeof(cpa)); cpa.vaddr = &addr; cpa.numpages = numpages; cpa.mask_set = enc ? pgprot_encrypted(empty) : pgprot_decrypted(empty); cpa.mask_clr = enc ? pgprot_decrypted(empty) : pgprot_encrypted(empty); cpa.pgd = init_mm.pgd; /* Must avoid aliasing mappings in the highmem code */ kmap_flush_unused(); vm_unmap_aliases(); /* Flush the caches as needed before changing the encryption attribute. */ if (x86_platform.guest.enc_tlb_flush_required(enc)) cpa_flush(&cpa, x86_platform.guest.enc_cache_flush_required()); /* Notify hypervisor that we are about to set/clr encryption attribute. */ ret = x86_platform.guest.enc_status_change_prepare(addr, numpages, enc); if (ret) goto vmm_fail; ret = __change_page_attr_set_clr(&cpa, 1); /* * After changing the encryption attribute, we need to flush TLBs again * in case any speculative TLB caching occurred (but no need to flush * caches again). We could just use cpa_flush_all(), but in case TLB * flushing gets optimized in the cpa_flush() path use the same logic * as above. */ cpa_flush(&cpa, 0); if (ret) return ret; /* Notify hypervisor that we have successfully set/clr encryption attribute. */ ret = x86_platform.guest.enc_status_change_finish(addr, numpages, enc); if (ret) goto vmm_fail; return 0; vmm_fail: WARN_ONCE(1, "CPA VMM failure to convert memory (addr=%p, numpages=%d) to %s: %d\n", (void *)addr, numpages, enc ? "private" : "shared", ret); return ret; } /* * The lock serializes conversions between private and shared memory. * * It is taken for read on conversion. A write lock guarantees that no * concurrent conversions are in progress. */ static DECLARE_RWSEM(mem_enc_lock); /* * Stop new private<->shared conversions. * * Taking the exclusive mem_enc_lock waits for in-flight conversions to complete. * The lock is not released to prevent new conversions from being started. */ bool set_memory_enc_stop_conversion(void) { /* * In a crash scenario, sleep is not allowed. Try to take the lock. * Failure indicates that there is a race with the conversion. */ if (oops_in_progress) return down_write_trylock(&mem_enc_lock); down_write(&mem_enc_lock); return true; } static int __set_memory_enc_dec(unsigned long addr, int numpages, bool enc) { int ret = 0; if (cc_platform_has(CC_ATTR_MEM_ENCRYPT)) { if (!down_read_trylock(&mem_enc_lock)) return -EBUSY; ret = __set_memory_enc_pgtable(addr, numpages, enc); up_read(&mem_enc_lock); } return ret; } int set_memory_encrypted(unsigned long addr, int numpages) { return __set_memory_enc_dec(addr, numpages, true); } EXPORT_SYMBOL_GPL(set_memory_encrypted); int set_memory_decrypted(unsigned long addr, int numpages) { return __set_memory_enc_dec(addr, numpages, false); } EXPORT_SYMBOL_GPL(set_memory_decrypted); int set_pages_uc(struct page *page, int numpages) { unsigned long addr = (unsigned long)page_address(page); return set_memory_uc(addr, numpages); } EXPORT_SYMBOL(set_pages_uc); static int _set_pages_array(struct page **pages, int numpages, enum page_cache_mode new_type) { unsigned long start; unsigned long end; enum page_cache_mode set_type; int i; int free_idx; int ret; for (i = 0; i < numpages; i++) { if (PageHighMem(pages[i])) continue; start = page_to_pfn(pages[i]) << PAGE_SHIFT; end = start + PAGE_SIZE; if (memtype_reserve(start, end, new_type, NULL)) goto err_out; } /* If WC, set to UC- first and then WC */ set_type = (new_type == _PAGE_CACHE_MODE_WC) ? _PAGE_CACHE_MODE_UC_MINUS : new_type; ret = cpa_set_pages_array(pages, numpages, cachemode2pgprot(set_type)); if (!ret && new_type == _PAGE_CACHE_MODE_WC) ret = change_page_attr_set_clr(NULL, numpages, cachemode2pgprot( _PAGE_CACHE_MODE_WC), __pgprot(_PAGE_CACHE_MASK), 0, CPA_PAGES_ARRAY, pages); if (ret) goto err_out; return 0; /* Success */ err_out: free_idx = i; for (i = 0; i < free_idx; i++) { if (PageHighMem(pages[i])) continue; start = page_to_pfn(pages[i]) << PAGE_SHIFT; end = start + PAGE_SIZE; memtype_free(start, end); } return -EINVAL; } int set_pages_array_uc(struct page **pages, int numpages) { return _set_pages_array(pages, numpages, _PAGE_CACHE_MODE_UC_MINUS); } EXPORT_SYMBOL(set_pages_array_uc); int set_pages_array_wc(struct page **pages, int numpages) { return _set_pages_array(pages, numpages, _PAGE_CACHE_MODE_WC); } EXPORT_SYMBOL(set_pages_array_wc); int set_pages_wb(struct page *page, int numpages) { unsigned long addr = (unsigned long)page_address(page); return set_memory_wb(addr, numpages); } EXPORT_SYMBOL(set_pages_wb); int set_pages_array_wb(struct page **pages, int numpages) { int retval; unsigned long start; unsigned long end; int i; /* WB cache mode is hard wired to all cache attribute bits being 0 */ retval = cpa_clear_pages_array(pages, numpages, __pgprot(_PAGE_CACHE_MASK)); if (retval) return retval; for (i = 0; i < numpages; i++) { if (PageHighMem(pages[i])) continue; start = page_to_pfn(pages[i]) << PAGE_SHIFT; end = start + PAGE_SIZE; memtype_free(start, end); } return 0; } EXPORT_SYMBOL(set_pages_array_wb); int set_pages_ro(struct page *page, int numpages) { unsigned long addr = (unsigned long)page_address(page); return set_memory_ro(addr, numpages); } int set_pages_rw(struct page *page, int numpages) { unsigned long addr = (unsigned long)page_address(page); return set_memory_rw(addr, numpages); } static int __set_pages_p(struct page *page, int numpages) { unsigned long tempaddr = (unsigned long) page_address(page); struct cpa_data cpa = { .vaddr = &tempaddr, .pgd = NULL, .numpages = numpages, .mask_set = __pgprot(_PAGE_PRESENT | _PAGE_RW), .mask_clr = __pgprot(0), .flags = CPA_NO_CHECK_ALIAS }; /* * No alias checking needed for setting present flag. otherwise, * we may need to break large pages for 64-bit kernel text * mappings (this adds to complexity if we want to do this from * atomic context especially). Let's keep it simple! */ return __change_page_attr_set_clr(&cpa, 1); } static int __set_pages_np(struct page *page, int numpages) { unsigned long tempaddr = (unsigned long) page_address(page); struct cpa_data cpa = { .vaddr = &tempaddr, .pgd = NULL, .numpages = numpages, .mask_set = __pgprot(0), .mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW), .flags = CPA_NO_CHECK_ALIAS }; /* * No alias checking needed for setting not present flag. otherwise, * we may need to break large pages for 64-bit kernel text * mappings (this adds to complexity if we want to do this from * atomic context especially). Let's keep it simple! */ return __change_page_attr_set_clr(&cpa, 1); } int set_direct_map_invalid_noflush(struct page *page) { return __set_pages_np(page, 1); } int set_direct_map_default_noflush(struct page *page) { return __set_pages_p(page, 1); } int set_direct_map_valid_noflush(struct page *page, unsigned nr, bool valid) { if (valid) return __set_pages_p(page, nr); return __set_pages_np(page, nr); } #ifdef CONFIG_DEBUG_PAGEALLOC void __kernel_map_pages(struct page *page, int numpages, int enable) { if (PageHighMem(page)) return; if (!enable) { debug_check_no_locks_freed(page_address(page), numpages * PAGE_SIZE); } /* * The return value is ignored as the calls cannot fail. * Large pages for identity mappings are not used at boot time * and hence no memory allocations during large page split. */ if (enable) __set_pages_p(page, numpages); else __set_pages_np(page, numpages); /* * We should perform an IPI and flush all tlbs, * but that can deadlock->flush only current cpu. * Preemption needs to be disabled around __flush_tlb_all() due to * CR3 reload in __native_flush_tlb(). */ preempt_disable(); __flush_tlb_all(); preempt_enable(); arch_flush_lazy_mmu_mode(); } #endif /* CONFIG_DEBUG_PAGEALLOC */ bool kernel_page_present(struct page *page) { unsigned int level; pte_t *pte; if (PageHighMem(page)) return false; pte = lookup_address((unsigned long)page_address(page), &level); return (pte_val(*pte) & _PAGE_PRESENT); } int __init kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn, unsigned long address, unsigned numpages, unsigned long page_flags) { int retval = -EINVAL; struct cpa_data cpa = { .vaddr = &address, .pfn = pfn, .pgd = pgd, .numpages = numpages, .mask_set = __pgprot(0), .mask_clr = __pgprot(~page_flags & (_PAGE_NX|_PAGE_RW)), .flags = CPA_NO_CHECK_ALIAS, }; WARN_ONCE(num_online_cpus() > 1, "Don't call after initializing SMP"); if (!(__supported_pte_mask & _PAGE_NX)) goto out; if (!(page_flags & _PAGE_ENC)) cpa.mask_clr = pgprot_encrypted(cpa.mask_clr); cpa.mask_set = __pgprot(_PAGE_PRESENT | page_flags); retval = __change_page_attr_set_clr(&cpa, 1); __flush_tlb_all(); out: return retval; } /* * __flush_tlb_all() flushes mappings only on current CPU and hence this * function shouldn't be used in an SMP environment. Presently, it's used only * during boot (way before smp_init()) by EFI subsystem and hence is ok. */ int __init kernel_unmap_pages_in_pgd(pgd_t *pgd, unsigned long address, unsigned long numpages) { int retval; /* * The typical sequence for unmapping is to find a pte through * lookup_address_in_pgd() (ideally, it should never return NULL because * the address is already mapped) and change its protections. As pfn is * the *target* of a mapping, it's not useful while unmapping. */ struct cpa_data cpa = { .vaddr = &address, .pfn = 0, .pgd = pgd, .numpages = numpages, .mask_set = __pgprot(0), .mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW), .flags = CPA_NO_CHECK_ALIAS, }; WARN_ONCE(num_online_cpus() > 1, "Don't call after initializing SMP"); retval = __change_page_attr_set_clr(&cpa, 1); __flush_tlb_all(); return retval; } /* * The testcases use internal knowledge of the implementation that shouldn't * be exposed to the rest of the kernel. Include these directly here. */ #ifdef CONFIG_CPA_DEBUG #include "cpa-test.c" #endif
77 75 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 // SPDX-License-Identifier: GPL-2.0 /* * linux/mm/madvise.c * * Copyright (C) 1999 Linus Torvalds * Copyright (C) 2002 Christoph Hellwig */ #include <linux/mman.h> #include <linux/pagemap.h> #include <linux/syscalls.h> #include <linux/mempolicy.h> #include <linux/page-isolation.h> #include <linux/page_idle.h> #include <linux/userfaultfd_k.h> #include <linux/hugetlb.h> #include <linux/falloc.h> #include <linux/fadvise.h> #include <linux/sched.h> #include <linux/sched/mm.h> #include <linux/mm_inline.h> #include <linux/string.h> #include <linux/uio.h> #include <linux/ksm.h> #include <linux/fs.h> #include <linux/file.h> #include <linux/blkdev.h> #include <linux/backing-dev.h> #include <linux/pagewalk.h> #include <linux/swap.h> #include <linux/swapops.h> #include <linux/shmem_fs.h> #include <linux/mmu_notifier.h> #include <asm/tlb.h> #include "internal.h" #include "swap.h" /* * Maximum number of attempts we make to install guard pages before we give up * and return -ERESTARTNOINTR to have userspace try again. */ #define MAX_MADVISE_GUARD_RETRIES 3 struct madvise_walk_private { struct mmu_gather *tlb; bool pageout; }; /* * Any behaviour which results in changes to the vma->vm_flags needs to * take mmap_lock for writing. Others, which simply traverse vmas, need * to only take it for reading. */ static int madvise_need_mmap_write(int behavior) { switch (behavior) { case MADV_REMOVE: case MADV_WILLNEED: case MADV_DONTNEED: case MADV_DONTNEED_LOCKED: case MADV_COLD: case MADV_PAGEOUT: case MADV_FREE: case MADV_POPULATE_READ: case MADV_POPULATE_WRITE: case MADV_COLLAPSE: case MADV_GUARD_INSTALL: case MADV_GUARD_REMOVE: return 0; default: /* be safe, default to 1. list exceptions explicitly */ return 1; } } #ifdef CONFIG_ANON_VMA_NAME struct anon_vma_name *anon_vma_name_alloc(const char *name) { struct anon_vma_name *anon_name; size_t count; /* Add 1 for NUL terminator at the end of the anon_name->name */ count = strlen(name) + 1; anon_name = kmalloc(struct_size(anon_name, name, count), GFP_KERNEL); if (anon_name) { kref_init(&anon_name->kref); memcpy(anon_name->name, name, count); } return anon_name; } void anon_vma_name_free(struct kref *kref) { struct anon_vma_name *anon_name = container_of(kref, struct anon_vma_name, kref); kfree(anon_name); } struct anon_vma_name *anon_vma_name(struct vm_area_struct *vma) { mmap_assert_locked(vma->vm_mm); return vma->anon_name; } /* mmap_lock should be write-locked */ static int replace_anon_vma_name(struct vm_area_struct *vma, struct anon_vma_name *anon_name) { struct anon_vma_name *orig_name = anon_vma_name(vma); if (!anon_name) { vma->anon_name = NULL; anon_vma_name_put(orig_name); return 0; } if (anon_vma_name_eq(orig_name, anon_name)) return 0; vma->anon_name = anon_vma_name_reuse(anon_name); anon_vma_name_put(orig_name); return 0; } #else /* CONFIG_ANON_VMA_NAME */ static int replace_anon_vma_name(struct vm_area_struct *vma, struct anon_vma_name *anon_name) { if (anon_name) return -EINVAL; return 0; } #endif /* CONFIG_ANON_VMA_NAME */ /* * Update the vm_flags on region of a vma, splitting it or merging it as * necessary. Must be called with mmap_lock held for writing; * Caller should ensure anon_name stability by raising its refcount even when * anon_name belongs to a valid vma because this function might free that vma. */ static int madvise_update_vma(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, unsigned long end, unsigned long new_flags, struct anon_vma_name *anon_name) { struct mm_struct *mm = vma->vm_mm; int error; VMA_ITERATOR(vmi, mm, start); if (new_flags == vma->vm_flags && anon_vma_name_eq(anon_vma_name(vma), anon_name)) { *prev = vma; return 0; } vma = vma_modify_flags_name(&vmi, *prev, vma, start, end, new_flags, anon_name); if (IS_ERR(vma)) return PTR_ERR(vma); *prev = vma; /* vm_flags is protected by the mmap_lock held in write mode. */ vma_start_write(vma); vm_flags_reset(vma, new_flags); if (!vma->vm_file || vma_is_anon_shmem(vma)) { error = replace_anon_vma_name(vma, anon_name); if (error) return error; } return 0; } #ifdef CONFIG_SWAP static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start, unsigned long end, struct mm_walk *walk) { struct vm_area_struct *vma = walk->private; struct swap_iocb *splug = NULL; pte_t *ptep = NULL; spinlock_t *ptl; unsigned long addr; for (addr = start; addr < end; addr += PAGE_SIZE) { pte_t pte; swp_entry_t entry; struct folio *folio; if (!ptep++) { ptep = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); if (!ptep) break; } pte = ptep_get(ptep); if (!is_swap_pte(pte)) continue; entry = pte_to_swp_entry(pte); if (unlikely(non_swap_entry(entry))) continue; pte_unmap_unlock(ptep, ptl); ptep = NULL; folio = read_swap_cache_async(entry, GFP_HIGHUSER_MOVABLE, vma, addr, &splug); if (folio) folio_put(folio); } if (ptep) pte_unmap_unlock(ptep, ptl); swap_read_unplug(splug); cond_resched(); return 0; } static const struct mm_walk_ops swapin_walk_ops = { .pmd_entry = swapin_walk_pmd_entry, .walk_lock = PGWALK_RDLOCK, }; static void shmem_swapin_range(struct vm_area_struct *vma, unsigned long start, unsigned long end, struct address_space *mapping) { XA_STATE(xas, &mapping->i_pages, linear_page_index(vma, start)); pgoff_t end_index = linear_page_index(vma, end) - 1; struct folio *folio; struct swap_iocb *splug = NULL; rcu_read_lock(); xas_for_each(&xas, folio, end_index) { unsigned long addr; swp_entry_t entry; if (!xa_is_value(folio)) continue; entry = radix_to_swp_entry(folio); /* There might be swapin error entries in shmem mapping. */ if (non_swap_entry(entry)) continue; addr = vma->vm_start + ((xas.xa_index - vma->vm_pgoff) << PAGE_SHIFT); xas_pause(&xas); rcu_read_unlock(); folio = read_swap_cache_async(entry, mapping_gfp_mask(mapping), vma, addr, &splug); if (folio) folio_put(folio); rcu_read_lock(); } rcu_read_unlock(); swap_read_unplug(splug); } #endif /* CONFIG_SWAP */ /* * Schedule all required I/O operations. Do not wait for completion. */ static long madvise_willneed(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, unsigned long end) { struct mm_struct *mm = vma->vm_mm; struct file *file = vma->vm_file; loff_t offset; *prev = vma; #ifdef CONFIG_SWAP if (!file) { walk_page_range(vma->vm_mm, start, end, &swapin_walk_ops, vma); lru_add_drain(); /* Push any new pages onto the LRU now */ return 0; } if (shmem_mapping(file->f_mapping)) { shmem_swapin_range(vma, start, end, file->f_mapping); lru_add_drain(); /* Push any new pages onto the LRU now */ return 0; } #else if (!file) return -EBADF; #endif if (IS_DAX(file_inode(file))) { /* no bad return value, but ignore advice */ return 0; } /* * Filesystem's fadvise may need to take various locks. We need to * explicitly grab a reference because the vma (and hence the * vma's reference to the file) can go away as soon as we drop * mmap_lock. */ *prev = NULL; /* tell sys_madvise we drop mmap_lock */ get_file(file); offset = (loff_t)(start - vma->vm_start) + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); mmap_read_unlock(mm); vfs_fadvise(file, offset, end - start, POSIX_FADV_WILLNEED); fput(file); mmap_read_lock(mm); return 0; } static inline bool can_do_file_pageout(struct vm_area_struct *vma) { if (!vma->vm_file) return false; /* * paging out pagecache only for non-anonymous mappings that correspond * to the files the calling process could (if tried) open for writing; * otherwise we'd be including shared non-exclusive mappings, which * opens a side channel. */ return inode_owner_or_capable(&nop_mnt_idmap, file_inode(vma->vm_file)) || file_permission(vma->vm_file, MAY_WRITE) == 0; } static inline int madvise_folio_pte_batch(unsigned long addr, unsigned long end, struct folio *folio, pte_t *ptep, pte_t pte, bool *any_young, bool *any_dirty) { const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY; int max_nr = (end - addr) / PAGE_SIZE; return folio_pte_batch(folio, addr, ptep, pte, max_nr, fpb_flags, NULL, any_young, any_dirty); } static int madvise_cold_or_pageout_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, struct mm_walk *walk) { struct madvise_walk_private *private = walk->private; struct mmu_gather *tlb = private->tlb; bool pageout = private->pageout; struct mm_struct *mm = tlb->mm; struct vm_area_struct *vma = walk->vma; pte_t *start_pte, *pte, ptent; spinlock_t *ptl; struct folio *folio = NULL; LIST_HEAD(folio_list); bool pageout_anon_only_filter; unsigned int batch_count = 0; int nr; if (fatal_signal_pending(current)) return -EINTR; pageout_anon_only_filter = pageout && !vma_is_anonymous(vma) && !can_do_file_pageout(vma); #ifdef CONFIG_TRANSPARENT_HUGEPAGE if (pmd_trans_huge(*pmd)) { pmd_t orig_pmd; unsigned long next = pmd_addr_end(addr, end); tlb_change_page_size(tlb, HPAGE_PMD_SIZE); ptl = pmd_trans_huge_lock(pmd, vma); if (!ptl) return 0; orig_pmd = *pmd; if (is_huge_zero_pmd(orig_pmd)) goto huge_unlock; if (unlikely(!pmd_present(orig_pmd))) { VM_BUG_ON(thp_migration_supported() && !is_pmd_migration_entry(orig_pmd)); goto huge_unlock; } folio = pmd_folio(orig_pmd); /* Do not interfere with other mappings of this folio */ if (folio_likely_mapped_shared(folio)) goto huge_unlock; if (pageout_anon_only_filter && !folio_test_anon(folio)) goto huge_unlock; if (next - addr != HPAGE_PMD_SIZE) { int err; folio_get(folio); spin_unlock(ptl); folio_lock(folio); err = split_folio(folio); folio_unlock(folio); folio_put(folio); if (!err) goto regular_folio; return 0; } if (!pageout && pmd_young(orig_pmd)) { pmdp_invalidate(vma, addr, pmd); orig_pmd = pmd_mkold(orig_pmd); set_pmd_at(mm, addr, pmd, orig_pmd); tlb_remove_pmd_tlb_entry(tlb, pmd, addr); } folio_clear_referenced(folio); folio_test_clear_young(folio); if (folio_test_active(folio)) folio_set_workingset(folio); if (pageout) { if (folio_isolate_lru(folio)) { if (folio_test_unevictable(folio)) folio_putback_lru(folio); else list_add(&folio->lru, &folio_list); } } else folio_deactivate(folio); huge_unlock: spin_unlock(ptl); if (pageout) reclaim_pages(&folio_list); return 0; } regular_folio: #endif tlb_change_page_size(tlb, PAGE_SIZE); restart: start_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); if (!start_pte) return 0; flush_tlb_batched_pending(mm); arch_enter_lazy_mmu_mode(); for (; addr < end; pte += nr, addr += nr * PAGE_SIZE) { nr = 1; ptent = ptep_get(pte); if (++batch_count == SWAP_CLUSTER_MAX) { batch_count = 0; if (need_resched()) { arch_leave_lazy_mmu_mode(); pte_unmap_unlock(start_pte, ptl); cond_resched(); goto restart; } } if (pte_none(ptent)) continue; if (!pte_present(ptent)) continue; folio = vm_normal_folio(vma, addr, ptent); if (!folio || folio_is_zone_device(folio)) continue; /* * If we encounter a large folio, only split it if it is not * fully mapped within the range we are operating on. Otherwise * leave it as is so that it can be swapped out whole. If we * fail to split a folio, leave it in place and advance to the * next pte in the range. */ if (folio_test_large(folio)) { bool any_young; nr = madvise_folio_pte_batch(addr, end, folio, pte, ptent, &any_young, NULL); if (any_young) ptent = pte_mkyoung(ptent); if (nr < folio_nr_pages(folio)) { int err; if (folio_likely_mapped_shared(folio)) continue; if (pageout_anon_only_filter && !folio_test_anon(folio)) continue; if (!folio_trylock(folio)) continue; folio_get(folio); arch_leave_lazy_mmu_mode(); pte_unmap_unlock(start_pte, ptl); start_pte = NULL; err = split_folio(folio); folio_unlock(folio); folio_put(folio); start_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl); if (!start_pte) break; arch_enter_lazy_mmu_mode(); if (!err) nr = 0; continue; } } /* * Do not interfere with other mappings of this folio and * non-LRU folio. If we have a large folio at this point, we * know it is fully mapped so if its mapcount is the same as its * number of pages, it must be exclusive. */ if (!folio_test_lru(folio) || folio_mapcount(folio) != folio_nr_pages(folio)) continue; if (pageout_anon_only_filter && !folio_test_anon(folio)) continue; if (!pageout && pte_young(ptent)) { clear_young_dirty_ptes(vma, addr, pte, nr, CYDP_CLEAR_YOUNG); tlb_remove_tlb_entries(tlb, pte, nr, addr); } /* * We are deactivating a folio for accelerating reclaiming. * VM couldn't reclaim the folio unless we clear PG_young. * As a side effect, it makes confuse idle-page tracking * because they will miss recent referenced history. */ folio_clear_referenced(folio); folio_test_clear_young(folio); if (folio_test_active(folio)) folio_set_workingset(folio); if (pageout) { if (folio_isolate_lru(folio)) { if (folio_test_unevictable(folio)) folio_putback_lru(folio); else list_add(&folio->lru, &folio_list); } } else folio_deactivate(folio); } if (start_pte) { arch_leave_lazy_mmu_mode(); pte_unmap_unlock(start_pte, ptl); } if (pageout) reclaim_pages(&folio_list); cond_resched(); return 0; } static const struct mm_walk_ops cold_walk_ops = { .pmd_entry = madvise_cold_or_pageout_pte_range, .walk_lock = PGWALK_RDLOCK, }; static void madvise_cold_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long addr, unsigned long end) { struct madvise_walk_private walk_private = { .pageout = false, .tlb = tlb, }; tlb_start_vma(tlb, vma); walk_page_range(vma->vm_mm, addr, end, &cold_walk_ops, &walk_private); tlb_end_vma(tlb, vma); } static inline bool can_madv_lru_vma(struct vm_area_struct *vma) { return !(vma->vm_flags & (VM_LOCKED|VM_PFNMAP|VM_HUGETLB)); } static long madvise_cold(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start_addr, unsigned long end_addr) { struct mm_struct *mm = vma->vm_mm; struct mmu_gather tlb; *prev = vma; if (!can_madv_lru_vma(vma)) return -EINVAL; lru_add_drain(); tlb_gather_mmu(&tlb, mm); madvise_cold_page_range(&tlb, vma, start_addr, end_addr); tlb_finish_mmu(&tlb); return 0; } static void madvise_pageout_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long addr, unsigned long end) { struct madvise_walk_private walk_private = { .pageout = true, .tlb = tlb, }; tlb_start_vma(tlb, vma); walk_page_range(vma->vm_mm, addr, end, &cold_walk_ops, &walk_private); tlb_end_vma(tlb, vma); } static long madvise_pageout(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start_addr, unsigned long end_addr) { struct mm_struct *mm = vma->vm_mm; struct mmu_gather tlb; *prev = vma; if (!can_madv_lru_vma(vma)) return -EINVAL; /* * If the VMA belongs to a private file mapping, there can be private * dirty pages which can be paged out if even this process is neither * owner nor write capable of the file. We allow private file mappings * further to pageout dirty anon pages. */ if (!vma_is_anonymous(vma) && (!can_do_file_pageout(vma) && (vma->vm_flags & VM_MAYSHARE))) return 0; lru_add_drain(); tlb_gather_mmu(&tlb, mm); madvise_pageout_page_range(&tlb, vma, start_addr, end_addr); tlb_finish_mmu(&tlb); return 0; } static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, struct mm_walk *walk) { const cydp_t cydp_flags = CYDP_CLEAR_YOUNG | CYDP_CLEAR_DIRTY; struct mmu_gather *tlb = walk->private; struct mm_struct *mm = tlb->mm; struct vm_area_struct *vma = walk->vma; spinlock_t *ptl; pte_t *start_pte, *pte, ptent; struct folio *folio; int nr_swap = 0; unsigned long next; int nr, max_nr; next = pmd_addr_end(addr, end); if (pmd_trans_huge(*pmd)) if (madvise_free_huge_pmd(tlb, vma, pmd, addr, next)) return 0; tlb_change_page_size(tlb, PAGE_SIZE); start_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl); if (!start_pte) return 0; flush_tlb_batched_pending(mm); arch_enter_lazy_mmu_mode(); for (; addr != end; pte += nr, addr += PAGE_SIZE * nr) { nr = 1; ptent = ptep_get(pte); if (pte_none(ptent)) continue; /* * If the pte has swp_entry, just clear page table to * prevent swap-in which is more expensive rather than * (page allocation + zeroing). */ if (!pte_present(ptent)) { swp_entry_t entry; entry = pte_to_swp_entry(ptent); if (!non_swap_entry(entry)) { max_nr = (end - addr) / PAGE_SIZE; nr = swap_pte_batch(pte, max_nr, ptent); nr_swap -= nr; free_swap_and_cache_nr(entry, nr); clear_not_present_full_ptes(mm, addr, pte, nr, tlb->fullmm); } else if (is_hwpoison_entry(entry) || is_poisoned_swp_entry(entry)) { pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); } continue; } folio = vm_normal_folio(vma, addr, ptent); if (!folio || folio_is_zone_device(folio)) continue; /* * If we encounter a large folio, only split it if it is not * fully mapped within the range we are operating on. Otherwise * leave it as is so that it can be marked as lazyfree. If we * fail to split a folio, leave it in place and advance to the * next pte in the range. */ if (folio_test_large(folio)) { bool any_young, any_dirty; nr = madvise_folio_pte_batch(addr, end, folio, pte, ptent, &any_young, &any_dirty); if (nr < folio_nr_pages(folio)) { int err; if (folio_likely_mapped_shared(folio)) continue; if (!folio_trylock(folio)) continue; folio_get(folio); arch_leave_lazy_mmu_mode(); pte_unmap_unlock(start_pte, ptl); start_pte = NULL; err = split_folio(folio); folio_unlock(folio); folio_put(folio); pte = pte_offset_map_lock(mm, pmd, addr, &ptl); start_pte = pte; if (!start_pte) break; arch_enter_lazy_mmu_mode(); if (!err) nr = 0; continue; } if (any_young) ptent = pte_mkyoung(ptent); if (any_dirty) ptent = pte_mkdirty(ptent); } if (folio_test_swapcache(folio) || folio_test_dirty(folio)) { if (!folio_trylock(folio)) continue; /* * If we have a large folio at this point, we know it is * fully mapped so if its mapcount is the same as its * number of pages, it must be exclusive. */ if (folio_mapcount(folio) != folio_nr_pages(folio)) { folio_unlock(folio); continue; } if (folio_test_swapcache(folio) && !folio_free_swap(folio)) { folio_unlock(folio); continue; } folio_clear_dirty(folio); folio_unlock(folio); } if (pte_young(ptent) || pte_dirty(ptent)) { clear_young_dirty_ptes(vma, addr, pte, nr, cydp_flags); tlb_remove_tlb_entries(tlb, pte, nr, addr); } folio_mark_lazyfree(folio); } if (nr_swap) add_mm_counter(mm, MM_SWAPENTS, nr_swap); if (start_pte) { arch_leave_lazy_mmu_mode(); pte_unmap_unlock(start_pte, ptl); } cond_resched(); return 0; } static const struct mm_walk_ops madvise_free_walk_ops = { .pmd_entry = madvise_free_pte_range, .walk_lock = PGWALK_RDLOCK, }; static int madvise_free_single_vma(struct vm_area_struct *vma, unsigned long start_addr, unsigned long end_addr) { struct mm_struct *mm = vma->vm_mm; struct mmu_notifier_range range; struct mmu_gather tlb; /* MADV_FREE works for only anon vma at the moment */ if (!vma_is_anonymous(vma)) return -EINVAL; range.start = max(vma->vm_start, start_addr); if (range.start >= vma->vm_end) return -EINVAL; range.end = min(vma->vm_end, end_addr); if (range.end <= vma->vm_start) return -EINVAL; mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, range.start, range.end); lru_add_drain(); tlb_gather_mmu(&tlb, mm); update_hiwater_rss(mm); mmu_notifier_invalidate_range_start(&range); tlb_start_vma(&tlb, vma); walk_page_range(vma->vm_mm, range.start, range.end, &madvise_free_walk_ops, &tlb); tlb_end_vma(&tlb, vma); mmu_notifier_invalidate_range_end(&range); tlb_finish_mmu(&tlb); return 0; } /* * Application no longer needs these pages. If the pages are dirty, * it's OK to just throw them away. The app will be more careful about * data it wants to keep. Be sure to free swap resources too. The * zap_page_range_single call sets things up for shrink_active_list to actually * free these pages later if no one else has touched them in the meantime, * although we could add these pages to a global reuse list for * shrink_active_list to pick up before reclaiming other pages. * * NB: This interface discards data rather than pushes it out to swap, * as some implementations do. This has performance implications for * applications like large transactional databases which want to discard * pages in anonymous maps after committing to backing store the data * that was kept in them. There is no reason to write this data out to * the swap area if the application is discarding it. * * An interface that causes the system to free clean pages and flush * dirty pages is already available as msync(MS_INVALIDATE). */ static long madvise_dontneed_single_vma(struct vm_area_struct *vma, unsigned long start, unsigned long end) { struct zap_details details = { .reclaim_pt = true, .even_cows = true, }; zap_page_range_single(vma, start, end - start, &details); return 0; } static bool madvise_dontneed_free_valid_vma(struct vm_area_struct *vma, unsigned long start, unsigned long *end, int behavior) { if (!is_vm_hugetlb_page(vma)) { unsigned int forbidden = VM_PFNMAP; if (behavior != MADV_DONTNEED_LOCKED) forbidden |= VM_LOCKED; return !(vma->vm_flags & forbidden); } if (behavior != MADV_DONTNEED && behavior != MADV_DONTNEED_LOCKED) return false; if (start & ~huge_page_mask(hstate_vma(vma))) return false; /* * Madvise callers expect the length to be rounded up to PAGE_SIZE * boundaries, and may be unaware that this VMA uses huge pages. * Avoid unexpected data loss by rounding down the number of * huge pages freed. */ *end = ALIGN_DOWN(*end, huge_page_size(hstate_vma(vma))); return true; } static long madvise_dontneed_free(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, unsigned long end, int behavior) { struct mm_struct *mm = vma->vm_mm; *prev = vma; if (!madvise_dontneed_free_valid_vma(vma, start, &end, behavior)) return -EINVAL; if (start == end) return 0; if (!userfaultfd_remove(vma, start, end)) { *prev = NULL; /* mmap_lock has been dropped, prev is stale */ mmap_read_lock(mm); vma = vma_lookup(mm, start); if (!vma) return -ENOMEM; /* * Potential end adjustment for hugetlb vma is OK as * the check below keeps end within vma. */ if (!madvise_dontneed_free_valid_vma(vma, start, &end, behavior)) return -EINVAL; if (end > vma->vm_end) { /* * Don't fail if end > vma->vm_end. If the old * vma was split while the mmap_lock was * released the effect of the concurrent * operation may not cause madvise() to * have an undefined result. There may be an * adjacent next vma that we'll walk * next. userfaultfd_remove() will generate an * UFFD_EVENT_REMOVE repetition on the * end-vma->vm_end range, but the manager can * handle a repetition fine. */ end = vma->vm_end; } /* * If the memory region between start and end was * originally backed by 4kB pages and then remapped to * be backed by hugepages while mmap_lock was dropped, * the adjustment for hugetlb vma above may have rounded * end down to the start address. */ if (start == end) return 0; VM_WARN_ON(start > end); } if (behavior == MADV_DONTNEED || behavior == MADV_DONTNEED_LOCKED) return madvise_dontneed_single_vma(vma, start, end); else if (behavior == MADV_FREE) return madvise_free_single_vma(vma, start, end); else return -EINVAL; } static long madvise_populate(struct mm_struct *mm, unsigned long start, unsigned long end, int behavior) { const bool write = behavior == MADV_POPULATE_WRITE; int locked = 1; long pages; while (start < end) { /* Populate (prefault) page tables readable/writable. */ pages = faultin_page_range(mm, start, end, write, &locked); if (!locked) { mmap_read_lock(mm); locked = 1; } if (pages < 0) { switch (pages) { case -EINTR: return -EINTR; case -EINVAL: /* Incompatible mappings / permissions. */ return -EINVAL; case -EHWPOISON: return -EHWPOISON; case -EFAULT: /* VM_FAULT_SIGBUS or VM_FAULT_SIGSEGV */ return -EFAULT; default: pr_warn_once("%s: unhandled return value: %ld\n", __func__, pages); fallthrough; case -ENOMEM: /* No VMA or out of memory. */ return -ENOMEM; } } start += pages * PAGE_SIZE; } return 0; } /* * Application wants to free up the pages and associated backing store. * This is effectively punching a hole into the middle of a file. */ static long madvise_remove(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, unsigned long end) { loff_t offset; int error; struct file *f; struct mm_struct *mm = vma->vm_mm; *prev = NULL; /* tell sys_madvise we drop mmap_lock */ if (vma->vm_flags & VM_LOCKED) return -EINVAL; f = vma->vm_file; if (!f || !f->f_mapping || !f->f_mapping->host) { return -EINVAL; } if (!vma_is_shared_maywrite(vma)) return -EACCES; offset = (loff_t)(start - vma->vm_start) + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); /* * Filesystem's fallocate may need to take i_rwsem. We need to * explicitly grab a reference because the vma (and hence the * vma's reference to the file) can go away as soon as we drop * mmap_lock. */ get_file(f); if (userfaultfd_remove(vma, start, end)) { /* mmap_lock was not released by userfaultfd_remove() */ mmap_read_unlock(mm); } error = vfs_fallocate(f, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, offset, end - start); fput(f); mmap_read_lock(mm); return error; } static bool is_valid_guard_vma(struct vm_area_struct *vma, bool allow_locked) { vm_flags_t disallowed = VM_SPECIAL | VM_HUGETLB; /* * A user could lock after setting a guard range but that's fine, as * they'd not be able to fault in. The issue arises when we try to zap * existing locked VMAs. We don't want to do that. */ if (!allow_locked) disallowed |= VM_LOCKED; if (!vma_is_anonymous(vma)) return false; if ((vma->vm_flags & (VM_MAYWRITE | disallowed)) != VM_MAYWRITE) return false; return true; } static bool is_guard_pte_marker(pte_t ptent) { return is_pte_marker(ptent) && is_guard_swp_entry(pte_to_swp_entry(ptent)); } static int guard_install_pud_entry(pud_t *pud, unsigned long addr, unsigned long next, struct mm_walk *walk) { pud_t pudval = pudp_get(pud); /* If huge return >0 so we abort the operation + zap. */ return pud_trans_huge(pudval) || pud_devmap(pudval); } static int guard_install_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long next, struct mm_walk *walk) { pmd_t pmdval = pmdp_get(pmd); /* If huge return >0 so we abort the operation + zap. */ return pmd_trans_huge(pmdval) || pmd_devmap(pmdval); } static int guard_install_pte_entry(pte_t *pte, unsigned long addr, unsigned long next, struct mm_walk *walk) { pte_t pteval = ptep_get(pte); unsigned long *nr_pages = (unsigned long *)walk->private; /* If there is already a guard page marker, we have nothing to do. */ if (is_guard_pte_marker(pteval)) { (*nr_pages)++; return 0; } /* If populated return >0 so we abort the operation + zap. */ return 1; } static int guard_install_set_pte(unsigned long addr, unsigned long next, pte_t *ptep, struct mm_walk *walk) { unsigned long *nr_pages = (unsigned long *)walk->private; /* Simply install a PTE marker, this causes segfault on access. */ *ptep = make_pte_marker(PTE_MARKER_GUARD); (*nr_pages)++; return 0; } static const struct mm_walk_ops guard_install_walk_ops = { .pud_entry = guard_install_pud_entry, .pmd_entry = guard_install_pmd_entry, .pte_entry = guard_install_pte_entry, .install_pte = guard_install_set_pte, .walk_lock = PGWALK_RDLOCK, }; static long madvise_guard_install(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, unsigned long end) { long err; int i; *prev = vma; if (!is_valid_guard_vma(vma, /* allow_locked = */false)) return -EINVAL; /* * If we install guard markers, then the range is no longer * empty from a page table perspective and therefore it's * appropriate to have an anon_vma. * * This ensures that on fork, we copy page tables correctly. */ err = anon_vma_prepare(vma); if (err) return err; /* * Optimistically try to install the guard marker pages first. If any * non-guard pages are encountered, give up and zap the range before * trying again. * * We try a few times before giving up and releasing back to userland to * loop around, releasing locks in the process to avoid contention. This * would only happen if there was a great many racing page faults. * * In most cases we should simply install the guard markers immediately * with no zap or looping. */ for (i = 0; i < MAX_MADVISE_GUARD_RETRIES; i++) { unsigned long nr_pages = 0; /* Returns < 0 on error, == 0 if success, > 0 if zap needed. */ err = walk_page_range_mm(vma->vm_mm, start, end, &guard_install_walk_ops, &nr_pages); if (err < 0) return err; if (err == 0) { unsigned long nr_expected_pages = PHYS_PFN(end - start); VM_WARN_ON(nr_pages != nr_expected_pages); return 0; } /* * OK some of the range have non-guard pages mapped, zap * them. This leaves existing guard pages in place. */ zap_page_range_single(vma, start, end - start, NULL); } /* * We were unable to install the guard pages due to being raced by page * faults. This should not happen ordinarily. We return to userspace and * immediately retry, relieving lock contention. */ return restart_syscall(); } static int guard_remove_pud_entry(pud_t *pud, unsigned long addr, unsigned long next, struct mm_walk *walk) { pud_t pudval = pudp_get(pud); /* If huge, cannot have guard pages present, so no-op - skip. */ if (pud_trans_huge(pudval) || pud_devmap(pudval)) walk->action = ACTION_CONTINUE; return 0; } static int guard_remove_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long next, struct mm_walk *walk) { pmd_t pmdval = pmdp_get(pmd); /* If huge, cannot have guard pages present, so no-op - skip. */ if (pmd_trans_huge(pmdval) || pmd_devmap(pmdval)) walk->action = ACTION_CONTINUE; return 0; } static int guard_remove_pte_entry(pte_t *pte, unsigned long addr, unsigned long next, struct mm_walk *walk) { pte_t ptent = ptep_get(pte); if (is_guard_pte_marker(ptent)) { /* Simply clear the PTE marker. */ pte_clear_not_present_full(walk->mm, addr, pte, false); update_mmu_cache(walk->vma, addr, pte); } return 0; } static const struct mm_walk_ops guard_remove_walk_ops = { .pud_entry = guard_remove_pud_entry, .pmd_entry = guard_remove_pmd_entry, .pte_entry = guard_remove_pte_entry, .walk_lock = PGWALK_RDLOCK, }; static long madvise_guard_remove(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, unsigned long end) { *prev = vma; /* * We're ok with removing guards in mlock()'d ranges, as this is a * non-destructive action. */ if (!is_valid_guard_vma(vma, /* allow_locked = */true)) return -EINVAL; return walk_page_range(vma->vm_mm, start, end, &guard_remove_walk_ops, NULL); } /* * Apply an madvise behavior to a region of a vma. madvise_update_vma * will handle splitting a vm area into separate areas, each area with its own * behavior. */ static int madvise_vma_behavior(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, unsigned long end, unsigned long behavior) { int error; struct anon_vma_name *anon_name; unsigned long new_flags = vma->vm_flags; if (unlikely(!can_modify_vma_madv(vma, behavior))) return -EPERM; switch (behavior) { case MADV_REMOVE: return madvise_remove(vma, prev, start, end); case MADV_WILLNEED: return madvise_willneed(vma, prev, start, end); case MADV_COLD: return madvise_cold(vma, prev, start, end); case MADV_PAGEOUT: return madvise_pageout(vma, prev, start, end); case MADV_FREE: case MADV_DONTNEED: case MADV_DONTNEED_LOCKED: return madvise_dontneed_free(vma, prev, start, end, behavior); case MADV_NORMAL: new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ; break; case MADV_SEQUENTIAL: new_flags = (new_flags & ~VM_RAND_READ) | VM_SEQ_READ; break; case MADV_RANDOM: new_flags = (new_flags & ~VM_SEQ_READ) | VM_RAND_READ; break; case MADV_DONTFORK: new_flags |= VM_DONTCOPY; break; case MADV_DOFORK: if (vma->vm_flags & VM_IO) return -EINVAL; new_flags &= ~VM_DONTCOPY; break; case MADV_WIPEONFORK: /* MADV_WIPEONFORK is only supported on anonymous memory. */ if (vma->vm_file || vma->vm_flags & VM_SHARED) return -EINVAL; new_flags |= VM_WIPEONFORK; break; case MADV_KEEPONFORK: if (vma->vm_flags & VM_DROPPABLE) return -EINVAL; new_flags &= ~VM_WIPEONFORK; break; case MADV_DONTDUMP: new_flags |= VM_DONTDUMP; break; case MADV_DODUMP: if ((!is_vm_hugetlb_page(vma) && new_flags & VM_SPECIAL) || (vma->vm_flags & VM_DROPPABLE)) return -EINVAL; new_flags &= ~VM_DONTDUMP; break; case MADV_MERGEABLE: case MADV_UNMERGEABLE: error = ksm_madvise(vma, start, end, behavior, &new_flags); if (error) goto out; break; case MADV_HUGEPAGE: case MADV_NOHUGEPAGE: error = hugepage_madvise(vma, &new_flags, behavior); if (error) goto out; break; case MADV_COLLAPSE: return madvise_collapse(vma, prev, start, end); case MADV_GUARD_INSTALL: return madvise_guard_install(vma, prev, start, end); case MADV_GUARD_REMOVE: return madvise_guard_remove(vma, prev, start, end); } anon_name = anon_vma_name(vma); anon_vma_name_get(anon_name); error = madvise_update_vma(vma, prev, start, end, new_flags, anon_name); anon_vma_name_put(anon_name); out: /* * madvise() returns EAGAIN if kernel resources, such as * slab, are temporarily unavailable. */ if (error == -ENOMEM) error = -EAGAIN; return error; } #ifdef CONFIG_MEMORY_FAILURE /* * Error injection support for memory error handling. */ static int madvise_inject_error(int behavior, unsigned long start, unsigned long end) { unsigned long size; if (!capable(CAP_SYS_ADMIN)) return -EPERM; for (; start < end; start += size) { unsigned long pfn; struct page *page; int ret; ret = get_user_pages_fast(start, 1, 0, &page); if (ret != 1) return ret; pfn = page_to_pfn(page); /* * When soft offlining hugepages, after migrating the page * we dissolve it, therefore in the second loop "page" will * no longer be a compound page. */ size = page_size(compound_head(page)); if (behavior == MADV_SOFT_OFFLINE) { pr_info("Soft offlining pfn %#lx at process virtual address %#lx\n", pfn, start); ret = soft_offline_page(pfn, MF_COUNT_INCREASED); } else { pr_info("Injecting memory failure for pfn %#lx at process virtual address %#lx\n", pfn, start); ret = memory_failure(pfn, MF_ACTION_REQUIRED | MF_COUNT_INCREASED | MF_SW_SIMULATED); if (ret == -EOPNOTSUPP) ret = 0; } if (ret) return ret; } return 0; } #endif static bool madvise_behavior_valid(int behavior) { switch (behavior) { case MADV_DOFORK: case MADV_DONTFORK: case MADV_NORMAL: case MADV_SEQUENTIAL: case MADV_RANDOM: case MADV_REMOVE: case MADV_WILLNEED: case MADV_DONTNEED: case MADV_DONTNEED_LOCKED: case MADV_FREE: case MADV_COLD: case MADV_PAGEOUT: case MADV_POPULATE_READ: case MADV_POPULATE_WRITE: #ifdef CONFIG_KSM case MADV_MERGEABLE: case MADV_UNMERGEABLE: #endif #ifdef CONFIG_TRANSPARENT_HUGEPAGE case MADV_HUGEPAGE: case MADV_NOHUGEPAGE: case MADV_COLLAPSE: #endif case MADV_DONTDUMP: case MADV_DODUMP: case MADV_WIPEONFORK: case MADV_KEEPONFORK: case MADV_GUARD_INSTALL: case MADV_GUARD_REMOVE: #ifdef CONFIG_MEMORY_FAILURE case MADV_SOFT_OFFLINE: case MADV_HWPOISON: #endif return true; default: return false; } } /* Can we invoke process_madvise() on a remote mm for the specified behavior? */ static bool process_madvise_remote_valid(int behavior) { switch (behavior) { case MADV_COLD: case MADV_PAGEOUT: case MADV_WILLNEED: case MADV_COLLAPSE: return true; default: return false; } } /* * Walk the vmas in range [start,end), and call the visit function on each one. * The visit function will get start and end parameters that cover the overlap * between the current vma and the original range. Any unmapped regions in the * original range will result in this function returning -ENOMEM while still * calling the visit function on all of the existing vmas in the range. * Must be called with the mmap_lock held for reading or writing. */ static int madvise_walk_vmas(struct mm_struct *mm, unsigned long start, unsigned long end, unsigned long arg, int (*visit)(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, unsigned long end, unsigned long arg)) { struct vm_area_struct *vma; struct vm_area_struct *prev; unsigned long tmp; int unmapped_error = 0; /* * If the interval [start,end) covers some unmapped address * ranges, just ignore them, but return -ENOMEM at the end. * - different from the way of handling in mlock etc. */ vma = find_vma_prev(mm, start, &prev); if (vma && start > vma->vm_start) prev = vma; for (;;) { int error; /* Still start < end. */ if (!vma) return -ENOMEM; /* Here start < (end|vma->vm_end). */ if (start < vma->vm_start) { unmapped_error = -ENOMEM; start = vma->vm_start; if (start >= end) break; } /* Here vma->vm_start <= start < (end|vma->vm_end) */ tmp = vma->vm_end; if (end < tmp) tmp = end; /* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */ error = visit(vma, &prev, start, tmp, arg); if (error) return error; start = tmp; if (prev && start < prev->vm_end) start = prev->vm_end; if (start >= end) break; if (prev) vma = find_vma(mm, prev->vm_end); else /* madvise_remove dropped mmap_lock */ vma = find_vma(mm, start); } return unmapped_error; } #ifdef CONFIG_ANON_VMA_NAME static int madvise_vma_anon_name(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, unsigned long end, unsigned long anon_name) { int error; /* Only anonymous mappings can be named */ if (vma->vm_file && !vma_is_anon_shmem(vma)) return -EBADF; error = madvise_update_vma(vma, prev, start, end, vma->vm_flags, (struct anon_vma_name *)anon_name); /* * madvise() returns EAGAIN if kernel resources, such as * slab, are temporarily unavailable. */ if (error == -ENOMEM) error = -EAGAIN; return error; } int madvise_set_anon_name(struct mm_struct *mm, unsigned long start, unsigned long len_in, struct anon_vma_name *anon_name) { unsigned long end; unsigned long len; if (start & ~PAGE_MASK) return -EINVAL; len = (len_in + ~PAGE_MASK) & PAGE_MASK; /* Check to see whether len was rounded up from small -ve to zero */ if (len_in && !len) return -EINVAL; end = start + len; if (end < start) return -EINVAL; if (end == start) return 0; return madvise_walk_vmas(mm, start, end, (unsigned long)anon_name, madvise_vma_anon_name); } #endif /* CONFIG_ANON_VMA_NAME */ /* * The madvise(2) system call. * * Applications can use madvise() to advise the kernel how it should * handle paging I/O in this VM area. The idea is to help the kernel * use appropriate read-ahead and caching techniques. The information * provided is advisory only, and can be safely disregarded by the * kernel without affecting the correct operation of the application. * * behavior values: * MADV_NORMAL - the default behavior is to read clusters. This * results in some read-ahead and read-behind. * MADV_RANDOM - the system should read the minimum amount of data * on any access, since it is unlikely that the appli- * cation will need more than what it asks for. * MADV_SEQUENTIAL - pages in the given range will probably be accessed * once, so they can be aggressively read ahead, and * can be freed soon after they are accessed. * MADV_WILLNEED - the application is notifying the system to read * some pages ahead. * MADV_DONTNEED - the application is finished with the given range, * so the kernel can free resources associated with it. * MADV_FREE - the application marks pages in the given range as lazy free, * where actual purges are postponed until memory pressure happens. * MADV_REMOVE - the application wants to free up the given range of * pages and associated backing store. * MADV_DONTFORK - omit this area from child's address space when forking: * typically, to avoid COWing pages pinned by get_user_pages(). * MADV_DOFORK - cancel MADV_DONTFORK: no longer omit this area when forking. * MADV_WIPEONFORK - present the child process with zero-filled memory in this * range after a fork. * MADV_KEEPONFORK - undo the effect of MADV_WIPEONFORK * MADV_HWPOISON - trigger memory error handler as if the given memory range * were corrupted by unrecoverable hardware memory failure. * MADV_SOFT_OFFLINE - try to soft-offline the given range of memory. * MADV_MERGEABLE - the application recommends that KSM try to merge pages in * this area with pages of identical content from other such areas. * MADV_UNMERGEABLE- cancel MADV_MERGEABLE: no longer merge pages with others. * MADV_HUGEPAGE - the application wants to back the given range by transparent * huge pages in the future. Existing pages might be coalesced and * new pages might be allocated as THP. * MADV_NOHUGEPAGE - mark the given range as not worth being backed by * transparent huge pages so the existing pages will not be * coalesced into THP and new pages will not be allocated as THP. * MADV_COLLAPSE - synchronously coalesce pages into new THP. * MADV_DONTDUMP - the application wants to prevent pages in the given range * from being included in its core dump. * MADV_DODUMP - cancel MADV_DONTDUMP: no longer exclude from core dump. * MADV_COLD - the application is not expected to use this memory soon, * deactivate pages in this range so that they can be reclaimed * easily if memory pressure happens. * MADV_PAGEOUT - the application is not expected to use this memory soon, * page out the pages in this range immediately. * MADV_POPULATE_READ - populate (prefault) page tables readable by * triggering read faults if required * MADV_POPULATE_WRITE - populate (prefault) page tables writable by * triggering write faults if required * * return values: * zero - success * -EINVAL - start + len < 0, start is not page-aligned, * "behavior" is not a valid value, or application * is attempting to release locked or shared pages, * or the specified address range includes file, Huge TLB, * MAP_SHARED or VMPFNMAP range. * -ENOMEM - addresses in the specified range are not currently * mapped, or are outside the AS of the process. * -EIO - an I/O error occurred while paging in data. * -EBADF - map exists, but area maps something that isn't a file. * -EAGAIN - a kernel resource was temporarily unavailable. * -EPERM - memory is sealed. */ int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior) { unsigned long end; int error; int write; size_t len; struct blk_plug plug; if (!madvise_behavior_valid(behavior)) return -EINVAL; if (!PAGE_ALIGNED(start)) return -EINVAL; len = PAGE_ALIGN(len_in); /* Check to see whether len was rounded up from small -ve to zero */ if (len_in && !len) return -EINVAL; end = start + len; if (end < start) return -EINVAL; if (end == start) return 0; #ifdef CONFIG_MEMORY_FAILURE if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE) return madvise_inject_error(behavior, start, start + len_in); #endif write = madvise_need_mmap_write(behavior); if (write) { if (mmap_write_lock_killable(mm)) return -EINTR; } else { mmap_read_lock(mm); } start = untagged_addr_remote(mm, start); end = start + len; blk_start_plug(&plug); switch (behavior) { case MADV_POPULATE_READ: case MADV_POPULATE_WRITE: error = madvise_populate(mm, start, end, behavior); break; default: error = madvise_walk_vmas(mm, start, end, behavior, madvise_vma_behavior); break; } blk_finish_plug(&plug); if (write) mmap_write_unlock(mm); else mmap_read_unlock(mm); return error; } SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) { return do_madvise(current->mm, start, len_in, behavior); } /* Perform an madvise operation over a vector of addresses and lengths. */ static ssize_t vector_madvise(struct mm_struct *mm, struct iov_iter *iter, int behavior) { ssize_t ret = 0; size_t total_len; total_len = iov_iter_count(iter); while (iov_iter_count(iter)) { ret = do_madvise(mm, (unsigned long)iter_iov_addr(iter), iter_iov_len(iter), behavior); /* * An madvise operation is attempting to restart the syscall, * but we cannot proceed as it would not be correct to repeat * the operation in aggregate, and would be surprising to the * user. * * As we have already dropped locks, it is safe to just loop and * try again. We check for fatal signals in case we need exit * early anyway. */ if (ret == -ERESTARTNOINTR) { if (fatal_signal_pending(current)) { ret = -EINTR; break; } continue; } if (ret < 0) break; iov_iter_advance(iter, iter_iov_len(iter)); } ret = (total_len - iov_iter_count(iter)) ? : ret; return ret; } SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec, size_t, vlen, int, behavior, unsigned int, flags) { ssize_t ret; struct iovec iovstack[UIO_FASTIOV]; struct iovec *iov = iovstack; struct iov_iter iter; struct task_struct *task; struct mm_struct *mm; unsigned int f_flags; if (flags != 0) { ret = -EINVAL; goto out; } ret = import_iovec(ITER_DEST, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter); if (ret < 0) goto out; task = pidfd_get_task(pidfd, &f_flags); if (IS_ERR(task)) { ret = PTR_ERR(task); goto free_iov; } /* Require PTRACE_MODE_READ to avoid leaking ASLR metadata. */ mm = mm_access(task, PTRACE_MODE_READ_FSCREDS); if (IS_ERR(mm)) { ret = PTR_ERR(mm); goto release_task; } /* * We need only perform this check if we are attempting to manipulate a * remote process's address space. */ if (mm != current->mm && !process_madvise_remote_valid(behavior)) { ret = -EINVAL; goto release_mm; } /* * Require CAP_SYS_NICE for influencing process performance. Note that * only non-destructive hints are currently supported for remote * processes. */ if (mm != current->mm && !capable(CAP_SYS_NICE)) { ret = -EPERM; goto release_mm; } ret = vector_madvise(mm, &iter, behavior); release_mm: mmput(mm); release_task: put_task_struct(task); free_iov: kfree(iov); out: return ret; }
17 14 17 21 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 /* * linux/fs/nls/mac-turkish.c * * Charset macturkish translation tables. * Generated automatically from the Unicode and charset * tables from the Unicode Organization (www.unicode.org). * The Unicode to charset table has only exact mappings. */ /* * COPYRIGHT AND PERMISSION NOTICE * * Copyright 1991-2012 Unicode, Inc. All rights reserved. Distributed under * the Terms of Use in http://www.unicode.org/copyright.html. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of the Unicode data files and any associated documentation (the "Data * Files") or Unicode software and any associated documentation (the * "Software") to deal in the Data Files or Software without restriction, * including without limitation the rights to use, copy, modify, merge, * publish, distribute, and/or sell copies of the Data Files or Software, and * to permit persons to whom the Data Files or Software are furnished to do * so, provided that (a) the above copyright notice(s) and this permission * notice appear with all copies of the Data Files or Software, (b) both the * above copyright notice(s) and this permission notice appear in associated * documentation, and (c) there is clear notice in each modified Data File or * in the Software as well as in the documentation associated with the Data * File(s) or Software that the data or software has been modified. * * THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY * KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF * THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS * INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF * USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR * PERFORMANCE OF THE DATA FILES OR SOFTWARE. * * Except as contained in this notice, the name of a copyright holder shall * not be used in advertising or otherwise to promote the sale, use or other * dealings in these Data Files or Software without prior written * authorization of the copyright holder. */ #include <linux/module.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/nls.h> #include <linux/errno.h> static const wchar_t charset2uni[256] = { /* 0x00 */ 0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007, 0x0008, 0x0009, 0x000a, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f, /* 0x10 */ 0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017, 0x0018, 0x0019, 0x001a, 0x001b, 0x001c, 0x001d, 0x001e, 0x001f, /* 0x20 */ 0x0020, 0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0027, 0x0028, 0x0029, 0x002a, 0x002b, 0x002c, 0x002d, 0x002e, 0x002f, /* 0x30 */ 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037, 0x0038, 0x0039, 0x003a, 0x003b, 0x003c, 0x003d, 0x003e, 0x003f, /* 0x40 */ 0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047, 0x0048, 0x0049, 0x004a, 0x004b, 0x004c, 0x004d, 0x004e, 0x004f, /* 0x50 */ 0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, 0x0058, 0x0059, 0x005a, 0x005b, 0x005c, 0x005d, 0x005e, 0x005f, /* 0x60 */ 0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067, 0x0068, 0x0069, 0x006a, 0x006b, 0x006c, 0x006d, 0x006e, 0x006f, /* 0x70 */ 0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077, 0x0078, 0x0079, 0x007a, 0x007b, 0x007c, 0x007d, 0x007e, 0x007f, /* 0x80 */ 0x00c4, 0x00c5, 0x00c7, 0x00c9, 0x00d1, 0x00d6, 0x00dc, 0x00e1, 0x00e0, 0x00e2, 0x00e4, 0x00e3, 0x00e5, 0x00e7, 0x00e9, 0x00e8, /* 0x90 */ 0x00ea, 0x00eb, 0x00ed, 0x00ec, 0x00ee, 0x00ef, 0x00f1, 0x00f3, 0x00f2, 0x00f4, 0x00f6, 0x00f5, 0x00fa, 0x00f9, 0x00fb, 0x00fc, /* 0xa0 */ 0x2020, 0x00b0, 0x00a2, 0x00a3, 0x00a7, 0x2022, 0x00b6, 0x00df, 0x00ae, 0x00a9, 0x2122, 0x00b4, 0x00a8, 0x2260, 0x00c6, 0x00d8, /* 0xb0 */ 0x221e, 0x00b1, 0x2264, 0x2265, 0x00a5, 0x00b5, 0x2202, 0x2211, 0x220f, 0x03c0, 0x222b, 0x00aa, 0x00ba, 0x03a9, 0x00e6, 0x00f8, /* 0xc0 */ 0x00bf, 0x00a1, 0x00ac, 0x221a, 0x0192, 0x2248, 0x2206, 0x00ab, 0x00bb, 0x2026, 0x00a0, 0x00c0, 0x00c3, 0x00d5, 0x0152, 0x0153, /* 0xd0 */ 0x2013, 0x2014, 0x201c, 0x201d, 0x2018, 0x2019, 0x00f7, 0x25ca, 0x00ff, 0x0178, 0x011e, 0x011f, 0x0130, 0x0131, 0x015e, 0x015f, /* 0xe0 */ 0x2021, 0x00b7, 0x201a, 0x201e, 0x2030, 0x00c2, 0x00ca, 0x00c1, 0x00cb, 0x00c8, 0x00cd, 0x00ce, 0x00cf, 0x00cc, 0x00d3, 0x00d4, /* 0xf0 */ 0xf8ff, 0x00d2, 0x00da, 0x00db, 0x00d9, 0xf8a0, 0x02c6, 0x02dc, 0x00af, 0x02d8, 0x02d9, 0x02da, 0x00b8, 0x02dd, 0x02db, 0x02c7, }; static const unsigned char page00[256] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ 0xca, 0xc1, 0xa2, 0xa3, 0x00, 0xb4, 0x00, 0xa4, /* 0xa0-0xa7 */ 0xac, 0xa9, 0xbb, 0xc7, 0xc2, 0x00, 0xa8, 0xf8, /* 0xa8-0xaf */ 0xa1, 0xb1, 0x00, 0x00, 0xab, 0xb5, 0xa6, 0xe1, /* 0xb0-0xb7 */ 0xfc, 0x00, 0xbc, 0xc8, 0x00, 0x00, 0x00, 0xc0, /* 0xb8-0xbf */ 0xcb, 0xe7, 0xe5, 0xcc, 0x80, 0x81, 0xae, 0x82, /* 0xc0-0xc7 */ 0xe9, 0x83, 0xe6, 0xe8, 0xed, 0xea, 0xeb, 0xec, /* 0xc8-0xcf */ 0x00, 0x84, 0xf1, 0xee, 0xef, 0xcd, 0x85, 0x00, /* 0xd0-0xd7 */ 0xaf, 0xf4, 0xf2, 0xf3, 0x86, 0x00, 0x00, 0xa7, /* 0xd8-0xdf */ 0x88, 0x87, 0x89, 0x8b, 0x8a, 0x8c, 0xbe, 0x8d, /* 0xe0-0xe7 */ 0x8f, 0x8e, 0x90, 0x91, 0x93, 0x92, 0x94, 0x95, /* 0xe8-0xef */ 0x00, 0x96, 0x98, 0x97, 0x99, 0x9b, 0x9a, 0xd6, /* 0xf0-0xf7 */ 0xbf, 0x9d, 0x9c, 0x9e, 0x9f, 0x00, 0x00, 0xd8, /* 0xf8-0xff */ }; static const unsigned char page01[256] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xda, 0xdb, /* 0x18-0x1f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ 0xdc, 0xdd, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ 0x00, 0x00, 0xce, 0xcf, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xde, 0xdf, /* 0x58-0x5f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ 0xd9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ 0x00, 0x00, 0xc4, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */ }; static const unsigned char page02[256] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf6, 0xff, /* 0xc0-0xc7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ 0xf9, 0xfa, 0xfb, 0xfe, 0xf7, 0xfd, 0x00, 0x00, /* 0xd8-0xdf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */ }; static const unsigned char page03[256] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ 0x00, 0xbd, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ 0xb9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */ }; static const unsigned char page20[256] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ 0x00, 0x00, 0x00, 0xd0, 0xd1, 0x00, 0x00, 0x00, /* 0x10-0x17 */ 0xd4, 0xd5, 0xe2, 0x00, 0xd2, 0xd3, 0xe3, 0x00, /* 0x18-0x1f */ 0xa0, 0xe0, 0xa5, 0x00, 0x00, 0x00, 0xc9, 0x00, /* 0x20-0x27 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ 0xe4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */ }; static const unsigned char page21[256] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ 0x00, 0x00, 0xaa, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */ }; static const unsigned char page22[256] = { 0x00, 0x00, 0xb6, 0x00, 0x00, 0x00, 0xc6, 0x00, /* 0x00-0x07 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xb8, /* 0x08-0x0f */ 0x00, 0xb7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ 0x00, 0x00, 0xc3, 0x00, 0x00, 0x00, 0xb0, 0x00, /* 0x18-0x1f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ 0x00, 0x00, 0x00, 0xba, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ 0xc5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ 0xad, 0x00, 0x00, 0x00, 0xb2, 0xb3, 0x00, 0x00, /* 0x60-0x67 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */ }; static const unsigned char page25[256] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ 0x00, 0x00, 0xd7, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */ }; static const unsigned char pagef8[256] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ 0xf5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf0, /* 0xf8-0xff */ }; static const unsigned char *const page_uni2charset[256] = { page00, page01, page02, page03, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, page20, page21, page22, NULL, NULL, page25, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, pagef8, NULL, NULL, NULL, NULL, NULL, NULL, NULL, }; static const unsigned char charset2lower[256] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x00-0x07 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x08-0x0f */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x10-0x17 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x18-0x1f */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x20-0x27 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x28-0x2f */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x30-0x37 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x38-0x3f */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x40-0x47 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x48-0x4f */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x50-0x57 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x58-0x5f */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x60-0x67 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x68-0x6f */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x70-0x77 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x78-0x7f */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x80-0x87 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x88-0x8f */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x90-0x97 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x98-0x9f */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xa0-0xa7 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xa8-0xaf */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xb0-0xb7 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xb8-0xbf */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xc0-0xc7 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xc8-0xcf */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xd0-0xd7 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xd8-0xdf */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xe0-0xe7 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xe8-0xef */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xf0-0xf7 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xf8-0xff */ }; static const unsigned char charset2upper[256] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x00-0x07 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x08-0x0f */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x10-0x17 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x18-0x1f */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x20-0x27 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x28-0x2f */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x30-0x37 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x38-0x3f */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x40-0x47 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x48-0x4f */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x50-0x57 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x58-0x5f */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x60-0x67 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x68-0x6f */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x70-0x77 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x78-0x7f */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x80-0x87 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x88-0x8f */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x90-0x97 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x98-0x9f */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xa0-0xa7 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xa8-0xaf */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xb0-0xb7 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xb8-0xbf */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xc0-0xc7 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xc8-0xcf */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xd0-0xd7 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xd8-0xdf */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xe0-0xe7 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xe8-0xef */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xf0-0xf7 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xf8-0xff */ }; static int uni2char(wchar_t uni, unsigned char *out, int boundlen) { const unsigned char *uni2charset; unsigned char cl = uni & 0x00ff; unsigned char ch = (uni & 0xff00) >> 8; if (boundlen <= 0) return -ENAMETOOLONG; uni2charset = page_uni2charset[ch]; if (uni2charset && uni2charset[cl]) out[0] = uni2charset[cl]; else return -EINVAL; return 1; } static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) { *uni = charset2uni[*rawstring]; if (*uni == 0x0000) return -EINVAL; return 1; } static struct nls_table table = { .charset = "macturkish", .uni2char = uni2char, .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, }; static int __init init_nls_macturkish(void) { return register_nls(&table); } static void __exit exit_nls_macturkish(void) { unregister_nls(&table); } module_init(init_nls_macturkish) module_exit(exit_nls_macturkish) MODULE_DESCRIPTION("NLS Codepage macturkish"); MODULE_LICENSE("Dual BSD/GPL");
116 116 116 241 241 116 179 178 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 // SPDX-License-Identifier: GPL-2.0 /* * This file contains functions which manage clock event devices. * * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de> * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar * Copyright(C) 2006-2007, Timesys Corp., Thomas Gleixner */ #include <linux/clockchips.h> #include <linux/hrtimer.h> #include <linux/init.h> #include <linux/module.h> #include <linux/smp.h> #include <linux/device.h> #include "tick-internal.h" /* The registered clock event devices */ static LIST_HEAD(clockevent_devices); static LIST_HEAD(clockevents_released); /* Protection for the above */ static DEFINE_RAW_SPINLOCK(clockevents_lock); /* Protection for unbind operations */ static DEFINE_MUTEX(clockevents_mutex); struct ce_unbind { struct clock_event_device *ce; int res; }; static u64 cev_delta2ns(unsigned long latch, struct clock_event_device *evt, bool ismax) { u64 clc = (u64) latch << evt->shift; u64 rnd; if (WARN_ON(!evt->mult)) evt->mult = 1; rnd = (u64) evt->mult - 1; /* * Upper bound sanity check. If the backwards conversion is * not equal latch, we know that the above shift overflowed. */ if ((clc >> evt->shift) != (u64)latch) clc = ~0ULL; /* * Scaled math oddities: * * For mult <= (1 << shift) we can safely add mult - 1 to * prevent integer rounding loss. So the backwards conversion * from nsec to device ticks will be correct. * * For mult > (1 << shift), i.e. device frequency is > 1GHz we * need to be careful. Adding mult - 1 will result in a value * which when converted back to device ticks can be larger * than latch by up to (mult - 1) >> shift. For the min_delta * calculation we still want to apply this in order to stay * above the minimum device ticks limit. For the upper limit * we would end up with a latch value larger than the upper * limit of the device, so we omit the add to stay below the * device upper boundary. * * Also omit the add if it would overflow the u64 boundary. */ if ((~0ULL - clc > rnd) && (!ismax || evt->mult <= (1ULL << evt->shift))) clc += rnd; do_div(clc, evt->mult); /* Deltas less than 1usec are pointless noise */ return clc > 1000 ? clc : 1000; } /** * clockevent_delta2ns - Convert a latch value (device ticks) to nanoseconds * @latch: value to convert * @evt: pointer to clock event device descriptor * * Math helper, returns latch value converted to nanoseconds (bound checked) */ u64 clockevent_delta2ns(unsigned long latch, struct clock_event_device *evt) { return cev_delta2ns(latch, evt, false); } EXPORT_SYMBOL_GPL(clockevent_delta2ns); static int __clockevents_switch_state(struct clock_event_device *dev, enum clock_event_state state) { if (dev->features & CLOCK_EVT_FEAT_DUMMY) return 0; /* Transition with new state-specific callbacks */ switch (state) { case CLOCK_EVT_STATE_DETACHED: /* The clockevent device is getting replaced. Shut it down. */ case CLOCK_EVT_STATE_SHUTDOWN: if (dev->set_state_shutdown) return dev->set_state_shutdown(dev); return 0; case CLOCK_EVT_STATE_PERIODIC: /* Core internal bug */ if (!(dev->features & CLOCK_EVT_FEAT_PERIODIC)) return -ENOSYS; if (dev->set_state_periodic) return dev->set_state_periodic(dev); return 0; case CLOCK_EVT_STATE_ONESHOT: /* Core internal bug */ if (!(dev->features & CLOCK_EVT_FEAT_ONESHOT)) return -ENOSYS; if (dev->set_state_oneshot) return dev->set_state_oneshot(dev); return 0; case CLOCK_EVT_STATE_ONESHOT_STOPPED: /* Core internal bug */ if (WARN_ONCE(!clockevent_state_oneshot(dev), "Current state: %d\n", clockevent_get_state(dev))) return -EINVAL; if (dev->set_state_oneshot_stopped) return dev->set_state_oneshot_stopped(dev); else return -ENOSYS; default: return -ENOSYS; } } /** * clockevents_switch_state - set the operating state of a clock event device * @dev: device to modify * @state: new state * * Must be called with interrupts disabled ! */ void clockevents_switch_state(struct clock_event_device *dev, enum clock_event_state state) { if (clockevent_get_state(dev) != state) { if (__clockevents_switch_state(dev, state)) return; clockevent_set_state(dev, state); /* * A nsec2cyc multiplicator of 0 is invalid and we'd crash * on it, so fix it up and emit a warning: */ if (clockevent_state_oneshot(dev)) { if (WARN_ON(!dev->mult)) dev->mult = 1; } } } /** * clockevents_shutdown - shutdown the device and clear next_event * @dev: device to shutdown */ void clockevents_shutdown(struct clock_event_device *dev) { clockevents_switch_state(dev, CLOCK_EVT_STATE_SHUTDOWN); dev->next_event = KTIME_MAX; } /** * clockevents_tick_resume - Resume the tick device before using it again * @dev: device to resume */ int clockevents_tick_resume(struct clock_event_device *dev) { int ret = 0; if (dev->tick_resume) ret = dev->tick_resume(dev); return ret; } #ifdef CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST /* Limit min_delta to a jiffy */ #define MIN_DELTA_LIMIT (NSEC_PER_SEC / HZ) /** * clockevents_increase_min_delta - raise minimum delta of a clock event device * @dev: device to increase the minimum delta * * Returns 0 on success, -ETIME when the minimum delta reached the limit. */ static int clockevents_increase_min_delta(struct clock_event_device *dev) { /* Nothing to do if we already reached the limit */ if (dev->min_delta_ns >= MIN_DELTA_LIMIT) { printk_deferred(KERN_WARNING "CE: Reprogramming failure. Giving up\n"); dev->next_event = KTIME_MAX; return -ETIME; } if (dev->min_delta_ns < 5000) dev->min_delta_ns = 5000; else dev->min_delta_ns += dev->min_delta_ns >> 1; if (dev->min_delta_ns > MIN_DELTA_LIMIT) dev->min_delta_ns = MIN_DELTA_LIMIT; printk_deferred(KERN_WARNING "CE: %s increased min_delta_ns to %llu nsec\n", dev->name ? dev->name : "?", (unsigned long long) dev->min_delta_ns); return 0; } /** * clockevents_program_min_delta - Set clock event device to the minimum delay. * @dev: device to program * * Returns 0 on success, -ETIME when the retry loop failed. */ static int clockevents_program_min_delta(struct clock_event_device *dev) { unsigned long long clc; int64_t delta; int i; for (i = 0;;) { delta = dev->min_delta_ns; dev->next_event = ktime_add_ns(ktime_get(), delta); if (clockevent_state_shutdown(dev)) return 0; dev->retries++; clc = ((unsigned long long) delta * dev->mult) >> dev->shift; if (dev->set_next_event((unsigned long) clc, dev) == 0) return 0; if (++i > 2) { /* * We tried 3 times to program the device with the * given min_delta_ns. Try to increase the minimum * delta, if that fails as well get out of here. */ if (clockevents_increase_min_delta(dev)) return -ETIME; i = 0; } } } #else /* CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST */ /** * clockevents_program_min_delta - Set clock event device to the minimum delay. * @dev: device to program * * Returns 0 on success, -ETIME when the retry loop failed. */ static int clockevents_program_min_delta(struct clock_event_device *dev) { unsigned long long clc; int64_t delta = 0; int i; for (i = 0; i < 10; i++) { delta += dev->min_delta_ns; dev->next_event = ktime_add_ns(ktime_get(), delta); if (clockevent_state_shutdown(dev)) return 0; dev->retries++; clc = ((unsigned long long) delta * dev->mult) >> dev->shift; if (dev->set_next_event((unsigned long) clc, dev) == 0) return 0; } return -ETIME; } #endif /* CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST */ /** * clockevents_program_event - Reprogram the clock event device. * @dev: device to program * @expires: absolute expiry time (monotonic clock) * @force: program minimum delay if expires can not be set * * Returns 0 on success, -ETIME when the event is in the past. */ int clockevents_program_event(struct clock_event_device *dev, ktime_t expires, bool force) { unsigned long long clc; int64_t delta; int rc; if (WARN_ON_ONCE(expires < 0)) return -ETIME; dev->next_event = expires; if (clockevent_state_shutdown(dev)) return 0; /* We must be in ONESHOT state here */ WARN_ONCE(!clockevent_state_oneshot(dev), "Current state: %d\n", clockevent_get_state(dev)); /* Shortcut for clockevent devices that can deal with ktime. */ if (dev->features & CLOCK_EVT_FEAT_KTIME) return dev->set_next_ktime(expires, dev); delta = ktime_to_ns(ktime_sub(expires, ktime_get())); if (delta <= 0) return force ? clockevents_program_min_delta(dev) : -ETIME; delta = min(delta, (int64_t) dev->max_delta_ns); delta = max(delta, (int64_t) dev->min_delta_ns); clc = ((unsigned long long) delta * dev->mult) >> dev->shift; rc = dev->set_next_event((unsigned long) clc, dev); return (rc && force) ? clockevents_program_min_delta(dev) : rc; } /* * Called after a clockevent has been added which might * have replaced a current regular or broadcast device. A * released normal device might be a suitable replacement * for the current broadcast device. Similarly a released * broadcast device might be a suitable replacement for a * normal device. */ static void clockevents_notify_released(void) { struct clock_event_device *dev; /* * Keep iterating as long as tick_check_new_device() * replaces a device. */ while (!list_empty(&clockevents_released)) { dev = list_entry(clockevents_released.next, struct clock_event_device, list); list_move(&dev->list, &clockevent_devices); tick_check_new_device(dev); } } /* * Try to install a replacement clock event device */ static int clockevents_replace(struct clock_event_device *ced) { struct clock_event_device *dev, *newdev = NULL; list_for_each_entry(dev, &clockevent_devices, list) { if (dev == ced || !clockevent_state_detached(dev)) continue; if (!tick_check_replacement(newdev, dev)) continue; if (!try_module_get(dev->owner)) continue; if (newdev) module_put(newdev->owner); newdev = dev; } if (newdev) { tick_install_replacement(newdev); list_del_init(&ced->list); } return newdev ? 0 : -EBUSY; } /* * Called with clockevents_mutex and clockevents_lock held */ static int __clockevents_try_unbind(struct clock_event_device *ced, int cpu) { /* Fast track. Device is unused */ if (clockevent_state_detached(ced)) { list_del_init(&ced->list); return 0; } return ced == per_cpu(tick_cpu_device, cpu).evtdev ? -EAGAIN : -EBUSY; } /* * SMP function call to unbind a device */ static void __clockevents_unbind(void *arg) { struct ce_unbind *cu = arg; int res; raw_spin_lock(&clockevents_lock); res = __clockevents_try_unbind(cu->ce, smp_processor_id()); if (res == -EAGAIN) res = clockevents_replace(cu->ce); cu->res = res; raw_spin_unlock(&clockevents_lock); } /* * Issues smp function call to unbind a per cpu device. Called with * clockevents_mutex held. */ static int clockevents_unbind(struct clock_event_device *ced, int cpu) { struct ce_unbind cu = { .ce = ced, .res = -ENODEV }; smp_call_function_single(cpu, __clockevents_unbind, &cu, 1); return cu.res; } /* * Unbind a clockevents device. */ int clockevents_unbind_device(struct clock_event_device *ced, int cpu) { int ret; mutex_lock(&clockevents_mutex); ret = clockevents_unbind(ced, cpu); mutex_unlock(&clockevents_mutex); return ret; } EXPORT_SYMBOL_GPL(clockevents_unbind_device); /** * clockevents_register_device - register a clock event device * @dev: device to register */ void clockevents_register_device(struct clock_event_device *dev) { unsigned long flags; /* Initialize state to DETACHED */ clockevent_set_state(dev, CLOCK_EVT_STATE_DETACHED); if (!dev->cpumask) { WARN_ON(num_possible_cpus() > 1); dev->cpumask = cpumask_of(smp_processor_id()); } if (dev->cpumask == cpu_all_mask) { WARN(1, "%s cpumask == cpu_all_mask, using cpu_possible_mask instead\n", dev->name); dev->cpumask = cpu_possible_mask; } raw_spin_lock_irqsave(&clockevents_lock, flags); list_add(&dev->list, &clockevent_devices); tick_check_new_device(dev); clockevents_notify_released(); raw_spin_unlock_irqrestore(&clockevents_lock, flags); } EXPORT_SYMBOL_GPL(clockevents_register_device); static void clockevents_config(struct clock_event_device *dev, u32 freq) { u64 sec; if (!(dev->features & CLOCK_EVT_FEAT_ONESHOT)) return; /* * Calculate the maximum number of seconds we can sleep. Limit * to 10 minutes for hardware which can program more than * 32bit ticks so we still get reasonable conversion values. */ sec = dev->max_delta_ticks; do_div(sec, freq); if (!sec) sec = 1; else if (sec > 600 && dev->max_delta_ticks > UINT_MAX) sec = 600; clockevents_calc_mult_shift(dev, freq, sec); dev->min_delta_ns = cev_delta2ns(dev->min_delta_ticks, dev, false); dev->max_delta_ns = cev_delta2ns(dev->max_delta_ticks, dev, true); } /** * clockevents_config_and_register - Configure and register a clock event device * @dev: device to register * @freq: The clock frequency * @min_delta: The minimum clock ticks to program in oneshot mode * @max_delta: The maximum clock ticks to program in oneshot mode * * min/max_delta can be 0 for devices which do not support oneshot mode. */ void clockevents_config_and_register(struct clock_event_device *dev, u32 freq, unsigned long min_delta, unsigned long max_delta) { dev->min_delta_ticks = min_delta; dev->max_delta_ticks = max_delta; clockevents_config(dev, freq); clockevents_register_device(dev); } EXPORT_SYMBOL_GPL(clockevents_config_and_register); int __clockevents_update_freq(struct clock_event_device *dev, u32 freq) { clockevents_config(dev, freq); if (clockevent_state_oneshot(dev)) return clockevents_program_event(dev, dev->next_event, false); if (clockevent_state_periodic(dev)) return __clockevents_switch_state(dev, CLOCK_EVT_STATE_PERIODIC); return 0; } /** * clockevents_update_freq - Update frequency and reprogram a clock event device. * @dev: device to modify * @freq: new device frequency * * Reconfigure and reprogram a clock event device in oneshot * mode. Must be called on the cpu for which the device delivers per * cpu timer events. If called for the broadcast device the core takes * care of serialization. * * Returns 0 on success, -ETIME when the event is in the past. */ int clockevents_update_freq(struct clock_event_device *dev, u32 freq) { unsigned long flags; int ret; local_irq_save(flags); ret = tick_broadcast_update_freq(dev, freq); if (ret == -ENODEV) ret = __clockevents_update_freq(dev, freq); local_irq_restore(flags); return ret; } /* * Noop handler when we shut down an event device */ void clockevents_handle_noop(struct clock_event_device *dev) { } /** * clockevents_exchange_device - release and request clock devices * @old: device to release (can be NULL) * @new: device to request (can be NULL) * * Called from various tick functions with clockevents_lock held and * interrupts disabled. */ void clockevents_exchange_device(struct clock_event_device *old, struct clock_event_device *new) { /* * Caller releases a clock event device. We queue it into the * released list and do a notify add later. */ if (old) { module_put(old->owner); clockevents_switch_state(old, CLOCK_EVT_STATE_DETACHED); list_move(&old->list, &clockevents_released); } if (new) { BUG_ON(!clockevent_state_detached(new)); clockevents_shutdown(new); } } /** * clockevents_suspend - suspend clock devices */ void clockevents_suspend(void) { struct clock_event_device *dev; list_for_each_entry_reverse(dev, &clockevent_devices, list) if (dev->suspend && !clockevent_state_detached(dev)) dev->suspend(dev); } /** * clockevents_resume - resume clock devices */ void clockevents_resume(void) { struct clock_event_device *dev; list_for_each_entry(dev, &clockevent_devices, list) if (dev->resume && !clockevent_state_detached(dev)) dev->resume(dev); } #ifdef CONFIG_HOTPLUG_CPU /** * tick_offline_cpu - Shutdown all clock events related * to this CPU and take it out of the * broadcast mechanism. * @cpu: The outgoing CPU * * Called by the dying CPU during teardown. */ void tick_offline_cpu(unsigned int cpu) { struct clock_event_device *dev, *tmp; raw_spin_lock(&clockevents_lock); tick_broadcast_offline(cpu); tick_shutdown(cpu); /* * Unregister the clock event devices which were * released above. */ list_for_each_entry_safe(dev, tmp, &clockevents_released, list) list_del(&dev->list); /* * Now check whether the CPU has left unused per cpu devices */ list_for_each_entry_safe(dev, tmp, &clockevent_devices, list) { if (cpumask_test_cpu(cpu, dev->cpumask) && cpumask_weight(dev->cpumask) == 1 && !tick_is_broadcast_device(dev)) { BUG_ON(!clockevent_state_detached(dev)); list_del(&dev->list); } } raw_spin_unlock(&clockevents_lock); } #endif #ifdef CONFIG_SYSFS static const struct bus_type clockevents_subsys = { .name = "clockevents", .dev_name = "clockevent", }; static DEFINE_PER_CPU(struct device, tick_percpu_dev); static struct tick_device *tick_get_tick_dev(struct device *dev); static ssize_t current_device_show(struct device *dev, struct device_attribute *attr, char *buf) { struct tick_device *td; ssize_t count = 0; raw_spin_lock_irq(&clockevents_lock); td = tick_get_tick_dev(dev); if (td && td->evtdev) count = sysfs_emit(buf, "%s\n", td->evtdev->name); raw_spin_unlock_irq(&clockevents_lock); return count; } static DEVICE_ATTR_RO(current_device); /* We don't support the abomination of removable broadcast devices */ static ssize_t unbind_device_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { char name[CS_NAME_LEN]; ssize_t ret = sysfs_get_uname(buf, name, count); struct clock_event_device *ce = NULL, *iter; if (ret < 0) return ret; ret = -ENODEV; mutex_lock(&clockevents_mutex); raw_spin_lock_irq(&clockevents_lock); list_for_each_entry(iter, &clockevent_devices, list) { if (!strcmp(iter->name, name)) { ret = __clockevents_try_unbind(iter, dev->id); ce = iter; break; } } raw_spin_unlock_irq(&clockevents_lock); /* * We hold clockevents_mutex, so ce can't go away */ if (ret == -EAGAIN) ret = clockevents_unbind(ce, dev->id); mutex_unlock(&clockevents_mutex); return ret ? ret : count; } static DEVICE_ATTR_WO(unbind_device); #ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST static struct device tick_bc_dev = { .init_name = "broadcast", .id = 0, .bus = &clockevents_subsys, }; static struct tick_device *tick_get_tick_dev(struct device *dev) { return dev == &tick_bc_dev ? tick_get_broadcast_device() : &per_cpu(tick_cpu_device, dev->id); } static __init int tick_broadcast_init_sysfs(void) { int err = device_register(&tick_bc_dev); if (!err) err = device_create_file(&tick_bc_dev, &dev_attr_current_device); return err; } #else static struct tick_device *tick_get_tick_dev(struct device *dev) { return &per_cpu(tick_cpu_device, dev->id); } static inline int tick_broadcast_init_sysfs(void) { return 0; } #endif static int __init tick_init_sysfs(void) { int cpu; for_each_possible_cpu(cpu) { struct device *dev = &per_cpu(tick_percpu_dev, cpu); int err; dev->id = cpu; dev->bus = &clockevents_subsys; err = device_register(dev); if (!err) err = device_create_file(dev, &dev_attr_current_device); if (!err) err = device_create_file(dev, &dev_attr_unbind_device); if (err) return err; } return tick_broadcast_init_sysfs(); } static int __init clockevents_init_sysfs(void) { int err = subsys_system_register(&clockevents_subsys, NULL); if (!err) err = tick_init_sysfs(); return err; } device_initcall(clockevents_init_sysfs); #endif /* SYSFS */
205 3 59 59 59 61 61 61 2 59 2 1 57 45 58 59 7 59 59 59 66 68 62 61 8 58 15 68 13 13 2 12 64 64 180 136 29 6 240 231 14 178 4 31 5 115 1 20 324 245 7 16 15 27 58 57 41 41 41 41 58 58 58 58 58 58 35 35 31 31 7 31 91 92 92 92 192 189 101 253 253 253 1 341 103 330 339 342 342 342 306 307 139 139 310 305 307 311 229 136 17 162 259 259 164 142 6 137 254 256 1 256 65 2 2 2 1 234 7 231 232 232 6 55 56 54 4 3 7 3 7 7 56 56 10 3 4 166 73 136 214 202 58 4 211 44 211 166 166 211 3 154 209 211 154 146 13 3 4 201 1 130 4 128 139 11 129 131 131 131 58 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (C) 2012-2013 Samsung Electronics Co., Ltd. */ #include <linux/slab.h> #include <linux/compat.h> #include <linux/bio.h> #include <linux/buffer_head.h> #include "exfat_raw.h" #include "exfat_fs.h" static int exfat_extract_uni_name(struct exfat_dentry *ep, unsigned short *uniname) { int i, len = 0; for (i = 0; i < EXFAT_FILE_NAME_LEN; i++) { *uniname = le16_to_cpu(ep->dentry.name.unicode_0_14[i]); if (*uniname == 0x0) return len; uniname++; len++; } *uniname = 0x0; return len; } static int exfat_get_uniname_from_ext_entry(struct super_block *sb, struct exfat_chain *p_dir, int entry, unsigned short *uniname) { int i, err; struct exfat_entry_set_cache es; unsigned int uni_len = 0, len; err = exfat_get_dentry_set(&es, sb, p_dir, entry, ES_ALL_ENTRIES); if (err) return err; /* * First entry : file entry * Second entry : stream-extension entry * Third entry : first file-name entry * So, the index of first file-name dentry should start from 2. */ for (i = ES_IDX_FIRST_FILENAME; i < es.num_entries; i++) { struct exfat_dentry *ep = exfat_get_dentry_cached(&es, i); /* end of name entry */ if (exfat_get_entry_type(ep) != TYPE_EXTEND) break; len = exfat_extract_uni_name(ep, uniname); uni_len += len; if (len != EXFAT_FILE_NAME_LEN || uni_len >= MAX_NAME_LENGTH) break; uniname += EXFAT_FILE_NAME_LEN; } exfat_put_dentry_set(&es, false); return 0; } /* read a directory entry from the opened directory */ static int exfat_readdir(struct inode *inode, loff_t *cpos, struct exfat_dir_entry *dir_entry) { int i, dentries_per_clu, num_ext, err; unsigned int type, clu_offset, max_dentries; struct exfat_chain dir, clu; struct exfat_uni_name uni_name; struct exfat_dentry *ep; struct super_block *sb = inode->i_sb; struct exfat_sb_info *sbi = EXFAT_SB(sb); struct exfat_inode_info *ei = EXFAT_I(inode); unsigned int dentry = EXFAT_B_TO_DEN(*cpos) & 0xFFFFFFFF; struct buffer_head *bh; /* check if the given file ID is opened */ if (ei->type != TYPE_DIR) return -EPERM; exfat_chain_set(&dir, ei->start_clu, EXFAT_B_TO_CLU(i_size_read(inode), sbi), ei->flags); dentries_per_clu = sbi->dentries_per_clu; max_dentries = (unsigned int)min_t(u64, MAX_EXFAT_DENTRIES, (u64)EXFAT_CLU_TO_DEN(sbi->num_clusters, sbi)); clu_offset = EXFAT_DEN_TO_CLU(dentry, sbi); exfat_chain_dup(&clu, &dir); if (clu.flags == ALLOC_NO_FAT_CHAIN) { clu.dir += clu_offset; clu.size -= clu_offset; } else { /* hint_information */ if (clu_offset > 0 && ei->hint_bmap.off != EXFAT_EOF_CLUSTER && ei->hint_bmap.off > 0 && clu_offset >= ei->hint_bmap.off) { clu_offset -= ei->hint_bmap.off; clu.dir = ei->hint_bmap.clu; } while (clu_offset > 0 && clu.dir != EXFAT_EOF_CLUSTER) { if (exfat_get_next_cluster(sb, &(clu.dir))) return -EIO; clu_offset--; } } while (clu.dir != EXFAT_EOF_CLUSTER && dentry < max_dentries) { i = dentry & (dentries_per_clu - 1); for ( ; i < dentries_per_clu; i++, dentry++) { ep = exfat_get_dentry(sb, &clu, i, &bh); if (!ep) return -EIO; type = exfat_get_entry_type(ep); if (type == TYPE_UNUSED) { brelse(bh); goto out; } if (type != TYPE_FILE && type != TYPE_DIR) { brelse(bh); continue; } num_ext = ep->dentry.file.num_ext; dir_entry->attr = le16_to_cpu(ep->dentry.file.attr); *uni_name.name = 0x0; err = exfat_get_uniname_from_ext_entry(sb, &clu, i, uni_name.name); if (err) { brelse(bh); continue; } exfat_utf16_to_nls(sb, &uni_name, dir_entry->namebuf.lfn, dir_entry->namebuf.lfnbuf_len); brelse(bh); ep = exfat_get_dentry(sb, &clu, i + 1, &bh); if (!ep) return -EIO; dir_entry->entry = i; dir_entry->dir = clu; brelse(bh); ei->hint_bmap.off = EXFAT_DEN_TO_CLU(dentry, sbi); ei->hint_bmap.clu = clu.dir; *cpos = EXFAT_DEN_TO_B(dentry + 1 + num_ext); return 0; } if (clu.flags == ALLOC_NO_FAT_CHAIN) { if (--clu.size > 0) clu.dir++; else clu.dir = EXFAT_EOF_CLUSTER; } else { if (exfat_get_next_cluster(sb, &(clu.dir))) return -EIO; } } out: dir_entry->namebuf.lfn[0] = '\0'; *cpos = EXFAT_DEN_TO_B(dentry); return 0; } static void exfat_init_namebuf(struct exfat_dentry_namebuf *nb) { nb->lfn = NULL; nb->lfnbuf_len = 0; } static int exfat_alloc_namebuf(struct exfat_dentry_namebuf *nb) { nb->lfn = __getname(); if (!nb->lfn) return -ENOMEM; nb->lfnbuf_len = MAX_VFSNAME_BUF_SIZE; return 0; } static void exfat_free_namebuf(struct exfat_dentry_namebuf *nb) { if (!nb->lfn) return; __putname(nb->lfn); exfat_init_namebuf(nb); } /* * Before calling dir_emit*(), sbi->s_lock should be released * because page fault can occur in dir_emit*(). */ #define ITER_POS_FILLED_DOTS (2) static int exfat_iterate(struct file *file, struct dir_context *ctx) { struct inode *inode = file_inode(file); struct super_block *sb = inode->i_sb; struct inode *tmp; struct exfat_dir_entry de; struct exfat_dentry_namebuf *nb = &(de.namebuf); struct exfat_inode_info *ei = EXFAT_I(inode); unsigned long inum; loff_t cpos, i_pos; int err = 0, fake_offset = 0; exfat_init_namebuf(nb); cpos = ctx->pos; if (!dir_emit_dots(file, ctx)) goto out; if (ctx->pos == ITER_POS_FILLED_DOTS) { cpos = 0; fake_offset = 1; } cpos = round_up(cpos, DENTRY_SIZE); /* name buffer should be allocated before use */ err = exfat_alloc_namebuf(nb); if (err) goto out; get_new: mutex_lock(&EXFAT_SB(sb)->s_lock); if (ei->flags == ALLOC_NO_FAT_CHAIN && cpos >= i_size_read(inode)) goto end_of_dir; err = exfat_readdir(inode, &cpos, &de); if (err) { /* * At least we tried to read a sector. * Move cpos to next sector position (should be aligned). */ if (err == -EIO) { cpos += 1 << (sb->s_blocksize_bits); cpos &= ~(sb->s_blocksize - 1); } err = -EIO; goto end_of_dir; } if (!nb->lfn[0]) goto end_of_dir; i_pos = ((loff_t)de.dir.dir << 32) | (de.entry & 0xffffffff); tmp = exfat_iget(sb, i_pos); if (tmp) { inum = tmp->i_ino; iput(tmp); } else { inum = iunique(sb, EXFAT_ROOT_INO); } mutex_unlock(&EXFAT_SB(sb)->s_lock); if (!dir_emit(ctx, nb->lfn, strlen(nb->lfn), inum, (de.attr & EXFAT_ATTR_SUBDIR) ? DT_DIR : DT_REG)) goto out; ctx->pos = cpos; goto get_new; end_of_dir: if (!cpos && fake_offset) cpos = ITER_POS_FILLED_DOTS; ctx->pos = cpos; mutex_unlock(&EXFAT_SB(sb)->s_lock); out: /* * To improve performance, free namebuf after unlock sb_lock. * If namebuf is not allocated, this function do nothing */ exfat_free_namebuf(nb); return err; } WRAP_DIR_ITER(exfat_iterate) // FIXME! const struct file_operations exfat_dir_operations = { .llseek = generic_file_llseek, .read = generic_read_dir, .iterate_shared = shared_exfat_iterate, .unlocked_ioctl = exfat_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = exfat_compat_ioctl, #endif .fsync = exfat_file_fsync, }; int exfat_alloc_new_dir(struct inode *inode, struct exfat_chain *clu) { int ret; exfat_chain_set(clu, EXFAT_EOF_CLUSTER, 0, ALLOC_NO_FAT_CHAIN); ret = exfat_alloc_cluster(inode, 1, clu, IS_DIRSYNC(inode)); if (ret) return ret; return exfat_zeroed_cluster(inode, clu->dir); } int exfat_calc_num_entries(struct exfat_uni_name *p_uniname) { int len; len = p_uniname->name_len; if (len == 0) return -EINVAL; /* 1 file entry + 1 stream entry + name entries */ return ES_ENTRY_NUM(len); } unsigned int exfat_get_entry_type(struct exfat_dentry *ep) { if (ep->type == EXFAT_UNUSED) return TYPE_UNUSED; if (IS_EXFAT_DELETED(ep->type)) return TYPE_DELETED; if (ep->type == EXFAT_INVAL) return TYPE_INVALID; if (IS_EXFAT_CRITICAL_PRI(ep->type)) { if (ep->type == EXFAT_BITMAP) return TYPE_BITMAP; if (ep->type == EXFAT_UPCASE) return TYPE_UPCASE; if (ep->type == EXFAT_VOLUME) return TYPE_VOLUME; if (ep->type == EXFAT_FILE) { if (le16_to_cpu(ep->dentry.file.attr) & EXFAT_ATTR_SUBDIR) return TYPE_DIR; return TYPE_FILE; } return TYPE_CRITICAL_PRI; } if (IS_EXFAT_BENIGN_PRI(ep->type)) { if (ep->type == EXFAT_GUID) return TYPE_GUID; if (ep->type == EXFAT_PADDING) return TYPE_PADDING; if (ep->type == EXFAT_ACLTAB) return TYPE_ACLTAB; return TYPE_BENIGN_PRI; } if (IS_EXFAT_CRITICAL_SEC(ep->type)) { if (ep->type == EXFAT_STREAM) return TYPE_STREAM; if (ep->type == EXFAT_NAME) return TYPE_EXTEND; if (ep->type == EXFAT_ACL) return TYPE_ACL; return TYPE_CRITICAL_SEC; } if (ep->type == EXFAT_VENDOR_EXT) return TYPE_VENDOR_EXT; if (ep->type == EXFAT_VENDOR_ALLOC) return TYPE_VENDOR_ALLOC; return TYPE_BENIGN_SEC; } static void exfat_set_entry_type(struct exfat_dentry *ep, unsigned int type) { if (type == TYPE_UNUSED) { ep->type = EXFAT_UNUSED; } else if (type == TYPE_DELETED) { ep->type &= EXFAT_DELETE; } else if (type == TYPE_STREAM) { ep->type = EXFAT_STREAM; } else if (type == TYPE_EXTEND) { ep->type = EXFAT_NAME; } else if (type == TYPE_BITMAP) { ep->type = EXFAT_BITMAP; } else if (type == TYPE_UPCASE) { ep->type = EXFAT_UPCASE; } else if (type == TYPE_VOLUME) { ep->type = EXFAT_VOLUME; } else if (type == TYPE_DIR) { ep->type = EXFAT_FILE; ep->dentry.file.attr = cpu_to_le16(EXFAT_ATTR_SUBDIR); } else if (type == TYPE_FILE) { ep->type = EXFAT_FILE; ep->dentry.file.attr = cpu_to_le16(EXFAT_ATTR_ARCHIVE); } } static void exfat_init_stream_entry(struct exfat_dentry *ep, unsigned int start_clu, unsigned long long size) { memset(ep, 0, sizeof(*ep)); exfat_set_entry_type(ep, TYPE_STREAM); if (size == 0) ep->dentry.stream.flags = ALLOC_FAT_CHAIN; else ep->dentry.stream.flags = ALLOC_NO_FAT_CHAIN; ep->dentry.stream.start_clu = cpu_to_le32(start_clu); ep->dentry.stream.valid_size = cpu_to_le64(size); ep->dentry.stream.size = cpu_to_le64(size); } static void exfat_init_name_entry(struct exfat_dentry *ep, unsigned short *uniname) { int i; exfat_set_entry_type(ep, TYPE_EXTEND); ep->dentry.name.flags = 0x0; for (i = 0; i < EXFAT_FILE_NAME_LEN; i++) { if (*uniname != 0x0) { ep->dentry.name.unicode_0_14[i] = cpu_to_le16(*uniname); uniname++; } else { ep->dentry.name.unicode_0_14[i] = 0x0; } } } void exfat_init_dir_entry(struct exfat_entry_set_cache *es, unsigned int type, unsigned int start_clu, unsigned long long size, struct timespec64 *ts) { struct super_block *sb = es->sb; struct exfat_sb_info *sbi = EXFAT_SB(sb); struct exfat_dentry *ep; ep = exfat_get_dentry_cached(es, ES_IDX_FILE); memset(ep, 0, sizeof(*ep)); exfat_set_entry_type(ep, type); exfat_set_entry_time(sbi, ts, &ep->dentry.file.create_tz, &ep->dentry.file.create_time, &ep->dentry.file.create_date, &ep->dentry.file.create_time_cs); exfat_set_entry_time(sbi, ts, &ep->dentry.file.modify_tz, &ep->dentry.file.modify_time, &ep->dentry.file.modify_date, &ep->dentry.file.modify_time_cs); exfat_set_entry_time(sbi, ts, &ep->dentry.file.access_tz, &ep->dentry.file.access_time, &ep->dentry.file.access_date, NULL); ep = exfat_get_dentry_cached(es, ES_IDX_STREAM); exfat_init_stream_entry(ep, start_clu, size); } static void exfat_free_benign_secondary_clusters(struct inode *inode, struct exfat_dentry *ep) { struct super_block *sb = inode->i_sb; struct exfat_chain dir; unsigned int start_clu = le32_to_cpu(ep->dentry.generic_secondary.start_clu); u64 size = le64_to_cpu(ep->dentry.generic_secondary.size); unsigned char flags = ep->dentry.generic_secondary.flags; if (!(flags & ALLOC_POSSIBLE) || !start_clu || !size) return; exfat_chain_set(&dir, start_clu, EXFAT_B_TO_CLU_ROUND_UP(size, EXFAT_SB(sb)), flags); exfat_free_cluster(inode, &dir); } void exfat_init_ext_entry(struct exfat_entry_set_cache *es, int num_entries, struct exfat_uni_name *p_uniname) { int i; unsigned short *uniname = p_uniname->name; struct exfat_dentry *ep; ep = exfat_get_dentry_cached(es, ES_IDX_FILE); ep->dentry.file.num_ext = (unsigned char)(num_entries - 1); ep = exfat_get_dentry_cached(es, ES_IDX_STREAM); ep->dentry.stream.name_len = p_uniname->name_len; ep->dentry.stream.name_hash = cpu_to_le16(p_uniname->name_hash); for (i = ES_IDX_FIRST_FILENAME; i < num_entries; i++) { ep = exfat_get_dentry_cached(es, i); exfat_init_name_entry(ep, uniname); uniname += EXFAT_FILE_NAME_LEN; } exfat_update_dir_chksum(es); } void exfat_remove_entries(struct inode *inode, struct exfat_entry_set_cache *es, int order) { int i; struct exfat_dentry *ep; for (i = order; i < es->num_entries; i++) { ep = exfat_get_dentry_cached(es, i); if (exfat_get_entry_type(ep) & TYPE_BENIGN_SEC) exfat_free_benign_secondary_clusters(inode, ep); exfat_set_entry_type(ep, TYPE_DELETED); } if (order < es->num_entries) es->modified = true; } void exfat_update_dir_chksum(struct exfat_entry_set_cache *es) { int chksum_type = CS_DIR_ENTRY, i; unsigned short chksum = 0; struct exfat_dentry *ep; for (i = ES_IDX_FILE; i < es->num_entries; i++) { ep = exfat_get_dentry_cached(es, i); chksum = exfat_calc_chksum16(ep, DENTRY_SIZE, chksum, chksum_type); chksum_type = CS_DEFAULT; } ep = exfat_get_dentry_cached(es, ES_IDX_FILE); ep->dentry.file.checksum = cpu_to_le16(chksum); es->modified = true; } int exfat_put_dentry_set(struct exfat_entry_set_cache *es, int sync) { int i, err = 0; if (es->modified) err = exfat_update_bhs(es->bh, es->num_bh, sync); for (i = 0; i < es->num_bh; i++) if (err) bforget(es->bh[i]); else brelse(es->bh[i]); if (IS_DYNAMIC_ES(es)) kfree(es->bh); return err; } static int exfat_walk_fat_chain(struct super_block *sb, struct exfat_chain *p_dir, unsigned int byte_offset, unsigned int *clu) { struct exfat_sb_info *sbi = EXFAT_SB(sb); unsigned int clu_offset; unsigned int cur_clu; clu_offset = EXFAT_B_TO_CLU(byte_offset, sbi); cur_clu = p_dir->dir; if (p_dir->flags == ALLOC_NO_FAT_CHAIN) { cur_clu += clu_offset; } else { while (clu_offset > 0) { if (exfat_get_next_cluster(sb, &cur_clu)) return -EIO; if (cur_clu == EXFAT_EOF_CLUSTER) { exfat_fs_error(sb, "invalid dentry access beyond EOF (clu : %u, eidx : %d)", p_dir->dir, EXFAT_B_TO_DEN(byte_offset)); return -EIO; } clu_offset--; } } *clu = cur_clu; return 0; } static int exfat_find_location(struct super_block *sb, struct exfat_chain *p_dir, int entry, sector_t *sector, int *offset) { int ret; unsigned int off, clu = 0; struct exfat_sb_info *sbi = EXFAT_SB(sb); off = EXFAT_DEN_TO_B(entry); ret = exfat_walk_fat_chain(sb, p_dir, off, &clu); if (ret) return ret; /* byte offset in cluster */ off = EXFAT_CLU_OFFSET(off, sbi); /* byte offset in sector */ *offset = EXFAT_BLK_OFFSET(off, sb); /* sector offset in cluster */ *sector = EXFAT_B_TO_BLK(off, sb); *sector += exfat_cluster_to_sector(sbi, clu); return 0; } #define EXFAT_MAX_RA_SIZE (128*1024) static int exfat_dir_readahead(struct super_block *sb, sector_t sec) { struct exfat_sb_info *sbi = EXFAT_SB(sb); struct buffer_head *bh; unsigned int max_ra_count = EXFAT_MAX_RA_SIZE >> sb->s_blocksize_bits; unsigned int page_ra_count = PAGE_SIZE >> sb->s_blocksize_bits; unsigned int adj_ra_count = max(sbi->sect_per_clus, page_ra_count); unsigned int ra_count = min(adj_ra_count, max_ra_count); /* Read-ahead is not required */ if (sbi->sect_per_clus == 1) return 0; if (sec < sbi->data_start_sector) { exfat_err(sb, "requested sector is invalid(sect:%llu, root:%llu)", (unsigned long long)sec, sbi->data_start_sector); return -EIO; } /* Not sector aligned with ra_count, resize ra_count to page size */ if ((sec - sbi->data_start_sector) & (ra_count - 1)) ra_count = page_ra_count; bh = sb_find_get_block(sb, sec); if (!bh || !buffer_uptodate(bh)) { unsigned int i; for (i = 0; i < ra_count; i++) sb_breadahead(sb, (sector_t)(sec + i)); } brelse(bh); return 0; } struct exfat_dentry *exfat_get_dentry(struct super_block *sb, struct exfat_chain *p_dir, int entry, struct buffer_head **bh) { unsigned int dentries_per_page = EXFAT_B_TO_DEN(PAGE_SIZE); int off; sector_t sec; if (p_dir->dir == DIR_DELETED) { exfat_err(sb, "abnormal access to deleted dentry"); return NULL; } if (exfat_find_location(sb, p_dir, entry, &sec, &off)) return NULL; if (p_dir->dir != EXFAT_FREE_CLUSTER && !(entry & (dentries_per_page - 1))) exfat_dir_readahead(sb, sec); *bh = sb_bread(sb, sec); if (!*bh) return NULL; return (struct exfat_dentry *)((*bh)->b_data + off); } enum exfat_validate_dentry_mode { ES_MODE_GET_FILE_ENTRY, ES_MODE_GET_STRM_ENTRY, ES_MODE_GET_NAME_ENTRY, ES_MODE_GET_CRITICAL_SEC_ENTRY, ES_MODE_GET_BENIGN_SEC_ENTRY, }; static bool exfat_validate_entry(unsigned int type, enum exfat_validate_dentry_mode *mode) { if (type == TYPE_UNUSED || type == TYPE_DELETED) return false; switch (*mode) { case ES_MODE_GET_FILE_ENTRY: if (type != TYPE_STREAM) return false; *mode = ES_MODE_GET_STRM_ENTRY; break; case ES_MODE_GET_STRM_ENTRY: if (type != TYPE_EXTEND) return false; *mode = ES_MODE_GET_NAME_ENTRY; break; case ES_MODE_GET_NAME_ENTRY: if (type & TYPE_BENIGN_SEC) *mode = ES_MODE_GET_BENIGN_SEC_ENTRY; else if (type != TYPE_EXTEND) return false; break; case ES_MODE_GET_BENIGN_SEC_ENTRY: /* Assume unreconized benign secondary entry */ if (!(type & TYPE_BENIGN_SEC)) return false; break; default: return false; } return true; } struct exfat_dentry *exfat_get_dentry_cached( struct exfat_entry_set_cache *es, int num) { int off = es->start_off + num * DENTRY_SIZE; struct buffer_head *bh = es->bh[EXFAT_B_TO_BLK(off, es->sb)]; char *p = bh->b_data + EXFAT_BLK_OFFSET(off, es->sb); return (struct exfat_dentry *)p; } /* * Returns a set of dentries. * * Note It provides a direct pointer to bh->data via exfat_get_dentry_cached(). * User should call exfat_get_dentry_set() after setting 'modified' to apply * changes made in this entry set to the real device. * * in: * sb+p_dir+entry: indicates a file/dir * num_entries: specifies how many dentries should be included. * It will be set to es->num_entries if it is not 0. * If num_entries is 0, es->num_entries will be obtained * from the first dentry. * out: * es: pointer of entry set on success. * return: * 0 on success * -error code on failure */ static int __exfat_get_dentry_set(struct exfat_entry_set_cache *es, struct super_block *sb, struct exfat_chain *p_dir, int entry, unsigned int num_entries) { int ret, i, num_bh; unsigned int off; sector_t sec; struct exfat_sb_info *sbi = EXFAT_SB(sb); struct buffer_head *bh; if (p_dir->dir == DIR_DELETED) { exfat_err(sb, "access to deleted dentry"); return -EIO; } ret = exfat_find_location(sb, p_dir, entry, &sec, &off); if (ret) return ret; memset(es, 0, sizeof(*es)); es->sb = sb; es->modified = false; es->start_off = off; es->bh = es->__bh; bh = sb_bread(sb, sec); if (!bh) return -EIO; es->bh[es->num_bh++] = bh; if (num_entries == ES_ALL_ENTRIES) { struct exfat_dentry *ep; ep = exfat_get_dentry_cached(es, ES_IDX_FILE); if (ep->type != EXFAT_FILE) { brelse(bh); return -EIO; } num_entries = ep->dentry.file.num_ext + 1; } es->num_entries = num_entries; num_bh = EXFAT_B_TO_BLK_ROUND_UP(off + num_entries * DENTRY_SIZE, sb); if (num_bh > ARRAY_SIZE(es->__bh)) { es->bh = kmalloc_array(num_bh, sizeof(*es->bh), GFP_NOFS); if (!es->bh) { brelse(bh); return -ENOMEM; } es->bh[0] = bh; } for (i = 1; i < num_bh; i++) { /* get the next sector */ if (exfat_is_last_sector_in_cluster(sbi, sec)) { unsigned int clu = exfat_sector_to_cluster(sbi, sec); if (p_dir->flags == ALLOC_NO_FAT_CHAIN) clu++; else if (exfat_get_next_cluster(sb, &clu)) goto put_es; sec = exfat_cluster_to_sector(sbi, clu); } else { sec++; } bh = sb_bread(sb, sec); if (!bh) goto put_es; es->bh[es->num_bh++] = bh; } return 0; put_es: exfat_put_dentry_set(es, false); return -EIO; } int exfat_get_dentry_set(struct exfat_entry_set_cache *es, struct super_block *sb, struct exfat_chain *p_dir, int entry, unsigned int num_entries) { int ret, i; struct exfat_dentry *ep; enum exfat_validate_dentry_mode mode = ES_MODE_GET_FILE_ENTRY; ret = __exfat_get_dentry_set(es, sb, p_dir, entry, num_entries); if (ret < 0) return ret; /* validate cached dentries */ for (i = ES_IDX_STREAM; i < es->num_entries; i++) { ep = exfat_get_dentry_cached(es, i); if (!exfat_validate_entry(exfat_get_entry_type(ep), &mode)) goto put_es; } return 0; put_es: exfat_put_dentry_set(es, false); return -EIO; } static int exfat_validate_empty_dentry_set(struct exfat_entry_set_cache *es) { struct exfat_dentry *ep; struct buffer_head *bh; int i, off; bool unused_hit = false; /* * ONLY UNUSED OR DELETED DENTRIES ARE ALLOWED: * Although it violates the specification for a deleted entry to * follow an unused entry, some exFAT implementations could work * like this. Therefore, to improve compatibility, let's allow it. */ for (i = 0; i < es->num_entries; i++) { ep = exfat_get_dentry_cached(es, i); if (ep->type == EXFAT_UNUSED) { unused_hit = true; } else if (!IS_EXFAT_DELETED(ep->type)) { if (unused_hit) goto err_used_follow_unused; i++; goto count_skip_entries; } } return 0; err_used_follow_unused: off = es->start_off + (i << DENTRY_SIZE_BITS); bh = es->bh[EXFAT_B_TO_BLK(off, es->sb)]; exfat_fs_error(es->sb, "in sector %lld, dentry %d should be unused, but 0x%x", bh->b_blocknr, off >> DENTRY_SIZE_BITS, ep->type); return -EIO; count_skip_entries: es->num_entries = EXFAT_B_TO_DEN(EXFAT_BLK_TO_B(es->num_bh, es->sb) - es->start_off); for (; i < es->num_entries; i++) { ep = exfat_get_dentry_cached(es, i); if (IS_EXFAT_DELETED(ep->type)) break; } return i; } /* * Get an empty dentry set. * * in: * sb+p_dir+entry: indicates the empty dentry location * num_entries: specifies how many empty dentries should be included. * out: * es: pointer of empty dentry set on success. * return: * 0 : on success * >0 : the dentries are not empty, the return value is the number of * dentries to be skipped for the next lookup. * <0 : on failure */ int exfat_get_empty_dentry_set(struct exfat_entry_set_cache *es, struct super_block *sb, struct exfat_chain *p_dir, int entry, unsigned int num_entries) { int ret; ret = __exfat_get_dentry_set(es, sb, p_dir, entry, num_entries); if (ret < 0) return ret; ret = exfat_validate_empty_dentry_set(es); if (ret) exfat_put_dentry_set(es, false); return ret; } static inline void exfat_reset_empty_hint(struct exfat_hint_femp *hint_femp) { hint_femp->eidx = EXFAT_HINT_NONE; hint_femp->count = 0; } static inline void exfat_set_empty_hint(struct exfat_inode_info *ei, struct exfat_hint_femp *candi_empty, struct exfat_chain *clu, int dentry, int num_entries, int entry_type) { if (ei->hint_femp.eidx == EXFAT_HINT_NONE || ei->hint_femp.eidx > dentry) { int total_entries = EXFAT_B_TO_DEN(i_size_read(&ei->vfs_inode)); if (candi_empty->count == 0) { candi_empty->cur = *clu; candi_empty->eidx = dentry; } if (entry_type == TYPE_UNUSED) candi_empty->count += total_entries - dentry; else candi_empty->count++; if (candi_empty->count == num_entries || candi_empty->count + candi_empty->eidx == total_entries) ei->hint_femp = *candi_empty; } } enum { DIRENT_STEP_FILE, DIRENT_STEP_STRM, DIRENT_STEP_NAME, DIRENT_STEP_SECD, }; /* * @ei: inode info of parent directory * @p_dir: directory structure of parent directory * @num_entries:entry size of p_uniname * @hint_opt: If p_uniname is found, filled with optimized dir/entry * for traversing cluster chain. * @return: * >= 0: file directory entry position where the name exists * -ENOENT: entry with the name does not exist * -EIO: I/O error */ int exfat_find_dir_entry(struct super_block *sb, struct exfat_inode_info *ei, struct exfat_chain *p_dir, struct exfat_uni_name *p_uniname, struct exfat_hint *hint_opt) { int i, rewind = 0, dentry = 0, end_eidx = 0, num_ext = 0, len; int order, step, name_len = 0; int dentries_per_clu; unsigned int entry_type; unsigned short *uniname = NULL; struct exfat_chain clu; struct exfat_hint *hint_stat = &ei->hint_stat; struct exfat_hint_femp candi_empty; struct exfat_sb_info *sbi = EXFAT_SB(sb); int num_entries = exfat_calc_num_entries(p_uniname); if (num_entries < 0) return num_entries; dentries_per_clu = sbi->dentries_per_clu; exfat_chain_dup(&clu, p_dir); if (hint_stat->eidx) { clu.dir = hint_stat->clu; dentry = hint_stat->eidx; end_eidx = dentry; } exfat_reset_empty_hint(&ei->hint_femp); rewind: order = 0; step = DIRENT_STEP_FILE; exfat_reset_empty_hint(&candi_empty); while (clu.dir != EXFAT_EOF_CLUSTER) { i = dentry & (dentries_per_clu - 1); for (; i < dentries_per_clu; i++, dentry++) { struct exfat_dentry *ep; struct buffer_head *bh; if (rewind && dentry == end_eidx) goto not_found; ep = exfat_get_dentry(sb, &clu, i, &bh); if (!ep) return -EIO; entry_type = exfat_get_entry_type(ep); if (entry_type == TYPE_UNUSED || entry_type == TYPE_DELETED) { step = DIRENT_STEP_FILE; exfat_set_empty_hint(ei, &candi_empty, &clu, dentry, num_entries, entry_type); brelse(bh); if (entry_type == TYPE_UNUSED) goto not_found; continue; } exfat_reset_empty_hint(&candi_empty); if (entry_type == TYPE_FILE || entry_type == TYPE_DIR) { step = DIRENT_STEP_FILE; hint_opt->clu = clu.dir; hint_opt->eidx = i; num_ext = ep->dentry.file.num_ext; step = DIRENT_STEP_STRM; brelse(bh); continue; } if (entry_type == TYPE_STREAM) { u16 name_hash; if (step != DIRENT_STEP_STRM) { step = DIRENT_STEP_FILE; brelse(bh); continue; } step = DIRENT_STEP_FILE; name_hash = le16_to_cpu( ep->dentry.stream.name_hash); if (p_uniname->name_hash == name_hash && p_uniname->name_len == ep->dentry.stream.name_len) { step = DIRENT_STEP_NAME; order = 1; name_len = 0; } brelse(bh); continue; } brelse(bh); if (entry_type == TYPE_EXTEND) { unsigned short entry_uniname[16], unichar; if (step != DIRENT_STEP_NAME || name_len >= MAX_NAME_LENGTH) { step = DIRENT_STEP_FILE; continue; } if (++order == 2) uniname = p_uniname->name; else uniname += EXFAT_FILE_NAME_LEN; len = exfat_extract_uni_name(ep, entry_uniname); name_len += len; unichar = *(uniname+len); *(uniname+len) = 0x0; if (exfat_uniname_ncmp(sb, uniname, entry_uniname, len)) { step = DIRENT_STEP_FILE; } else if (p_uniname->name_len == name_len) { if (order == num_ext) goto found; step = DIRENT_STEP_SECD; } *(uniname+len) = unichar; continue; } if (entry_type & (TYPE_CRITICAL_SEC | TYPE_BENIGN_SEC)) { if (step == DIRENT_STEP_SECD) { if (++order == num_ext) goto found; continue; } } step = DIRENT_STEP_FILE; } if (clu.flags == ALLOC_NO_FAT_CHAIN) { if (--clu.size > 0) clu.dir++; else clu.dir = EXFAT_EOF_CLUSTER; } else { if (exfat_get_next_cluster(sb, &clu.dir)) return -EIO; } } not_found: /* * We started at not 0 index,so we should try to find target * from 0 index to the index we started at. */ if (!rewind && end_eidx) { rewind = 1; dentry = 0; clu.dir = p_dir->dir; goto rewind; } /* * set the EXFAT_EOF_CLUSTER flag to avoid search * from the beginning again when allocated a new cluster */ if (ei->hint_femp.eidx == EXFAT_HINT_NONE) { ei->hint_femp.cur.dir = EXFAT_EOF_CLUSTER; ei->hint_femp.eidx = p_dir->size * dentries_per_clu; ei->hint_femp.count = 0; } /* initialized hint_stat */ hint_stat->clu = p_dir->dir; hint_stat->eidx = 0; return -ENOENT; found: /* next dentry we'll find is out of this cluster */ if (!((dentry + 1) & (dentries_per_clu - 1))) { int ret = 0; if (clu.flags == ALLOC_NO_FAT_CHAIN) { if (--clu.size > 0) clu.dir++; else clu.dir = EXFAT_EOF_CLUSTER; } else { ret = exfat_get_next_cluster(sb, &clu.dir); } if (ret || clu.dir == EXFAT_EOF_CLUSTER) { /* just initialized hint_stat */ hint_stat->clu = p_dir->dir; hint_stat->eidx = 0; return (dentry - num_ext); } } hint_stat->clu = clu.dir; hint_stat->eidx = dentry + 1; return dentry - num_ext; } int exfat_count_dir_entries(struct super_block *sb, struct exfat_chain *p_dir) { int i, count = 0; int dentries_per_clu; unsigned int entry_type; struct exfat_chain clu; struct exfat_dentry *ep; struct exfat_sb_info *sbi = EXFAT_SB(sb); struct buffer_head *bh; dentries_per_clu = sbi->dentries_per_clu; exfat_chain_dup(&clu, p_dir); while (clu.dir != EXFAT_EOF_CLUSTER) { for (i = 0; i < dentries_per_clu; i++) { ep = exfat_get_dentry(sb, &clu, i, &bh); if (!ep) return -EIO; entry_type = exfat_get_entry_type(ep); brelse(bh); if (entry_type == TYPE_UNUSED) return count; if (entry_type != TYPE_DIR) continue; count++; } if (clu.flags == ALLOC_NO_FAT_CHAIN) { if (--clu.size > 0) clu.dir++; else clu.dir = EXFAT_EOF_CLUSTER; } else { if (exfat_get_next_cluster(sb, &(clu.dir))) return -EIO; } } return count; }
1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 // SPDX-License-Identifier: GPL-2.0 or MIT /* * Copyright 2018 Noralf Trønnes */ #include <linux/list.h> #include <linux/mutex.h> #include <linux/seq_file.h> #include <drm/drm_client.h> #include <drm/drm_client_event.h> #include <drm/drm_debugfs.h> #include <drm/drm_device.h> #include <drm/drm_drv.h> #include <drm/drm_print.h> #include "drm_internal.h" /** * drm_client_dev_unregister - Unregister clients * @dev: DRM device * * This function releases all clients by calling each client's * &drm_client_funcs.unregister callback. The callback function * is responsibe for releaseing all resources including the client * itself. * * The helper drm_dev_unregister() calls this function. Drivers * that use it don't need to call this function themselves. */ void drm_client_dev_unregister(struct drm_device *dev) { struct drm_client_dev *client, *tmp; if (!drm_core_check_feature(dev, DRIVER_MODESET)) return; mutex_lock(&dev->clientlist_mutex); list_for_each_entry_safe(client, tmp, &dev->clientlist, list) { list_del(&client->list); if (client->funcs && client->funcs->unregister) { client->funcs->unregister(client); } else { drm_client_release(client); kfree(client); } } mutex_unlock(&dev->clientlist_mutex); } EXPORT_SYMBOL(drm_client_dev_unregister); /** * drm_client_dev_hotplug - Send hotplug event to clients * @dev: DRM device * * This function calls the &drm_client_funcs.hotplug callback on the attached clients. * * drm_kms_helper_hotplug_event() calls this function, so drivers that use it * don't need to call this function themselves. */ void drm_client_dev_hotplug(struct drm_device *dev) { struct drm_client_dev *client; int ret; if (!drm_core_check_feature(dev, DRIVER_MODESET)) return; if (!dev->mode_config.num_connector) { drm_dbg_kms(dev, "No connectors found, will not send hotplug events!\n"); return; } mutex_lock(&dev->clientlist_mutex); list_for_each_entry(client, &dev->clientlist, list) { if (!client->funcs || !client->funcs->hotplug) continue; if (client->hotplug_failed) continue; ret = client->funcs->hotplug(client); drm_dbg_kms(dev, "%s: ret=%d\n", client->name, ret); if (ret) client->hotplug_failed = true; } mutex_unlock(&dev->clientlist_mutex); } EXPORT_SYMBOL(drm_client_dev_hotplug); void drm_client_dev_restore(struct drm_device *dev) { struct drm_client_dev *client; int ret; if (!drm_core_check_feature(dev, DRIVER_MODESET)) return; mutex_lock(&dev->clientlist_mutex); list_for_each_entry(client, &dev->clientlist, list) { if (!client->funcs || !client->funcs->restore) continue; ret = client->funcs->restore(client); drm_dbg_kms(dev, "%s: ret=%d\n", client->name, ret); if (!ret) /* The first one to return zero gets the privilege to restore */ break; } mutex_unlock(&dev->clientlist_mutex); } static int drm_client_suspend(struct drm_client_dev *client, bool holds_console_lock) { struct drm_device *dev = client->dev; int ret = 0; if (drm_WARN_ON_ONCE(dev, client->suspended)) return 0; if (client->funcs && client->funcs->suspend) ret = client->funcs->suspend(client, holds_console_lock); drm_dbg_kms(dev, "%s: ret=%d\n", client->name, ret); client->suspended = true; return ret; } void drm_client_dev_suspend(struct drm_device *dev, bool holds_console_lock) { struct drm_client_dev *client; mutex_lock(&dev->clientlist_mutex); list_for_each_entry(client, &dev->clientlist, list) { if (!client->suspended) drm_client_suspend(client, holds_console_lock); } mutex_unlock(&dev->clientlist_mutex); } EXPORT_SYMBOL(drm_client_dev_suspend); static int drm_client_resume(struct drm_client_dev *client, bool holds_console_lock) { struct drm_device *dev = client->dev; int ret = 0; if (drm_WARN_ON_ONCE(dev, !client->suspended)) return 0; if (client->funcs && client->funcs->resume) ret = client->funcs->resume(client, holds_console_lock); drm_dbg_kms(dev, "%s: ret=%d\n", client->name, ret); client->suspended = false; return ret; } void drm_client_dev_resume(struct drm_device *dev, bool holds_console_lock) { struct drm_client_dev *client; mutex_lock(&dev->clientlist_mutex); list_for_each_entry(client, &dev->clientlist, list) { if (client->suspended) drm_client_resume(client, holds_console_lock); } mutex_unlock(&dev->clientlist_mutex); } EXPORT_SYMBOL(drm_client_dev_resume); #ifdef CONFIG_DEBUG_FS static int drm_client_debugfs_internal_clients(struct seq_file *m, void *data) { struct drm_debugfs_entry *entry = m->private; struct drm_device *dev = entry->dev; struct drm_printer p = drm_seq_file_printer(m); struct drm_client_dev *client; mutex_lock(&dev->clientlist_mutex); list_for_each_entry(client, &dev->clientlist, list) drm_printf(&p, "%s\n", client->name); mutex_unlock(&dev->clientlist_mutex); return 0; } static const struct drm_debugfs_info drm_client_debugfs_list[] = { { "internal_clients", drm_client_debugfs_internal_clients, 0 }, }; void drm_client_debugfs_init(struct drm_device *dev) { drm_debugfs_add_files(dev, drm_client_debugfs_list, ARRAY_SIZE(drm_client_debugfs_list)); } #endif
1 1 2 2 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 // SPDX-License-Identifier: GPL-2.0-only /* * IPv6 library code, needed by static components when full IPv6 support is * not configured or static. */ #include <linux/export.h> #include <net/ipv6.h> #include <net/ipv6_stubs.h> #include <net/addrconf.h> #include <net/ip.h> /* if ipv6 module registers this function is used by xfrm to force all * sockets to relookup their nodes - this is fairly expensive, be * careful */ void (*__fib6_flush_trees)(struct net *); EXPORT_SYMBOL(__fib6_flush_trees); #define IPV6_ADDR_SCOPE_TYPE(scope) ((scope) << 16) static inline unsigned int ipv6_addr_scope2type(unsigned int scope) { switch (scope) { case IPV6_ADDR_SCOPE_NODELOCAL: return (IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_NODELOCAL) | IPV6_ADDR_LOOPBACK); case IPV6_ADDR_SCOPE_LINKLOCAL: return (IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_LINKLOCAL) | IPV6_ADDR_LINKLOCAL); case IPV6_ADDR_SCOPE_SITELOCAL: return (IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_SITELOCAL) | IPV6_ADDR_SITELOCAL); } return IPV6_ADDR_SCOPE_TYPE(scope); } int __ipv6_addr_type(const struct in6_addr *addr) { __be32 st; st = addr->s6_addr32[0]; /* Consider all addresses with the first three bits different of 000 and 111 as unicasts. */ if ((st & htonl(0xE0000000)) != htonl(0x00000000) && (st & htonl(0xE0000000)) != htonl(0xE0000000)) return (IPV6_ADDR_UNICAST | IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_GLOBAL)); if ((st & htonl(0xFF000000)) == htonl(0xFF000000)) { /* multicast */ /* addr-select 3.1 */ return (IPV6_ADDR_MULTICAST | ipv6_addr_scope2type(IPV6_ADDR_MC_SCOPE(addr))); } if ((st & htonl(0xFFC00000)) == htonl(0xFE800000)) return (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_UNICAST | IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_LINKLOCAL)); /* addr-select 3.1 */ if ((st & htonl(0xFFC00000)) == htonl(0xFEC00000)) return (IPV6_ADDR_SITELOCAL | IPV6_ADDR_UNICAST | IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_SITELOCAL)); /* addr-select 3.1 */ if ((st & htonl(0xFE000000)) == htonl(0xFC000000)) return (IPV6_ADDR_UNICAST | IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_GLOBAL)); /* RFC 4193 */ if ((addr->s6_addr32[0] | addr->s6_addr32[1]) == 0) { if (addr->s6_addr32[2] == 0) { if (addr->s6_addr32[3] == 0) return IPV6_ADDR_ANY; if (addr->s6_addr32[3] == htonl(0x00000001)) return (IPV6_ADDR_LOOPBACK | IPV6_ADDR_UNICAST | IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_LINKLOCAL)); /* addr-select 3.4 */ return (IPV6_ADDR_COMPATv4 | IPV6_ADDR_UNICAST | IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_GLOBAL)); /* addr-select 3.3 */ } if (addr->s6_addr32[2] == htonl(0x0000ffff)) return (IPV6_ADDR_MAPPED | IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_GLOBAL)); /* addr-select 3.3 */ } return (IPV6_ADDR_UNICAST | IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_GLOBAL)); /* addr-select 3.4 */ } EXPORT_SYMBOL(__ipv6_addr_type); static ATOMIC_NOTIFIER_HEAD(inet6addr_chain); static BLOCKING_NOTIFIER_HEAD(inet6addr_validator_chain); int register_inet6addr_notifier(struct notifier_block *nb) { return atomic_notifier_chain_register(&inet6addr_chain, nb); } EXPORT_SYMBOL(register_inet6addr_notifier); int unregister_inet6addr_notifier(struct notifier_block *nb) { return atomic_notifier_chain_unregister(&inet6addr_chain, nb); } EXPORT_SYMBOL(unregister_inet6addr_notifier); int inet6addr_notifier_call_chain(unsigned long val, void *v) { return atomic_notifier_call_chain(&inet6addr_chain, val, v); } EXPORT_SYMBOL(inet6addr_notifier_call_chain); int register_inet6addr_validator_notifier(struct notifier_block *nb) { return blocking_notifier_chain_register(&inet6addr_validator_chain, nb); } EXPORT_SYMBOL(register_inet6addr_validator_notifier); int unregister_inet6addr_validator_notifier(struct notifier_block *nb) { return blocking_notifier_chain_unregister(&inet6addr_validator_chain, nb); } EXPORT_SYMBOL(unregister_inet6addr_validator_notifier); int inet6addr_validator_notifier_call_chain(unsigned long val, void *v) { return blocking_notifier_call_chain(&inet6addr_validator_chain, val, v); } EXPORT_SYMBOL(inet6addr_validator_notifier_call_chain); static struct dst_entry *eafnosupport_ipv6_dst_lookup_flow(struct net *net, const struct sock *sk, struct flowi6 *fl6, const struct in6_addr *final_dst) { return ERR_PTR(-EAFNOSUPPORT); } static int eafnosupport_ipv6_route_input(struct sk_buff *skb) { return -EAFNOSUPPORT; } static struct fib6_table *eafnosupport_fib6_get_table(struct net *net, u32 id) { return NULL; } static int eafnosupport_fib6_table_lookup(struct net *net, struct fib6_table *table, int oif, struct flowi6 *fl6, struct fib6_result *res, int flags) { return -EAFNOSUPPORT; } static int eafnosupport_fib6_lookup(struct net *net, int oif, struct flowi6 *fl6, struct fib6_result *res, int flags) { return -EAFNOSUPPORT; } static void eafnosupport_fib6_select_path(const struct net *net, struct fib6_result *res, struct flowi6 *fl6, int oif, bool have_oif_match, const struct sk_buff *skb, int strict) { } static u32 eafnosupport_ip6_mtu_from_fib6(const struct fib6_result *res, const struct in6_addr *daddr, const struct in6_addr *saddr) { return 0; } static int eafnosupport_fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh, struct fib6_config *cfg, gfp_t gfp_flags, struct netlink_ext_ack *extack) { NL_SET_ERR_MSG(extack, "IPv6 support not enabled in kernel"); return -EAFNOSUPPORT; } static int eafnosupport_ip6_del_rt(struct net *net, struct fib6_info *rt, bool skip_notify) { return -EAFNOSUPPORT; } static int eafnosupport_ipv6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, int (*output)(struct net *, struct sock *, struct sk_buff *)) { kfree_skb(skb); return -EAFNOSUPPORT; } static struct net_device *eafnosupport_ipv6_dev_find(struct net *net, const struct in6_addr *addr, struct net_device *dev) { return ERR_PTR(-EAFNOSUPPORT); } const struct ipv6_stub *ipv6_stub __read_mostly = &(struct ipv6_stub) { .ipv6_dst_lookup_flow = eafnosupport_ipv6_dst_lookup_flow, .ipv6_route_input = eafnosupport_ipv6_route_input, .fib6_get_table = eafnosupport_fib6_get_table, .fib6_table_lookup = eafnosupport_fib6_table_lookup, .fib6_lookup = eafnosupport_fib6_lookup, .fib6_select_path = eafnosupport_fib6_select_path, .ip6_mtu_from_fib6 = eafnosupport_ip6_mtu_from_fib6, .fib6_nh_init = eafnosupport_fib6_nh_init, .ip6_del_rt = eafnosupport_ip6_del_rt, .ipv6_fragment = eafnosupport_ipv6_fragment, .ipv6_dev_find = eafnosupport_ipv6_dev_find, }; EXPORT_SYMBOL_GPL(ipv6_stub); /* IPv6 Wildcard Address and Loopback Address defined by RFC2553 */ const struct in6_addr in6addr_loopback __aligned(BITS_PER_LONG/8) = IN6ADDR_LOOPBACK_INIT; EXPORT_SYMBOL(in6addr_loopback); const struct in6_addr in6addr_any __aligned(BITS_PER_LONG/8) = IN6ADDR_ANY_INIT; EXPORT_SYMBOL(in6addr_any); const struct in6_addr in6addr_linklocal_allnodes __aligned(BITS_PER_LONG/8) = IN6ADDR_LINKLOCAL_ALLNODES_INIT; EXPORT_SYMBOL(in6addr_linklocal_allnodes); const struct in6_addr in6addr_linklocal_allrouters __aligned(BITS_PER_LONG/8) = IN6ADDR_LINKLOCAL_ALLROUTERS_INIT; EXPORT_SYMBOL(in6addr_linklocal_allrouters); const struct in6_addr in6addr_interfacelocal_allnodes __aligned(BITS_PER_LONG/8) = IN6ADDR_INTERFACELOCAL_ALLNODES_INIT; EXPORT_SYMBOL(in6addr_interfacelocal_allnodes); const struct in6_addr in6addr_interfacelocal_allrouters __aligned(BITS_PER_LONG/8) = IN6ADDR_INTERFACELOCAL_ALLROUTERS_INIT; EXPORT_SYMBOL(in6addr_interfacelocal_allrouters); const struct in6_addr in6addr_sitelocal_allrouters __aligned(BITS_PER_LONG/8) = IN6ADDR_SITELOCAL_ALLROUTERS_INIT; EXPORT_SYMBOL(in6addr_sitelocal_allrouters); static void snmp6_free_dev(struct inet6_dev *idev) { kfree(idev->stats.icmpv6msgdev); kfree(idev->stats.icmpv6dev); free_percpu(idev->stats.ipv6); } static void in6_dev_finish_destroy_rcu(struct rcu_head *head) { struct inet6_dev *idev = container_of(head, struct inet6_dev, rcu); snmp6_free_dev(idev); kfree(idev); } /* Nobody refers to this device, we may destroy it. */ void in6_dev_finish_destroy(struct inet6_dev *idev) { struct net_device *dev = idev->dev; WARN_ON(!list_empty(&idev->addr_list)); WARN_ON(rcu_access_pointer(idev->mc_list)); WARN_ON(timer_pending(&idev->rs_timer)); #ifdef NET_REFCNT_DEBUG pr_debug("%s: %s\n", __func__, dev ? dev->name : "NIL"); #endif netdev_put(dev, &idev->dev_tracker); if (!idev->dead) { pr_warn("Freeing alive inet6 device %p\n", idev); return; } call_rcu(&idev->rcu, in6_dev_finish_destroy_rcu); } EXPORT_SYMBOL(in6_dev_finish_destroy);
7 13 13 7 7 7 7 7 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 // SPDX-License-Identifier: GPL-2.0 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/errno.h> #include <linux/kernel.h> #include <linux/mm.h> #include <linux/smp.h> #include <linux/cpu.h> #include <linux/prctl.h> #include <linux/slab.h> #include <linux/sched.h> #include <linux/sched/idle.h> #include <linux/sched/debug.h> #include <linux/sched/task.h> #include <linux/sched/task_stack.h> #include <linux/init.h> #include <linux/export.h> #include <linux/pm.h> #include <linux/tick.h> #include <linux/random.h> #include <linux/user-return-notifier.h> #include <linux/dmi.h> #include <linux/utsname.h> #include <linux/stackprotector.h> #include <linux/cpuidle.h> #include <linux/acpi.h> #include <linux/elf-randomize.h> #include <linux/static_call.h> #include <trace/events/power.h> #include <linux/hw_breakpoint.h> #include <linux/entry-common.h> #include <asm/cpu.h> #include <asm/cpuid.h> #include <asm/apic.h> #include <linux/uaccess.h> #include <asm/mwait.h> #include <asm/fpu/api.h> #include <asm/fpu/sched.h> #include <asm/fpu/xstate.h> #include <asm/debugreg.h> #include <asm/nmi.h> #include <asm/tlbflush.h> #include <asm/mce.h> #include <asm/vm86.h> #include <asm/switch_to.h> #include <asm/desc.h> #include <asm/prctl.h> #include <asm/spec-ctrl.h> #include <asm/io_bitmap.h> #include <asm/proto.h> #include <asm/frame.h> #include <asm/unwind.h> #include <asm/tdx.h> #include <asm/mmu_context.h> #include <asm/shstk.h> #include "process.h" /* * per-CPU TSS segments. Threads are completely 'soft' on Linux, * no more per-task TSS's. The TSS size is kept cacheline-aligned * so they are allowed to end up in the .data..cacheline_aligned * section. Since TSS's are completely CPU-local, we want them * on exact cacheline boundaries, to eliminate cacheline ping-pong. */ __visible DEFINE_PER_CPU_PAGE_ALIGNED(struct tss_struct, cpu_tss_rw) = { .x86_tss = { /* * .sp0 is only used when entering ring 0 from a lower * privilege level. Since the init task never runs anything * but ring 0 code, there is no need for a valid value here. * Poison it. */ .sp0 = (1UL << (BITS_PER_LONG-1)) + 1, #ifdef CONFIG_X86_32 .sp1 = TOP_OF_INIT_STACK, .ss0 = __KERNEL_DS, .ss1 = __KERNEL_CS, #endif .io_bitmap_base = IO_BITMAP_OFFSET_INVALID, }, }; EXPORT_PER_CPU_SYMBOL(cpu_tss_rw); DEFINE_PER_CPU(bool, __tss_limit_invalid); EXPORT_PER_CPU_SYMBOL_GPL(__tss_limit_invalid); /* * this gets called so that we can store lazy state into memory and copy the * current task into the new thread. */ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) { memcpy(dst, src, arch_task_struct_size); #ifdef CONFIG_VM86 dst->thread.vm86 = NULL; #endif /* Drop the copied pointer to current's fpstate */ dst->thread.fpu.fpstate = NULL; return 0; } #ifdef CONFIG_X86_64 void arch_release_task_struct(struct task_struct *tsk) { if (fpu_state_size_dynamic()) fpstate_free(&tsk->thread.fpu); } #endif /* * Free thread data structures etc.. */ void exit_thread(struct task_struct *tsk) { struct thread_struct *t = &tsk->thread; struct fpu *fpu = &t->fpu; if (test_thread_flag(TIF_IO_BITMAP)) io_bitmap_exit(tsk); free_vm86(t); shstk_free(tsk); fpu__drop(fpu); } static int set_new_tls(struct task_struct *p, unsigned long tls) { struct user_desc __user *utls = (struct user_desc __user *)tls; if (in_ia32_syscall()) return do_set_thread_area(p, -1, utls, 0); else return do_set_thread_area_64(p, ARCH_SET_FS, tls); } __visible void ret_from_fork(struct task_struct *prev, struct pt_regs *regs, int (*fn)(void *), void *fn_arg) { schedule_tail(prev); /* Is this a kernel thread? */ if (unlikely(fn)) { fn(fn_arg); /* * A kernel thread is allowed to return here after successfully * calling kernel_execve(). Exit to userspace to complete the * execve() syscall. */ regs->ax = 0; } syscall_exit_to_user_mode(regs); } int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) { unsigned long clone_flags = args->flags; unsigned long sp = args->stack; unsigned long tls = args->tls; struct inactive_task_frame *frame; struct fork_frame *fork_frame; struct pt_regs *childregs; unsigned long new_ssp; int ret = 0; childregs = task_pt_regs(p); fork_frame = container_of(childregs, struct fork_frame, regs); frame = &fork_frame->frame; frame->bp = encode_frame_pointer(childregs); frame->ret_addr = (unsigned long) ret_from_fork_asm; p->thread.sp = (unsigned long) fork_frame; p->thread.io_bitmap = NULL; p->thread.iopl_warn = 0; memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps)); #ifdef CONFIG_X86_64 current_save_fsgs(); p->thread.fsindex = current->thread.fsindex; p->thread.fsbase = current->thread.fsbase; p->thread.gsindex = current->thread.gsindex; p->thread.gsbase = current->thread.gsbase; savesegment(es, p->thread.es); savesegment(ds, p->thread.ds); if (p->mm && (clone_flags & (CLONE_VM | CLONE_VFORK)) == CLONE_VM) set_bit(MM_CONTEXT_LOCK_LAM, &p->mm->context.flags); #else p->thread.sp0 = (unsigned long) (childregs + 1); savesegment(gs, p->thread.gs); /* * Clear all status flags including IF and set fixed bit. 64bit * does not have this initialization as the frame does not contain * flags. The flags consistency (especially vs. AC) is there * ensured via objtool, which lacks 32bit support. */ frame->flags = X86_EFLAGS_FIXED; #endif /* * Allocate a new shadow stack for thread if needed. If shadow stack, * is disabled, new_ssp will remain 0, and fpu_clone() will know not to * update it. */ new_ssp = shstk_alloc_thread_stack(p, clone_flags, args->stack_size); if (IS_ERR_VALUE(new_ssp)) return PTR_ERR((void *)new_ssp); fpu_clone(p, clone_flags, args->fn, new_ssp); /* Kernel thread ? */ if (unlikely(p->flags & PF_KTHREAD)) { p->thread.pkru = pkru_get_init_value(); memset(childregs, 0, sizeof(struct pt_regs)); kthread_frame_init(frame, args->fn, args->fn_arg); return 0; } /* * Clone current's PKRU value from hardware. tsk->thread.pkru * is only valid when scheduled out. */ p->thread.pkru = read_pkru(); frame->bx = 0; *childregs = *current_pt_regs(); childregs->ax = 0; if (sp) childregs->sp = sp; if (unlikely(args->fn)) { /* * A user space thread, but it doesn't return to * ret_after_fork(). * * In order to indicate that to tools like gdb, * we reset the stack and instruction pointers. * * It does the same kernel frame setup to return to a kernel * function that a kernel thread does. */ childregs->sp = 0; childregs->ip = 0; kthread_frame_init(frame, args->fn, args->fn_arg); return 0; } /* Set a new TLS for the child thread? */ if (clone_flags & CLONE_SETTLS) ret = set_new_tls(p, tls); if (!ret && unlikely(test_tsk_thread_flag(current, TIF_IO_BITMAP))) io_bitmap_share(p); return ret; } static void pkru_flush_thread(void) { /* * If PKRU is enabled the default PKRU value has to be loaded into * the hardware right here (similar to context switch). */ pkru_write_default(); } void flush_thread(void) { struct task_struct *tsk = current; flush_ptrace_hw_breakpoint(tsk); memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array)); fpu_flush_thread(); pkru_flush_thread(); } void disable_TSC(void) { preempt_disable(); if (!test_and_set_thread_flag(TIF_NOTSC)) /* * Must flip the CPU state synchronously with * TIF_NOTSC in the current running context. */ cr4_set_bits(X86_CR4_TSD); preempt_enable(); } static void enable_TSC(void) { preempt_disable(); if (test_and_clear_thread_flag(TIF_NOTSC)) /* * Must flip the CPU state synchronously with * TIF_NOTSC in the current running context. */ cr4_clear_bits(X86_CR4_TSD); preempt_enable(); } int get_tsc_mode(unsigned long adr) { unsigned int val; if (test_thread_flag(TIF_NOTSC)) val = PR_TSC_SIGSEGV; else val = PR_TSC_ENABLE; return put_user(val, (unsigned int __user *)adr); } int set_tsc_mode(unsigned int val) { if (val == PR_TSC_SIGSEGV) disable_TSC(); else if (val == PR_TSC_ENABLE) enable_TSC(); else return -EINVAL; return 0; } DEFINE_PER_CPU(u64, msr_misc_features_shadow); static void set_cpuid_faulting(bool on) { u64 msrval; msrval = this_cpu_read(msr_misc_features_shadow); msrval &= ~MSR_MISC_FEATURES_ENABLES_CPUID_FAULT; msrval |= (on << MSR_MISC_FEATURES_ENABLES_CPUID_FAULT_BIT); this_cpu_write(msr_misc_features_shadow, msrval); wrmsrl(MSR_MISC_FEATURES_ENABLES, msrval); } static void disable_cpuid(void) { preempt_disable(); if (!test_and_set_thread_flag(TIF_NOCPUID)) { /* * Must flip the CPU state synchronously with * TIF_NOCPUID in the current running context. */ set_cpuid_faulting(true); } preempt_enable(); } static void enable_cpuid(void) { preempt_disable(); if (test_and_clear_thread_flag(TIF_NOCPUID)) { /* * Must flip the CPU state synchronously with * TIF_NOCPUID in the current running context. */ set_cpuid_faulting(false); } preempt_enable(); } static int get_cpuid_mode(void) { return !test_thread_flag(TIF_NOCPUID); } static int set_cpuid_mode(unsigned long cpuid_enabled) { if (!boot_cpu_has(X86_FEATURE_CPUID_FAULT)) return -ENODEV; if (cpuid_enabled) enable_cpuid(); else disable_cpuid(); return 0; } /* * Called immediately after a successful exec. */ void arch_setup_new_exec(void) { /* If cpuid was previously disabled for this task, re-enable it. */ if (test_thread_flag(TIF_NOCPUID)) enable_cpuid(); /* * Don't inherit TIF_SSBD across exec boundary when * PR_SPEC_DISABLE_NOEXEC is used. */ if (test_thread_flag(TIF_SSBD) && task_spec_ssb_noexec(current)) { clear_thread_flag(TIF_SSBD); task_clear_spec_ssb_disable(current); task_clear_spec_ssb_noexec(current); speculation_ctrl_update(read_thread_flags()); } mm_reset_untag_mask(current->mm); } #ifdef CONFIG_X86_IOPL_IOPERM static inline void switch_to_bitmap(unsigned long tifp) { /* * Invalidate I/O bitmap if the previous task used it. This prevents * any possible leakage of an active I/O bitmap. * * If the next task has an I/O bitmap it will handle it on exit to * user mode. */ if (tifp & _TIF_IO_BITMAP) tss_invalidate_io_bitmap(); } static void tss_copy_io_bitmap(struct tss_struct *tss, struct io_bitmap *iobm) { /* * Copy at least the byte range of the incoming tasks bitmap which * covers the permitted I/O ports. * * If the previous task which used an I/O bitmap had more bits * permitted, then the copy needs to cover those as well so they * get turned off. */ memcpy(tss->io_bitmap.bitmap, iobm->bitmap, max(tss->io_bitmap.prev_max, iobm->max)); /* * Store the new max and the sequence number of this bitmap * and a pointer to the bitmap itself. */ tss->io_bitmap.prev_max = iobm->max; tss->io_bitmap.prev_sequence = iobm->sequence; } /** * native_tss_update_io_bitmap - Update I/O bitmap before exiting to user mode */ void native_tss_update_io_bitmap(void) { struct tss_struct *tss = this_cpu_ptr(&cpu_tss_rw); struct thread_struct *t = &current->thread; u16 *base = &tss->x86_tss.io_bitmap_base; if (!test_thread_flag(TIF_IO_BITMAP)) { native_tss_invalidate_io_bitmap(); return; } if (IS_ENABLED(CONFIG_X86_IOPL_IOPERM) && t->iopl_emul == 3) { *base = IO_BITMAP_OFFSET_VALID_ALL; } else { struct io_bitmap *iobm = t->io_bitmap; /* * Only copy bitmap data when the sequence number differs. The * update time is accounted to the incoming task. */ if (tss->io_bitmap.prev_sequence != iobm->sequence) tss_copy_io_bitmap(tss, iobm); /* Enable the bitmap */ *base = IO_BITMAP_OFFSET_VALID_MAP; } /* * Make sure that the TSS limit is covering the IO bitmap. It might have * been cut down by a VMEXIT to 0x67 which would cause a subsequent I/O * access from user space to trigger a #GP because the bitmap is outside * the TSS limit. */ refresh_tss_limit(); } #else /* CONFIG_X86_IOPL_IOPERM */ static inline void switch_to_bitmap(unsigned long tifp) { } #endif #ifdef CONFIG_SMP struct ssb_state { struct ssb_state *shared_state; raw_spinlock_t lock; unsigned int disable_state; unsigned long local_state; }; #define LSTATE_SSB 0 static DEFINE_PER_CPU(struct ssb_state, ssb_state); void speculative_store_bypass_ht_init(void) { struct ssb_state *st = this_cpu_ptr(&ssb_state); unsigned int this_cpu = smp_processor_id(); unsigned int cpu; st->local_state = 0; /* * Shared state setup happens once on the first bringup * of the CPU. It's not destroyed on CPU hotunplug. */ if (st->shared_state) return; raw_spin_lock_init(&st->lock); /* * Go over HT siblings and check whether one of them has set up the * shared state pointer already. */ for_each_cpu(cpu, topology_sibling_cpumask(this_cpu)) { if (cpu == this_cpu) continue; if (!per_cpu(ssb_state, cpu).shared_state) continue; /* Link it to the state of the sibling: */ st->shared_state = per_cpu(ssb_state, cpu).shared_state; return; } /* * First HT sibling to come up on the core. Link shared state of * the first HT sibling to itself. The siblings on the same core * which come up later will see the shared state pointer and link * themselves to the state of this CPU. */ st->shared_state = st; } /* * Logic is: First HT sibling enables SSBD for both siblings in the core * and last sibling to disable it, disables it for the whole core. This how * MSR_SPEC_CTRL works in "hardware": * * CORE_SPEC_CTRL = THREAD0_SPEC_CTRL | THREAD1_SPEC_CTRL */ static __always_inline void amd_set_core_ssb_state(unsigned long tifn) { struct ssb_state *st = this_cpu_ptr(&ssb_state); u64 msr = x86_amd_ls_cfg_base; if (!static_cpu_has(X86_FEATURE_ZEN)) { msr |= ssbd_tif_to_amd_ls_cfg(tifn); wrmsrl(MSR_AMD64_LS_CFG, msr); return; } if (tifn & _TIF_SSBD) { /* * Since this can race with prctl(), block reentry on the * same CPU. */ if (__test_and_set_bit(LSTATE_SSB, &st->local_state)) return; msr |= x86_amd_ls_cfg_ssbd_mask; raw_spin_lock(&st->shared_state->lock); /* First sibling enables SSBD: */ if (!st->shared_state->disable_state) wrmsrl(MSR_AMD64_LS_CFG, msr); st->shared_state->disable_state++; raw_spin_unlock(&st->shared_state->lock); } else { if (!__test_and_clear_bit(LSTATE_SSB, &st->local_state)) return; raw_spin_lock(&st->shared_state->lock); st->shared_state->disable_state--; if (!st->shared_state->disable_state) wrmsrl(MSR_AMD64_LS_CFG, msr); raw_spin_unlock(&st->shared_state->lock); } } #else static __always_inline void amd_set_core_ssb_state(unsigned long tifn) { u64 msr = x86_amd_ls_cfg_base | ssbd_tif_to_amd_ls_cfg(tifn); wrmsrl(MSR_AMD64_LS_CFG, msr); } #endif static __always_inline void amd_set_ssb_virt_state(unsigned long tifn) { /* * SSBD has the same definition in SPEC_CTRL and VIRT_SPEC_CTRL, * so ssbd_tif_to_spec_ctrl() just works. */ wrmsrl(MSR_AMD64_VIRT_SPEC_CTRL, ssbd_tif_to_spec_ctrl(tifn)); } /* * Update the MSRs managing speculation control, during context switch. * * tifp: Previous task's thread flags * tifn: Next task's thread flags */ static __always_inline void __speculation_ctrl_update(unsigned long tifp, unsigned long tifn) { unsigned long tif_diff = tifp ^ tifn; u64 msr = x86_spec_ctrl_base; bool updmsr = false; lockdep_assert_irqs_disabled(); /* Handle change of TIF_SSBD depending on the mitigation method. */ if (static_cpu_has(X86_FEATURE_VIRT_SSBD)) { if (tif_diff & _TIF_SSBD) amd_set_ssb_virt_state(tifn); } else if (static_cpu_has(X86_FEATURE_LS_CFG_SSBD)) { if (tif_diff & _TIF_SSBD) amd_set_core_ssb_state(tifn); } else if (static_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD) || static_cpu_has(X86_FEATURE_AMD_SSBD)) { updmsr |= !!(tif_diff & _TIF_SSBD); msr |= ssbd_tif_to_spec_ctrl(tifn); } /* Only evaluate TIF_SPEC_IB if conditional STIBP is enabled. */ if (IS_ENABLED(CONFIG_SMP) && static_branch_unlikely(&switch_to_cond_stibp)) { updmsr |= !!(tif_diff & _TIF_SPEC_IB); msr |= stibp_tif_to_spec_ctrl(tifn); } if (updmsr) update_spec_ctrl_cond(msr); } static unsigned long speculation_ctrl_update_tif(struct task_struct *tsk) { if (test_and_clear_tsk_thread_flag(tsk, TIF_SPEC_FORCE_UPDATE)) { if (task_spec_ssb_disable(tsk)) set_tsk_thread_flag(tsk, TIF_SSBD); else clear_tsk_thread_flag(tsk, TIF_SSBD); if (task_spec_ib_disable(tsk)) set_tsk_thread_flag(tsk, TIF_SPEC_IB); else clear_tsk_thread_flag(tsk, TIF_SPEC_IB); } /* Return the updated threadinfo flags*/ return read_task_thread_flags(tsk); } void speculation_ctrl_update(unsigned long tif) { unsigned long flags; /* Forced update. Make sure all relevant TIF flags are different */ local_irq_save(flags); __speculation_ctrl_update(~tif, tif); local_irq_restore(flags); } /* Called from seccomp/prctl update */ void speculation_ctrl_update_current(void) { preempt_disable(); speculation_ctrl_update(speculation_ctrl_update_tif(current)); preempt_enable(); } static inline void cr4_toggle_bits_irqsoff(unsigned long mask) { unsigned long newval, cr4 = this_cpu_read(cpu_tlbstate.cr4); newval = cr4 ^ mask; if (newval != cr4) { this_cpu_write(cpu_tlbstate.cr4, newval); __write_cr4(newval); } } void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p) { unsigned long tifp, tifn; tifn = read_task_thread_flags(next_p); tifp = read_task_thread_flags(prev_p); switch_to_bitmap(tifp); propagate_user_return_notify(prev_p, next_p); if ((tifp & _TIF_BLOCKSTEP || tifn & _TIF_BLOCKSTEP) && arch_has_block_step()) { unsigned long debugctl, msk; rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); debugctl &= ~DEBUGCTLMSR_BTF; msk = tifn & _TIF_BLOCKSTEP; debugctl |= (msk >> TIF_BLOCKSTEP) << DEBUGCTLMSR_BTF_SHIFT; wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); } if ((tifp ^ tifn) & _TIF_NOTSC) cr4_toggle_bits_irqsoff(X86_CR4_TSD); if ((tifp ^ tifn) & _TIF_NOCPUID) set_cpuid_faulting(!!(tifn & _TIF_NOCPUID)); if (likely(!((tifp | tifn) & _TIF_SPEC_FORCE_UPDATE))) { __speculation_ctrl_update(tifp, tifn); } else { speculation_ctrl_update_tif(prev_p); tifn = speculation_ctrl_update_tif(next_p); /* Enforce MSR update to ensure consistent state */ __speculation_ctrl_update(~tifn, tifn); } } /* * Idle related variables and functions */ unsigned long boot_option_idle_override = IDLE_NO_OVERRIDE; EXPORT_SYMBOL(boot_option_idle_override); /* * We use this if we don't have any better idle routine.. */ void __cpuidle default_idle(void) { raw_safe_halt(); raw_local_irq_disable(); } #if defined(CONFIG_APM_MODULE) || defined(CONFIG_HALTPOLL_CPUIDLE_MODULE) EXPORT_SYMBOL(default_idle); #endif DEFINE_STATIC_CALL_NULL(x86_idle, default_idle); static bool x86_idle_set(void) { return !!static_call_query(x86_idle); } #ifndef CONFIG_SMP static inline void __noreturn play_dead(void) { BUG(); } #endif void arch_cpu_idle_enter(void) { tsc_verify_tsc_adjust(false); local_touch_nmi(); } void __noreturn arch_cpu_idle_dead(void) { play_dead(); } /* * Called from the generic idle code. */ void __cpuidle arch_cpu_idle(void) { static_call(x86_idle)(); } EXPORT_SYMBOL_GPL(arch_cpu_idle); #ifdef CONFIG_XEN bool xen_set_default_idle(void) { bool ret = x86_idle_set(); static_call_update(x86_idle, default_idle); return ret; } #endif struct cpumask cpus_stop_mask; void __noreturn stop_this_cpu(void *dummy) { struct cpuinfo_x86 *c = this_cpu_ptr(&cpu_info); unsigned int cpu = smp_processor_id(); local_irq_disable(); /* * Remove this CPU from the online mask and disable it * unconditionally. This might be redundant in case that the reboot * vector was handled late and stop_other_cpus() sent an NMI. * * According to SDM and APM NMIs can be accepted even after soft * disabling the local APIC. */ set_cpu_online(cpu, false); disable_local_APIC(); mcheck_cpu_clear(c); /* * Use wbinvd on processors that support SME. This provides support * for performing a successful kexec when going from SME inactive * to SME active (or vice-versa). The cache must be cleared so that * if there are entries with the same physical address, both with and * without the encryption bit, they don't race each other when flushed * and potentially end up with the wrong entry being committed to * memory. * * Test the CPUID bit directly because the machine might've cleared * X86_FEATURE_SME due to cmdline options. */ if (c->extended_cpuid_level >= 0x8000001f && (cpuid_eax(0x8000001f) & BIT(0))) wbinvd(); /* * This brings a cache line back and dirties it, but * native_stop_other_cpus() will overwrite cpus_stop_mask after it * observed that all CPUs reported stop. This write will invalidate * the related cache line on this CPU. */ cpumask_clear_cpu(cpu, &cpus_stop_mask); #ifdef CONFIG_SMP if (smp_ops.stop_this_cpu) { smp_ops.stop_this_cpu(); BUG(); } #endif for (;;) { /* * Use native_halt() so that memory contents don't change * (stack usage and variables) after possibly issuing the * wbinvd() above. */ native_halt(); } } /* * Prefer MWAIT over HALT if MWAIT is supported, MWAIT_CPUID leaf * exists and whenever MONITOR/MWAIT extensions are present there is at * least one C1 substate. * * Do not prefer MWAIT if MONITOR instruction has a bug or idle=nomwait * is passed to kernel commandline parameter. */ static __init bool prefer_mwait_c1_over_halt(void) { const struct cpuinfo_x86 *c = &boot_cpu_data; u32 eax, ebx, ecx, edx; /* If override is enforced on the command line, fall back to HALT. */ if (boot_option_idle_override != IDLE_NO_OVERRIDE) return false; /* MWAIT is not supported on this platform. Fallback to HALT */ if (!cpu_has(c, X86_FEATURE_MWAIT)) return false; /* Monitor has a bug or APIC stops in C1E. Fallback to HALT */ if (boot_cpu_has_bug(X86_BUG_MONITOR) || boot_cpu_has_bug(X86_BUG_AMD_APIC_C1E)) return false; cpuid(CPUID_LEAF_MWAIT, &eax, &ebx, &ecx, &edx); /* * If MWAIT extensions are not available, it is safe to use MWAIT * with EAX=0, ECX=0. */ if (!(ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED)) return true; /* * If MWAIT extensions are available, there should be at least one * MWAIT C1 substate present. */ return !!(edx & MWAIT_C1_SUBSTATE_MASK); } /* * MONITOR/MWAIT with no hints, used for default C1 state. This invokes MWAIT * with interrupts enabled and no flags, which is backwards compatible with the * original MWAIT implementation. */ static __cpuidle void mwait_idle(void) { if (!current_set_polling_and_test()) { if (this_cpu_has(X86_BUG_CLFLUSH_MONITOR)) { mb(); /* quirk */ clflush((void *)&current_thread_info()->flags); mb(); /* quirk */ } __monitor((void *)&current_thread_info()->flags, 0, 0); if (!need_resched()) { __sti_mwait(0, 0); raw_local_irq_disable(); } } __current_clr_polling(); } void __init select_idle_routine(void) { if (boot_option_idle_override == IDLE_POLL) { if (IS_ENABLED(CONFIG_SMP) && __max_threads_per_core > 1) pr_warn_once("WARNING: polling idle and HT enabled, performance may degrade\n"); return; } /* Required to guard against xen_set_default_idle() */ if (x86_idle_set()) return; if (prefer_mwait_c1_over_halt()) { pr_info("using mwait in idle threads\n"); static_call_update(x86_idle, mwait_idle); } else if (cpu_feature_enabled(X86_FEATURE_TDX_GUEST)) { pr_info("using TDX aware idle routine\n"); static_call_update(x86_idle, tdx_safe_halt); } else { static_call_update(x86_idle, default_idle); } } void amd_e400_c1e_apic_setup(void) { if (boot_cpu_has_bug(X86_BUG_AMD_APIC_C1E)) { pr_info("Switch to broadcast mode on CPU%d\n", smp_processor_id()); local_irq_disable(); tick_broadcast_force(); local_irq_enable(); } } void __init arch_post_acpi_subsys_init(void) { u32 lo, hi; if (!boot_cpu_has_bug(X86_BUG_AMD_E400)) return; /* * AMD E400 detection needs to happen after ACPI has been enabled. If * the machine is affected K8_INTP_C1E_ACTIVE_MASK bits are set in * MSR_K8_INT_PENDING_MSG. */ rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi); if (!(lo & K8_INTP_C1E_ACTIVE_MASK)) return; boot_cpu_set_bug(X86_BUG_AMD_APIC_C1E); if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC)) mark_tsc_unstable("TSC halt in AMD C1E"); if (IS_ENABLED(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST_IDLE)) static_branch_enable(&arch_needs_tick_broadcast); pr_info("System has AMD C1E erratum E400. Workaround enabled.\n"); } static int __init idle_setup(char *str) { if (!str) return -EINVAL; if (!strcmp(str, "poll")) { pr_info("using polling idle threads\n"); boot_option_idle_override = IDLE_POLL; cpu_idle_poll_ctrl(true); } else if (!strcmp(str, "halt")) { /* 'idle=halt' HALT for idle. C-states are disabled. */ boot_option_idle_override = IDLE_HALT; } else if (!strcmp(str, "nomwait")) { /* 'idle=nomwait' disables MWAIT for idle */ boot_option_idle_override = IDLE_NOMWAIT; } else { return -EINVAL; } return 0; } early_param("idle", idle_setup); unsigned long arch_align_stack(unsigned long sp) { if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space) sp -= get_random_u32_below(8192); return sp & ~0xf; } unsigned long arch_randomize_brk(struct mm_struct *mm) { if (mmap_is_ia32()) return randomize_page(mm->brk, SZ_32M); return randomize_page(mm->brk, SZ_1G); } /* * Called from fs/proc with a reference on @p to find the function * which called into schedule(). This needs to be done carefully * because the task might wake up and we might look at a stack * changing under us. */ unsigned long __get_wchan(struct task_struct *p) { struct unwind_state state; unsigned long addr = 0; if (!try_get_task_stack(p)) return 0; for (unwind_start(&state, p, NULL, NULL); !unwind_done(&state); unwind_next_frame(&state)) { addr = unwind_get_return_address(&state); if (!addr) break; if (in_sched_functions(addr)) continue; break; } put_task_stack(p); return addr; } long do_arch_prctl_common(int option, unsigned long arg2) { switch (option) { case ARCH_GET_CPUID: return get_cpuid_mode(); case ARCH_SET_CPUID: return set_cpuid_mode(arg2); case ARCH_GET_XCOMP_SUPP: case ARCH_GET_XCOMP_PERM: case ARCH_REQ_XCOMP_PERM: case ARCH_GET_XCOMP_GUEST_PERM: case ARCH_REQ_XCOMP_GUEST_PERM: return fpu_xstate_prctl(option, arg2); } return -EINVAL; }
196 348 3 6 348 196 6 347 201 382 145 357 347 201 357 33 33 33 358 218 359 323 322 359 358 176 370 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 // SPDX-License-Identifier: GPL-2.0 /* * A fast, small, non-recursive O(n log n) sort for the Linux kernel * * This performs n*log2(n) + 0.37*n + o(n) comparisons on average, * and 1.5*n*log2(n) + O(n) in the (very contrived) worst case. * * Quicksort manages n*log2(n) - 1.26*n for random inputs (1.63*n * better) at the expense of stack usage and much larger code to avoid * quicksort's O(n^2) worst case. */ #include <linux/types.h> #include <linux/export.h> #include <linux/sort.h> /** * is_aligned - is this pointer & size okay for word-wide copying? * @base: pointer to data * @size: size of each element * @align: required alignment (typically 4 or 8) * * Returns true if elements can be copied using word loads and stores. * The size must be a multiple of the alignment, and the base address must * be if we do not have CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS. * * For some reason, gcc doesn't know to optimize "if (a & mask || b & mask)" * to "if ((a | b) & mask)", so we do that by hand. */ __attribute_const__ __always_inline static bool is_aligned(const void *base, size_t size, unsigned char align) { unsigned char lsbits = (unsigned char)size; (void)base; #ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS lsbits |= (unsigned char)(uintptr_t)base; #endif return (lsbits & (align - 1)) == 0; } /** * swap_words_32 - swap two elements in 32-bit chunks * @a: pointer to the first element to swap * @b: pointer to the second element to swap * @n: element size (must be a multiple of 4) * * Exchange the two objects in memory. This exploits base+index addressing, * which basically all CPUs have, to minimize loop overhead computations. * * For some reason, on x86 gcc 7.3.0 adds a redundant test of n at the * bottom of the loop, even though the zero flag is still valid from the * subtract (since the intervening mov instructions don't alter the flags). * Gcc 8.1.0 doesn't have that problem. */ static void swap_words_32(void *a, void *b, size_t n) { do { u32 t = *(u32 *)(a + (n -= 4)); *(u32 *)(a + n) = *(u32 *)(b + n); *(u32 *)(b + n) = t; } while (n); } /** * swap_words_64 - swap two elements in 64-bit chunks * @a: pointer to the first element to swap * @b: pointer to the second element to swap * @n: element size (must be a multiple of 8) * * Exchange the two objects in memory. This exploits base+index * addressing, which basically all CPUs have, to minimize loop overhead * computations. * * We'd like to use 64-bit loads if possible. If they're not, emulating * one requires base+index+4 addressing which x86 has but most other * processors do not. If CONFIG_64BIT, we definitely have 64-bit loads, * but it's possible to have 64-bit loads without 64-bit pointers (e.g. * x32 ABI). Are there any cases the kernel needs to worry about? */ static void swap_words_64(void *a, void *b, size_t n) { do { #ifdef CONFIG_64BIT u64 t = *(u64 *)(a + (n -= 8)); *(u64 *)(a + n) = *(u64 *)(b + n); *(u64 *)(b + n) = t; #else /* Use two 32-bit transfers to avoid base+index+4 addressing */ u32 t = *(u32 *)(a + (n -= 4)); *(u32 *)(a + n) = *(u32 *)(b + n); *(u32 *)(b + n) = t; t = *(u32 *)(a + (n -= 4)); *(u32 *)(a + n) = *(u32 *)(b + n); *(u32 *)(b + n) = t; #endif } while (n); } /** * swap_bytes - swap two elements a byte at a time * @a: pointer to the first element to swap * @b: pointer to the second element to swap * @n: element size * * This is the fallback if alignment doesn't allow using larger chunks. */ static void swap_bytes(void *a, void *b, size_t n) { do { char t = ((char *)a)[--n]; ((char *)a)[n] = ((char *)b)[n]; ((char *)b)[n] = t; } while (n); } /* * The values are arbitrary as long as they can't be confused with * a pointer, but small integers make for the smallest compare * instructions. */ #define SWAP_WORDS_64 (swap_r_func_t)0 #define SWAP_WORDS_32 (swap_r_func_t)1 #define SWAP_BYTES (swap_r_func_t)2 #define SWAP_WRAPPER (swap_r_func_t)3 struct wrapper { cmp_func_t cmp; swap_func_t swap; }; /* * The function pointer is last to make tail calls most efficient if the * compiler decides not to inline this function. */ static void do_swap(void *a, void *b, size_t size, swap_r_func_t swap_func, const void *priv) { if (swap_func == SWAP_WRAPPER) { ((const struct wrapper *)priv)->swap(a, b, (int)size); return; } if (swap_func == SWAP_WORDS_64) swap_words_64(a, b, size); else if (swap_func == SWAP_WORDS_32) swap_words_32(a, b, size); else if (swap_func == SWAP_BYTES) swap_bytes(a, b, size); else swap_func(a, b, (int)size, priv); } #define _CMP_WRAPPER ((cmp_r_func_t)0L) static int do_cmp(const void *a, const void *b, cmp_r_func_t cmp, const void *priv) { if (cmp == _CMP_WRAPPER) return ((const struct wrapper *)priv)->cmp(a, b); return cmp(a, b, priv); } /** * parent - given the offset of the child, find the offset of the parent. * @i: the offset of the heap element whose parent is sought. Non-zero. * @lsbit: a precomputed 1-bit mask, equal to "size & -size" * @size: size of each element * * In terms of array indexes, the parent of element j = @i/@size is simply * (j-1)/2. But when working in byte offsets, we can't use implicit * truncation of integer divides. * * Fortunately, we only need one bit of the quotient, not the full divide. * @size has a least significant bit. That bit will be clear if @i is * an even multiple of @size, and set if it's an odd multiple. * * Logically, we're doing "if (i & lsbit) i -= size;", but since the * branch is unpredictable, it's done with a bit of clever branch-free * code instead. */ __attribute_const__ __always_inline static size_t parent(size_t i, unsigned int lsbit, size_t size) { i -= size; i -= size & -(i & lsbit); return i / 2; } /** * sort_r - sort an array of elements * @base: pointer to data to sort * @num: number of elements * @size: size of each element * @cmp_func: pointer to comparison function * @swap_func: pointer to swap function or NULL * @priv: third argument passed to comparison function * * This function does a heapsort on the given array. You may provide * a swap_func function if you need to do something more than a memory * copy (e.g. fix up pointers or auxiliary data), but the built-in swap * avoids a slow retpoline and so is significantly faster. * * The comparison function must adhere to specific mathematical * properties to ensure correct and stable sorting: * - Antisymmetry: cmp_func(a, b) must return the opposite sign of * cmp_func(b, a). * - Transitivity: if cmp_func(a, b) <= 0 and cmp_func(b, c) <= 0, then * cmp_func(a, c) <= 0. * * Sorting time is O(n log n) both on average and worst-case. While * quicksort is slightly faster on average, it suffers from exploitable * O(n*n) worst-case behavior and extra memory requirements that make * it less suitable for kernel use. */ void sort_r(void *base, size_t num, size_t size, cmp_r_func_t cmp_func, swap_r_func_t swap_func, const void *priv) { /* pre-scale counters for performance */ size_t n = num * size, a = (num/2) * size; const unsigned int lsbit = size & -size; /* Used to find parent */ size_t shift = 0; if (!a) /* num < 2 || size == 0 */ return; /* called from 'sort' without swap function, let's pick the default */ if (swap_func == SWAP_WRAPPER && !((struct wrapper *)priv)->swap) swap_func = NULL; if (!swap_func) { if (is_aligned(base, size, 8)) swap_func = SWAP_WORDS_64; else if (is_aligned(base, size, 4)) swap_func = SWAP_WORDS_32; else swap_func = SWAP_BYTES; } /* * Loop invariants: * 1. elements [a,n) satisfy the heap property (compare greater than * all of their children), * 2. elements [n,num*size) are sorted, and * 3. a <= b <= c <= d <= n (whenever they are valid). */ for (;;) { size_t b, c, d; if (a) /* Building heap: sift down a */ a -= size << shift; else if (n > 3 * size) { /* Sorting: Extract two largest elements */ n -= size; do_swap(base, base + n, size, swap_func, priv); shift = do_cmp(base + size, base + 2 * size, cmp_func, priv) <= 0; a = size << shift; n -= size; do_swap(base + a, base + n, size, swap_func, priv); } else { /* Sort complete */ break; } /* * Sift element at "a" down into heap. This is the * "bottom-up" variant, which significantly reduces * calls to cmp_func(): we find the sift-down path all * the way to the leaves (one compare per level), then * backtrack to find where to insert the target element. * * Because elements tend to sift down close to the leaves, * this uses fewer compares than doing two per level * on the way down. (A bit more than half as many on * average, 3/4 worst-case.) */ for (b = a; c = 2*b + size, (d = c + size) < n;) b = do_cmp(base + c, base + d, cmp_func, priv) > 0 ? c : d; if (d == n) /* Special case last leaf with no sibling */ b = c; /* Now backtrack from "b" to the correct location for "a" */ while (b != a && do_cmp(base + a, base + b, cmp_func, priv) >= 0) b = parent(b, lsbit, size); c = b; /* Where "a" belongs */ while (b != a) { /* Shift it into place */ b = parent(b, lsbit, size); do_swap(base + b, base + c, size, swap_func, priv); } } n -= size; do_swap(base, base + n, size, swap_func, priv); if (n == size * 2 && do_cmp(base, base + size, cmp_func, priv) > 0) do_swap(base, base + size, size, swap_func, priv); } EXPORT_SYMBOL(sort_r); void sort(void *base, size_t num, size_t size, cmp_func_t cmp_func, swap_func_t swap_func) { struct wrapper w = { .cmp = cmp_func, .swap = swap_func, }; return sort_r(base, num, size, _CMP_WRAPPER, SWAP_WRAPPER, &w); } EXPORT_SYMBOL(sort);
168 126 2 93 109 1 4 14 7 89 1 3 85 1 3 2 3 4 45 2 81 24 69 6 26 2 1 1 39 11 28 27 4 23 14 23 3 19 27 1 16 10 14 11 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 /* * linux/fs/hfs/dir.c * * Copyright (C) 1995-1997 Paul H. Hargrove * (C) 2003 Ardis Technologies <roman@ardistech.com> * This file may be distributed under the terms of the GNU General Public License. * * This file contains directory-related functions independent of which * scheme is being used to represent forks. * * Based on the minix file system code, (C) 1991, 1992 by Linus Torvalds */ #include "hfs_fs.h" #include "btree.h" /* * hfs_lookup() */ static struct dentry *hfs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { hfs_cat_rec rec; struct hfs_find_data fd; struct inode *inode = NULL; int res; res = hfs_find_init(HFS_SB(dir->i_sb)->cat_tree, &fd); if (res) return ERR_PTR(res); hfs_cat_build_key(dir->i_sb, fd.search_key, dir->i_ino, &dentry->d_name); res = hfs_brec_read(&fd, &rec, sizeof(rec)); if (res) { if (res != -ENOENT) inode = ERR_PTR(res); } else { inode = hfs_iget(dir->i_sb, &fd.search_key->cat, &rec); if (!inode) inode = ERR_PTR(-EACCES); } hfs_find_exit(&fd); return d_splice_alias(inode, dentry); } /* * hfs_readdir */ static int hfs_readdir(struct file *file, struct dir_context *ctx) { struct inode *inode = file_inode(file); struct super_block *sb = inode->i_sb; int len, err; char strbuf[HFS_MAX_NAMELEN]; union hfs_cat_rec entry; struct hfs_find_data fd; struct hfs_readdir_data *rd; u16 type; if (ctx->pos >= inode->i_size) return 0; err = hfs_find_init(HFS_SB(sb)->cat_tree, &fd); if (err) return err; hfs_cat_build_key(sb, fd.search_key, inode->i_ino, NULL); err = hfs_brec_find(&fd); if (err) goto out; if (ctx->pos == 0) { /* This is completely artificial... */ if (!dir_emit_dot(file, ctx)) goto out; ctx->pos = 1; } if (ctx->pos == 1) { if (fd.entrylength > sizeof(entry) || fd.entrylength < 0) { err = -EIO; goto out; } hfs_bnode_read(fd.bnode, &entry, fd.entryoffset, fd.entrylength); if (entry.type != HFS_CDR_THD) { pr_err("bad catalog folder thread\n"); err = -EIO; goto out; } //if (fd.entrylength < HFS_MIN_THREAD_SZ) { // pr_err("truncated catalog thread\n"); // err = -EIO; // goto out; //} if (!dir_emit(ctx, "..", 2, be32_to_cpu(entry.thread.ParID), DT_DIR)) goto out; ctx->pos = 2; } if (ctx->pos >= inode->i_size) goto out; err = hfs_brec_goto(&fd, ctx->pos - 1); if (err) goto out; for (;;) { if (be32_to_cpu(fd.key->cat.ParID) != inode->i_ino) { pr_err("walked past end of dir\n"); err = -EIO; goto out; } if (fd.entrylength > sizeof(entry) || fd.entrylength < 0) { err = -EIO; goto out; } hfs_bnode_read(fd.bnode, &entry, fd.entryoffset, fd.entrylength); type = entry.type; len = hfs_mac2asc(sb, strbuf, &fd.key->cat.CName); if (type == HFS_CDR_DIR) { if (fd.entrylength < sizeof(struct hfs_cat_dir)) { pr_err("small dir entry\n"); err = -EIO; goto out; } if (!dir_emit(ctx, strbuf, len, be32_to_cpu(entry.dir.DirID), DT_DIR)) break; } else if (type == HFS_CDR_FIL) { if (fd.entrylength < sizeof(struct hfs_cat_file)) { pr_err("small file entry\n"); err = -EIO; goto out; } if (!dir_emit(ctx, strbuf, len, be32_to_cpu(entry.file.FlNum), DT_REG)) break; } else { pr_err("bad catalog entry type %d\n", type); err = -EIO; goto out; } ctx->pos++; if (ctx->pos >= inode->i_size) goto out; err = hfs_brec_goto(&fd, 1); if (err) goto out; } rd = file->private_data; if (!rd) { rd = kmalloc(sizeof(struct hfs_readdir_data), GFP_KERNEL); if (!rd) { err = -ENOMEM; goto out; } file->private_data = rd; rd->file = file; spin_lock(&HFS_I(inode)->open_dir_lock); list_add(&rd->list, &HFS_I(inode)->open_dir_list); spin_unlock(&HFS_I(inode)->open_dir_lock); } /* * Can be done after the list insertion; exclusion with * hfs_delete_cat() is provided by directory lock. */ memcpy(&rd->key, &fd.key->cat, sizeof(struct hfs_cat_key)); out: hfs_find_exit(&fd); return err; } static int hfs_dir_release(struct inode *inode, struct file *file) { struct hfs_readdir_data *rd = file->private_data; if (rd) { spin_lock(&HFS_I(inode)->open_dir_lock); list_del(&rd->list); spin_unlock(&HFS_I(inode)->open_dir_lock); kfree(rd); } return 0; } /* * hfs_create() * * This is the create() entry in the inode_operations structure for * regular HFS directories. The purpose is to create a new file in * a directory and return a corresponding inode, given the inode for * the directory and the name (and its length) of the new file. */ static int hfs_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { struct inode *inode; int res; inode = hfs_new_inode(dir, &dentry->d_name, mode); if (!inode) return -ENOMEM; res = hfs_cat_create(inode->i_ino, dir, &dentry->d_name, inode); if (res) { clear_nlink(inode); hfs_delete_inode(inode); iput(inode); return res; } d_instantiate(dentry, inode); mark_inode_dirty(inode); return 0; } /* * hfs_mkdir() * * This is the mkdir() entry in the inode_operations structure for * regular HFS directories. The purpose is to create a new directory * in a directory, given the inode for the parent directory and the * name (and its length) of the new directory. */ static int hfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { struct inode *inode; int res; inode = hfs_new_inode(dir, &dentry->d_name, S_IFDIR | mode); if (!inode) return -ENOMEM; res = hfs_cat_create(inode->i_ino, dir, &dentry->d_name, inode); if (res) { clear_nlink(inode); hfs_delete_inode(inode); iput(inode); return res; } d_instantiate(dentry, inode); mark_inode_dirty(inode); return 0; } /* * hfs_remove() * * This serves as both unlink() and rmdir() in the inode_operations * structure for regular HFS directories. The purpose is to delete * an existing child, given the inode for the parent directory and * the name (and its length) of the existing directory. * * HFS does not have hardlinks, so both rmdir and unlink set the * link count to 0. The only difference is the emptiness check. */ static int hfs_remove(struct inode *dir, struct dentry *dentry) { struct inode *inode = d_inode(dentry); int res; if (S_ISDIR(inode->i_mode) && inode->i_size != 2) return -ENOTEMPTY; res = hfs_cat_delete(inode->i_ino, dir, &dentry->d_name); if (res) return res; clear_nlink(inode); inode_set_ctime_current(inode); hfs_delete_inode(inode); mark_inode_dirty(inode); return 0; } /* * hfs_rename() * * This is the rename() entry in the inode_operations structure for * regular HFS directories. The purpose is to rename an existing * file or directory, given the inode for the current directory and * the name (and its length) of the existing file/directory and the * inode for the new directory and the name (and its length) of the * new file/directory. * XXX: how do you handle must_be dir? */ static int hfs_rename(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { int res; if (flags & ~RENAME_NOREPLACE) return -EINVAL; /* Unlink destination if it already exists */ if (d_really_is_positive(new_dentry)) { res = hfs_remove(new_dir, new_dentry); if (res) return res; } res = hfs_cat_move(d_inode(old_dentry)->i_ino, old_dir, &old_dentry->d_name, new_dir, &new_dentry->d_name); if (!res) hfs_cat_build_key(old_dir->i_sb, (btree_key *)&HFS_I(d_inode(old_dentry))->cat_key, new_dir->i_ino, &new_dentry->d_name); return res; } const struct file_operations hfs_dir_operations = { .read = generic_read_dir, .iterate_shared = hfs_readdir, .llseek = generic_file_llseek, .release = hfs_dir_release, }; const struct inode_operations hfs_dir_inode_operations = { .create = hfs_create, .lookup = hfs_lookup, .unlink = hfs_remove, .mkdir = hfs_mkdir, .rmdir = hfs_remove, .rename = hfs_rename, .setattr = hfs_inode_setattr, };
5 186 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. * All Rights Reserved. */ #ifndef __XFS_RTBITMAP_H__ #define __XFS_RTBITMAP_H__ #include "xfs_rtgroup.h" struct xfs_rtalloc_args { struct xfs_rtgroup *rtg; struct xfs_mount *mp; struct xfs_trans *tp; struct xfs_buf *rbmbp; /* bitmap block buffer */ struct xfs_buf *sumbp; /* summary block buffer */ xfs_fileoff_t rbmoff; /* bitmap block number */ xfs_fileoff_t sumoff; /* summary block number */ }; static inline xfs_rtblock_t xfs_rtx_to_rtb( struct xfs_rtgroup *rtg, xfs_rtxnum_t rtx) { struct xfs_mount *mp = rtg_mount(rtg); xfs_rtblock_t start = xfs_group_start_fsb(rtg_group(rtg)); if (mp->m_rtxblklog >= 0) return start + (rtx << mp->m_rtxblklog); return start + (rtx * mp->m_sb.sb_rextsize); } /* Convert an rgbno into an rt extent number. */ static inline xfs_rtxnum_t xfs_rgbno_to_rtx( struct xfs_mount *mp, xfs_rgblock_t rgbno) { if (likely(mp->m_rtxblklog >= 0)) return rgbno >> mp->m_rtxblklog; return rgbno / mp->m_sb.sb_rextsize; } static inline uint64_t xfs_rtbxlen_to_blen( struct xfs_mount *mp, xfs_rtbxlen_t rtbxlen) { if (mp->m_rtxblklog >= 0) return rtbxlen << mp->m_rtxblklog; return rtbxlen * mp->m_sb.sb_rextsize; } static inline xfs_extlen_t xfs_rtxlen_to_extlen( struct xfs_mount *mp, xfs_rtxlen_t rtxlen) { if (mp->m_rtxblklog >= 0) return rtxlen << mp->m_rtxblklog; return rtxlen * mp->m_sb.sb_rextsize; } /* Compute the misalignment between an extent length and a realtime extent .*/ static inline unsigned int xfs_extlen_to_rtxmod( struct xfs_mount *mp, xfs_extlen_t len) { if (mp->m_rtxblklog >= 0) return len & mp->m_rtxblkmask; return len % mp->m_sb.sb_rextsize; } static inline xfs_rtxlen_t xfs_extlen_to_rtxlen( struct xfs_mount *mp, xfs_extlen_t len) { if (mp->m_rtxblklog >= 0) return len >> mp->m_rtxblklog; return len / mp->m_sb.sb_rextsize; } /* Convert an rt block count into an rt extent count. */ static inline xfs_rtbxlen_t xfs_blen_to_rtbxlen( struct xfs_mount *mp, uint64_t blen) { if (likely(mp->m_rtxblklog >= 0)) return blen >> mp->m_rtxblklog; return div_u64(blen, mp->m_sb.sb_rextsize); } /* Return the offset of a file block length within an rt extent. */ static inline xfs_extlen_t xfs_blen_to_rtxoff( struct xfs_mount *mp, xfs_filblks_t blen) { if (likely(mp->m_rtxblklog >= 0)) return blen & mp->m_rtxblkmask; return do_div(blen, mp->m_sb.sb_rextsize); } /* Round this block count up to the nearest rt extent size. */ static inline xfs_filblks_t xfs_blen_roundup_rtx( struct xfs_mount *mp, xfs_filblks_t blen) { return roundup_64(blen, mp->m_sb.sb_rextsize); } /* Convert an rt block number into an rt extent number. */ static inline xfs_rtxnum_t xfs_rtb_to_rtx( struct xfs_mount *mp, xfs_rtblock_t rtbno) { /* open-coded 64-bit masking operation */ rtbno &= mp->m_groups[XG_TYPE_RTG].blkmask; if (likely(mp->m_rtxblklog >= 0)) return rtbno >> mp->m_rtxblklog; return div_u64(rtbno, mp->m_sb.sb_rextsize); } /* Return the offset of a rtgroup block number within an rt extent. */ static inline xfs_extlen_t xfs_rgbno_to_rtxoff( struct xfs_mount *mp, xfs_rgblock_t rgbno) { return rgbno % mp->m_sb.sb_rextsize; } /* Return the offset of an rt block number within an rt extent. */ static inline xfs_extlen_t xfs_rtb_to_rtxoff( struct xfs_mount *mp, xfs_rtblock_t rtbno) { /* open-coded 64-bit masking operation */ rtbno &= mp->m_groups[XG_TYPE_RTG].blkmask; if (likely(mp->m_rtxblklog >= 0)) return rtbno & mp->m_rtxblkmask; return do_div(rtbno, mp->m_sb.sb_rextsize); } /* Round this file block offset up to the nearest rt extent size. */ static inline xfs_rtblock_t xfs_fileoff_roundup_rtx( struct xfs_mount *mp, xfs_fileoff_t off) { return roundup_64(off, mp->m_sb.sb_rextsize); } /* Round this file block offset down to the nearest rt extent size. */ static inline xfs_rtblock_t xfs_fileoff_rounddown_rtx( struct xfs_mount *mp, xfs_fileoff_t off) { return rounddown_64(off, mp->m_sb.sb_rextsize); } /* Convert an rt extent number to a file block offset in the rt bitmap file. */ static inline xfs_fileoff_t xfs_rtx_to_rbmblock( struct xfs_mount *mp, xfs_rtxnum_t rtx) { if (xfs_has_rtgroups(mp)) return div_u64(rtx, mp->m_rtx_per_rbmblock); return rtx >> mp->m_blkbit_log; } /* Convert an rt extent number to a word offset within an rt bitmap block. */ static inline unsigned int xfs_rtx_to_rbmword( struct xfs_mount *mp, xfs_rtxnum_t rtx) { if (xfs_has_rtgroups(mp)) { unsigned int mod; div_u64_rem(rtx >> XFS_NBWORDLOG, mp->m_blockwsize, &mod); return mod; } return (rtx >> XFS_NBWORDLOG) & (mp->m_blockwsize - 1); } /* Convert a file block offset in the rt bitmap file to an rt extent number. */ static inline xfs_rtxnum_t xfs_rbmblock_to_rtx( struct xfs_mount *mp, xfs_fileoff_t rbmoff) { if (xfs_has_rtgroups(mp)) return rbmoff * mp->m_rtx_per_rbmblock; return rbmoff << mp->m_blkbit_log; } /* Return a pointer to a bitmap word within a rt bitmap block. */ static inline union xfs_rtword_raw * xfs_rbmblock_wordptr( struct xfs_rtalloc_args *args, unsigned int index) { struct xfs_mount *mp = args->mp; union xfs_rtword_raw *words; struct xfs_rtbuf_blkinfo *hdr = args->rbmbp->b_addr; if (xfs_has_rtgroups(mp)) words = (union xfs_rtword_raw *)(hdr + 1); else words = args->rbmbp->b_addr; return words + index; } /* Convert an ondisk bitmap word to its incore representation. */ static inline xfs_rtword_t xfs_rtbitmap_getword( struct xfs_rtalloc_args *args, unsigned int index) { union xfs_rtword_raw *word = xfs_rbmblock_wordptr(args, index); if (xfs_has_rtgroups(args->mp)) return be32_to_cpu(word->rtg); return word->old; } /* Set an ondisk bitmap word from an incore representation. */ static inline void xfs_rtbitmap_setword( struct xfs_rtalloc_args *args, unsigned int index, xfs_rtword_t value) { union xfs_rtword_raw *word = xfs_rbmblock_wordptr(args, index); if (xfs_has_rtgroups(args->mp)) word->rtg = cpu_to_be32(value); else word->old = value; } /* * Convert a rt extent length and rt bitmap block number to a xfs_suminfo_t * offset within the rt summary file. */ static inline xfs_rtsumoff_t xfs_rtsumoffs( struct xfs_mount *mp, int log2_len, xfs_fileoff_t rbmoff) { return log2_len * mp->m_sb.sb_rbmblocks + rbmoff; } /* * Convert an xfs_suminfo_t offset to a file block offset within the rt summary * file. */ static inline xfs_fileoff_t xfs_rtsumoffs_to_block( struct xfs_mount *mp, xfs_rtsumoff_t rsumoff) { if (xfs_has_rtgroups(mp)) return rsumoff / mp->m_blockwsize; return XFS_B_TO_FSBT(mp, rsumoff * sizeof(xfs_suminfo_t)); } /* * Convert an xfs_suminfo_t offset to an info word offset within an rt summary * block. */ static inline unsigned int xfs_rtsumoffs_to_infoword( struct xfs_mount *mp, xfs_rtsumoff_t rsumoff) { unsigned int mask = mp->m_blockmask >> XFS_SUMINFOLOG; if (xfs_has_rtgroups(mp)) return rsumoff % mp->m_blockwsize; return rsumoff & mask; } /* Return a pointer to a summary info word within a rt summary block. */ static inline union xfs_suminfo_raw * xfs_rsumblock_infoptr( struct xfs_rtalloc_args *args, unsigned int index) { union xfs_suminfo_raw *info; struct xfs_rtbuf_blkinfo *hdr = args->sumbp->b_addr; if (xfs_has_rtgroups(args->mp)) info = (union xfs_suminfo_raw *)(hdr + 1); else info = args->sumbp->b_addr; return info + index; } /* Get the current value of a summary counter. */ static inline xfs_suminfo_t xfs_suminfo_get( struct xfs_rtalloc_args *args, unsigned int index) { union xfs_suminfo_raw *info = xfs_rsumblock_infoptr(args, index); if (xfs_has_rtgroups(args->mp)) return be32_to_cpu(info->rtg); return info->old; } /* Add to the current value of a summary counter and return the new value. */ static inline xfs_suminfo_t xfs_suminfo_add( struct xfs_rtalloc_args *args, unsigned int index, int delta) { union xfs_suminfo_raw *info = xfs_rsumblock_infoptr(args, index); if (xfs_has_rtgroups(args->mp)) { be32_add_cpu(&info->rtg, delta); return be32_to_cpu(info->rtg); } info->old += delta; return info->old; } static inline const struct xfs_buf_ops * xfs_rtblock_ops( struct xfs_mount *mp, enum xfs_rtg_inodes type) { if (xfs_has_rtgroups(mp)) { if (type == XFS_RTGI_SUMMARY) return &xfs_rtsummary_buf_ops; return &xfs_rtbitmap_buf_ops; } return &xfs_rtbuf_ops; } /* * Functions for walking free space rtextents in the realtime bitmap. */ struct xfs_rtalloc_rec { xfs_rtxnum_t ar_startext; xfs_rtbxlen_t ar_extcount; }; typedef int (*xfs_rtalloc_query_range_fn)( struct xfs_rtgroup *rtg, struct xfs_trans *tp, const struct xfs_rtalloc_rec *rec, void *priv); #ifdef CONFIG_XFS_RT void xfs_rtbuf_cache_relse(struct xfs_rtalloc_args *args); int xfs_rtbitmap_read_buf(struct xfs_rtalloc_args *args, xfs_fileoff_t block); int xfs_rtsummary_read_buf(struct xfs_rtalloc_args *args, xfs_fileoff_t block); int xfs_rtcheck_range(struct xfs_rtalloc_args *args, xfs_rtxnum_t start, xfs_rtxlen_t len, int val, xfs_rtxnum_t *new, int *stat); int xfs_rtfind_back(struct xfs_rtalloc_args *args, xfs_rtxnum_t start, xfs_rtxnum_t *rtblock); int xfs_rtfind_forw(struct xfs_rtalloc_args *args, xfs_rtxnum_t start, xfs_rtxnum_t limit, xfs_rtxnum_t *rtblock); int xfs_rtmodify_range(struct xfs_rtalloc_args *args, xfs_rtxnum_t start, xfs_rtxlen_t len, int val); int xfs_rtget_summary(struct xfs_rtalloc_args *args, int log, xfs_fileoff_t bbno, xfs_suminfo_t *sum); int xfs_rtmodify_summary(struct xfs_rtalloc_args *args, int log, xfs_fileoff_t bbno, int delta); int xfs_rtfree_range(struct xfs_rtalloc_args *args, xfs_rtxnum_t start, xfs_rtxlen_t len); int xfs_rtalloc_query_range(struct xfs_rtgroup *rtg, struct xfs_trans *tp, xfs_rtxnum_t start, xfs_rtxnum_t end, xfs_rtalloc_query_range_fn fn, void *priv); int xfs_rtalloc_query_all(struct xfs_rtgroup *rtg, struct xfs_trans *tp, xfs_rtalloc_query_range_fn fn, void *priv); int xfs_rtalloc_extent_is_free(struct xfs_rtgroup *rtg, struct xfs_trans *tp, xfs_rtxnum_t start, xfs_rtxlen_t len, bool *is_free); int xfs_rtfree_extent(struct xfs_trans *tp, struct xfs_rtgroup *rtg, xfs_rtxnum_t start, xfs_rtxlen_t len); /* Same as above, but in units of rt blocks. */ int xfs_rtfree_blocks(struct xfs_trans *tp, struct xfs_rtgroup *rtg, xfs_fsblock_t rtbno, xfs_filblks_t rtlen); xfs_rtxnum_t xfs_rtbitmap_rtx_per_rbmblock(struct xfs_mount *mp); xfs_filblks_t xfs_rtbitmap_blockcount(struct xfs_mount *mp); xfs_filblks_t xfs_rtbitmap_blockcount_len(struct xfs_mount *mp, xfs_rtbxlen_t rtextents); xfs_filblks_t xfs_rtsummary_blockcount(struct xfs_mount *mp, unsigned int *rsumlevels); int xfs_rtfile_initialize_blocks(struct xfs_rtgroup *rtg, enum xfs_rtg_inodes type, xfs_fileoff_t offset_fsb, xfs_fileoff_t end_fsb, void *data); int xfs_rtbitmap_create(struct xfs_rtgroup *rtg, struct xfs_inode *ip, struct xfs_trans *tp, bool init); int xfs_rtsummary_create(struct xfs_rtgroup *rtg, struct xfs_inode *ip, struct xfs_trans *tp, bool init); #else /* CONFIG_XFS_RT */ # define xfs_rtfree_extent(t,b,l) (-ENOSYS) static inline int xfs_rtfree_blocks(struct xfs_trans *tp, struct xfs_rtgroup *rtg, xfs_fsblock_t rtbno, xfs_filblks_t rtlen) { return -ENOSYS; } # define xfs_rtalloc_query_range(m,t,l,h,f,p) (-ENOSYS) # define xfs_rtalloc_query_all(m,t,f,p) (-ENOSYS) # define xfs_rtbitmap_read_buf(a,b) (-ENOSYS) # define xfs_rtsummary_read_buf(a,b) (-ENOSYS) # define xfs_rtbuf_cache_relse(a) (0) # define xfs_rtalloc_extent_is_free(m,t,s,l,i) (-ENOSYS) static inline xfs_filblks_t xfs_rtbitmap_blockcount_len(struct xfs_mount *mp, xfs_rtbxlen_t rtextents) { /* shut up gcc */ return 0; } #endif /* CONFIG_XFS_RT */ #endif /* __XFS_RTBITMAP_H__ */
611 612 414 236 228 412 98 98 98 98 1 157 365 393 392 393 158 158 365 8 158 158 365 237 359 227 228 355 357 357 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 // SPDX-License-Identifier: GPL-2.0-only #define pr_fmt(fmt) "%s: " fmt, __func__ #include <linux/kernel.h> #include <linux/sched.h> #include <linux/wait.h> #include <linux/slab.h> #include <linux/mm.h> #include <linux/percpu-refcount.h> /* * Initially, a percpu refcount is just a set of percpu counters. Initially, we * don't try to detect the ref hitting 0 - which means that get/put can just * increment or decrement the local counter. Note that the counter on a * particular cpu can (and will) wrap - this is fine, when we go to shutdown the * percpu counters will all sum to the correct value * * (More precisely: because modular arithmetic is commutative the sum of all the * percpu_count vars will be equal to what it would have been if all the gets * and puts were done to a single integer, even if some of the percpu integers * overflow or underflow). * * The real trick to implementing percpu refcounts is shutdown. We can't detect * the ref hitting 0 on every put - this would require global synchronization * and defeat the whole purpose of using percpu refs. * * What we do is require the user to keep track of the initial refcount; we know * the ref can't hit 0 before the user drops the initial ref, so as long as we * convert to non percpu mode before the initial ref is dropped everything * works. * * Converting to non percpu mode is done with some RCUish stuff in * percpu_ref_kill. Additionally, we need a bias value so that the * atomic_long_t can't hit 0 before we've added up all the percpu refs. */ #define PERCPU_COUNT_BIAS (1LU << (BITS_PER_LONG - 1)) static DEFINE_SPINLOCK(percpu_ref_switch_lock); static DECLARE_WAIT_QUEUE_HEAD(percpu_ref_switch_waitq); static unsigned long __percpu *percpu_count_ptr(struct percpu_ref *ref) { return (unsigned long __percpu *) (ref->percpu_count_ptr & ~__PERCPU_REF_ATOMIC_DEAD); } /** * percpu_ref_init - initialize a percpu refcount * @ref: percpu_ref to initialize * @release: function which will be called when refcount hits 0 * @flags: PERCPU_REF_INIT_* flags * @gfp: allocation mask to use * * Initializes @ref. @ref starts out in percpu mode with a refcount of 1 unless * @flags contains PERCPU_REF_INIT_ATOMIC or PERCPU_REF_INIT_DEAD. These flags * change the start state to atomic with the latter setting the initial refcount * to 0. See the definitions of PERCPU_REF_INIT_* flags for flag behaviors. * * Note that @release must not sleep - it may potentially be called from RCU * callback context by percpu_ref_kill(). */ int percpu_ref_init(struct percpu_ref *ref, percpu_ref_func_t *release, unsigned int flags, gfp_t gfp) { size_t align = max_t(size_t, 1 << __PERCPU_REF_FLAG_BITS, __alignof__(unsigned long)); unsigned long start_count = 0; struct percpu_ref_data *data; ref->percpu_count_ptr = (unsigned long) __alloc_percpu_gfp(sizeof(unsigned long), align, gfp); if (!ref->percpu_count_ptr) return -ENOMEM; data = kzalloc(sizeof(*ref->data), gfp); if (!data) { free_percpu((void __percpu *)ref->percpu_count_ptr); ref->percpu_count_ptr = 0; return -ENOMEM; } data->force_atomic = flags & PERCPU_REF_INIT_ATOMIC; data->allow_reinit = flags & PERCPU_REF_ALLOW_REINIT; if (flags & (PERCPU_REF_INIT_ATOMIC | PERCPU_REF_INIT_DEAD)) { ref->percpu_count_ptr |= __PERCPU_REF_ATOMIC; data->allow_reinit = true; } else { start_count += PERCPU_COUNT_BIAS; } if (flags & PERCPU_REF_INIT_DEAD) ref->percpu_count_ptr |= __PERCPU_REF_DEAD; else start_count++; atomic_long_set(&data->count, start_count); data->release = release; data->confirm_switch = NULL; data->ref = ref; ref->data = data; return 0; } EXPORT_SYMBOL_GPL(percpu_ref_init); static void __percpu_ref_exit(struct percpu_ref *ref) { unsigned long __percpu *percpu_count = percpu_count_ptr(ref); if (percpu_count) { /* non-NULL confirm_switch indicates switching in progress */ WARN_ON_ONCE(ref->data && ref->data->confirm_switch); free_percpu(percpu_count); ref->percpu_count_ptr = __PERCPU_REF_ATOMIC_DEAD; } } /** * percpu_ref_exit - undo percpu_ref_init() * @ref: percpu_ref to exit * * This function exits @ref. The caller is responsible for ensuring that * @ref is no longer in active use. The usual places to invoke this * function from are the @ref->release() callback or in init failure path * where percpu_ref_init() succeeded but other parts of the initialization * of the embedding object failed. */ void percpu_ref_exit(struct percpu_ref *ref) { struct percpu_ref_data *data = ref->data; unsigned long flags; __percpu_ref_exit(ref); if (!data) return; spin_lock_irqsave(&percpu_ref_switch_lock, flags); ref->percpu_count_ptr |= atomic_long_read(&ref->data->count) << __PERCPU_REF_FLAG_BITS; ref->data = NULL; spin_unlock_irqrestore(&percpu_ref_switch_lock, flags); kfree(data); } EXPORT_SYMBOL_GPL(percpu_ref_exit); static void percpu_ref_call_confirm_rcu(struct rcu_head *rcu) { struct percpu_ref_data *data = container_of(rcu, struct percpu_ref_data, rcu); struct percpu_ref *ref = data->ref; data->confirm_switch(ref); data->confirm_switch = NULL; wake_up_all(&percpu_ref_switch_waitq); if (!data->allow_reinit) __percpu_ref_exit(ref); /* drop ref from percpu_ref_switch_to_atomic() */ percpu_ref_put(ref); } static void percpu_ref_switch_to_atomic_rcu(struct rcu_head *rcu) { struct percpu_ref_data *data = container_of(rcu, struct percpu_ref_data, rcu); struct percpu_ref *ref = data->ref; unsigned long __percpu *percpu_count = percpu_count_ptr(ref); static atomic_t underflows; unsigned long count = 0; int cpu; for_each_possible_cpu(cpu) count += *per_cpu_ptr(percpu_count, cpu); pr_debug("global %lu percpu %lu\n", atomic_long_read(&data->count), count); /* * It's crucial that we sum the percpu counters _before_ adding the sum * to &ref->count; since gets could be happening on one cpu while puts * happen on another, adding a single cpu's count could cause * @ref->count to hit 0 before we've got a consistent value - but the * sum of all the counts will be consistent and correct. * * Subtracting the bias value then has to happen _after_ adding count to * &ref->count; we need the bias value to prevent &ref->count from * reaching 0 before we add the percpu counts. But doing it at the same * time is equivalent and saves us atomic operations: */ atomic_long_add((long)count - PERCPU_COUNT_BIAS, &data->count); if (WARN_ONCE(atomic_long_read(&data->count) <= 0, "percpu ref (%ps) <= 0 (%ld) after switching to atomic", data->release, atomic_long_read(&data->count)) && atomic_inc_return(&underflows) < 4) { pr_err("%s(): percpu_ref underflow", __func__); mem_dump_obj(data); } /* @ref is viewed as dead on all CPUs, send out switch confirmation */ percpu_ref_call_confirm_rcu(rcu); } static void percpu_ref_noop_confirm_switch(struct percpu_ref *ref) { } static void __percpu_ref_switch_to_atomic(struct percpu_ref *ref, percpu_ref_func_t *confirm_switch) { if (ref->percpu_count_ptr & __PERCPU_REF_ATOMIC) { if (confirm_switch) confirm_switch(ref); return; } /* switching from percpu to atomic */ ref->percpu_count_ptr |= __PERCPU_REF_ATOMIC; /* * Non-NULL ->confirm_switch is used to indicate that switching is * in progress. Use noop one if unspecified. */ ref->data->confirm_switch = confirm_switch ?: percpu_ref_noop_confirm_switch; percpu_ref_get(ref); /* put after confirmation */ call_rcu_hurry(&ref->data->rcu, percpu_ref_switch_to_atomic_rcu); } static void __percpu_ref_switch_to_percpu(struct percpu_ref *ref) { unsigned long __percpu *percpu_count = percpu_count_ptr(ref); int cpu; BUG_ON(!percpu_count); if (!(ref->percpu_count_ptr & __PERCPU_REF_ATOMIC)) return; if (WARN_ON_ONCE(!ref->data->allow_reinit)) return; atomic_long_add(PERCPU_COUNT_BIAS, &ref->data->count); /* * Restore per-cpu operation. smp_store_release() is paired * with READ_ONCE() in __ref_is_percpu() and guarantees that the * zeroing is visible to all percpu accesses which can see the * following __PERCPU_REF_ATOMIC clearing. */ for_each_possible_cpu(cpu) *per_cpu_ptr(percpu_count, cpu) = 0; smp_store_release(&ref->percpu_count_ptr, ref->percpu_count_ptr & ~__PERCPU_REF_ATOMIC); } static void __percpu_ref_switch_mode(struct percpu_ref *ref, percpu_ref_func_t *confirm_switch) { struct percpu_ref_data *data = ref->data; lockdep_assert_held(&percpu_ref_switch_lock); /* * If the previous ATOMIC switching hasn't finished yet, wait for * its completion. If the caller ensures that ATOMIC switching * isn't in progress, this function can be called from any context. */ wait_event_lock_irq(percpu_ref_switch_waitq, !data->confirm_switch, percpu_ref_switch_lock); if (data->force_atomic || percpu_ref_is_dying(ref)) __percpu_ref_switch_to_atomic(ref, confirm_switch); else __percpu_ref_switch_to_percpu(ref); } /** * percpu_ref_switch_to_atomic - switch a percpu_ref to atomic mode * @ref: percpu_ref to switch to atomic mode * @confirm_switch: optional confirmation callback * * There's no reason to use this function for the usual reference counting. * Use percpu_ref_kill[_and_confirm](). * * Schedule switching of @ref to atomic mode. All its percpu counts will * be collected to the main atomic counter. On completion, when all CPUs * are guaraneed to be in atomic mode, @confirm_switch, which may not * block, is invoked. This function may be invoked concurrently with all * the get/put operations and can safely be mixed with kill and reinit * operations. Note that @ref will stay in atomic mode across kill/reinit * cycles until percpu_ref_switch_to_percpu() is called. * * This function may block if @ref is in the process of switching to atomic * mode. If the caller ensures that @ref is not in the process of * switching to atomic mode, this function can be called from any context. */ void percpu_ref_switch_to_atomic(struct percpu_ref *ref, percpu_ref_func_t *confirm_switch) { unsigned long flags; spin_lock_irqsave(&percpu_ref_switch_lock, flags); ref->data->force_atomic = true; __percpu_ref_switch_mode(ref, confirm_switch); spin_unlock_irqrestore(&percpu_ref_switch_lock, flags); } EXPORT_SYMBOL_GPL(percpu_ref_switch_to_atomic); /** * percpu_ref_switch_to_atomic_sync - switch a percpu_ref to atomic mode * @ref: percpu_ref to switch to atomic mode * * Schedule switching the ref to atomic mode, and wait for the * switch to complete. Caller must ensure that no other thread * will switch back to percpu mode. */ void percpu_ref_switch_to_atomic_sync(struct percpu_ref *ref) { percpu_ref_switch_to_atomic(ref, NULL); wait_event(percpu_ref_switch_waitq, !ref->data->confirm_switch); } EXPORT_SYMBOL_GPL(percpu_ref_switch_to_atomic_sync); /** * percpu_ref_switch_to_percpu - switch a percpu_ref to percpu mode * @ref: percpu_ref to switch to percpu mode * * There's no reason to use this function for the usual reference counting. * To re-use an expired ref, use percpu_ref_reinit(). * * Switch @ref to percpu mode. This function may be invoked concurrently * with all the get/put operations and can safely be mixed with kill and * reinit operations. This function reverses the sticky atomic state set * by PERCPU_REF_INIT_ATOMIC or percpu_ref_switch_to_atomic(). If @ref is * dying or dead, the actual switching takes place on the following * percpu_ref_reinit(). * * This function may block if @ref is in the process of switching to atomic * mode. If the caller ensures that @ref is not in the process of * switching to atomic mode, this function can be called from any context. */ void percpu_ref_switch_to_percpu(struct percpu_ref *ref) { unsigned long flags; spin_lock_irqsave(&percpu_ref_switch_lock, flags); ref->data->force_atomic = false; __percpu_ref_switch_mode(ref, NULL); spin_unlock_irqrestore(&percpu_ref_switch_lock, flags); } EXPORT_SYMBOL_GPL(percpu_ref_switch_to_percpu); /** * percpu_ref_kill_and_confirm - drop the initial ref and schedule confirmation * @ref: percpu_ref to kill * @confirm_kill: optional confirmation callback * * Equivalent to percpu_ref_kill() but also schedules kill confirmation if * @confirm_kill is not NULL. @confirm_kill, which may not block, will be * called after @ref is seen as dead from all CPUs at which point all * further invocations of percpu_ref_tryget_live() will fail. See * percpu_ref_tryget_live() for details. * * This function normally doesn't block and can be called from any context * but it may block if @confirm_kill is specified and @ref is in the * process of switching to atomic mode by percpu_ref_switch_to_atomic(). * * There are no implied RCU grace periods between kill and release. */ void percpu_ref_kill_and_confirm(struct percpu_ref *ref, percpu_ref_func_t *confirm_kill) { unsigned long flags; spin_lock_irqsave(&percpu_ref_switch_lock, flags); WARN_ONCE(percpu_ref_is_dying(ref), "%s called more than once on %ps!", __func__, ref->data->release); ref->percpu_count_ptr |= __PERCPU_REF_DEAD; __percpu_ref_switch_mode(ref, confirm_kill); percpu_ref_put(ref); spin_unlock_irqrestore(&percpu_ref_switch_lock, flags); } EXPORT_SYMBOL_GPL(percpu_ref_kill_and_confirm); /** * percpu_ref_is_zero - test whether a percpu refcount reached zero * @ref: percpu_ref to test * * Returns %true if @ref reached zero. * * This function is safe to call as long as @ref is between init and exit. */ bool percpu_ref_is_zero(struct percpu_ref *ref) { unsigned long __percpu *percpu_count; unsigned long count, flags; if (__ref_is_percpu(ref, &percpu_count)) return false; /* protect us from being destroyed */ spin_lock_irqsave(&percpu_ref_switch_lock, flags); if (ref->data) count = atomic_long_read(&ref->data->count); else count = ref->percpu_count_ptr >> __PERCPU_REF_FLAG_BITS; spin_unlock_irqrestore(&percpu_ref_switch_lock, flags); return count == 0; } EXPORT_SYMBOL_GPL(percpu_ref_is_zero); /** * percpu_ref_reinit - re-initialize a percpu refcount * @ref: perpcu_ref to re-initialize * * Re-initialize @ref so that it's in the same state as when it finished * percpu_ref_init() ignoring %PERCPU_REF_INIT_DEAD. @ref must have been * initialized successfully and reached 0 but not exited. * * Note that percpu_ref_tryget[_live]() are safe to perform on @ref while * this function is in progress. */ void percpu_ref_reinit(struct percpu_ref *ref) { WARN_ON_ONCE(!percpu_ref_is_zero(ref)); percpu_ref_resurrect(ref); } EXPORT_SYMBOL_GPL(percpu_ref_reinit); /** * percpu_ref_resurrect - modify a percpu refcount from dead to live * @ref: perpcu_ref to resurrect * * Modify @ref so that it's in the same state as before percpu_ref_kill() was * called. @ref must be dead but must not yet have exited. * * If @ref->release() frees @ref then the caller is responsible for * guaranteeing that @ref->release() does not get called while this * function is in progress. * * Note that percpu_ref_tryget[_live]() are safe to perform on @ref while * this function is in progress. */ void percpu_ref_resurrect(struct percpu_ref *ref) { unsigned long __percpu *percpu_count; unsigned long flags; spin_lock_irqsave(&percpu_ref_switch_lock, flags); WARN_ON_ONCE(!percpu_ref_is_dying(ref)); WARN_ON_ONCE(__ref_is_percpu(ref, &percpu_count)); ref->percpu_count_ptr &= ~__PERCPU_REF_DEAD; percpu_ref_get(ref); __percpu_ref_switch_mode(ref, NULL); spin_unlock_irqrestore(&percpu_ref_switch_lock, flags); } EXPORT_SYMBOL_GPL(percpu_ref_resurrect);
51 49 8 8 8 8 8 40 25 25 23 2 23 2 74 77 14 6 65 40 42 2 40 40 43 43 3 1 40 43 30 39 16 30 22 9 24 24 24 12 7 6 12 2 12 12 12 12 24 4 14 1 6 27 5 14 11 14 11 27 1 26 7 2 13 6 7 17 2 15 26 26 3 5 5 5 2 3 12 1 1 1 2 7 7 2 9 29 29 23 29 25 25 25 25 25 8 15 15 15 4 21 18 21 21 23 21 2 3 3 30 12 21 17 13 13 25 5 28 8 23 45 2 44 30 1 5 9 3 44 44 7 8 8 14 2 12 11 5 8 7 6 1 10 10 10 1 4 4 3 1 1 1 10 5 5 1 7 98 228 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 // SPDX-License-Identifier: GPL-2.0 #ifndef NO_BCACHEFS_FS #include "bcachefs.h" #include "alloc_foreground.h" #include "bkey_buf.h" #include "btree_update.h" #include "buckets.h" #include "clock.h" #include "error.h" #include "extents.h" #include "extent_update.h" #include "fs.h" #include "fs-io.h" #include "fs-io-buffered.h" #include "fs-io-pagecache.h" #include "fsck.h" #include "inode.h" #include "journal.h" #include "io_misc.h" #include "keylist.h" #include "quota.h" #include "reflink.h" #include "trace.h" #include <linux/aio.h> #include <linux/backing-dev.h> #include <linux/falloc.h> #include <linux/migrate.h> #include <linux/mmu_context.h> #include <linux/pagevec.h> #include <linux/rmap.h> #include <linux/sched/signal.h> #include <linux/task_io_accounting_ops.h> #include <linux/uio.h> #include <trace/events/writeback.h> struct nocow_flush { struct closure *cl; struct bch_dev *ca; struct bio bio; }; static void nocow_flush_endio(struct bio *_bio) { struct nocow_flush *bio = container_of(_bio, struct nocow_flush, bio); closure_put(bio->cl); percpu_ref_put(&bio->ca->io_ref); bio_put(&bio->bio); } void bch2_inode_flush_nocow_writes_async(struct bch_fs *c, struct bch_inode_info *inode, struct closure *cl) { struct nocow_flush *bio; struct bch_dev *ca; struct bch_devs_mask devs; unsigned dev; dev = find_first_bit(inode->ei_devs_need_flush.d, BCH_SB_MEMBERS_MAX); if (dev == BCH_SB_MEMBERS_MAX) return; devs = inode->ei_devs_need_flush; memset(&inode->ei_devs_need_flush, 0, sizeof(inode->ei_devs_need_flush)); for_each_set_bit(dev, devs.d, BCH_SB_MEMBERS_MAX) { rcu_read_lock(); ca = rcu_dereference(c->devs[dev]); if (ca && !percpu_ref_tryget(&ca->io_ref)) ca = NULL; rcu_read_unlock(); if (!ca) continue; bio = container_of(bio_alloc_bioset(ca->disk_sb.bdev, 0, REQ_OP_WRITE|REQ_PREFLUSH, GFP_KERNEL, &c->nocow_flush_bioset), struct nocow_flush, bio); bio->cl = cl; bio->ca = ca; bio->bio.bi_end_io = nocow_flush_endio; closure_bio_submit(&bio->bio, cl); } } static int bch2_inode_flush_nocow_writes(struct bch_fs *c, struct bch_inode_info *inode) { struct closure cl; closure_init_stack(&cl); bch2_inode_flush_nocow_writes_async(c, inode, &cl); closure_sync(&cl); return 0; } /* i_size updates: */ struct inode_new_size { loff_t new_size; u64 now; unsigned fields; }; static int inode_set_size(struct btree_trans *trans, struct bch_inode_info *inode, struct bch_inode_unpacked *bi, void *p) { struct inode_new_size *s = p; bi->bi_size = s->new_size; if (s->fields & ATTR_ATIME) bi->bi_atime = s->now; if (s->fields & ATTR_MTIME) bi->bi_mtime = s->now; if (s->fields & ATTR_CTIME) bi->bi_ctime = s->now; return 0; } int __must_check bch2_write_inode_size(struct bch_fs *c, struct bch_inode_info *inode, loff_t new_size, unsigned fields) { struct inode_new_size s = { .new_size = new_size, .now = bch2_current_time(c), .fields = fields, }; return bch2_write_inode(c, inode, inode_set_size, &s, fields); } void __bch2_i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode, struct quota_res *quota_res, s64 sectors) { bch2_fs_inconsistent_on((s64) inode->v.i_blocks + sectors < 0, c, "inode %lu i_blocks underflow: %llu + %lli < 0 (ondisk %lli)", inode->v.i_ino, (u64) inode->v.i_blocks, sectors, inode->ei_inode.bi_sectors); inode->v.i_blocks += sectors; #ifdef CONFIG_BCACHEFS_QUOTA if (quota_res && !test_bit(EI_INODE_SNAPSHOT, &inode->ei_flags) && sectors > 0) { BUG_ON(sectors > quota_res->sectors); BUG_ON(sectors > inode->ei_quota_reserved); quota_res->sectors -= sectors; inode->ei_quota_reserved -= sectors; } else { bch2_quota_acct(c, inode->ei_qid, Q_SPC, sectors, KEY_TYPE_QUOTA_WARN); } #endif } /* fsync: */ static int bch2_get_inode_journal_seq_trans(struct btree_trans *trans, subvol_inum inum, u64 *seq) { struct printbuf buf = PRINTBUF; struct bch_inode_unpacked u; struct btree_iter iter; int ret = bch2_inode_peek(trans, &iter, &u, inum, 0); if (ret) return ret; u64 cur_seq = journal_cur_seq(&trans->c->journal); *seq = min(cur_seq, u.bi_journal_seq); if (fsck_err_on(u.bi_journal_seq > cur_seq, trans, inode_journal_seq_in_future, "inode journal seq in future (currently at %llu)\n%s", cur_seq, (bch2_inode_unpacked_to_text(&buf, &u), buf.buf))) { u.bi_journal_seq = cur_seq; ret = bch2_inode_write(trans, &iter, &u); } fsck_err: bch2_trans_iter_exit(trans, &iter); printbuf_exit(&buf); return ret; } /* * inode->ei_inode.bi_journal_seq won't be up to date since it's set in an * insert trigger: look up the btree inode instead */ static int bch2_flush_inode(struct bch_fs *c, struct bch_inode_info *inode) { if (c->opts.journal_flush_disabled) return 0; if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_fsync)) return -EROFS; u64 seq; int ret = bch2_trans_commit_do(c, NULL, NULL, 0, bch2_get_inode_journal_seq_trans(trans, inode_inum(inode), &seq)) ?: bch2_journal_flush_seq(&c->journal, seq, TASK_INTERRUPTIBLE) ?: bch2_inode_flush_nocow_writes(c, inode); bch2_write_ref_put(c, BCH_WRITE_REF_fsync); return ret; } int bch2_fsync(struct file *file, loff_t start, loff_t end, int datasync) { struct bch_inode_info *inode = file_bch_inode(file); struct bch_fs *c = inode->v.i_sb->s_fs_info; int ret, err; trace_bch2_fsync(file, datasync); ret = file_write_and_wait_range(file, start, end); if (ret) goto out; ret = sync_inode_metadata(&inode->v, 1); if (ret) goto out; ret = bch2_flush_inode(c, inode); out: ret = bch2_err_class(ret); if (ret == -EROFS) ret = -EIO; err = file_check_and_advance_wb_err(file); if (!ret) ret = err; return ret; } /* truncate: */ static inline int range_has_data(struct bch_fs *c, u32 subvol, struct bpos start, struct bpos end) { return bch2_trans_run(c, for_each_btree_key_in_subvolume_max(trans, iter, BTREE_ID_extents, start, end, subvol, 0, k, ({ bkey_extent_is_data(k.k) && !bkey_extent_is_unwritten(k); }))); } static int __bch2_truncate_folio(struct bch_inode_info *inode, pgoff_t index, loff_t start, loff_t end) { struct bch_fs *c = inode->v.i_sb->s_fs_info; struct address_space *mapping = inode->v.i_mapping; struct bch_folio *s; unsigned start_offset; unsigned end_offset; unsigned i; struct folio *folio; s64 i_sectors_delta = 0; int ret = 0; u64 end_pos; folio = filemap_lock_folio(mapping, index); if (IS_ERR_OR_NULL(folio)) { /* * XXX: we're doing two index lookups when we end up reading the * folio */ ret = range_has_data(c, inode->ei_inum.subvol, POS(inode->v.i_ino, (index << PAGE_SECTORS_SHIFT)), POS(inode->v.i_ino, (index << PAGE_SECTORS_SHIFT) + PAGE_SECTORS)); if (ret <= 0) return ret; folio = __filemap_get_folio(mapping, index, FGP_LOCK|FGP_CREAT, GFP_KERNEL); if (IS_ERR(folio)) { ret = -ENOMEM; goto out; } } BUG_ON(start >= folio_end_pos(folio)); BUG_ON(end <= folio_pos(folio)); start_offset = max(start, folio_pos(folio)) - folio_pos(folio); end_offset = min_t(u64, end, folio_end_pos(folio)) - folio_pos(folio); /* Folio boundary? Nothing to do */ if (start_offset == 0 && end_offset == folio_size(folio)) { ret = 0; goto unlock; } s = bch2_folio_create(folio, 0); if (!s) { ret = -ENOMEM; goto unlock; } if (!folio_test_uptodate(folio)) { ret = bch2_read_single_folio(folio, mapping); if (ret) goto unlock; } ret = bch2_folio_set(c, inode_inum(inode), &folio, 1); if (ret) goto unlock; for (i = round_up(start_offset, block_bytes(c)) >> 9; i < round_down(end_offset, block_bytes(c)) >> 9; i++) { s->s[i].nr_replicas = 0; i_sectors_delta -= s->s[i].state == SECTOR_dirty; bch2_folio_sector_set(folio, s, i, SECTOR_unallocated); } bch2_i_sectors_acct(c, inode, NULL, i_sectors_delta); /* * Caller needs to know whether this folio will be written out by * writeback - doing an i_size update if necessary - or whether it will * be responsible for the i_size update. * * Note that we shouldn't ever see a folio beyond EOF, but check and * warn if so. This has been observed by failure to clean up folios * after a short write and there's still a chance reclaim will fix * things up. */ WARN_ON_ONCE(folio_pos(folio) >= inode->v.i_size); end_pos = folio_end_pos(folio); if (inode->v.i_size > folio_pos(folio)) end_pos = min_t(u64, inode->v.i_size, end_pos); ret = s->s[folio_pos_to_s(folio, end_pos - 1)].state >= SECTOR_dirty; folio_zero_segment(folio, start_offset, end_offset); /* * Bit of a hack - we don't want truncate to fail due to -ENOSPC. * * XXX: because we aren't currently tracking whether the folio has actual * data in it (vs. just 0s, or only partially written) this wrong. ick. */ BUG_ON(bch2_get_folio_disk_reservation(c, inode, folio, false)); /* * This removes any writeable userspace mappings; we need to force * .page_mkwrite to be called again before any mmapped writes, to * redirty the full page: */ folio_mkclean(folio); filemap_dirty_folio(mapping, folio); unlock: folio_unlock(folio); folio_put(folio); out: return ret; } static int bch2_truncate_folio(struct bch_inode_info *inode, loff_t from) { return __bch2_truncate_folio(inode, from >> PAGE_SHIFT, from, ANYSINT_MAX(loff_t)); } static int bch2_truncate_folios(struct bch_inode_info *inode, loff_t start, loff_t end) { int ret = __bch2_truncate_folio(inode, start >> PAGE_SHIFT, start, end); if (ret >= 0 && start >> PAGE_SHIFT != end >> PAGE_SHIFT) ret = __bch2_truncate_folio(inode, (end - 1) >> PAGE_SHIFT, start, end); return ret; } static int bch2_extend(struct mnt_idmap *idmap, struct bch_inode_info *inode, struct bch_inode_unpacked *inode_u, struct iattr *iattr) { struct address_space *mapping = inode->v.i_mapping; int ret; /* * sync appends: * * this has to be done _before_ extending i_size: */ ret = filemap_write_and_wait_range(mapping, inode_u->bi_size, S64_MAX); if (ret) return ret; truncate_setsize(&inode->v, iattr->ia_size); return bch2_setattr_nonsize(idmap, inode, iattr); } int bchfs_truncate(struct mnt_idmap *idmap, struct bch_inode_info *inode, struct iattr *iattr) { struct bch_fs *c = inode->v.i_sb->s_fs_info; struct address_space *mapping = inode->v.i_mapping; struct bch_inode_unpacked inode_u; s64 i_sectors_delta = 0; int ret = 0; /* * If the truncate call with change the size of the file, the * cmtimes should be updated. If the size will not change, we * do not need to update the cmtimes. */ if (iattr->ia_size != inode->v.i_size) { if (!(iattr->ia_valid & ATTR_MTIME)) ktime_get_coarse_real_ts64(&iattr->ia_mtime); if (!(iattr->ia_valid & ATTR_CTIME)) ktime_get_coarse_real_ts64(&iattr->ia_ctime); iattr->ia_valid |= ATTR_MTIME|ATTR_CTIME; } inode_dio_wait(&inode->v); bch2_pagecache_block_get(inode); ret = bch2_inode_find_by_inum(c, inode_inum(inode), &inode_u); if (ret) goto err; /* * check this before next assertion; on filesystem error our normal * invariants are a bit broken (truncate has to truncate the page cache * before the inode). */ ret = bch2_journal_error(&c->journal); if (ret) goto err; WARN_ONCE(!test_bit(EI_INODE_ERROR, &inode->ei_flags) && inode->v.i_size < inode_u.bi_size, "truncate spotted in mem i_size < btree i_size: %llu < %llu\n", (u64) inode->v.i_size, inode_u.bi_size); if (iattr->ia_size > inode->v.i_size) { ret = bch2_extend(idmap, inode, &inode_u, iattr); goto err; } iattr->ia_valid &= ~ATTR_SIZE; ret = bch2_truncate_folio(inode, iattr->ia_size); if (unlikely(ret < 0)) goto err; ret = 0; truncate_setsize(&inode->v, iattr->ia_size); /* * When extending, we're going to write the new i_size to disk * immediately so we need to flush anything above the current on disk * i_size first: * * Also, when extending we need to flush the page that i_size currently * straddles - if it's mapped to userspace, we need to ensure that * userspace has to redirty it and call .mkwrite -> set_page_dirty * again to allocate the part of the page that was extended. */ if (iattr->ia_size > inode_u.bi_size) ret = filemap_write_and_wait_range(mapping, inode_u.bi_size, iattr->ia_size - 1); else if (iattr->ia_size & (PAGE_SIZE - 1)) ret = filemap_write_and_wait_range(mapping, round_down(iattr->ia_size, PAGE_SIZE), iattr->ia_size - 1); if (ret) goto err; ret = bch2_truncate(c, inode_inum(inode), iattr->ia_size, &i_sectors_delta); bch2_i_sectors_acct(c, inode, NULL, i_sectors_delta); if (unlikely(ret)) { /* * If we error here, VFS caches are now inconsistent with btree */ set_bit(EI_INODE_ERROR, &inode->ei_flags); goto err; } bch2_fs_inconsistent_on(!inode->v.i_size && inode->v.i_blocks && !bch2_journal_error(&c->journal), c, "inode %lu truncated to 0 but i_blocks %llu (ondisk %lli)", inode->v.i_ino, (u64) inode->v.i_blocks, inode->ei_inode.bi_sectors); ret = bch2_setattr_nonsize(idmap, inode, iattr); err: bch2_pagecache_block_put(inode); return bch2_err_class(ret); } /* fallocate: */ static int inode_update_times_fn(struct btree_trans *trans, struct bch_inode_info *inode, struct bch_inode_unpacked *bi, void *p) { struct bch_fs *c = inode->v.i_sb->s_fs_info; bi->bi_mtime = bi->bi_ctime = bch2_current_time(c); return 0; } static noinline long bchfs_fpunch(struct bch_inode_info *inode, loff_t offset, loff_t len) { struct bch_fs *c = inode->v.i_sb->s_fs_info; u64 end = offset + len; u64 block_start = round_up(offset, block_bytes(c)); u64 block_end = round_down(end, block_bytes(c)); bool truncated_last_page; int ret = 0; ret = bch2_truncate_folios(inode, offset, end); if (unlikely(ret < 0)) goto err; truncated_last_page = ret; truncate_pagecache_range(&inode->v, offset, end - 1); if (block_start < block_end) { s64 i_sectors_delta = 0; ret = bch2_fpunch(c, inode_inum(inode), block_start >> 9, block_end >> 9, &i_sectors_delta); bch2_i_sectors_acct(c, inode, NULL, i_sectors_delta); } mutex_lock(&inode->ei_update_lock); if (end >= inode->v.i_size && !truncated_last_page) { ret = bch2_write_inode_size(c, inode, inode->v.i_size, ATTR_MTIME|ATTR_CTIME); } else { ret = bch2_write_inode(c, inode, inode_update_times_fn, NULL, ATTR_MTIME|ATTR_CTIME); } mutex_unlock(&inode->ei_update_lock); err: return ret; } static noinline long bchfs_fcollapse_finsert(struct bch_inode_info *inode, loff_t offset, loff_t len, bool insert) { struct bch_fs *c = inode->v.i_sb->s_fs_info; struct address_space *mapping = inode->v.i_mapping; s64 i_sectors_delta = 0; int ret = 0; if ((offset | len) & (block_bytes(c) - 1)) return -EINVAL; if (insert) { if (offset >= inode->v.i_size) return -EINVAL; } else { if (offset + len >= inode->v.i_size) return -EINVAL; } ret = bch2_write_invalidate_inode_pages_range(mapping, offset, LLONG_MAX); if (ret) return ret; if (insert) i_size_write(&inode->v, inode->v.i_size + len); ret = bch2_fcollapse_finsert(c, inode_inum(inode), offset >> 9, len >> 9, insert, &i_sectors_delta); if (!ret && !insert) i_size_write(&inode->v, inode->v.i_size - len); bch2_i_sectors_acct(c, inode, NULL, i_sectors_delta); return ret; } static noinline int __bchfs_fallocate(struct bch_inode_info *inode, int mode, u64 start_sector, u64 end_sector) { struct bch_fs *c = inode->v.i_sb->s_fs_info; struct btree_trans *trans = bch2_trans_get(c); struct btree_iter iter; struct bpos end_pos = POS(inode->v.i_ino, end_sector); struct bch_io_opts opts; int ret = 0; bch2_inode_opts_get(&opts, c, &inode->ei_inode); bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, POS(inode->v.i_ino, start_sector), BTREE_ITER_slots|BTREE_ITER_intent); while (!ret) { s64 i_sectors_delta = 0; struct quota_res quota_res = { 0 }; struct bkey_s_c k; unsigned sectors; bool is_allocation; u64 hole_start, hole_end; u32 snapshot; bch2_trans_begin(trans); if (bkey_ge(iter.pos, end_pos)) break; ret = bch2_subvolume_get_snapshot(trans, inode->ei_inum.subvol, &snapshot); if (ret) goto bkey_err; bch2_btree_iter_set_snapshot(&iter, snapshot); k = bch2_btree_iter_peek_slot(&iter); if ((ret = bkey_err(k))) goto bkey_err; hole_start = iter.pos.offset; hole_end = bpos_min(k.k->p, end_pos).offset; is_allocation = bkey_extent_is_allocation(k.k); /* already reserved */ if (bkey_extent_is_reservation(k) && bch2_bkey_nr_ptrs_fully_allocated(k) >= opts.data_replicas) { bch2_btree_iter_advance(&iter); continue; } if (bkey_extent_is_data(k.k) && !(mode & FALLOC_FL_ZERO_RANGE)) { bch2_btree_iter_advance(&iter); continue; } if (!(mode & FALLOC_FL_ZERO_RANGE)) { /* * Lock ordering - can't be holding btree locks while * blocking on a folio lock: */ if (bch2_clamp_data_hole(&inode->v, &hole_start, &hole_end, opts.data_replicas, true)) { ret = drop_locks_do(trans, (bch2_clamp_data_hole(&inode->v, &hole_start, &hole_end, opts.data_replicas, false), 0)); if (ret) goto bkey_err; } bch2_btree_iter_set_pos(&iter, POS(iter.pos.inode, hole_start)); if (ret) goto bkey_err; if (hole_start == hole_end) continue; } sectors = hole_end - hole_start; if (!is_allocation) { ret = bch2_quota_reservation_add(c, inode, &quota_res, sectors, true); if (unlikely(ret)) goto bkey_err; } ret = bch2_extent_fallocate(trans, inode_inum(inode), &iter, sectors, opts, &i_sectors_delta, writepoint_hashed((unsigned long) current)); if (ret) goto bkey_err; bch2_i_sectors_acct(c, inode, &quota_res, i_sectors_delta); if (bch2_mark_pagecache_reserved(inode, &hole_start, iter.pos.offset, true)) { ret = drop_locks_do(trans, bch2_mark_pagecache_reserved(inode, &hole_start, iter.pos.offset, false)); if (ret) goto bkey_err; } bkey_err: bch2_quota_reservation_put(c, inode, &quota_res); if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) ret = 0; } if (bch2_err_matches(ret, ENOSPC) && (mode & FALLOC_FL_ZERO_RANGE)) { struct quota_res quota_res = { 0 }; s64 i_sectors_delta = 0; bch2_fpunch_at(trans, &iter, inode_inum(inode), end_sector, &i_sectors_delta); bch2_i_sectors_acct(c, inode, &quota_res, i_sectors_delta); bch2_quota_reservation_put(c, inode, &quota_res); } bch2_trans_iter_exit(trans, &iter); bch2_trans_put(trans); return ret; } static noinline long bchfs_fallocate(struct bch_inode_info *inode, int mode, loff_t offset, loff_t len) { struct bch_fs *c = inode->v.i_sb->s_fs_info; u64 end = offset + len; u64 block_start = round_down(offset, block_bytes(c)); u64 block_end = round_up(end, block_bytes(c)); bool truncated_last_page = false; int ret, ret2 = 0; if (!(mode & FALLOC_FL_KEEP_SIZE) && end > inode->v.i_size) { ret = inode_newsize_ok(&inode->v, end); if (ret) return ret; } if (mode & FALLOC_FL_ZERO_RANGE) { ret = bch2_truncate_folios(inode, offset, end); if (unlikely(ret < 0)) return ret; truncated_last_page = ret; truncate_pagecache_range(&inode->v, offset, end - 1); block_start = round_up(offset, block_bytes(c)); block_end = round_down(end, block_bytes(c)); } ret = __bchfs_fallocate(inode, mode, block_start >> 9, block_end >> 9); /* * On -ENOSPC in ZERO_RANGE mode, we still want to do the inode update, * so that the VFS cache i_size is consistent with the btree i_size: */ if (ret && !(bch2_err_matches(ret, ENOSPC) && (mode & FALLOC_FL_ZERO_RANGE))) return ret; if (mode & FALLOC_FL_KEEP_SIZE && end > inode->v.i_size) end = inode->v.i_size; if (end >= inode->v.i_size && (((mode & FALLOC_FL_ZERO_RANGE) && !truncated_last_page) || !(mode & FALLOC_FL_KEEP_SIZE))) { spin_lock(&inode->v.i_lock); i_size_write(&inode->v, end); spin_unlock(&inode->v.i_lock); mutex_lock(&inode->ei_update_lock); ret2 = bch2_write_inode_size(c, inode, end, 0); mutex_unlock(&inode->ei_update_lock); } return ret ?: ret2; } long bch2_fallocate_dispatch(struct file *file, int mode, loff_t offset, loff_t len) { struct bch_inode_info *inode = file_bch_inode(file); struct bch_fs *c = inode->v.i_sb->s_fs_info; long ret; if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_fallocate)) return -EROFS; inode_lock(&inode->v); inode_dio_wait(&inode->v); bch2_pagecache_block_get(inode); ret = file_modified(file); if (ret) goto err; if (!(mode & ~(FALLOC_FL_KEEP_SIZE|FALLOC_FL_ZERO_RANGE))) ret = bchfs_fallocate(inode, mode, offset, len); else if (mode == (FALLOC_FL_PUNCH_HOLE|FALLOC_FL_KEEP_SIZE)) ret = bchfs_fpunch(inode, offset, len); else if (mode == FALLOC_FL_INSERT_RANGE) ret = bchfs_fcollapse_finsert(inode, offset, len, true); else if (mode == FALLOC_FL_COLLAPSE_RANGE) ret = bchfs_fcollapse_finsert(inode, offset, len, false); else ret = -EOPNOTSUPP; err: bch2_pagecache_block_put(inode); inode_unlock(&inode->v); bch2_write_ref_put(c, BCH_WRITE_REF_fallocate); return bch2_err_class(ret); } /* * Take a quota reservation for unallocated blocks in a given file range * Does not check pagecache */ static int quota_reserve_range(struct bch_inode_info *inode, struct quota_res *res, u64 start, u64 end) { struct bch_fs *c = inode->v.i_sb->s_fs_info; u64 sectors = end - start; int ret = bch2_trans_run(c, for_each_btree_key_in_subvolume_max(trans, iter, BTREE_ID_extents, POS(inode->v.i_ino, start), POS(inode->v.i_ino, end - 1), inode->ei_inum.subvol, 0, k, ({ if (bkey_extent_is_allocation(k.k)) { u64 s = min(end, k.k->p.offset) - max(start, bkey_start_offset(k.k)); BUG_ON(s > sectors); sectors -= s; } 0; }))); return ret ?: bch2_quota_reservation_add(c, inode, res, sectors, true); } loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src, struct file *file_dst, loff_t pos_dst, loff_t len, unsigned remap_flags) { struct bch_inode_info *src = file_bch_inode(file_src); struct bch_inode_info *dst = file_bch_inode(file_dst); struct bch_fs *c = src->v.i_sb->s_fs_info; struct quota_res quota_res = { 0 }; s64 i_sectors_delta = 0; u64 aligned_len; loff_t ret = 0; if (remap_flags & ~(REMAP_FILE_DEDUP|REMAP_FILE_ADVISORY)) return -EINVAL; if ((pos_src & (block_bytes(c) - 1)) || (pos_dst & (block_bytes(c) - 1))) return -EINVAL; if (src == dst && abs(pos_src - pos_dst) < len) return -EINVAL; lock_two_nondirectories(&src->v, &dst->v); bch2_lock_inodes(INODE_PAGECACHE_BLOCK, src, dst); inode_dio_wait(&src->v); inode_dio_wait(&dst->v); ret = generic_remap_file_range_prep(file_src, pos_src, file_dst, pos_dst, &len, remap_flags); if (ret < 0 || len == 0) goto err; aligned_len = round_up((u64) len, block_bytes(c)); ret = bch2_write_invalidate_inode_pages_range(dst->v.i_mapping, pos_dst, pos_dst + len - 1); if (ret) goto err; ret = quota_reserve_range(dst, &quota_res, pos_dst >> 9, (pos_dst + aligned_len) >> 9); if (ret) goto err; if (!(remap_flags & REMAP_FILE_DEDUP)) file_update_time(file_dst); bch2_mark_pagecache_unallocated(src, pos_src >> 9, (pos_src + aligned_len) >> 9); /* * XXX: we'd like to be telling bch2_remap_range() if we have * permission to write to the source file, and thus if io path option * changes should be propagated through the copy, but we need mnt_idmap * from the pathwalk, awkward */ ret = bch2_remap_range(c, inode_inum(dst), pos_dst >> 9, inode_inum(src), pos_src >> 9, aligned_len >> 9, pos_dst + len, &i_sectors_delta, false); if (ret < 0) goto err; /* * due to alignment, we might have remapped slightly more than requsted */ ret = min((u64) ret << 9, (u64) len); bch2_i_sectors_acct(c, dst, &quota_res, i_sectors_delta); spin_lock(&dst->v.i_lock); if (pos_dst + ret > dst->v.i_size) i_size_write(&dst->v, pos_dst + ret); spin_unlock(&dst->v.i_lock); if ((file_dst->f_flags & (__O_SYNC | O_DSYNC)) || IS_SYNC(file_inode(file_dst))) ret = bch2_flush_inode(c, dst); err: bch2_quota_reservation_put(c, dst, &quota_res); bch2_unlock_inodes(INODE_PAGECACHE_BLOCK, src, dst); unlock_two_nondirectories(&src->v, &dst->v); return bch2_err_class(ret); } /* fseek: */ static loff_t bch2_seek_data(struct file *file, u64 offset) { struct bch_inode_info *inode = file_bch_inode(file); struct bch_fs *c = inode->v.i_sb->s_fs_info; subvol_inum inum = inode_inum(inode); u64 isize, next_data = MAX_LFS_FILESIZE; isize = i_size_read(&inode->v); if (offset >= isize) return -ENXIO; int ret = bch2_trans_run(c, for_each_btree_key_in_subvolume_max(trans, iter, BTREE_ID_extents, POS(inode->v.i_ino, offset >> 9), POS(inode->v.i_ino, U64_MAX), inum.subvol, 0, k, ({ if (bkey_extent_is_data(k.k)) { next_data = max(offset, bkey_start_offset(k.k) << 9); break; } else if (k.k->p.offset >> 9 > isize) break; 0; }))); if (ret) return ret; if (next_data > offset) next_data = bch2_seek_pagecache_data(&inode->v, offset, next_data, 0, false); if (next_data >= isize) return -ENXIO; return vfs_setpos(file, next_data, MAX_LFS_FILESIZE); } static loff_t bch2_seek_hole(struct file *file, u64 offset) { struct bch_inode_info *inode = file_bch_inode(file); struct bch_fs *c = inode->v.i_sb->s_fs_info; subvol_inum inum = inode_inum(inode); u64 isize, next_hole = MAX_LFS_FILESIZE; isize = i_size_read(&inode->v); if (offset >= isize) return -ENXIO; int ret = bch2_trans_run(c, for_each_btree_key_in_subvolume_max(trans, iter, BTREE_ID_extents, POS(inode->v.i_ino, offset >> 9), POS(inode->v.i_ino, U64_MAX), inum.subvol, BTREE_ITER_slots, k, ({ if (k.k->p.inode != inode->v.i_ino) { next_hole = bch2_seek_pagecache_hole(&inode->v, offset, MAX_LFS_FILESIZE, 0, false); break; } else if (!bkey_extent_is_data(k.k)) { next_hole = bch2_seek_pagecache_hole(&inode->v, max(offset, bkey_start_offset(k.k) << 9), k.k->p.offset << 9, 0, false); if (next_hole < k.k->p.offset << 9) break; } else { offset = max(offset, bkey_start_offset(k.k) << 9); } 0; }))); if (ret) return ret; if (next_hole > isize) next_hole = isize; return vfs_setpos(file, next_hole, MAX_LFS_FILESIZE); } loff_t bch2_llseek(struct file *file, loff_t offset, int whence) { loff_t ret; switch (whence) { case SEEK_SET: case SEEK_CUR: case SEEK_END: ret = generic_file_llseek(file, offset, whence); break; case SEEK_DATA: ret = bch2_seek_data(file, offset); break; case SEEK_HOLE: ret = bch2_seek_hole(file, offset); break; default: ret = -EINVAL; break; } return bch2_err_class(ret); } void bch2_fs_fsio_exit(struct bch_fs *c) { bioset_exit(&c->nocow_flush_bioset); } int bch2_fs_fsio_init(struct bch_fs *c) { if (bioset_init(&c->nocow_flush_bioset, 1, offsetof(struct nocow_flush, bio), 0)) return -BCH_ERR_ENOMEM_nocow_flush_bioset_init; return 0; } #endif /* NO_BCACHEFS_FS */
1 1 1 1 1 1 1 3 3 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 // SPDX-License-Identifier: GPL-2.0-or-later /* * PTP 1588 clock support - character device implementation. * * Copyright (C) 2010 OMICRON electronics GmbH */ #include <linux/compat.h> #include <linux/module.h> #include <linux/posix-clock.h> #include <linux/poll.h> #include <linux/sched.h> #include <linux/slab.h> #include <linux/timekeeping.h> #include <linux/debugfs.h> #include <linux/nospec.h> #include "ptp_private.h" static int ptp_disable_pinfunc(struct ptp_clock_info *ops, enum ptp_pin_function func, unsigned int chan) { struct ptp_clock_request rq; int err = 0; memset(&rq, 0, sizeof(rq)); switch (func) { case PTP_PF_NONE: break; case PTP_PF_EXTTS: rq.type = PTP_CLK_REQ_EXTTS; rq.extts.index = chan; err = ops->enable(ops, &rq, 0); break; case PTP_PF_PEROUT: rq.type = PTP_CLK_REQ_PEROUT; rq.perout.index = chan; err = ops->enable(ops, &rq, 0); break; case PTP_PF_PHYSYNC: break; default: return -EINVAL; } return err; } int ptp_set_pinfunc(struct ptp_clock *ptp, unsigned int pin, enum ptp_pin_function func, unsigned int chan) { struct ptp_clock_info *info = ptp->info; struct ptp_pin_desc *pin1 = NULL, *pin2 = &info->pin_config[pin]; unsigned int i; /* Check to see if any other pin previously had this function. */ for (i = 0; i < info->n_pins; i++) { if (info->pin_config[i].func == func && info->pin_config[i].chan == chan) { pin1 = &info->pin_config[i]; break; } } if (pin1 && i == pin) return 0; /* Check the desired function and channel. */ switch (func) { case PTP_PF_NONE: break; case PTP_PF_EXTTS: if (chan >= info->n_ext_ts) return -EINVAL; break; case PTP_PF_PEROUT: if (chan >= info->n_per_out) return -EINVAL; break; case PTP_PF_PHYSYNC: if (chan != 0) return -EINVAL; break; default: return -EINVAL; } if (info->verify(info, pin, func, chan)) { pr_err("driver cannot use function %u and channel %u on pin %u\n", func, chan, pin); return -EOPNOTSUPP; } /* Disable whatever function was previously assigned. */ if (pin1) { ptp_disable_pinfunc(info, func, chan); pin1->func = PTP_PF_NONE; pin1->chan = 0; } ptp_disable_pinfunc(info, pin2->func, pin2->chan); pin2->func = func; pin2->chan = chan; return 0; } int ptp_open(struct posix_clock_context *pccontext, fmode_t fmode) { struct ptp_clock *ptp = container_of(pccontext->clk, struct ptp_clock, clock); struct timestamp_event_queue *queue; char debugfsname[32]; unsigned long flags; queue = kzalloc(sizeof(*queue), GFP_KERNEL); if (!queue) return -EINVAL; queue->mask = bitmap_alloc(PTP_MAX_CHANNELS, GFP_KERNEL); if (!queue->mask) { kfree(queue); return -EINVAL; } bitmap_set(queue->mask, 0, PTP_MAX_CHANNELS); spin_lock_init(&queue->lock); spin_lock_irqsave(&ptp->tsevqs_lock, flags); list_add_tail(&queue->qlist, &ptp->tsevqs); spin_unlock_irqrestore(&ptp->tsevqs_lock, flags); pccontext->private_clkdata = queue; /* Debugfs contents */ sprintf(debugfsname, "0x%p", queue); queue->debugfs_instance = debugfs_create_dir(debugfsname, ptp->debugfs_root); queue->dfs_bitmap.array = (u32 *)queue->mask; queue->dfs_bitmap.n_elements = DIV_ROUND_UP(PTP_MAX_CHANNELS, BITS_PER_BYTE * sizeof(u32)); debugfs_create_u32_array("mask", 0444, queue->debugfs_instance, &queue->dfs_bitmap); return 0; } int ptp_release(struct posix_clock_context *pccontext) { struct timestamp_event_queue *queue = pccontext->private_clkdata; unsigned long flags; struct ptp_clock *ptp = container_of(pccontext->clk, struct ptp_clock, clock); debugfs_remove(queue->debugfs_instance); pccontext->private_clkdata = NULL; spin_lock_irqsave(&ptp->tsevqs_lock, flags); list_del(&queue->qlist); spin_unlock_irqrestore(&ptp->tsevqs_lock, flags); bitmap_free(queue->mask); kfree(queue); return 0; } long ptp_ioctl(struct posix_clock_context *pccontext, unsigned int cmd, unsigned long arg) { struct ptp_clock *ptp = container_of(pccontext->clk, struct ptp_clock, clock); struct ptp_sys_offset_extended *extoff = NULL; struct ptp_sys_offset_precise precise_offset; struct system_device_crosststamp xtstamp; struct ptp_clock_info *ops = ptp->info; struct ptp_sys_offset *sysoff = NULL; struct timestamp_event_queue *tsevq; struct ptp_system_timestamp sts; struct ptp_clock_request req; struct ptp_clock_caps caps; struct ptp_clock_time *pct; unsigned int i, pin_index; struct ptp_pin_desc pd; struct timespec64 ts; int enable, err = 0; if (in_compat_syscall() && cmd != PTP_ENABLE_PPS && cmd != PTP_ENABLE_PPS2) arg = (unsigned long)compat_ptr(arg); tsevq = pccontext->private_clkdata; switch (cmd) { case PTP_CLOCK_GETCAPS: case PTP_CLOCK_GETCAPS2: memset(&caps, 0, sizeof(caps)); caps.max_adj = ptp->info->max_adj; caps.n_alarm = ptp->info->n_alarm; caps.n_ext_ts = ptp->info->n_ext_ts; caps.n_per_out = ptp->info->n_per_out; caps.pps = ptp->info->pps; caps.n_pins = ptp->info->n_pins; caps.cross_timestamping = ptp->info->getcrosststamp != NULL; caps.adjust_phase = ptp->info->adjphase != NULL && ptp->info->getmaxphase != NULL; if (caps.adjust_phase) caps.max_phase_adj = ptp->info->getmaxphase(ptp->info); if (copy_to_user((void __user *)arg, &caps, sizeof(caps))) err = -EFAULT; break; case PTP_EXTTS_REQUEST: case PTP_EXTTS_REQUEST2: memset(&req, 0, sizeof(req)); if (copy_from_user(&req.extts, (void __user *)arg, sizeof(req.extts))) { err = -EFAULT; break; } if (cmd == PTP_EXTTS_REQUEST2) { /* Tell the drivers to check the flags carefully. */ req.extts.flags |= PTP_STRICT_FLAGS; /* Make sure no reserved bit is set. */ if ((req.extts.flags & ~PTP_EXTTS_VALID_FLAGS) || req.extts.rsv[0] || req.extts.rsv[1]) { err = -EINVAL; break; } /* Ensure one of the rising/falling edge bits is set. */ if ((req.extts.flags & PTP_ENABLE_FEATURE) && (req.extts.flags & PTP_EXTTS_EDGES) == 0) { err = -EINVAL; break; } } else if (cmd == PTP_EXTTS_REQUEST) { req.extts.flags &= PTP_EXTTS_V1_VALID_FLAGS; req.extts.rsv[0] = 0; req.extts.rsv[1] = 0; } if (req.extts.index >= ops->n_ext_ts) { err = -EINVAL; break; } req.type = PTP_CLK_REQ_EXTTS; enable = req.extts.flags & PTP_ENABLE_FEATURE ? 1 : 0; if (mutex_lock_interruptible(&ptp->pincfg_mux)) return -ERESTARTSYS; err = ops->enable(ops, &req, enable); mutex_unlock(&ptp->pincfg_mux); break; case PTP_PEROUT_REQUEST: case PTP_PEROUT_REQUEST2: memset(&req, 0, sizeof(req)); if (copy_from_user(&req.perout, (void __user *)arg, sizeof(req.perout))) { err = -EFAULT; break; } if (cmd == PTP_PEROUT_REQUEST2) { struct ptp_perout_request *perout = &req.perout; if (perout->flags & ~PTP_PEROUT_VALID_FLAGS) { err = -EINVAL; break; } /* * The "on" field has undefined meaning if * PTP_PEROUT_DUTY_CYCLE isn't set, we must still treat * it as reserved, which must be set to zero. */ if (!(perout->flags & PTP_PEROUT_DUTY_CYCLE) && (perout->rsv[0] || perout->rsv[1] || perout->rsv[2] || perout->rsv[3])) { err = -EINVAL; break; } if (perout->flags & PTP_PEROUT_DUTY_CYCLE) { /* The duty cycle must be subunitary. */ if (perout->on.sec > perout->period.sec || (perout->on.sec == perout->period.sec && perout->on.nsec > perout->period.nsec)) { err = -ERANGE; break; } } if (perout->flags & PTP_PEROUT_PHASE) { /* * The phase should be specified modulo the * period, therefore anything equal or larger * than 1 period is invalid. */ if (perout->phase.sec > perout->period.sec || (perout->phase.sec == perout->period.sec && perout->phase.nsec >= perout->period.nsec)) { err = -ERANGE; break; } } } else if (cmd == PTP_PEROUT_REQUEST) { req.perout.flags &= PTP_PEROUT_V1_VALID_FLAGS; req.perout.rsv[0] = 0; req.perout.rsv[1] = 0; req.perout.rsv[2] = 0; req.perout.rsv[3] = 0; } if (req.perout.index >= ops->n_per_out) { err = -EINVAL; break; } req.type = PTP_CLK_REQ_PEROUT; enable = req.perout.period.sec || req.perout.period.nsec; if (mutex_lock_interruptible(&ptp->pincfg_mux)) return -ERESTARTSYS; err = ops->enable(ops, &req, enable); mutex_unlock(&ptp->pincfg_mux); break; case PTP_ENABLE_PPS: case PTP_ENABLE_PPS2: memset(&req, 0, sizeof(req)); if (!capable(CAP_SYS_TIME)) return -EPERM; req.type = PTP_CLK_REQ_PPS; enable = arg ? 1 : 0; if (mutex_lock_interruptible(&ptp->pincfg_mux)) return -ERESTARTSYS; err = ops->enable(ops, &req, enable); mutex_unlock(&ptp->pincfg_mux); break; case PTP_SYS_OFFSET_PRECISE: case PTP_SYS_OFFSET_PRECISE2: if (!ptp->info->getcrosststamp) { err = -EOPNOTSUPP; break; } err = ptp->info->getcrosststamp(ptp->info, &xtstamp); if (err) break; memset(&precise_offset, 0, sizeof(precise_offset)); ts = ktime_to_timespec64(xtstamp.device); precise_offset.device.sec = ts.tv_sec; precise_offset.device.nsec = ts.tv_nsec; ts = ktime_to_timespec64(xtstamp.sys_realtime); precise_offset.sys_realtime.sec = ts.tv_sec; precise_offset.sys_realtime.nsec = ts.tv_nsec; ts = ktime_to_timespec64(xtstamp.sys_monoraw); precise_offset.sys_monoraw.sec = ts.tv_sec; precise_offset.sys_monoraw.nsec = ts.tv_nsec; if (copy_to_user((void __user *)arg, &precise_offset, sizeof(precise_offset))) err = -EFAULT; break; case PTP_SYS_OFFSET_EXTENDED: case PTP_SYS_OFFSET_EXTENDED2: if (!ptp->info->gettimex64) { err = -EOPNOTSUPP; break; } extoff = memdup_user((void __user *)arg, sizeof(*extoff)); if (IS_ERR(extoff)) { err = PTR_ERR(extoff); extoff = NULL; break; } if (extoff->n_samples > PTP_MAX_SAMPLES || extoff->rsv[0] || extoff->rsv[1] || (extoff->clockid != CLOCK_REALTIME && extoff->clockid != CLOCK_MONOTONIC && extoff->clockid != CLOCK_MONOTONIC_RAW)) { err = -EINVAL; break; } sts.clockid = extoff->clockid; for (i = 0; i < extoff->n_samples; i++) { err = ptp->info->gettimex64(ptp->info, &ts, &sts); if (err) goto out; extoff->ts[i][0].sec = sts.pre_ts.tv_sec; extoff->ts[i][0].nsec = sts.pre_ts.tv_nsec; extoff->ts[i][1].sec = ts.tv_sec; extoff->ts[i][1].nsec = ts.tv_nsec; extoff->ts[i][2].sec = sts.post_ts.tv_sec; extoff->ts[i][2].nsec = sts.post_ts.tv_nsec; } if (copy_to_user((void __user *)arg, extoff, sizeof(*extoff))) err = -EFAULT; break; case PTP_SYS_OFFSET: case PTP_SYS_OFFSET2: sysoff = memdup_user((void __user *)arg, sizeof(*sysoff)); if (IS_ERR(sysoff)) { err = PTR_ERR(sysoff); sysoff = NULL; break; } if (sysoff->n_samples > PTP_MAX_SAMPLES) { err = -EINVAL; break; } pct = &sysoff->ts[0]; for (i = 0; i < sysoff->n_samples; i++) { ktime_get_real_ts64(&ts); pct->sec = ts.tv_sec; pct->nsec = ts.tv_nsec; pct++; if (ops->gettimex64) err = ops->gettimex64(ops, &ts, NULL); else err = ops->gettime64(ops, &ts); if (err) goto out; pct->sec = ts.tv_sec; pct->nsec = ts.tv_nsec; pct++; } ktime_get_real_ts64(&ts); pct->sec = ts.tv_sec; pct->nsec = ts.tv_nsec; if (copy_to_user((void __user *)arg, sysoff, sizeof(*sysoff))) err = -EFAULT; break; case PTP_PIN_GETFUNC: case PTP_PIN_GETFUNC2: if (copy_from_user(&pd, (void __user *)arg, sizeof(pd))) { err = -EFAULT; break; } if ((pd.rsv[0] || pd.rsv[1] || pd.rsv[2] || pd.rsv[3] || pd.rsv[4]) && cmd == PTP_PIN_GETFUNC2) { err = -EINVAL; break; } else if (cmd == PTP_PIN_GETFUNC) { pd.rsv[0] = 0; pd.rsv[1] = 0; pd.rsv[2] = 0; pd.rsv[3] = 0; pd.rsv[4] = 0; } pin_index = pd.index; if (pin_index >= ops->n_pins) { err = -EINVAL; break; } pin_index = array_index_nospec(pin_index, ops->n_pins); if (mutex_lock_interruptible(&ptp->pincfg_mux)) return -ERESTARTSYS; pd = ops->pin_config[pin_index]; mutex_unlock(&ptp->pincfg_mux); if (!err && copy_to_user((void __user *)arg, &pd, sizeof(pd))) err = -EFAULT; break; case PTP_PIN_SETFUNC: case PTP_PIN_SETFUNC2: if (copy_from_user(&pd, (void __user *)arg, sizeof(pd))) { err = -EFAULT; break; } if ((pd.rsv[0] || pd.rsv[1] || pd.rsv[2] || pd.rsv[3] || pd.rsv[4]) && cmd == PTP_PIN_SETFUNC2) { err = -EINVAL; break; } else if (cmd == PTP_PIN_SETFUNC) { pd.rsv[0] = 0; pd.rsv[1] = 0; pd.rsv[2] = 0; pd.rsv[3] = 0; pd.rsv[4] = 0; } pin_index = pd.index; if (pin_index >= ops->n_pins) { err = -EINVAL; break; } pin_index = array_index_nospec(pin_index, ops->n_pins); if (mutex_lock_interruptible(&ptp->pincfg_mux)) return -ERESTARTSYS; err = ptp_set_pinfunc(ptp, pin_index, pd.func, pd.chan); mutex_unlock(&ptp->pincfg_mux); break; case PTP_MASK_CLEAR_ALL: bitmap_clear(tsevq->mask, 0, PTP_MAX_CHANNELS); break; case PTP_MASK_EN_SINGLE: if (copy_from_user(&i, (void __user *)arg, sizeof(i))) { err = -EFAULT; break; } if (i >= PTP_MAX_CHANNELS) { err = -EFAULT; break; } set_bit(i, tsevq->mask); break; default: err = -ENOTTY; break; } out: kfree(extoff); kfree(sysoff); return err; } __poll_t ptp_poll(struct posix_clock_context *pccontext, struct file *fp, poll_table *wait) { struct ptp_clock *ptp = container_of(pccontext->clk, struct ptp_clock, clock); struct timestamp_event_queue *queue; queue = pccontext->private_clkdata; if (!queue) return EPOLLERR; poll_wait(fp, &ptp->tsev_wq, wait); return queue_cnt(queue) ? EPOLLIN : 0; } #define EXTTS_BUFSIZE (PTP_BUF_TIMESTAMPS * sizeof(struct ptp_extts_event)) ssize_t ptp_read(struct posix_clock_context *pccontext, uint rdflags, char __user *buf, size_t cnt) { struct ptp_clock *ptp = container_of(pccontext->clk, struct ptp_clock, clock); struct timestamp_event_queue *queue; struct ptp_extts_event *event; unsigned long flags; size_t qcnt, i; int result; queue = pccontext->private_clkdata; if (!queue) { result = -EINVAL; goto exit; } if (cnt % sizeof(struct ptp_extts_event) != 0) { result = -EINVAL; goto exit; } if (cnt > EXTTS_BUFSIZE) cnt = EXTTS_BUFSIZE; cnt = cnt / sizeof(struct ptp_extts_event); if (wait_event_interruptible(ptp->tsev_wq, ptp->defunct || queue_cnt(queue))) { return -ERESTARTSYS; } if (ptp->defunct) { result = -ENODEV; goto exit; } event = kmalloc(EXTTS_BUFSIZE, GFP_KERNEL); if (!event) { result = -ENOMEM; goto exit; } spin_lock_irqsave(&queue->lock, flags); qcnt = queue_cnt(queue); if (cnt > qcnt) cnt = qcnt; for (i = 0; i < cnt; i++) { event[i] = queue->buf[queue->head]; /* Paired with READ_ONCE() in queue_cnt() */ WRITE_ONCE(queue->head, (queue->head + 1) % PTP_MAX_TIMESTAMPS); } spin_unlock_irqrestore(&queue->lock, flags); cnt = cnt * sizeof(struct ptp_extts_event); result = cnt; if (copy_to_user(buf, event, cnt)) { result = -EFAULT; goto free_event; } free_event: kfree(event); exit: return result; }
1 1 1 1 1 1 1 1 1 3 3 1 1 1 2 2 2 2 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 // SPDX-License-Identifier: GPL-2.0-only /* Copyright (C) 2009 Red Hat, Inc. * Author: Michael S. Tsirkin <mst@redhat.com> * * virtio-net server in host kernel. */ #include <linux/compat.h> #include <linux/eventfd.h> #include <linux/vhost.h> #include <linux/virtio_net.h> #include <linux/miscdevice.h> #include <linux/module.h> #include <linux/moduleparam.h> #include <linux/mutex.h> #include <linux/workqueue.h> #include <linux/file.h> #include <linux/slab.h> #include <linux/sched/clock.h> #include <linux/sched/signal.h> #include <linux/vmalloc.h> #include <linux/net.h> #include <linux/if_packet.h> #include <linux/if_arp.h> #include <linux/if_tun.h> #include <linux/if_macvlan.h> #include <linux/if_tap.h> #include <linux/if_vlan.h> #include <linux/skb_array.h> #include <linux/skbuff.h> #include <net/sock.h> #include <net/xdp.h> #include "vhost.h" static int experimental_zcopytx = 0; module_param(experimental_zcopytx, int, 0444); MODULE_PARM_DESC(experimental_zcopytx, "Enable Zero Copy TX;" " 1 -Enable; 0 - Disable"); /* Max number of bytes transferred before requeueing the job. * Using this limit prevents one virtqueue from starving others. */ #define VHOST_NET_WEIGHT 0x80000 /* Max number of packets transferred before requeueing the job. * Using this limit prevents one virtqueue from starving others with small * pkts. */ #define VHOST_NET_PKT_WEIGHT 256 /* MAX number of TX used buffers for outstanding zerocopy */ #define VHOST_MAX_PEND 128 #define VHOST_GOODCOPY_LEN 256 /* * For transmit, used buffer len is unused; we override it to track buffer * status internally; used for zerocopy tx only. */ /* Lower device DMA failed */ #define VHOST_DMA_FAILED_LEN ((__force __virtio32)3) /* Lower device DMA done */ #define VHOST_DMA_DONE_LEN ((__force __virtio32)2) /* Lower device DMA in progress */ #define VHOST_DMA_IN_PROGRESS ((__force __virtio32)1) /* Buffer unused */ #define VHOST_DMA_CLEAR_LEN ((__force __virtio32)0) #define VHOST_DMA_IS_DONE(len) ((__force u32)(len) >= (__force u32)VHOST_DMA_DONE_LEN) enum { VHOST_NET_FEATURES = VHOST_FEATURES | (1ULL << VHOST_NET_F_VIRTIO_NET_HDR) | (1ULL << VIRTIO_NET_F_MRG_RXBUF) | (1ULL << VIRTIO_F_ACCESS_PLATFORM) | (1ULL << VIRTIO_F_RING_RESET) }; enum { VHOST_NET_BACKEND_FEATURES = (1ULL << VHOST_BACKEND_F_IOTLB_MSG_V2) }; enum { VHOST_NET_VQ_RX = 0, VHOST_NET_VQ_TX = 1, VHOST_NET_VQ_MAX = 2, }; struct vhost_net_ubuf_ref { /* refcount follows semantics similar to kref: * 0: object is released * 1: no outstanding ubufs * >1: outstanding ubufs */ atomic_t refcount; wait_queue_head_t wait; struct vhost_virtqueue *vq; }; #define VHOST_NET_BATCH 64 struct vhost_net_buf { void **queue; int tail; int head; }; struct vhost_net_virtqueue { struct vhost_virtqueue vq; size_t vhost_hlen; size_t sock_hlen; /* vhost zerocopy support fields below: */ /* last used idx for outstanding DMA zerocopy buffers */ int upend_idx; /* For TX, first used idx for DMA done zerocopy buffers * For RX, number of batched heads */ int done_idx; /* Number of XDP frames batched */ int batched_xdp; /* an array of userspace buffers info */ struct ubuf_info_msgzc *ubuf_info; /* Reference counting for outstanding ubufs. * Protected by vq mutex. Writers must also take device mutex. */ struct vhost_net_ubuf_ref *ubufs; struct ptr_ring *rx_ring; struct vhost_net_buf rxq; /* Batched XDP buffs */ struct xdp_buff *xdp; }; struct vhost_net { struct vhost_dev dev; struct vhost_net_virtqueue vqs[VHOST_NET_VQ_MAX]; struct vhost_poll poll[VHOST_NET_VQ_MAX]; /* Number of TX recently submitted. * Protected by tx vq lock. */ unsigned tx_packets; /* Number of times zerocopy TX recently failed. * Protected by tx vq lock. */ unsigned tx_zcopy_err; /* Flush in progress. Protected by tx vq lock. */ bool tx_flush; /* Private page frag cache */ struct page_frag_cache pf_cache; }; static unsigned vhost_net_zcopy_mask __read_mostly; static void *vhost_net_buf_get_ptr(struct vhost_net_buf *rxq) { if (rxq->tail != rxq->head) return rxq->queue[rxq->head]; else return NULL; } static int vhost_net_buf_get_size(struct vhost_net_buf *rxq) { return rxq->tail - rxq->head; } static int vhost_net_buf_is_empty(struct vhost_net_buf *rxq) { return rxq->tail == rxq->head; } static void *vhost_net_buf_consume(struct vhost_net_buf *rxq) { void *ret = vhost_net_buf_get_ptr(rxq); ++rxq->head; return ret; } static int vhost_net_buf_produce(struct vhost_net_virtqueue *nvq) { struct vhost_net_buf *rxq = &nvq->rxq; rxq->head = 0; rxq->tail = ptr_ring_consume_batched(nvq->rx_ring, rxq->queue, VHOST_NET_BATCH); return rxq->tail; } static void vhost_net_buf_unproduce(struct vhost_net_virtqueue *nvq) { struct vhost_net_buf *rxq = &nvq->rxq; if (nvq->rx_ring && !vhost_net_buf_is_empty(rxq)) { ptr_ring_unconsume(nvq->rx_ring, rxq->queue + rxq->head, vhost_net_buf_get_size(rxq), tun_ptr_free); rxq->head = rxq->tail = 0; } } static int vhost_net_buf_peek_len(void *ptr) { if (tun_is_xdp_frame(ptr)) { struct xdp_frame *xdpf = tun_ptr_to_xdp(ptr); return xdpf->len; } return __skb_array_len_with_tag(ptr); } static int vhost_net_buf_peek(struct vhost_net_virtqueue *nvq) { struct vhost_net_buf *rxq = &nvq->rxq; if (!vhost_net_buf_is_empty(rxq)) goto out; if (!vhost_net_buf_produce(nvq)) return 0; out: return vhost_net_buf_peek_len(vhost_net_buf_get_ptr(rxq)); } static void vhost_net_buf_init(struct vhost_net_buf *rxq) { rxq->head = rxq->tail = 0; } static void vhost_net_enable_zcopy(int vq) { vhost_net_zcopy_mask |= 0x1 << vq; } static struct vhost_net_ubuf_ref * vhost_net_ubuf_alloc(struct vhost_virtqueue *vq, bool zcopy) { struct vhost_net_ubuf_ref *ubufs; /* No zero copy backend? Nothing to count. */ if (!zcopy) return NULL; ubufs = kmalloc(sizeof(*ubufs), GFP_KERNEL); if (!ubufs) return ERR_PTR(-ENOMEM); atomic_set(&ubufs->refcount, 1); init_waitqueue_head(&ubufs->wait); ubufs->vq = vq; return ubufs; } static int vhost_net_ubuf_put(struct vhost_net_ubuf_ref *ubufs) { int r = atomic_sub_return(1, &ubufs->refcount); if (unlikely(!r)) wake_up(&ubufs->wait); return r; } static void vhost_net_ubuf_put_and_wait(struct vhost_net_ubuf_ref *ubufs) { vhost_net_ubuf_put(ubufs); wait_event(ubufs->wait, !atomic_read(&ubufs->refcount)); } static void vhost_net_ubuf_put_wait_and_free(struct vhost_net_ubuf_ref *ubufs) { vhost_net_ubuf_put_and_wait(ubufs); kfree(ubufs); } static void vhost_net_clear_ubuf_info(struct vhost_net *n) { int i; for (i = 0; i < VHOST_NET_VQ_MAX; ++i) { kfree(n->vqs[i].ubuf_info); n->vqs[i].ubuf_info = NULL; } } static int vhost_net_set_ubuf_info(struct vhost_net *n) { bool zcopy; int i; for (i = 0; i < VHOST_NET_VQ_MAX; ++i) { zcopy = vhost_net_zcopy_mask & (0x1 << i); if (!zcopy) continue; n->vqs[i].ubuf_info = kmalloc_array(UIO_MAXIOV, sizeof(*n->vqs[i].ubuf_info), GFP_KERNEL); if (!n->vqs[i].ubuf_info) goto err; } return 0; err: vhost_net_clear_ubuf_info(n); return -ENOMEM; } static void vhost_net_vq_reset(struct vhost_net *n) { int i; vhost_net_clear_ubuf_info(n); for (i = 0; i < VHOST_NET_VQ_MAX; i++) { n->vqs[i].done_idx = 0; n->vqs[i].upend_idx = 0; n->vqs[i].ubufs = NULL; n->vqs[i].vhost_hlen = 0; n->vqs[i].sock_hlen = 0; vhost_net_buf_init(&n->vqs[i].rxq); } } static void vhost_net_tx_packet(struct vhost_net *net) { ++net->tx_packets; if (net->tx_packets < 1024) return; net->tx_packets = 0; net->tx_zcopy_err = 0; } static void vhost_net_tx_err(struct vhost_net *net) { ++net->tx_zcopy_err; } static bool vhost_net_tx_select_zcopy(struct vhost_net *net) { /* TX flush waits for outstanding DMAs to be done. * Don't start new DMAs. */ return !net->tx_flush && net->tx_packets / 64 >= net->tx_zcopy_err; } static bool vhost_sock_zcopy(struct socket *sock) { return unlikely(experimental_zcopytx) && sock_flag(sock->sk, SOCK_ZEROCOPY); } static bool vhost_sock_xdp(struct socket *sock) { return sock_flag(sock->sk, SOCK_XDP); } /* In case of DMA done not in order in lower device driver for some reason. * upend_idx is used to track end of used idx, done_idx is used to track head * of used idx. Once lower device DMA done contiguously, we will signal KVM * guest used idx. */ static void vhost_zerocopy_signal_used(struct vhost_net *net, struct vhost_virtqueue *vq) { struct vhost_net_virtqueue *nvq = container_of(vq, struct vhost_net_virtqueue, vq); int i, add; int j = 0; for (i = nvq->done_idx; i != nvq->upend_idx; i = (i + 1) % UIO_MAXIOV) { if (vq->heads[i].len == VHOST_DMA_FAILED_LEN) vhost_net_tx_err(net); if (VHOST_DMA_IS_DONE(vq->heads[i].len)) { vq->heads[i].len = VHOST_DMA_CLEAR_LEN; ++j; } else break; } while (j) { add = min(UIO_MAXIOV - nvq->done_idx, j); vhost_add_used_and_signal_n(vq->dev, vq, &vq->heads[nvq->done_idx], add); nvq->done_idx = (nvq->done_idx + add) % UIO_MAXIOV; j -= add; } } static void vhost_zerocopy_complete(struct sk_buff *skb, struct ubuf_info *ubuf_base, bool success) { struct ubuf_info_msgzc *ubuf = uarg_to_msgzc(ubuf_base); struct vhost_net_ubuf_ref *ubufs = ubuf->ctx; struct vhost_virtqueue *vq = ubufs->vq; int cnt; rcu_read_lock_bh(); /* set len to mark this desc buffers done DMA */ vq->heads[ubuf->desc].len = success ? VHOST_DMA_DONE_LEN : VHOST_DMA_FAILED_LEN; cnt = vhost_net_ubuf_put(ubufs); /* * Trigger polling thread if guest stopped submitting new buffers: * in this case, the refcount after decrement will eventually reach 1. * We also trigger polling periodically after each 16 packets * (the value 16 here is more or less arbitrary, it's tuned to trigger * less than 10% of times). */ if (cnt <= 1 || !(cnt % 16)) vhost_poll_queue(&vq->poll); rcu_read_unlock_bh(); } static const struct ubuf_info_ops vhost_ubuf_ops = { .complete = vhost_zerocopy_complete, }; static inline unsigned long busy_clock(void) { return local_clock() >> 10; } static bool vhost_can_busy_poll(unsigned long endtime) { return likely(!need_resched() && !time_after(busy_clock(), endtime) && !signal_pending(current)); } static void vhost_net_disable_vq(struct vhost_net *n, struct vhost_virtqueue *vq) { struct vhost_net_virtqueue *nvq = container_of(vq, struct vhost_net_virtqueue, vq); struct vhost_poll *poll = n->poll + (nvq - n->vqs); if (!vhost_vq_get_backend(vq)) return; vhost_poll_stop(poll); } static int vhost_net_enable_vq(struct vhost_net *n, struct vhost_virtqueue *vq) { struct vhost_net_virtqueue *nvq = container_of(vq, struct vhost_net_virtqueue, vq); struct vhost_poll *poll = n->poll + (nvq - n->vqs); struct socket *sock; sock = vhost_vq_get_backend(vq); if (!sock) return 0; return vhost_poll_start(poll, sock->file); } static void vhost_net_signal_used(struct vhost_net_virtqueue *nvq) { struct vhost_virtqueue *vq = &nvq->vq; struct vhost_dev *dev = vq->dev; if (!nvq->done_idx) return; vhost_add_used_and_signal_n(dev, vq, vq->heads, nvq->done_idx); nvq->done_idx = 0; } static void vhost_tx_batch(struct vhost_net *net, struct vhost_net_virtqueue *nvq, struct socket *sock, struct msghdr *msghdr) { struct tun_msg_ctl ctl = { .type = TUN_MSG_PTR, .num = nvq->batched_xdp, .ptr = nvq->xdp, }; int i, err; if (nvq->batched_xdp == 0) goto signal_used; msghdr->msg_control = &ctl; msghdr->msg_controllen = sizeof(ctl); err = sock->ops->sendmsg(sock, msghdr, 0); if (unlikely(err < 0)) { vq_err(&nvq->vq, "Fail to batch sending packets\n"); /* free pages owned by XDP; since this is an unlikely error path, * keep it simple and avoid more complex bulk update for the * used pages */ for (i = 0; i < nvq->batched_xdp; ++i) put_page(virt_to_head_page(nvq->xdp[i].data)); nvq->batched_xdp = 0; nvq->done_idx = 0; return; } signal_used: vhost_net_signal_used(nvq); nvq->batched_xdp = 0; } static int sock_has_rx_data(struct socket *sock) { if (unlikely(!sock)) return 0; if (sock->ops->peek_len) return sock->ops->peek_len(sock); return skb_queue_empty(&sock->sk->sk_receive_queue); } static void vhost_net_busy_poll_try_queue(struct vhost_net *net, struct vhost_virtqueue *vq) { if (!vhost_vq_avail_empty(&net->dev, vq)) { vhost_poll_queue(&vq->poll); } else if (unlikely(vhost_enable_notify(&net->dev, vq))) { vhost_disable_notify(&net->dev, vq); vhost_poll_queue(&vq->poll); } } static void vhost_net_busy_poll(struct vhost_net *net, struct vhost_virtqueue *rvq, struct vhost_virtqueue *tvq, bool *busyloop_intr, bool poll_rx) { unsigned long busyloop_timeout; unsigned long endtime; struct socket *sock; struct vhost_virtqueue *vq = poll_rx ? tvq : rvq; /* Try to hold the vq mutex of the paired virtqueue. We can't * use mutex_lock() here since we could not guarantee a * consistenet lock ordering. */ if (!mutex_trylock(&vq->mutex)) return; vhost_disable_notify(&net->dev, vq); sock = vhost_vq_get_backend(rvq); busyloop_timeout = poll_rx ? rvq->busyloop_timeout: tvq->busyloop_timeout; preempt_disable(); endtime = busy_clock() + busyloop_timeout; while (vhost_can_busy_poll(endtime)) { if (vhost_vq_has_work(vq)) { *busyloop_intr = true; break; } if ((sock_has_rx_data(sock) && !vhost_vq_avail_empty(&net->dev, rvq)) || !vhost_vq_avail_empty(&net->dev, tvq)) break; cpu_relax(); } preempt_enable(); if (poll_rx || sock_has_rx_data(sock)) vhost_net_busy_poll_try_queue(net, vq); else if (!poll_rx) /* On tx here, sock has no rx data. */ vhost_enable_notify(&net->dev, rvq); mutex_unlock(&vq->mutex); } static int vhost_net_tx_get_vq_desc(struct vhost_net *net, struct vhost_net_virtqueue *tnvq, unsigned int *out_num, unsigned int *in_num, struct msghdr *msghdr, bool *busyloop_intr) { struct vhost_net_virtqueue *rnvq = &net->vqs[VHOST_NET_VQ_RX]; struct vhost_virtqueue *rvq = &rnvq->vq; struct vhost_virtqueue *tvq = &tnvq->vq; int r = vhost_get_vq_desc(tvq, tvq->iov, ARRAY_SIZE(tvq->iov), out_num, in_num, NULL, NULL); if (r == tvq->num && tvq->busyloop_timeout) { /* Flush batched packets first */ if (!vhost_sock_zcopy(vhost_vq_get_backend(tvq))) vhost_tx_batch(net, tnvq, vhost_vq_get_backend(tvq), msghdr); vhost_net_busy_poll(net, rvq, tvq, busyloop_intr, false); r = vhost_get_vq_desc(tvq, tvq->iov, ARRAY_SIZE(tvq->iov), out_num, in_num, NULL, NULL); } return r; } static bool vhost_exceeds_maxpend(struct vhost_net *net) { struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX]; struct vhost_virtqueue *vq = &nvq->vq; return (nvq->upend_idx + UIO_MAXIOV - nvq->done_idx) % UIO_MAXIOV > min_t(unsigned int, VHOST_MAX_PEND, vq->num >> 2); } static size_t init_iov_iter(struct vhost_virtqueue *vq, struct iov_iter *iter, size_t hdr_size, int out) { /* Skip header. TODO: support TSO. */ size_t len = iov_length(vq->iov, out); iov_iter_init(iter, ITER_SOURCE, vq->iov, out, len); iov_iter_advance(iter, hdr_size); return iov_iter_count(iter); } static int get_tx_bufs(struct vhost_net *net, struct vhost_net_virtqueue *nvq, struct msghdr *msg, unsigned int *out, unsigned int *in, size_t *len, bool *busyloop_intr) { struct vhost_virtqueue *vq = &nvq->vq; int ret; ret = vhost_net_tx_get_vq_desc(net, nvq, out, in, msg, busyloop_intr); if (ret < 0 || ret == vq->num) return ret; if (*in) { vq_err(vq, "Unexpected descriptor format for TX: out %d, int %d\n", *out, *in); return -EFAULT; } /* Sanity check */ *len = init_iov_iter(vq, &msg->msg_iter, nvq->vhost_hlen, *out); if (*len == 0) { vq_err(vq, "Unexpected header len for TX: %zd expected %zd\n", *len, nvq->vhost_hlen); return -EFAULT; } return ret; } static bool tx_can_batch(struct vhost_virtqueue *vq, size_t total_len) { return total_len < VHOST_NET_WEIGHT && !vhost_vq_avail_empty(vq->dev, vq); } #define VHOST_NET_RX_PAD (NET_IP_ALIGN + NET_SKB_PAD) static int vhost_net_build_xdp(struct vhost_net_virtqueue *nvq, struct iov_iter *from) { struct vhost_virtqueue *vq = &nvq->vq; struct vhost_net *net = container_of(vq->dev, struct vhost_net, dev); struct socket *sock = vhost_vq_get_backend(vq); struct virtio_net_hdr *gso; struct xdp_buff *xdp = &nvq->xdp[nvq->batched_xdp]; struct tun_xdp_hdr *hdr; size_t len = iov_iter_count(from); int headroom = vhost_sock_xdp(sock) ? XDP_PACKET_HEADROOM : 0; int buflen = SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); int pad = SKB_DATA_ALIGN(VHOST_NET_RX_PAD + headroom + nvq->sock_hlen); int sock_hlen = nvq->sock_hlen; void *buf; int copied; int ret; if (unlikely(len < nvq->sock_hlen)) return -EFAULT; if (SKB_DATA_ALIGN(len + pad) + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) > PAGE_SIZE) return -ENOSPC; buflen += SKB_DATA_ALIGN(len + pad); buf = page_frag_alloc_align(&net->pf_cache, buflen, GFP_KERNEL, SMP_CACHE_BYTES); if (unlikely(!buf)) return -ENOMEM; copied = copy_from_iter(buf + offsetof(struct tun_xdp_hdr, gso), sock_hlen, from); if (copied != sock_hlen) { ret = -EFAULT; goto err; } hdr = buf; gso = &hdr->gso; if (!sock_hlen) memset(buf, 0, pad); if ((gso->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) && vhost16_to_cpu(vq, gso->csum_start) + vhost16_to_cpu(vq, gso->csum_offset) + 2 > vhost16_to_cpu(vq, gso->hdr_len)) { gso->hdr_len = cpu_to_vhost16(vq, vhost16_to_cpu(vq, gso->csum_start) + vhost16_to_cpu(vq, gso->csum_offset) + 2); if (vhost16_to_cpu(vq, gso->hdr_len) > len) { ret = -EINVAL; goto err; } } len -= sock_hlen; copied = copy_from_iter(buf + pad, len, from); if (copied != len) { ret = -EFAULT; goto err; } xdp_init_buff(xdp, buflen, NULL); xdp_prepare_buff(xdp, buf, pad, len, true); hdr->buflen = buflen; ++nvq->batched_xdp; return 0; err: page_frag_free(buf); return ret; } static void handle_tx_copy(struct vhost_net *net, struct socket *sock) { struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX]; struct vhost_virtqueue *vq = &nvq->vq; unsigned out, in; int head; struct msghdr msg = { .msg_name = NULL, .msg_namelen = 0, .msg_control = NULL, .msg_controllen = 0, .msg_flags = MSG_DONTWAIT, }; size_t len, total_len = 0; int err; int sent_pkts = 0; bool sock_can_batch = (sock->sk->sk_sndbuf == INT_MAX); do { bool busyloop_intr = false; if (nvq->done_idx == VHOST_NET_BATCH) vhost_tx_batch(net, nvq, sock, &msg); head = get_tx_bufs(net, nvq, &msg, &out, &in, &len, &busyloop_intr); /* On error, stop handling until the next kick. */ if (unlikely(head < 0)) break; /* Nothing new? Wait for eventfd to tell us they refilled. */ if (head == vq->num) { if (unlikely(busyloop_intr)) { vhost_poll_queue(&vq->poll); } else if (unlikely(vhost_enable_notify(&net->dev, vq))) { vhost_disable_notify(&net->dev, vq); continue; } break; } total_len += len; /* For simplicity, TX batching is only enabled if * sndbuf is unlimited. */ if (sock_can_batch) { err = vhost_net_build_xdp(nvq, &msg.msg_iter); if (!err) { goto done; } else if (unlikely(err != -ENOSPC)) { vhost_tx_batch(net, nvq, sock, &msg); vhost_discard_vq_desc(vq, 1); vhost_net_enable_vq(net, vq); break; } /* We can't build XDP buff, go for single * packet path but let's flush batched * packets. */ vhost_tx_batch(net, nvq, sock, &msg); msg.msg_control = NULL; } else { if (tx_can_batch(vq, total_len)) msg.msg_flags |= MSG_MORE; else msg.msg_flags &= ~MSG_MORE; } err = sock->ops->sendmsg(sock, &msg, len); if (unlikely(err < 0)) { if (err == -EAGAIN || err == -ENOMEM || err == -ENOBUFS) { vhost_discard_vq_desc(vq, 1); vhost_net_enable_vq(net, vq); break; } pr_debug("Fail to send packet: err %d", err); } else if (unlikely(err != len)) pr_debug("Truncated TX packet: len %d != %zd\n", err, len); done: vq->heads[nvq->done_idx].id = cpu_to_vhost32(vq, head); vq->heads[nvq->done_idx].len = 0; ++nvq->done_idx; } while (likely(!vhost_exceeds_weight(vq, ++sent_pkts, total_len))); vhost_tx_batch(net, nvq, sock, &msg); } static void handle_tx_zerocopy(struct vhost_net *net, struct socket *sock) { struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX]; struct vhost_virtqueue *vq = &nvq->vq; unsigned out, in; int head; struct msghdr msg = { .msg_name = NULL, .msg_namelen = 0, .msg_control = NULL, .msg_controllen = 0, .msg_flags = MSG_DONTWAIT, }; struct tun_msg_ctl ctl; size_t len, total_len = 0; int err; struct vhost_net_ubuf_ref *ubufs; struct ubuf_info_msgzc *ubuf; bool zcopy_used; int sent_pkts = 0; do { bool busyloop_intr; /* Release DMAs done buffers first */ vhost_zerocopy_signal_used(net, vq); busyloop_intr = false; head = get_tx_bufs(net, nvq, &msg, &out, &in, &len, &busyloop_intr); /* On error, stop handling until the next kick. */ if (unlikely(head < 0)) break; /* Nothing new? Wait for eventfd to tell us they refilled. */ if (head == vq->num) { if (unlikely(busyloop_intr)) { vhost_poll_queue(&vq->poll); } else if (unlikely(vhost_enable_notify(&net->dev, vq))) { vhost_disable_notify(&net->dev, vq); continue; } break; } zcopy_used = len >= VHOST_GOODCOPY_LEN && !vhost_exceeds_maxpend(net) && vhost_net_tx_select_zcopy(net); /* use msg_control to pass vhost zerocopy ubuf info to skb */ if (zcopy_used) { ubuf = nvq->ubuf_info + nvq->upend_idx; vq->heads[nvq->upend_idx].id = cpu_to_vhost32(vq, head); vq->heads[nvq->upend_idx].len = VHOST_DMA_IN_PROGRESS; ubuf->ctx = nvq->ubufs; ubuf->desc = nvq->upend_idx; ubuf->ubuf.ops = &vhost_ubuf_ops; ubuf->ubuf.flags = SKBFL_ZEROCOPY_FRAG; refcount_set(&ubuf->ubuf.refcnt, 1); msg.msg_control = &ctl; ctl.type = TUN_MSG_UBUF; ctl.ptr = &ubuf->ubuf; msg.msg_controllen = sizeof(ctl); ubufs = nvq->ubufs; atomic_inc(&ubufs->refcount); nvq->upend_idx = (nvq->upend_idx + 1) % UIO_MAXIOV; } else { msg.msg_control = NULL; ubufs = NULL; } total_len += len; if (tx_can_batch(vq, total_len) && likely(!vhost_exceeds_maxpend(net))) { msg.msg_flags |= MSG_MORE; } else { msg.msg_flags &= ~MSG_MORE; } err = sock->ops->sendmsg(sock, &msg, len); if (unlikely(err < 0)) { bool retry = err == -EAGAIN || err == -ENOMEM || err == -ENOBUFS; if (zcopy_used) { if (vq->heads[ubuf->desc].len == VHOST_DMA_IN_PROGRESS) vhost_net_ubuf_put(ubufs); if (retry) nvq->upend_idx = ((unsigned)nvq->upend_idx - 1) % UIO_MAXIOV; else vq->heads[ubuf->desc].len = VHOST_DMA_DONE_LEN; } if (retry) { vhost_discard_vq_desc(vq, 1); vhost_net_enable_vq(net, vq); break; } pr_debug("Fail to send packet: err %d", err); } else if (unlikely(err != len)) pr_debug("Truncated TX packet: " " len %d != %zd\n", err, len); if (!zcopy_used) vhost_add_used_and_signal(&net->dev, vq, head, 0); else vhost_zerocopy_signal_used(net, vq); vhost_net_tx_packet(net); } while (likely(!vhost_exceeds_weight(vq, ++sent_pkts, total_len))); } /* Expects to be always run from workqueue - which acts as * read-size critical section for our kind of RCU. */ static void handle_tx(struct vhost_net *net) { struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX]; struct vhost_virtqueue *vq = &nvq->vq; struct socket *sock; mutex_lock_nested(&vq->mutex, VHOST_NET_VQ_TX); sock = vhost_vq_get_backend(vq); if (!sock) goto out; if (!vq_meta_prefetch(vq)) goto out; vhost_disable_notify(&net->dev, vq); vhost_net_disable_vq(net, vq); if (vhost_sock_zcopy(sock)) handle_tx_zerocopy(net, sock); else handle_tx_copy(net, sock); out: mutex_unlock(&vq->mutex); } static int peek_head_len(struct vhost_net_virtqueue *rvq, struct sock *sk) { struct sk_buff *head; int len = 0; unsigned long flags; if (rvq->rx_ring) return vhost_net_buf_peek(rvq); spin_lock_irqsave(&sk->sk_receive_queue.lock, flags); head = skb_peek(&sk->sk_receive_queue); if (likely(head)) { len = head->len; if (skb_vlan_tag_present(head)) len += VLAN_HLEN; } spin_unlock_irqrestore(&sk->sk_receive_queue.lock, flags); return len; } static int vhost_net_rx_peek_head_len(struct vhost_net *net, struct sock *sk, bool *busyloop_intr) { struct vhost_net_virtqueue *rnvq = &net->vqs[VHOST_NET_VQ_RX]; struct vhost_net_virtqueue *tnvq = &net->vqs[VHOST_NET_VQ_TX]; struct vhost_virtqueue *rvq = &rnvq->vq; struct vhost_virtqueue *tvq = &tnvq->vq; int len = peek_head_len(rnvq, sk); if (!len && rvq->busyloop_timeout) { /* Flush batched heads first */ vhost_net_signal_used(rnvq); /* Both tx vq and rx socket were polled here */ vhost_net_busy_poll(net, rvq, tvq, busyloop_intr, true); len = peek_head_len(rnvq, sk); } return len; } /* This is a multi-buffer version of vhost_get_desc, that works if * vq has read descriptors only. * @vq - the relevant virtqueue * @datalen - data length we'll be reading * @iovcount - returned count of io vectors we fill * @log - vhost log * @log_num - log offset * @quota - headcount quota, 1 for big buffer * returns number of buffer heads allocated, negative on error */ static int get_rx_bufs(struct vhost_virtqueue *vq, struct vring_used_elem *heads, int datalen, unsigned *iovcount, struct vhost_log *log, unsigned *log_num, unsigned int quota) { unsigned int out, in; int seg = 0; int headcount = 0; unsigned d; int r, nlogs = 0; /* len is always initialized before use since we are always called with * datalen > 0. */ u32 len; while (datalen > 0 && headcount < quota) { if (unlikely(seg >= UIO_MAXIOV)) { r = -ENOBUFS; goto err; } r = vhost_get_vq_desc(vq, vq->iov + seg, ARRAY_SIZE(vq->iov) - seg, &out, &in, log, log_num); if (unlikely(r < 0)) goto err; d = r; if (d == vq->num) { r = 0; goto err; } if (unlikely(out || in <= 0)) { vq_err(vq, "unexpected descriptor format for RX: " "out %d, in %d\n", out, in); r = -EINVAL; goto err; } if (unlikely(log)) { nlogs += *log_num; log += *log_num; } heads[headcount].id = cpu_to_vhost32(vq, d); len = iov_length(vq->iov + seg, in); heads[headcount].len = cpu_to_vhost32(vq, len); datalen -= len; ++headcount; seg += in; } heads[headcount - 1].len = cpu_to_vhost32(vq, len + datalen); *iovcount = seg; if (unlikely(log)) *log_num = nlogs; /* Detect overrun */ if (unlikely(datalen > 0)) { r = UIO_MAXIOV + 1; goto err; } return headcount; err: vhost_discard_vq_desc(vq, headcount); return r; } /* Expects to be always run from workqueue - which acts as * read-size critical section for our kind of RCU. */ static void handle_rx(struct vhost_net *net) { struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_RX]; struct vhost_virtqueue *vq = &nvq->vq; unsigned in, log; struct vhost_log *vq_log; struct msghdr msg = { .msg_name = NULL, .msg_namelen = 0, .msg_control = NULL, /* FIXME: get and handle RX aux data. */ .msg_controllen = 0, .msg_flags = MSG_DONTWAIT, }; struct virtio_net_hdr hdr = { .flags = 0, .gso_type = VIRTIO_NET_HDR_GSO_NONE }; size_t total_len = 0; int err, mergeable; s16 headcount; size_t vhost_hlen, sock_hlen; size_t vhost_len, sock_len; bool busyloop_intr = false; bool set_num_buffers; struct socket *sock; struct iov_iter fixup; __virtio16 num_buffers; int recv_pkts = 0; mutex_lock_nested(&vq->mutex, VHOST_NET_VQ_RX); sock = vhost_vq_get_backend(vq); if (!sock) goto out; if (!vq_meta_prefetch(vq)) goto out; vhost_disable_notify(&net->dev, vq); vhost_net_disable_vq(net, vq); vhost_hlen = nvq->vhost_hlen; sock_hlen = nvq->sock_hlen; vq_log = unlikely(vhost_has_feature(vq, VHOST_F_LOG_ALL)) ? vq->log : NULL; mergeable = vhost_has_feature(vq, VIRTIO_NET_F_MRG_RXBUF); set_num_buffers = mergeable || vhost_has_feature(vq, VIRTIO_F_VERSION_1); do { sock_len = vhost_net_rx_peek_head_len(net, sock->sk, &busyloop_intr); if (!sock_len) break; sock_len += sock_hlen; vhost_len = sock_len + vhost_hlen; headcount = get_rx_bufs(vq, vq->heads + nvq->done_idx, vhost_len, &in, vq_log, &log, likely(mergeable) ? UIO_MAXIOV : 1); /* On error, stop handling until the next kick. */ if (unlikely(headcount < 0)) goto out; /* OK, now we need to know about added descriptors. */ if (!headcount) { if (unlikely(busyloop_intr)) { vhost_poll_queue(&vq->poll); } else if (unlikely(vhost_enable_notify(&net->dev, vq))) { /* They have slipped one in as we were * doing that: check again. */ vhost_disable_notify(&net->dev, vq); continue; } /* Nothing new? Wait for eventfd to tell us * they refilled. */ goto out; } busyloop_intr = false; if (nvq->rx_ring) msg.msg_control = vhost_net_buf_consume(&nvq->rxq); /* On overrun, truncate and discard */ if (unlikely(headcount > UIO_MAXIOV)) { iov_iter_init(&msg.msg_iter, ITER_DEST, vq->iov, 1, 1); err = sock->ops->recvmsg(sock, &msg, 1, MSG_DONTWAIT | MSG_TRUNC); pr_debug("Discarded rx packet: len %zd\n", sock_len); continue; } /* We don't need to be notified again. */ iov_iter_init(&msg.msg_iter, ITER_DEST, vq->iov, in, vhost_len); fixup = msg.msg_iter; if (unlikely((vhost_hlen))) { /* We will supply the header ourselves * TODO: support TSO. */ iov_iter_advance(&msg.msg_iter, vhost_hlen); } err = sock->ops->recvmsg(sock, &msg, sock_len, MSG_DONTWAIT | MSG_TRUNC); /* Userspace might have consumed the packet meanwhile: * it's not supposed to do this usually, but might be hard * to prevent. Discard data we got (if any) and keep going. */ if (unlikely(err != sock_len)) { pr_debug("Discarded rx packet: " " len %d, expected %zd\n", err, sock_len); vhost_discard_vq_desc(vq, headcount); continue; } /* Supply virtio_net_hdr if VHOST_NET_F_VIRTIO_NET_HDR */ if (unlikely(vhost_hlen)) { if (copy_to_iter(&hdr, sizeof(hdr), &fixup) != sizeof(hdr)) { vq_err(vq, "Unable to write vnet_hdr " "at addr %p\n", vq->iov->iov_base); goto out; } } else { /* Header came from socket; we'll need to patch * ->num_buffers over if VIRTIO_NET_F_MRG_RXBUF */ iov_iter_advance(&fixup, sizeof(hdr)); } /* TODO: Should check and handle checksum. */ num_buffers = cpu_to_vhost16(vq, headcount); if (likely(set_num_buffers) && copy_to_iter(&num_buffers, sizeof num_buffers, &fixup) != sizeof num_buffers) { vq_err(vq, "Failed num_buffers write"); vhost_discard_vq_desc(vq, headcount); goto out; } nvq->done_idx += headcount; if (nvq->done_idx > VHOST_NET_BATCH) vhost_net_signal_used(nvq); if (unlikely(vq_log)) vhost_log_write(vq, vq_log, log, vhost_len, vq->iov, in); total_len += vhost_len; } while (likely(!vhost_exceeds_weight(vq, ++recv_pkts, total_len))); if (unlikely(busyloop_intr)) vhost_poll_queue(&vq->poll); else if (!sock_len) vhost_net_enable_vq(net, vq); out: vhost_net_signal_used(nvq); mutex_unlock(&vq->mutex); } static void handle_tx_kick(struct vhost_work *work) { struct vhost_virtqueue *vq = container_of(work, struct vhost_virtqueue, poll.work); struct vhost_net *net = container_of(vq->dev, struct vhost_net, dev); handle_tx(net); } static void handle_rx_kick(struct vhost_work *work) { struct vhost_virtqueue *vq = container_of(work, struct vhost_virtqueue, poll.work); struct vhost_net *net = container_of(vq->dev, struct vhost_net, dev); handle_rx(net); } static void handle_tx_net(struct vhost_work *work) { struct vhost_net *net = container_of(work, struct vhost_net, poll[VHOST_NET_VQ_TX].work); handle_tx(net); } static void handle_rx_net(struct vhost_work *work) { struct vhost_net *net = container_of(work, struct vhost_net, poll[VHOST_NET_VQ_RX].work); handle_rx(net); } static int vhost_net_open(struct inode *inode, struct file *f) { struct vhost_net *n; struct vhost_dev *dev; struct vhost_virtqueue **vqs; void **queue; struct xdp_buff *xdp; int i; n = kvmalloc(sizeof *n, GFP_KERNEL | __GFP_RETRY_MAYFAIL); if (!n) return -ENOMEM; vqs = kmalloc_array(VHOST_NET_VQ_MAX, sizeof(*vqs), GFP_KERNEL); if (!vqs) { kvfree(n); return -ENOMEM; } queue = kmalloc_array(VHOST_NET_BATCH, sizeof(void *), GFP_KERNEL); if (!queue) { kfree(vqs); kvfree(n); return -ENOMEM; } n->vqs[VHOST_NET_VQ_RX].rxq.queue = queue; xdp = kmalloc_array(VHOST_NET_BATCH, sizeof(*xdp), GFP_KERNEL); if (!xdp) { kfree(vqs); kvfree(n); kfree(queue); return -ENOMEM; } n->vqs[VHOST_NET_VQ_TX].xdp = xdp; dev = &n->dev; vqs[VHOST_NET_VQ_TX] = &n->vqs[VHOST_NET_VQ_TX].vq; vqs[VHOST_NET_VQ_RX] = &n->vqs[VHOST_NET_VQ_RX].vq; n->vqs[VHOST_NET_VQ_TX].vq.handle_kick = handle_tx_kick; n->vqs[VHOST_NET_VQ_RX].vq.handle_kick = handle_rx_kick; for (i = 0; i < VHOST_NET_VQ_MAX; i++) { n->vqs[i].ubufs = NULL; n->vqs[i].ubuf_info = NULL; n->vqs[i].upend_idx = 0; n->vqs[i].done_idx = 0; n->vqs[i].batched_xdp = 0; n->vqs[i].vhost_hlen = 0; n->vqs[i].sock_hlen = 0; n->vqs[i].rx_ring = NULL; vhost_net_buf_init(&n->vqs[i].rxq); } vhost_dev_init(dev, vqs, VHOST_NET_VQ_MAX, UIO_MAXIOV + VHOST_NET_BATCH, VHOST_NET_PKT_WEIGHT, VHOST_NET_WEIGHT, true, NULL); vhost_poll_init(n->poll + VHOST_NET_VQ_TX, handle_tx_net, EPOLLOUT, dev, vqs[VHOST_NET_VQ_TX]); vhost_poll_init(n->poll + VHOST_NET_VQ_RX, handle_rx_net, EPOLLIN, dev, vqs[VHOST_NET_VQ_RX]); f->private_data = n; page_frag_cache_init(&n->pf_cache); return 0; } static struct socket *vhost_net_stop_vq(struct vhost_net *n, struct vhost_virtqueue *vq) { struct socket *sock; struct vhost_net_virtqueue *nvq = container_of(vq, struct vhost_net_virtqueue, vq); mutex_lock(&vq->mutex); sock = vhost_vq_get_backend(vq); vhost_net_disable_vq(n, vq); vhost_vq_set_backend(vq, NULL); vhost_net_buf_unproduce(nvq); nvq->rx_ring = NULL; mutex_unlock(&vq->mutex); return sock; } static void vhost_net_stop(struct vhost_net *n, struct socket **tx_sock, struct socket **rx_sock) { *tx_sock = vhost_net_stop_vq(n, &n->vqs[VHOST_NET_VQ_TX].vq); *rx_sock = vhost_net_stop_vq(n, &n->vqs[VHOST_NET_VQ_RX].vq); } static void vhost_net_flush(struct vhost_net *n) { vhost_dev_flush(&n->dev); if (n->vqs[VHOST_NET_VQ_TX].ubufs) { mutex_lock(&n->vqs[VHOST_NET_VQ_TX].vq.mutex); n->tx_flush = true; mutex_unlock(&n->vqs[VHOST_NET_VQ_TX].vq.mutex); /* Wait for all lower device DMAs done. */ vhost_net_ubuf_put_and_wait(n->vqs[VHOST_NET_VQ_TX].ubufs); mutex_lock(&n->vqs[VHOST_NET_VQ_TX].vq.mutex); n->tx_flush = false; atomic_set(&n->vqs[VHOST_NET_VQ_TX].ubufs->refcount, 1); mutex_unlock(&n->vqs[VHOST_NET_VQ_TX].vq.mutex); } } static int vhost_net_release(struct inode *inode, struct file *f) { struct vhost_net *n = f->private_data; struct socket *tx_sock; struct socket *rx_sock; vhost_net_stop(n, &tx_sock, &rx_sock); vhost_net_flush(n); vhost_dev_stop(&n->dev); vhost_dev_cleanup(&n->dev); vhost_net_vq_reset(n); if (tx_sock) sockfd_put(tx_sock); if (rx_sock) sockfd_put(rx_sock); /* Make sure no callbacks are outstanding */ synchronize_rcu(); /* We do an extra flush before freeing memory, * since jobs can re-queue themselves. */ vhost_net_flush(n); kfree(n->vqs[VHOST_NET_VQ_RX].rxq.queue); kfree(n->vqs[VHOST_NET_VQ_TX].xdp); kfree(n->dev.vqs); page_frag_cache_drain(&n->pf_cache); kvfree(n); return 0; } static struct socket *get_raw_socket(int fd) { int r; struct socket *sock = sockfd_lookup(fd, &r); if (!sock) return ERR_PTR(-ENOTSOCK); /* Parameter checking */ if (sock->sk->sk_type != SOCK_RAW) { r = -ESOCKTNOSUPPORT; goto err; } if (sock->sk->sk_family != AF_PACKET) { r = -EPFNOSUPPORT; goto err; } return sock; err: sockfd_put(sock); return ERR_PTR(r); } static struct ptr_ring *get_tap_ptr_ring(struct file *file) { struct ptr_ring *ring; ring = tun_get_tx_ring(file); if (!IS_ERR(ring)) goto out; ring = tap_get_ptr_ring(file); if (!IS_ERR(ring)) goto out; ring = NULL; out: return ring; } static struct socket *get_tap_socket(int fd) { struct file *file = fget(fd); struct socket *sock; if (!file) return ERR_PTR(-EBADF); sock = tun_get_socket(file); if (!IS_ERR(sock)) return sock; sock = tap_get_socket(file); if (IS_ERR(sock)) fput(file); return sock; } static struct socket *get_socket(int fd) { struct socket *sock; /* special case to disable backend */ if (fd == -1) return NULL; sock = get_raw_socket(fd); if (!IS_ERR(sock)) return sock; sock = get_tap_socket(fd); if (!IS_ERR(sock)) return sock; return ERR_PTR(-ENOTSOCK); } static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd) { struct socket *sock, *oldsock; struct vhost_virtqueue *vq; struct vhost_net_virtqueue *nvq; struct vhost_net_ubuf_ref *ubufs, *oldubufs = NULL; int r; mutex_lock(&n->dev.mutex); r = vhost_dev_check_owner(&n->dev); if (r) goto err; if (index >= VHOST_NET_VQ_MAX) { r = -ENOBUFS; goto err; } vq = &n->vqs[index].vq; nvq = &n->vqs[index]; mutex_lock(&vq->mutex); if (fd == -1) vhost_clear_msg(&n->dev); /* Verify that ring has been setup correctly. */ if (!vhost_vq_access_ok(vq)) { r = -EFAULT; goto err_vq; } sock = get_socket(fd); if (IS_ERR(sock)) { r = PTR_ERR(sock); goto err_vq; } /* start polling new socket */ oldsock = vhost_vq_get_backend(vq); if (sock != oldsock) { ubufs = vhost_net_ubuf_alloc(vq, sock && vhost_sock_zcopy(sock)); if (IS_ERR(ubufs)) { r = PTR_ERR(ubufs); goto err_ubufs; } vhost_net_disable_vq(n, vq); vhost_vq_set_backend(vq, sock); vhost_net_buf_unproduce(nvq); r = vhost_vq_init_access(vq); if (r) goto err_used; r = vhost_net_enable_vq(n, vq); if (r) goto err_used; if (index == VHOST_NET_VQ_RX) { if (sock) nvq->rx_ring = get_tap_ptr_ring(sock->file); else nvq->rx_ring = NULL; } oldubufs = nvq->ubufs; nvq->ubufs = ubufs; n->tx_packets = 0; n->tx_zcopy_err = 0; n->tx_flush = false; } mutex_unlock(&vq->mutex); if (oldubufs) { vhost_net_ubuf_put_wait_and_free(oldubufs); mutex_lock(&vq->mutex); vhost_zerocopy_signal_used(n, vq); mutex_unlock(&vq->mutex); } if (oldsock) { vhost_dev_flush(&n->dev); sockfd_put(oldsock); } mutex_unlock(&n->dev.mutex); return 0; err_used: vhost_vq_set_backend(vq, oldsock); vhost_net_enable_vq(n, vq); if (ubufs) vhost_net_ubuf_put_wait_and_free(ubufs); err_ubufs: if (sock) sockfd_put(sock); err_vq: mutex_unlock(&vq->mutex); err: mutex_unlock(&n->dev.mutex); return r; } static long vhost_net_reset_owner(struct vhost_net *n) { struct socket *tx_sock = NULL; struct socket *rx_sock = NULL; long err; struct vhost_iotlb *umem; mutex_lock(&n->dev.mutex); err = vhost_dev_check_owner(&n->dev); if (err) goto done; umem = vhost_dev_reset_owner_prepare(); if (!umem) { err = -ENOMEM; goto done; } vhost_net_stop(n, &tx_sock, &rx_sock); vhost_net_flush(n); vhost_dev_stop(&n->dev); vhost_dev_reset_owner(&n->dev, umem); vhost_net_vq_reset(n); done: mutex_unlock(&n->dev.mutex); if (tx_sock) sockfd_put(tx_sock); if (rx_sock) sockfd_put(rx_sock); return err; } static int vhost_net_set_features(struct vhost_net *n, u64 features) { size_t vhost_hlen, sock_hlen, hdr_len; int i; hdr_len = (features & ((1ULL << VIRTIO_NET_F_MRG_RXBUF) | (1ULL << VIRTIO_F_VERSION_1))) ? sizeof(struct virtio_net_hdr_mrg_rxbuf) : sizeof(struct virtio_net_hdr); if (features & (1 << VHOST_NET_F_VIRTIO_NET_HDR)) { /* vhost provides vnet_hdr */ vhost_hlen = hdr_len; sock_hlen = 0; } else { /* socket provides vnet_hdr */ vhost_hlen = 0; sock_hlen = hdr_len; } mutex_lock(&n->dev.mutex); if ((features & (1 << VHOST_F_LOG_ALL)) && !vhost_log_access_ok(&n->dev)) goto out_unlock; if ((features & (1ULL << VIRTIO_F_ACCESS_PLATFORM))) { if (vhost_init_device_iotlb(&n->dev)) goto out_unlock; } for (i = 0; i < VHOST_NET_VQ_MAX; ++i) { mutex_lock(&n->vqs[i].vq.mutex); n->vqs[i].vq.acked_features = features; n->vqs[i].vhost_hlen = vhost_hlen; n->vqs[i].sock_hlen = sock_hlen; mutex_unlock(&n->vqs[i].vq.mutex); } mutex_unlock(&n->dev.mutex); return 0; out_unlock: mutex_unlock(&n->dev.mutex); return -EFAULT; } static long vhost_net_set_owner(struct vhost_net *n) { int r; mutex_lock(&n->dev.mutex); if (vhost_dev_has_owner(&n->dev)) { r = -EBUSY; goto out; } r = vhost_net_set_ubuf_info(n); if (r) goto out; r = vhost_dev_set_owner(&n->dev); if (r) vhost_net_clear_ubuf_info(n); vhost_net_flush(n); out: mutex_unlock(&n->dev.mutex); return r; } static long vhost_net_ioctl(struct file *f, unsigned int ioctl, unsigned long arg) { struct vhost_net *n = f->private_data; void __user *argp = (void __user *)arg; u64 __user *featurep = argp; struct vhost_vring_file backend; u64 features; int r; switch (ioctl) { case VHOST_NET_SET_BACKEND: if (copy_from_user(&backend, argp, sizeof backend)) return -EFAULT; return vhost_net_set_backend(n, backend.index, backend.fd); case VHOST_GET_FEATURES: features = VHOST_NET_FEATURES; if (copy_to_user(featurep, &features, sizeof features)) return -EFAULT; return 0; case VHOST_SET_FEATURES: if (copy_from_user(&features, featurep, sizeof features)) return -EFAULT; if (features & ~VHOST_NET_FEATURES) return -EOPNOTSUPP; return vhost_net_set_features(n, features); case VHOST_GET_BACKEND_FEATURES: features = VHOST_NET_BACKEND_FEATURES; if (copy_to_user(featurep, &features, sizeof(features))) return -EFAULT; return 0; case VHOST_SET_BACKEND_FEATURES: if (copy_from_user(&features, featurep, sizeof(features))) return -EFAULT; if (features & ~VHOST_NET_BACKEND_FEATURES) return -EOPNOTSUPP; vhost_set_backend_features(&n->dev, features); return 0; case VHOST_RESET_OWNER: return vhost_net_reset_owner(n); case VHOST_SET_OWNER: return vhost_net_set_owner(n); default: mutex_lock(&n->dev.mutex); r = vhost_dev_ioctl(&n->dev, ioctl, argp); if (r == -ENOIOCTLCMD) r = vhost_vring_ioctl(&n->dev, ioctl, argp); else vhost_net_flush(n); mutex_unlock(&n->dev.mutex); return r; } } static ssize_t vhost_net_chr_read_iter(struct kiocb *iocb, struct iov_iter *to) { struct file *file = iocb->ki_filp; struct vhost_net *n = file->private_data; struct vhost_dev *dev = &n->dev; int noblock = file->f_flags & O_NONBLOCK; return vhost_chr_read_iter(dev, to, noblock); } static ssize_t vhost_net_chr_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; struct vhost_net *n = file->private_data; struct vhost_dev *dev = &n->dev; return vhost_chr_write_iter(dev, from); } static __poll_t vhost_net_chr_poll(struct file *file, poll_table *wait) { struct vhost_net *n = file->private_data; struct vhost_dev *dev = &n->dev; return vhost_chr_poll(file, dev, wait); } static const struct file_operations vhost_net_fops = { .owner = THIS_MODULE, .release = vhost_net_release, .read_iter = vhost_net_chr_read_iter, .write_iter = vhost_net_chr_write_iter, .poll = vhost_net_chr_poll, .unlocked_ioctl = vhost_net_ioctl, .compat_ioctl = compat_ptr_ioctl, .open = vhost_net_open, .llseek = noop_llseek, }; static struct miscdevice vhost_net_misc = { .minor = VHOST_NET_MINOR, .name = "vhost-net", .fops = &vhost_net_fops, }; static int __init vhost_net_init(void) { if (experimental_zcopytx) vhost_net_enable_zcopy(VHOST_NET_VQ_TX); return misc_register(&vhost_net_misc); } module_init(vhost_net_init); static void __exit vhost_net_exit(void) { misc_deregister(&vhost_net_misc); } module_exit(vhost_net_exit); MODULE_VERSION("0.0.1"); MODULE_LICENSE("GPL v2"); MODULE_AUTHOR("Michael S. Tsirkin"); MODULE_DESCRIPTION("Host kernel accelerator for virtio net"); MODULE_ALIAS_MISCDEV(VHOST_NET_MINOR); MODULE_ALIAS("devname:vhost-net");
26 29 13 13 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* internal AFS stuff * * Copyright (C) 2002, 2007 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #include <linux/compiler.h> #include <linux/kernel.h> #include <linux/ktime.h> #include <linux/fs.h> #include <linux/filelock.h> #include <linux/pagemap.h> #include <linux/rxrpc.h> #include <linux/key.h> #include <linux/workqueue.h> #include <linux/sched.h> #include <linux/fscache.h> #include <linux/backing-dev.h> #include <linux/uuid.h> #include <linux/mm_types.h> #include <linux/dns_resolver.h> #include <net/net_namespace.h> #include <net/netns/generic.h> #include <net/sock.h> #include <net/af_rxrpc.h> #include "afs.h" #include "afs_vl.h" #define AFS_CELL_MAX_ADDRS 15 struct pagevec; struct afs_call; struct afs_vnode; struct afs_server_probe; /* * Partial file-locking emulation mode. (The problem being that AFS3 only * allows whole-file locks and no upgrading/downgrading). */ enum afs_flock_mode { afs_flock_mode_unset, afs_flock_mode_local, /* Local locking only */ afs_flock_mode_openafs, /* Don't get server lock for a partial lock */ afs_flock_mode_strict, /* Always get a server lock for a partial lock */ afs_flock_mode_write, /* Get an exclusive server lock for a partial lock */ }; struct afs_fs_context { bool force; /* T to force cell type */ bool autocell; /* T if set auto mount operation */ bool dyn_root; /* T if dynamic root */ bool no_cell; /* T if the source is "none" (for dynroot) */ enum afs_flock_mode flock_mode; /* Partial file-locking emulation mode */ afs_voltype_t type; /* type of volume requested */ unsigned int volnamesz; /* size of volume name */ const char *volname; /* name of volume to mount */ struct afs_net *net; /* the AFS net namespace stuff */ struct afs_cell *cell; /* cell in which to find volume */ struct afs_volume *volume; /* volume record */ struct key *key; /* key to use for secure mounting */ }; enum afs_call_state { AFS_CALL_CL_REQUESTING, /* Client: Request is being sent */ AFS_CALL_CL_AWAIT_REPLY, /* Client: Awaiting reply */ AFS_CALL_CL_PROC_REPLY, /* Client: rxrpc call complete; processing reply */ AFS_CALL_SV_AWAIT_OP_ID, /* Server: Awaiting op ID */ AFS_CALL_SV_AWAIT_REQUEST, /* Server: Awaiting request data */ AFS_CALL_SV_REPLYING, /* Server: Replying */ AFS_CALL_SV_AWAIT_ACK, /* Server: Awaiting final ACK */ AFS_CALL_COMPLETE, /* Completed or failed */ }; /* * Address preferences. */ struct afs_addr_preference { union { struct in_addr ipv4_addr; /* AF_INET address to compare against */ struct in6_addr ipv6_addr; /* AF_INET6 address to compare against */ }; sa_family_t family; /* Which address to use */ u16 prio; /* Priority */ u8 subnet_mask; /* How many bits to compare */ }; struct afs_addr_preference_list { struct rcu_head rcu; u16 version; /* Incremented when prefs list changes */ u8 ipv6_off; /* Offset of IPv6 addresses */ u8 nr; /* Number of addresses in total */ u8 max_prefs; /* Number of prefs allocated */ struct afs_addr_preference prefs[] __counted_by(max_prefs); }; struct afs_address { struct rxrpc_peer *peer; short last_error; /* Last error from this address */ u16 prio; /* Address priority */ }; /* * List of server addresses. */ struct afs_addr_list { struct rcu_head rcu; refcount_t usage; u32 version; /* Version */ unsigned int debug_id; unsigned int addr_pref_version; /* Version of address preference list */ unsigned char max_addrs; unsigned char nr_addrs; unsigned char preferred; /* Preferred address */ unsigned char nr_ipv4; /* Number of IPv4 addresses */ enum dns_record_source source:8; enum dns_lookup_status status:8; unsigned long probe_failed; /* Mask of addrs that failed locally/ICMP */ unsigned long responded; /* Mask of addrs that responded */ struct afs_address addrs[] __counted_by(max_addrs); #define AFS_MAX_ADDRESSES ((unsigned int)(sizeof(unsigned long) * 8)) }; /* * a record of an in-progress RxRPC call */ struct afs_call { const struct afs_call_type *type; /* type of call */ wait_queue_head_t waitq; /* processes awaiting completion */ struct work_struct async_work; /* async I/O processor */ struct work_struct work; /* actual work processor */ struct work_struct free_work; /* Deferred free processor */ struct rxrpc_call *rxcall; /* RxRPC call handle */ struct rxrpc_peer *peer; /* Remote endpoint */ struct key *key; /* security for this call */ struct afs_net *net; /* The network namespace */ struct afs_server *server; /* The fileserver record if fs op (pins ref) */ struct afs_vlserver *vlserver; /* The vlserver record if vl op */ void *request; /* request data (first part) */ size_t iov_len; /* Size of *iter to be used */ struct iov_iter def_iter; /* Default buffer/data iterator */ struct iov_iter *write_iter; /* Iterator defining write to be made */ struct iov_iter *iter; /* Iterator currently in use */ union { /* Convenience for ->def_iter */ struct kvec kvec[1]; struct bio_vec bvec[1]; }; void *buffer; /* reply receive buffer */ union { struct afs_endpoint_state *probe; struct afs_addr_list *vl_probe; struct afs_addr_list *ret_alist; struct afs_vldb_entry *ret_vldb; char *ret_str; }; struct afs_fid fid; /* Primary vnode ID (or all zeroes) */ unsigned char probe_index; /* Address in ->probe_alist */ struct afs_operation *op; unsigned int server_index; refcount_t ref; enum afs_call_state state; spinlock_t state_lock; int error; /* error code */ u32 abort_code; /* Remote abort ID or 0 */ unsigned long long remaining; /* How much is left to receive */ unsigned int max_lifespan; /* Maximum lifespan in secs to set if not 0 */ unsigned request_size; /* size of request data */ unsigned reply_max; /* maximum size of reply */ unsigned count2; /* count used in unmarshalling */ unsigned char unmarshall; /* unmarshalling phase */ bool drop_ref; /* T if need to drop ref for incoming call */ bool need_attention; /* T if RxRPC poked us */ bool async; /* T if asynchronous */ bool upgrade; /* T to request service upgrade */ bool intr; /* T if interruptible */ bool unmarshalling_error; /* T if an unmarshalling error occurred */ bool responded; /* Got a response from the call (may be abort) */ u16 service_id; /* Actual service ID (after upgrade) */ unsigned int debug_id; /* Trace ID */ u32 operation_ID; /* operation ID for an incoming call */ u32 count; /* count for use in unmarshalling */ union { /* place to extract temporary data */ struct { __be32 tmp_u; __be32 tmp; } __attribute__((packed)); __be64 tmp64; }; ktime_t issue_time; /* Time of issue of operation */ }; struct afs_call_type { const char *name; unsigned int op; /* Really enum afs_fs_operation */ /* deliver request or reply data to an call * - returning an error will cause the call to be aborted */ int (*deliver)(struct afs_call *call); /* clean up a call */ void (*destructor)(struct afs_call *call); /* Async receive processing function */ void (*async_rx)(struct work_struct *work); /* Work function */ void (*work)(struct work_struct *work); /* Call done function (gets called immediately on success or failure) */ void (*done)(struct afs_call *call); /* Handle a call being immediately cancelled. */ void (*immediate_cancel)(struct afs_call *call); }; /* * Key available for writeback on a file. */ struct afs_wb_key { refcount_t usage; struct key *key; struct list_head vnode_link; /* Link in vnode->wb_keys */ }; /* * AFS open file information record. Pointed to by file->private_data. */ struct afs_file { struct key *key; /* The key this file was opened with */ struct afs_wb_key *wb; /* Writeback key record for this file */ }; static inline struct key *afs_file_key(struct file *file) { struct afs_file *af = file->private_data; return af->key; } /* * AFS superblock private data * - there's one superblock per volume */ struct afs_super_info { struct net *net_ns; /* Network namespace */ struct afs_cell *cell; /* The cell in which the volume resides */ struct afs_volume *volume; /* volume record */ enum afs_flock_mode flock_mode:8; /* File locking emulation mode */ bool dyn_root; /* True if dynamic root */ }; static inline struct afs_super_info *AFS_FS_S(struct super_block *sb) { return sb->s_fs_info; } extern struct file_system_type afs_fs_type; /* * Set of substitutes for @sys. */ struct afs_sysnames { #define AFS_NR_SYSNAME 16 char *subs[AFS_NR_SYSNAME]; refcount_t usage; unsigned short nr; char blank[1]; }; /* * AFS network namespace record. */ struct afs_net { struct net *net; /* Backpointer to the owning net namespace */ struct afs_uuid uuid; bool live; /* F if this namespace is being removed */ /* AF_RXRPC I/O stuff */ struct socket *socket; struct afs_call *spare_incoming_call; struct work_struct charge_preallocation_work; struct mutex socket_mutex; atomic_t nr_outstanding_calls; atomic_t nr_superblocks; /* Cell database */ struct rb_root cells; struct afs_cell *ws_cell; struct work_struct cells_manager; struct timer_list cells_timer; atomic_t cells_outstanding; struct rw_semaphore cells_lock; struct mutex cells_alias_lock; struct mutex proc_cells_lock; struct hlist_head proc_cells; /* Known servers. Theoretically each fileserver can only be in one * cell, but in practice, people create aliases and subsets and there's * no easy way to distinguish them. */ seqlock_t fs_lock; /* For fs_servers, fs_probe_*, fs_proc */ struct rb_root fs_servers; /* afs_server (by server UUID or address) */ struct list_head fs_probe_fast; /* List of afs_server to probe at 30s intervals */ struct list_head fs_probe_slow; /* List of afs_server to probe at 5m intervals */ struct hlist_head fs_proc; /* procfs servers list */ struct hlist_head fs_addresses; /* afs_server (by lowest IPv6 addr) */ seqlock_t fs_addr_lock; /* For fs_addresses[46] */ struct work_struct fs_manager; struct timer_list fs_timer; struct work_struct fs_prober; struct timer_list fs_probe_timer; atomic_t servers_outstanding; /* File locking renewal management */ struct mutex lock_manager_mutex; /* Misc */ struct super_block *dynroot_sb; /* Dynamic root mount superblock */ struct proc_dir_entry *proc_afs; /* /proc/net/afs directory */ struct afs_sysnames *sysnames; rwlock_t sysnames_lock; struct afs_addr_preference_list __rcu *address_prefs; u16 address_pref_version; /* Statistics counters */ atomic_t n_lookup; /* Number of lookups done */ atomic_t n_reval; /* Number of dentries needing revalidation */ atomic_t n_inval; /* Number of invalidations by the server */ atomic_t n_relpg; /* Number of invalidations by release_folio */ atomic_t n_read_dir; /* Number of directory pages read */ atomic_t n_dir_cr; /* Number of directory entry creation edits */ atomic_t n_dir_rm; /* Number of directory entry removal edits */ atomic_t n_stores; /* Number of store ops */ atomic_long_t n_store_bytes; /* Number of bytes stored */ atomic_long_t n_fetch_bytes; /* Number of bytes fetched */ atomic_t n_fetches; /* Number of data fetch ops */ }; extern const char afs_init_sysname[]; enum afs_cell_state { AFS_CELL_UNSET, AFS_CELL_ACTIVATING, AFS_CELL_ACTIVE, AFS_CELL_DEACTIVATING, AFS_CELL_INACTIVE, AFS_CELL_FAILED, AFS_CELL_REMOVED, }; /* * AFS cell record. * * This is a tricky concept to get right as it is possible to create aliases * simply by pointing AFSDB/SRV records for two names at the same set of VL * servers; it is also possible to do things like setting up two sets of VL * servers, one of which provides a superset of the volumes provided by the * other (for internal/external division, for example). * * Cells only exist in the sense that (a) a cell's name maps to a set of VL * servers and (b) a cell's name is used by the client to select the key to use * for authentication and encryption. The cell name is not typically used in * the protocol. * * Two cells are determined to be aliases if they have an explicit alias (YFS * only), share any VL servers in common or have at least one volume in common. * "In common" means that the address list of the VL servers or the fileservers * share at least one endpoint. */ struct afs_cell { union { struct rcu_head rcu; struct rb_node net_node; /* Node in net->cells */ }; struct afs_net *net; struct afs_cell *alias_of; /* The cell this is an alias of */ struct afs_volume *root_volume; /* The root.cell volume if there is one */ struct key *anonymous_key; /* anonymous user key for this cell */ struct work_struct manager; /* Manager for init/deinit/dns */ struct hlist_node proc_link; /* /proc cell list link */ time64_t dns_expiry; /* Time AFSDB/SRV record expires */ time64_t last_inactive; /* Time of last drop of usage count */ refcount_t ref; /* Struct refcount */ atomic_t active; /* Active usage counter */ unsigned long flags; #define AFS_CELL_FL_NO_GC 0 /* The cell was added manually, don't auto-gc */ #define AFS_CELL_FL_DO_LOOKUP 1 /* DNS lookup requested */ #define AFS_CELL_FL_CHECK_ALIAS 2 /* Need to check for aliases */ enum afs_cell_state state; short error; enum dns_record_source dns_source:8; /* Latest source of data from lookup */ enum dns_lookup_status dns_status:8; /* Latest status of data from lookup */ unsigned int dns_lookup_count; /* Counter of DNS lookups */ unsigned int debug_id; /* The volumes belonging to this cell */ struct rw_semaphore vs_lock; /* Lock for server->volumes */ struct rb_root volumes; /* Tree of volumes on this server */ struct hlist_head proc_volumes; /* procfs volume list */ seqlock_t volume_lock; /* For volumes */ /* Active fileserver interaction state. */ struct rb_root fs_servers; /* afs_server (by server UUID) */ seqlock_t fs_lock; /* For fs_servers */ /* VL server list. */ rwlock_t vl_servers_lock; /* Lock on vl_servers */ struct afs_vlserver_list __rcu *vl_servers; u8 name_len; /* Length of name */ char *name; /* Cell name, case-flattened and NUL-padded */ }; /* * Volume Location server record. */ struct afs_vlserver { struct rcu_head rcu; struct afs_addr_list __rcu *addresses; /* List of addresses for this VL server */ unsigned long flags; #define AFS_VLSERVER_FL_PROBED 0 /* The VL server has been probed */ #define AFS_VLSERVER_FL_PROBING 1 /* VL server is being probed */ #define AFS_VLSERVER_FL_IS_YFS 2 /* Server is YFS not AFS */ #define AFS_VLSERVER_FL_RESPONDING 3 /* VL server is responding */ rwlock_t lock; /* Lock on addresses */ refcount_t ref; unsigned int rtt; /* Server's current RTT in uS */ unsigned int debug_id; /* Probe state */ wait_queue_head_t probe_wq; atomic_t probe_outstanding; spinlock_t probe_lock; struct { unsigned int rtt; /* Best RTT in uS (or UINT_MAX) */ u32 abort_code; short error; unsigned short flags; #define AFS_VLSERVER_PROBE_RESPONDED 0x01 /* At least once response (may be abort) */ #define AFS_VLSERVER_PROBE_IS_YFS 0x02 /* The peer appears to be YFS */ #define AFS_VLSERVER_PROBE_NOT_YFS 0x04 /* The peer appears not to be YFS */ #define AFS_VLSERVER_PROBE_LOCAL_FAILURE 0x08 /* A local failure prevented a probe */ } probe; u16 service_id; /* Service ID we're using */ u16 port; u16 name_len; /* Length of name */ char name[]; /* Server name, case-flattened */ }; /* * Weighted list of Volume Location servers. */ struct afs_vlserver_entry { u16 priority; /* Preference (as SRV) */ u16 weight; /* Weight (as SRV) */ enum dns_record_source source:8; enum dns_lookup_status status:8; struct afs_vlserver *server; }; struct afs_vlserver_list { struct rcu_head rcu; refcount_t ref; u8 nr_servers; u8 index; /* Server currently in use */ u8 preferred; /* Preferred server */ enum dns_record_source source:8; enum dns_lookup_status status:8; rwlock_t lock; struct afs_vlserver_entry servers[]; }; /* * Cached VLDB entry. * * This is pointed to by cell->vldb_entries, indexed by name. */ struct afs_vldb_entry { afs_volid_t vid[3]; /* Volume IDs for R/W, R/O and Bak volumes */ unsigned long flags; #define AFS_VLDB_HAS_RW 0 /* - R/W volume exists */ #define AFS_VLDB_HAS_RO 1 /* - R/O volume exists */ #define AFS_VLDB_HAS_BAK 2 /* - Backup volume exists */ #define AFS_VLDB_QUERY_VALID 3 /* - Record is valid */ #define AFS_VLDB_QUERY_ERROR 4 /* - VL server returned error */ uuid_t fs_server[AFS_NMAXNSERVERS]; u32 addr_version[AFS_NMAXNSERVERS]; /* Registration change counters */ u8 fs_mask[AFS_NMAXNSERVERS]; #define AFS_VOL_VTM_RW 0x01 /* R/W version of the volume is available (on this server) */ #define AFS_VOL_VTM_RO 0x02 /* R/O version of the volume is available (on this server) */ #define AFS_VOL_VTM_BAK 0x04 /* backup version of the volume is available (on this server) */ u8 vlsf_flags[AFS_NMAXNSERVERS]; short error; u8 nr_servers; /* Number of server records */ u8 name_len; u8 name[AFS_MAXVOLNAME + 1]; /* NUL-padded volume name */ }; /* * Fileserver endpoint state. The records the addresses of a fileserver's * endpoints and the state and result of a round of probing on them. This * allows the rotation algorithm to access those results without them being * erased by a subsequent round of probing. */ struct afs_endpoint_state { struct rcu_head rcu; struct afs_addr_list *addresses; /* The addresses being probed */ unsigned long responsive_set; /* Bitset of responsive endpoints */ unsigned long failed_set; /* Bitset of endpoints we failed to probe */ refcount_t ref; unsigned int server_id; /* Debug ID of server */ unsigned int probe_seq; /* Probe sequence (from server::probe_counter) */ atomic_t nr_probing; /* Number of outstanding probes */ unsigned int rtt; /* Best RTT in uS (or UINT_MAX) */ s32 abort_code; short error; unsigned long flags; #define AFS_ESTATE_RESPONDED 0 /* Set if the server responded */ #define AFS_ESTATE_SUPERSEDED 1 /* Set if this record has been superseded */ #define AFS_ESTATE_IS_YFS 2 /* Set if probe upgraded to YFS */ #define AFS_ESTATE_NOT_YFS 3 /* Set if probe didn't upgrade to YFS */ #define AFS_ESTATE_LOCAL_FAILURE 4 /* Set if there was a local failure (eg. ENOMEM) */ }; /* * Record of fileserver with which we're actively communicating. */ struct afs_server { struct rcu_head rcu; union { uuid_t uuid; /* Server ID */ struct afs_uuid _uuid; }; struct afs_cell *cell; /* Cell to which belongs (pins ref) */ struct rb_node uuid_rb; /* Link in net->fs_servers */ struct afs_server __rcu *uuid_next; /* Next server with same UUID */ struct afs_server *uuid_prev; /* Previous server with same UUID */ struct list_head probe_link; /* Link in net->fs_probe_list */ struct hlist_node addr_link; /* Link in net->fs_addresses6 */ struct hlist_node proc_link; /* Link in net->fs_proc */ struct list_head volumes; /* RCU list of afs_server_entry objects */ struct afs_server *gc_next; /* Next server in manager's list */ time64_t unuse_time; /* Time at which last unused */ unsigned long flags; #define AFS_SERVER_FL_RESPONDING 0 /* The server is responding */ #define AFS_SERVER_FL_UPDATING 1 #define AFS_SERVER_FL_NEEDS_UPDATE 2 /* Fileserver address list is out of date */ #define AFS_SERVER_FL_NOT_READY 4 /* The record is not ready for use */ #define AFS_SERVER_FL_NOT_FOUND 5 /* VL server says no such server */ #define AFS_SERVER_FL_VL_FAIL 6 /* Failed to access VL server */ #define AFS_SERVER_FL_MAY_HAVE_CB 8 /* May have callbacks on this fileserver */ #define AFS_SERVER_FL_IS_YFS 16 /* Server is YFS not AFS */ #define AFS_SERVER_FL_NO_IBULK 17 /* Fileserver doesn't support FS.InlineBulkStatus */ #define AFS_SERVER_FL_NO_RM2 18 /* Fileserver doesn't support YFS.RemoveFile2 */ #define AFS_SERVER_FL_HAS_FS64 19 /* Fileserver supports FS.{Fetch,Store}Data64 */ refcount_t ref; /* Object refcount */ atomic_t active; /* Active user count */ u32 addr_version; /* Address list version */ u16 service_id; /* Service ID we're using. */ unsigned int rtt; /* Server's current RTT in uS */ unsigned int debug_id; /* Debugging ID for traces */ /* file service access */ rwlock_t fs_lock; /* access lock */ /* Probe state */ struct afs_endpoint_state __rcu *endpoint_state; /* Latest endpoint/probe state */ unsigned long probed_at; /* Time last probe was dispatched (jiffies) */ wait_queue_head_t probe_wq; unsigned int probe_counter; /* Number of probes issued */ spinlock_t probe_lock; }; enum afs_ro_replicating { AFS_RO_NOT_REPLICATING, /* Not doing replication */ AFS_RO_REPLICATING_USE_OLD, /* Replicating; use old version */ AFS_RO_REPLICATING_USE_NEW, /* Replicating; switch to new version */ } __mode(byte); /* * Replaceable volume server list. */ struct afs_server_entry { struct afs_server *server; struct afs_volume *volume; struct list_head slink; /* Link in server->volumes */ time64_t cb_expires_at; /* Time at which volume-level callback expires */ unsigned long flags; #define AFS_SE_EXCLUDED 0 /* Set if server is to be excluded in rotation */ #define AFS_SE_VOLUME_OFFLINE 1 /* Set if volume offline notice given */ #define AFS_SE_VOLUME_BUSY 2 /* Set if volume busy notice given */ }; struct afs_server_list { struct rcu_head rcu; refcount_t usage; bool attached; /* T if attached to servers */ enum afs_ro_replicating ro_replicating; /* RW->RO update (probably) in progress */ unsigned char nr_servers; unsigned short vnovol_mask; /* Servers to be skipped due to VNOVOL */ unsigned int seq; /* Set to ->servers_seq when installed */ rwlock_t lock; struct afs_server_entry servers[]; }; /* * Live AFS volume management. */ struct afs_volume { struct rcu_head rcu; afs_volid_t vid; /* The volume ID of this volume */ afs_volid_t vids[AFS_MAXTYPES]; /* All associated volume IDs */ refcount_t ref; time64_t update_at; /* Time at which to next update */ struct afs_cell *cell; /* Cell to which belongs (pins ref) */ struct rb_node cell_node; /* Link in cell->volumes */ struct hlist_node proc_link; /* Link in cell->proc_volumes */ struct super_block __rcu *sb; /* Superblock on which inodes reside */ struct work_struct destructor; /* Deferred destructor */ unsigned long flags; #define AFS_VOLUME_NEEDS_UPDATE 0 /* - T if an update needs performing */ #define AFS_VOLUME_UPDATING 1 /* - T if an update is in progress */ #define AFS_VOLUME_WAIT 2 /* - T if users must wait for update */ #define AFS_VOLUME_DELETED 3 /* - T if volume appears deleted */ #define AFS_VOLUME_MAYBE_NO_IBULK 4 /* - T if some servers don't have InlineBulkStatus */ #define AFS_VOLUME_RM_TREE 5 /* - Set if volume removed from cell->volumes */ #ifdef CONFIG_AFS_FSCACHE struct fscache_volume *cache; /* Caching cookie */ #endif struct afs_server_list __rcu *servers; /* List of servers on which volume resides */ rwlock_t servers_lock; /* Lock for ->servers */ unsigned int servers_seq; /* Incremented each time ->servers changes */ /* RO release tracking */ struct mutex volsync_lock; /* Time/state evaluation lock */ time64_t creation_time; /* Volume creation time (or TIME64_MIN) */ time64_t update_time; /* Volume update time (or TIME64_MIN) */ /* Callback management */ struct mutex cb_check_lock; /* Lock to control race to check after v_break */ time64_t cb_expires_at; /* Earliest volume callback expiry time */ atomic_t cb_ro_snapshot; /* RO volume update-from-snapshot counter */ atomic_t cb_v_break; /* Volume-break event counter. */ atomic_t cb_v_check; /* Volume-break has-been-checked counter. */ atomic_t cb_scrub; /* Scrub-all-data event counter. */ rwlock_t cb_v_break_lock; struct rw_semaphore open_mmaps_lock; struct list_head open_mmaps; /* List of vnodes that are mmapped */ afs_voltype_t type; /* type of volume */ char type_force; /* force volume type (suppress R/O -> R/W) */ u8 name_len; u8 name[AFS_MAXVOLNAME + 1]; /* NUL-padded volume name */ }; enum afs_lock_state { AFS_VNODE_LOCK_NONE, /* The vnode has no lock on the server */ AFS_VNODE_LOCK_WAITING_FOR_CB, /* We're waiting for the server to break the callback */ AFS_VNODE_LOCK_SETTING, /* We're asking the server for a lock */ AFS_VNODE_LOCK_GRANTED, /* We have a lock on the server */ AFS_VNODE_LOCK_EXTENDING, /* We're extending a lock on the server */ AFS_VNODE_LOCK_NEED_UNLOCK, /* We need to unlock on the server */ AFS_VNODE_LOCK_UNLOCKING, /* We're telling the server to unlock */ AFS_VNODE_LOCK_DELETED, /* The vnode has been deleted whilst we have a lock */ }; /* * AFS inode private data. * * Note that afs_alloc_inode() *must* reset anything that could incorrectly * leak from one inode to another. */ struct afs_vnode { struct netfs_inode netfs; /* Netfslib context and vfs inode */ struct afs_volume *volume; /* volume on which vnode resides */ struct afs_fid fid; /* the file identifier for this inode */ struct afs_file_status status; /* AFS status info for this file */ afs_dataversion_t invalid_before; /* Child dentries are invalid before this */ struct afs_permits __rcu *permit_cache; /* cache of permits so far obtained */ struct list_head io_lock_waiters; /* Threads waiting for the I/O lock */ struct rw_semaphore validate_lock; /* lock for validating this vnode */ struct rw_semaphore rmdir_lock; /* Lock for rmdir vs sillyrename */ struct key *silly_key; /* Silly rename key */ spinlock_t wb_lock; /* lock for wb_keys */ spinlock_t lock; /* waitqueue/flags lock */ unsigned long flags; #define AFS_VNODE_IO_LOCK 0 /* Set if the I/O serialisation lock is held */ #define AFS_VNODE_UNSET 1 /* set if vnode attributes not yet set */ #define AFS_VNODE_DIR_VALID 2 /* Set if dir contents are valid */ #define AFS_VNODE_ZAP_DATA 3 /* set if vnode's data should be invalidated */ #define AFS_VNODE_DELETED 4 /* set if vnode deleted on server */ #define AFS_VNODE_MOUNTPOINT 5 /* set if vnode is a mountpoint symlink */ #define AFS_VNODE_AUTOCELL 6 /* set if Vnode is an auto mount point */ #define AFS_VNODE_PSEUDODIR 7 /* set if Vnode is a pseudo directory */ #define AFS_VNODE_NEW_CONTENT 8 /* Set if file has new content (create/trunc-0) */ #define AFS_VNODE_SILLY_DELETED 9 /* Set if file has been silly-deleted */ #define AFS_VNODE_MODIFYING 10 /* Set if we're performing a modification op */ #define AFS_VNODE_DIR_READ 11 /* Set if we've read a dir's contents */ struct folio_queue *directory; /* Directory contents */ struct list_head wb_keys; /* List of keys available for writeback */ struct list_head pending_locks; /* locks waiting to be granted */ struct list_head granted_locks; /* locks granted on this file */ struct delayed_work lock_work; /* work to be done in locking */ struct key *lock_key; /* Key to be used in lock ops */ ktime_t locked_at; /* Time at which lock obtained */ enum afs_lock_state lock_state : 8; afs_lock_type_t lock_type : 8; unsigned int directory_size; /* Amount of space in ->directory */ /* outstanding callback notification on this file */ struct work_struct cb_work; /* Work for mmap'd files */ struct list_head cb_mmap_link; /* Link in cell->fs_open_mmaps */ void *cb_server; /* Server with callback/filelock */ atomic_t cb_nr_mmap; /* Number of mmaps */ unsigned int cb_ro_snapshot; /* RO volume release counter on ->volume */ unsigned int cb_scrub; /* Scrub counter on ->volume */ unsigned int cb_break; /* Break counter on vnode */ unsigned int cb_v_check; /* Break check counter on ->volume */ seqlock_t cb_lock; /* Lock for ->cb_server, ->status, ->cb_*break */ atomic64_t cb_expires_at; /* time at which callback expires */ #define AFS_NO_CB_PROMISE TIME64_MIN }; static inline struct fscache_cookie *afs_vnode_cache(struct afs_vnode *vnode) { #ifdef CONFIG_AFS_FSCACHE return netfs_i_cookie(&vnode->netfs); #else return NULL; #endif } static inline void afs_vnode_set_cache(struct afs_vnode *vnode, struct fscache_cookie *cookie) { #ifdef CONFIG_AFS_FSCACHE vnode->netfs.cache = cookie; if (cookie) mapping_set_release_always(vnode->netfs.inode.i_mapping); #endif } /* * cached security record for one user's attempt to access a vnode */ struct afs_permit { struct key *key; /* RxRPC ticket holding a security context */ afs_access_t access; /* CallerAccess value for this key */ }; /* * Immutable cache of CallerAccess records from attempts to access vnodes. * These may be shared between multiple vnodes. */ struct afs_permits { struct rcu_head rcu; struct hlist_node hash_node; /* Link in hash */ unsigned long h; /* Hash value for this permit list */ refcount_t usage; unsigned short nr_permits; /* Number of records */ bool invalidated; /* Invalidated due to key change */ struct afs_permit permits[] __counted_by(nr_permits); /* List of permits sorted by key pointer */ }; /* * Error prioritisation and accumulation. */ struct afs_error { s32 abort_code; /* Cumulative abort code */ short error; /* Cumulative error */ bool responded; /* T if server responded */ bool aborted; /* T if ->error is from an abort */ }; /* * Cursor for iterating over a set of volume location servers. */ struct afs_vl_cursor { struct afs_cell *cell; /* The cell we're querying */ struct afs_vlserver_list *server_list; /* Current server list (pins ref) */ struct afs_vlserver *server; /* Server on which this resides */ struct afs_addr_list *alist; /* Current address list (pins ref) */ struct key *key; /* Key for the server */ unsigned long untried_servers; /* Bitmask of untried servers */ unsigned long addr_tried; /* Tried addresses */ struct afs_error cumul_error; /* Cumulative error */ unsigned int debug_id; s32 call_abort_code; short call_error; /* Error from single call */ short server_index; /* Current server */ signed char addr_index; /* Current address */ unsigned short flags; #define AFS_VL_CURSOR_STOP 0x0001 /* Set to cease iteration */ #define AFS_VL_CURSOR_RETRY 0x0002 /* Set to do a retry */ #define AFS_VL_CURSOR_RETRIED 0x0004 /* Set if started a retry */ short nr_iterations; /* Number of server iterations */ bool call_responded; /* T if the current address responded */ }; /* * Fileserver state tracking for an operation. An array of these is kept, * indexed by server index. */ struct afs_server_state { /* Tracking of fileserver probe state. Other operations may interfere * by probing a fileserver when accessing other volumes. */ unsigned int probe_seq; unsigned long untried_addrs; /* Addresses we haven't tried yet */ struct wait_queue_entry probe_waiter; struct afs_endpoint_state *endpoint_state; /* Endpoint state being monitored */ }; /* * Fileserver operation methods. */ struct afs_operation_ops { void (*issue_afs_rpc)(struct afs_operation *op); void (*issue_yfs_rpc)(struct afs_operation *op); void (*success)(struct afs_operation *op); void (*aborted)(struct afs_operation *op); void (*failed)(struct afs_operation *op); void (*edit_dir)(struct afs_operation *op); void (*put)(struct afs_operation *op); }; struct afs_vnode_param { struct afs_vnode *vnode; struct afs_fid fid; /* Fid to access */ struct afs_status_cb scb; /* Returned status and callback promise */ afs_dataversion_t dv_before; /* Data version before the call */ unsigned int cb_break_before; /* cb_break before the call */ u8 dv_delta; /* Expected change in data version */ bool put_vnode:1; /* T if we have a ref on the vnode */ bool need_io_lock:1; /* T if we need the I/O lock on this */ bool update_ctime:1; /* Need to update the ctime */ bool set_size:1; /* Must update i_size */ bool op_unlinked:1; /* True if file was unlinked by op */ bool speculative:1; /* T if speculative status fetch (no vnode lock) */ bool modification:1; /* Set if the content gets modified */ }; /* * Fileserver operation wrapper, handling server and address rotation * asynchronously. May make simultaneous calls to multiple servers. */ struct afs_operation { struct afs_net *net; /* Network namespace */ struct key *key; /* Key for the cell */ const struct afs_call_type *type; /* Type of call done */ const struct afs_operation_ops *ops; /* Parameters/results for the operation */ struct afs_volume *volume; /* Volume being accessed */ struct afs_vnode_param file[2]; struct afs_vnode_param *more_files; struct afs_volsync pre_volsync; /* Volsync before op */ struct afs_volsync volsync; /* Volsync returned by op */ struct dentry *dentry; /* Dentry to be altered */ struct dentry *dentry_2; /* Second dentry to be altered */ struct timespec64 mtime; /* Modification time to record */ struct timespec64 ctime; /* Change time to set */ struct afs_error cumul_error; /* Cumulative error */ short nr_files; /* Number of entries in file[], more_files */ unsigned int debug_id; unsigned int cb_v_break; /* Volume break counter before op */ union { struct { int which; /* Which ->file[] to fetch for */ } fetch_status; struct { int reason; /* enum afs_edit_dir_reason */ mode_t mode; const char *symlink; } create; struct { bool need_rehash; } unlink; struct { struct dentry *rehash; struct dentry *tmp; bool new_negative; } rename; struct { struct netfs_io_subrequest *subreq; } fetch; struct { afs_lock_type_t type; } lock; struct { struct iov_iter *write_iter; loff_t pos; loff_t size; loff_t i_size; } store; struct { struct iattr *attr; loff_t old_i_size; } setattr; struct afs_acl *acl; struct yfs_acl *yacl; struct { struct afs_volume_status vs; struct kstatfs *buf; } volstatus; }; /* Fileserver iteration state */ struct afs_server_list *server_list; /* Current server list (pins ref) */ struct afs_server *server; /* Server we're using (ref pinned by server_list) */ struct afs_endpoint_state *estate; /* Current endpoint state (doesn't pin ref) */ struct afs_server_state *server_states; /* States of the servers involved */ struct afs_call *call; unsigned long untried_servers; /* Bitmask of untried servers */ unsigned long addr_tried; /* Tried addresses */ s32 call_abort_code; /* Abort code from single call */ short call_error; /* Error from single call */ short server_index; /* Current server */ short nr_iterations; /* Number of server iterations */ signed char addr_index; /* Current address */ bool call_responded; /* T if the current address responded */ unsigned int flags; #define AFS_OPERATION_STOP 0x0001 /* Set to cease iteration */ #define AFS_OPERATION_VBUSY 0x0002 /* Set if seen VBUSY */ #define AFS_OPERATION_VMOVED 0x0004 /* Set if seen VMOVED */ #define AFS_OPERATION_VNOVOL 0x0008 /* Set if seen VNOVOL */ #define AFS_OPERATION_CUR_ONLY 0x0010 /* Set if current server only (file lock held) */ #define AFS_OPERATION_NO_VSLEEP 0x0020 /* Set to prevent sleep on VBUSY, VOFFLINE, ... */ #define AFS_OPERATION_UNINTR 0x0040 /* Set if op is uninterruptible */ #define AFS_OPERATION_DOWNGRADE 0x0080 /* Set to retry with downgraded opcode */ #define AFS_OPERATION_LOCK_0 0x0100 /* Set if have io_lock on file[0] */ #define AFS_OPERATION_LOCK_1 0x0200 /* Set if have io_lock on file[1] */ #define AFS_OPERATION_TRIED_ALL 0x0400 /* Set if we've tried all the fileservers */ #define AFS_OPERATION_RETRY_SERVER 0x0800 /* Set if we should retry the current server */ #define AFS_OPERATION_DIR_CONFLICT 0x1000 /* Set if we detected a 3rd-party dir change */ #define AFS_OPERATION_ASYNC 0x2000 /* Set if should run asynchronously */ }; /* * Cache auxiliary data. */ struct afs_vnode_cache_aux { __be64 data_version; } __packed; static inline void afs_set_cache_aux(struct afs_vnode *vnode, struct afs_vnode_cache_aux *aux) { aux->data_version = cpu_to_be64(vnode->status.data_version); } static inline void afs_invalidate_cache(struct afs_vnode *vnode, unsigned int flags) { struct afs_vnode_cache_aux aux; afs_set_cache_aux(vnode, &aux); fscache_invalidate(afs_vnode_cache(vnode), &aux, i_size_read(&vnode->netfs.inode), flags); } /* * Directory iteration management. */ struct afs_dir_iter { struct afs_vnode *dvnode; union afs_xdr_dir_block *block; struct folio_queue *fq; unsigned int fpos; int fq_slot; unsigned int loop_check; u8 nr_slots; u8 bucket; unsigned int prev_entry; }; #include <trace/events/afs.h> /*****************************************************************************/ /* * addr_list.c */ struct afs_addr_list *afs_get_addrlist(struct afs_addr_list *alist, enum afs_alist_trace reason); extern struct afs_addr_list *afs_alloc_addrlist(unsigned int nr); extern void afs_put_addrlist(struct afs_addr_list *alist, enum afs_alist_trace reason); extern struct afs_vlserver_list *afs_parse_text_addrs(struct afs_net *, const char *, size_t, char, unsigned short, unsigned short); bool afs_addr_list_same(const struct afs_addr_list *a, const struct afs_addr_list *b); extern struct afs_vlserver_list *afs_dns_query(struct afs_cell *, time64_t *); extern int afs_merge_fs_addr4(struct afs_net *net, struct afs_addr_list *addr, __be32 xdr, u16 port); extern int afs_merge_fs_addr6(struct afs_net *net, struct afs_addr_list *addr, __be32 *xdr, u16 port); /* * addr_prefs.c */ int afs_proc_addr_prefs_write(struct file *file, char *buf, size_t size); void afs_get_address_preferences_rcu(struct afs_net *net, struct afs_addr_list *alist); void afs_get_address_preferences(struct afs_net *net, struct afs_addr_list *alist); /* * callback.c */ extern void afs_invalidate_mmap_work(struct work_struct *); extern void afs_init_callback_state(struct afs_server *); extern void __afs_break_callback(struct afs_vnode *, enum afs_cb_break_reason); extern void afs_break_callback(struct afs_vnode *, enum afs_cb_break_reason); extern void afs_break_callbacks(struct afs_server *, size_t, struct afs_callback_break *); static inline unsigned int afs_calc_vnode_cb_break(struct afs_vnode *vnode) { return vnode->cb_break + vnode->cb_ro_snapshot + vnode->cb_scrub; } static inline bool afs_cb_is_broken(unsigned int cb_break, const struct afs_vnode *vnode) { return cb_break != (vnode->cb_break + atomic_read(&vnode->volume->cb_ro_snapshot) + atomic_read(&vnode->volume->cb_scrub)); } /* * cell.c */ extern int afs_cell_init(struct afs_net *, const char *); extern struct afs_cell *afs_find_cell(struct afs_net *, const char *, unsigned, enum afs_cell_trace); extern struct afs_cell *afs_lookup_cell(struct afs_net *, const char *, unsigned, const char *, bool); extern struct afs_cell *afs_use_cell(struct afs_cell *, enum afs_cell_trace); extern void afs_unuse_cell(struct afs_net *, struct afs_cell *, enum afs_cell_trace); extern struct afs_cell *afs_get_cell(struct afs_cell *, enum afs_cell_trace); extern void afs_see_cell(struct afs_cell *, enum afs_cell_trace); extern void afs_put_cell(struct afs_cell *, enum afs_cell_trace); extern void afs_queue_cell(struct afs_cell *, enum afs_cell_trace); extern void afs_manage_cells(struct work_struct *); extern void afs_cells_timer(struct timer_list *); extern void __net_exit afs_cell_purge(struct afs_net *); /* * cmservice.c */ extern bool afs_cm_incoming_call(struct afs_call *); /* * dir.c */ extern const struct file_operations afs_dir_file_operations; extern const struct inode_operations afs_dir_inode_operations; extern const struct address_space_operations afs_dir_aops; extern const struct dentry_operations afs_fs_dentry_operations; ssize_t afs_read_single(struct afs_vnode *dvnode, struct file *file); ssize_t afs_read_dir(struct afs_vnode *dvnode, struct file *file) __acquires(&dvnode->validate_lock); extern void afs_d_release(struct dentry *); extern void afs_check_for_remote_deletion(struct afs_operation *); int afs_single_writepages(struct address_space *mapping, struct writeback_control *wbc); /* * dir_edit.c */ extern void afs_edit_dir_add(struct afs_vnode *, struct qstr *, struct afs_fid *, enum afs_edit_dir_reason); extern void afs_edit_dir_remove(struct afs_vnode *, struct qstr *, enum afs_edit_dir_reason); void afs_edit_dir_update_dotdot(struct afs_vnode *vnode, struct afs_vnode *new_dvnode, enum afs_edit_dir_reason why); void afs_mkdir_init_dir(struct afs_vnode *dvnode, struct afs_vnode *parent_vnode); /* * dir_search.c */ unsigned int afs_dir_hash_name(const struct qstr *name); bool afs_dir_init_iter(struct afs_dir_iter *iter, const struct qstr *name); union afs_xdr_dir_block *afs_dir_find_block(struct afs_dir_iter *iter, size_t block); int afs_dir_search_bucket(struct afs_dir_iter *iter, const struct qstr *name, struct afs_fid *_fid); int afs_dir_search(struct afs_vnode *dvnode, struct qstr *name, struct afs_fid *_fid, afs_dataversion_t *_dir_version); /* * dir_silly.c */ extern int afs_sillyrename(struct afs_vnode *, struct afs_vnode *, struct dentry *, struct key *); extern int afs_silly_iput(struct dentry *, struct inode *); /* * dynroot.c */ extern const struct inode_operations afs_dynroot_inode_operations; extern const struct dentry_operations afs_dynroot_dentry_operations; extern struct inode *afs_try_auto_mntpt(struct dentry *, struct inode *); extern int afs_dynroot_mkdir(struct afs_net *, struct afs_cell *); extern void afs_dynroot_rmdir(struct afs_net *, struct afs_cell *); extern int afs_dynroot_populate(struct super_block *); extern void afs_dynroot_depopulate(struct super_block *); /* * file.c */ extern const struct address_space_operations afs_file_aops; extern const struct inode_operations afs_file_inode_operations; extern const struct file_operations afs_file_operations; extern const struct afs_operation_ops afs_fetch_data_operation; extern const struct netfs_request_ops afs_req_ops; extern int afs_cache_wb_key(struct afs_vnode *, struct afs_file *); extern void afs_put_wb_key(struct afs_wb_key *); extern int afs_open(struct inode *, struct file *); extern int afs_release(struct inode *, struct file *); void afs_fetch_data_async_rx(struct work_struct *work); void afs_fetch_data_immediate_cancel(struct afs_call *call); /* * flock.c */ extern struct workqueue_struct *afs_lock_manager; extern void afs_lock_op_done(struct afs_call *); extern void afs_lock_work(struct work_struct *); extern void afs_lock_may_be_available(struct afs_vnode *); extern int afs_lock(struct file *, int, struct file_lock *); extern int afs_flock(struct file *, int, struct file_lock *); /* * fsclient.c */ extern void afs_fs_fetch_status(struct afs_operation *); extern void afs_fs_fetch_data(struct afs_operation *); extern void afs_fs_create_file(struct afs_operation *); extern void afs_fs_make_dir(struct afs_operation *); extern void afs_fs_remove_file(struct afs_operation *); extern void afs_fs_remove_dir(struct afs_operation *); extern void afs_fs_link(struct afs_operation *); extern void afs_fs_symlink(struct afs_operation *); extern void afs_fs_rename(struct afs_operation *); extern void afs_fs_store_data(struct afs_operation *); extern void afs_fs_setattr(struct afs_operation *); extern void afs_fs_get_volume_status(struct afs_operation *); extern void afs_fs_set_lock(struct afs_operation *); extern void afs_fs_extend_lock(struct afs_operation *); extern void afs_fs_release_lock(struct afs_operation *); int afs_fs_give_up_all_callbacks(struct afs_net *net, struct afs_server *server, struct afs_address *addr, struct key *key); bool afs_fs_get_capabilities(struct afs_net *net, struct afs_server *server, struct afs_endpoint_state *estate, unsigned int addr_index, struct key *key); extern void afs_fs_inline_bulk_status(struct afs_operation *); struct afs_acl { u32 size; u8 data[] __counted_by(size); }; extern void afs_fs_fetch_acl(struct afs_operation *); extern void afs_fs_store_acl(struct afs_operation *); /* * fs_operation.c */ extern struct afs_operation *afs_alloc_operation(struct key *, struct afs_volume *); extern int afs_put_operation(struct afs_operation *); extern bool afs_begin_vnode_operation(struct afs_operation *); extern void afs_end_vnode_operation(struct afs_operation *op); extern void afs_wait_for_operation(struct afs_operation *); extern int afs_do_sync_operation(struct afs_operation *); static inline void afs_op_set_vnode(struct afs_operation *op, unsigned int n, struct afs_vnode *vnode) { op->file[n].vnode = vnode; op->file[n].need_io_lock = true; } static inline void afs_op_set_fid(struct afs_operation *op, unsigned int n, const struct afs_fid *fid) { op->file[n].fid = *fid; } /* * fs_probe.c */ struct afs_endpoint_state *afs_get_endpoint_state(struct afs_endpoint_state *estate, enum afs_estate_trace where); void afs_put_endpoint_state(struct afs_endpoint_state *estate, enum afs_estate_trace where); extern void afs_fileserver_probe_result(struct afs_call *); void afs_fs_probe_fileserver(struct afs_net *net, struct afs_server *server, struct afs_addr_list *new_addrs, struct key *key); int afs_wait_for_fs_probes(struct afs_operation *op, struct afs_server_state *states, bool intr); extern void afs_probe_fileserver(struct afs_net *, struct afs_server *); extern void afs_fs_probe_dispatcher(struct work_struct *); int afs_wait_for_one_fs_probe(struct afs_server *server, struct afs_endpoint_state *estate, unsigned long exclude, bool is_intr); extern void afs_fs_probe_cleanup(struct afs_net *); /* * inode.c */ extern const struct afs_operation_ops afs_fetch_status_operation; void afs_init_new_symlink(struct afs_vnode *vnode, struct afs_operation *op); const char *afs_get_link(struct dentry *dentry, struct inode *inode, struct delayed_call *callback); int afs_readlink(struct dentry *dentry, char __user *buffer, int buflen); extern void afs_vnode_commit_status(struct afs_operation *, struct afs_vnode_param *); extern int afs_fetch_status(struct afs_vnode *, struct key *, bool, afs_access_t *); extern int afs_ilookup5_test_by_fid(struct inode *, void *); extern struct inode *afs_iget_pseudo_dir(struct super_block *, bool); extern struct inode *afs_iget(struct afs_operation *, struct afs_vnode_param *); extern struct inode *afs_root_iget(struct super_block *, struct key *); extern int afs_getattr(struct mnt_idmap *idmap, const struct path *, struct kstat *, u32, unsigned int); extern int afs_setattr(struct mnt_idmap *idmap, struct dentry *, struct iattr *); extern void afs_evict_inode(struct inode *); extern int afs_drop_inode(struct inode *); /* * main.c */ extern struct workqueue_struct *afs_wq; extern int afs_net_id; static inline struct afs_net *afs_net(struct net *net) { return net_generic(net, afs_net_id); } static inline struct afs_net *afs_sb2net(struct super_block *sb) { return afs_net(AFS_FS_S(sb)->net_ns); } static inline struct afs_net *afs_d2net(struct dentry *dentry) { return afs_sb2net(dentry->d_sb); } static inline struct afs_net *afs_i2net(struct inode *inode) { return afs_sb2net(inode->i_sb); } static inline struct afs_net *afs_v2net(struct afs_vnode *vnode) { return afs_i2net(&vnode->netfs.inode); } static inline struct afs_net *afs_sock2net(struct sock *sk) { return net_generic(sock_net(sk), afs_net_id); } static inline void __afs_stat(atomic_t *s) { atomic_inc(s); } #define afs_stat_v(vnode, n) __afs_stat(&afs_v2net(vnode)->n) /* * misc.c */ extern int afs_abort_to_error(u32); extern void afs_prioritise_error(struct afs_error *, int, u32); static inline void afs_op_nomem(struct afs_operation *op) { op->cumul_error.error = -ENOMEM; } static inline int afs_op_error(const struct afs_operation *op) { return op->cumul_error.error; } static inline s32 afs_op_abort_code(const struct afs_operation *op) { return op->cumul_error.abort_code; } static inline int afs_op_set_error(struct afs_operation *op, int error) { return op->cumul_error.error = error; } static inline void afs_op_accumulate_error(struct afs_operation *op, int error, s32 abort_code) { afs_prioritise_error(&op->cumul_error, error, abort_code); } /* * mntpt.c */ extern const struct inode_operations afs_mntpt_inode_operations; extern const struct inode_operations afs_autocell_inode_operations; extern const struct file_operations afs_mntpt_file_operations; extern struct vfsmount *afs_d_automount(struct path *); extern void afs_mntpt_kill_timer(void); /* * proc.c */ #ifdef CONFIG_PROC_FS extern int __net_init afs_proc_init(struct afs_net *); extern void __net_exit afs_proc_cleanup(struct afs_net *); extern int afs_proc_cell_setup(struct afs_cell *); extern void afs_proc_cell_remove(struct afs_cell *); extern void afs_put_sysnames(struct afs_sysnames *); #else static inline int afs_proc_init(struct afs_net *net) { return 0; } static inline void afs_proc_cleanup(struct afs_net *net) {} static inline int afs_proc_cell_setup(struct afs_cell *cell) { return 0; } static inline void afs_proc_cell_remove(struct afs_cell *cell) {} static inline void afs_put_sysnames(struct afs_sysnames *sysnames) {} #endif /* * rotate.c */ void afs_clear_server_states(struct afs_operation *op); extern bool afs_select_fileserver(struct afs_operation *); extern void afs_dump_edestaddrreq(const struct afs_operation *); /* * rxrpc.c */ extern struct workqueue_struct *afs_async_calls; extern int __net_init afs_open_socket(struct afs_net *); extern void __net_exit afs_close_socket(struct afs_net *); extern void afs_charge_preallocation(struct work_struct *); extern void afs_put_call(struct afs_call *); void afs_deferred_put_call(struct afs_call *call); void afs_make_call(struct afs_call *call, gfp_t gfp); void afs_deliver_to_call(struct afs_call *call); void afs_wait_for_call_to_complete(struct afs_call *call); extern struct afs_call *afs_alloc_flat_call(struct afs_net *, const struct afs_call_type *, size_t, size_t); extern void afs_flat_call_destructor(struct afs_call *); extern void afs_send_empty_reply(struct afs_call *); extern void afs_send_simple_reply(struct afs_call *, const void *, size_t); extern int afs_extract_data(struct afs_call *, bool); extern int afs_protocol_error(struct afs_call *, enum afs_eproto_cause); static inline struct afs_call *afs_get_call(struct afs_call *call, enum afs_call_trace why) { int r; __refcount_inc(&call->ref, &r); trace_afs_call(call->debug_id, why, r + 1, atomic_read(&call->net->nr_outstanding_calls), __builtin_return_address(0)); return call; } static inline void afs_see_call(struct afs_call *call, enum afs_call_trace why) { int r = refcount_read(&call->ref); trace_afs_call(call->debug_id, why, r, atomic_read(&call->net->nr_outstanding_calls), __builtin_return_address(0)); } static inline void afs_make_op_call(struct afs_operation *op, struct afs_call *call, gfp_t gfp) { struct afs_addr_list *alist = op->estate->addresses; op->call = call; op->type = call->type; call->op = op; call->key = op->key; call->intr = !(op->flags & AFS_OPERATION_UNINTR); call->peer = rxrpc_kernel_get_peer(alist->addrs[op->addr_index].peer); call->service_id = op->server->service_id; afs_make_call(call, gfp); } static inline void afs_extract_begin(struct afs_call *call, void *buf, size_t size) { call->iov_len = size; call->kvec[0].iov_base = buf; call->kvec[0].iov_len = size; iov_iter_kvec(&call->def_iter, ITER_DEST, call->kvec, 1, size); } static inline void afs_extract_to_tmp(struct afs_call *call) { call->iov_len = sizeof(call->tmp); afs_extract_begin(call, &call->tmp, sizeof(call->tmp)); } static inline void afs_extract_to_tmp64(struct afs_call *call) { call->iov_len = sizeof(call->tmp64); afs_extract_begin(call, &call->tmp64, sizeof(call->tmp64)); } static inline void afs_extract_discard(struct afs_call *call, size_t size) { call->iov_len = size; iov_iter_discard(&call->def_iter, ITER_DEST, size); } static inline void afs_extract_to_buf(struct afs_call *call, size_t size) { call->iov_len = size; afs_extract_begin(call, call->buffer, size); } static inline int afs_transfer_reply(struct afs_call *call) { return afs_extract_data(call, false); } static inline bool afs_check_call_state(struct afs_call *call, enum afs_call_state state) { return READ_ONCE(call->state) == state; } static inline bool afs_set_call_state(struct afs_call *call, enum afs_call_state from, enum afs_call_state to) { bool ok = false; spin_lock_bh(&call->state_lock); if (call->state == from) { call->state = to; trace_afs_call_state(call, from, to, 0, 0); ok = true; } spin_unlock_bh(&call->state_lock); return ok; } static inline void afs_set_call_complete(struct afs_call *call, int error, u32 remote_abort) { enum afs_call_state state; bool ok = false; spin_lock_bh(&call->state_lock); state = call->state; if (state != AFS_CALL_COMPLETE) { call->abort_code = remote_abort; call->error = error; call->state = AFS_CALL_COMPLETE; trace_afs_call_state(call, state, AFS_CALL_COMPLETE, error, remote_abort); ok = true; } spin_unlock_bh(&call->state_lock); if (ok) { trace_afs_call_done(call); /* Asynchronous calls have two refs to release - one from the alloc and * one queued with the work item - and we can't just deallocate the * call because the work item may be queued again. */ if (call->drop_ref) afs_put_call(call); } } /* * security.c */ extern void afs_put_permits(struct afs_permits *); extern void afs_clear_permits(struct afs_vnode *); extern void afs_cache_permit(struct afs_vnode *, struct key *, unsigned int, struct afs_status_cb *); extern struct key *afs_request_key(struct afs_cell *); extern struct key *afs_request_key_rcu(struct afs_cell *); extern int afs_check_permit(struct afs_vnode *, struct key *, afs_access_t *); extern int afs_permission(struct mnt_idmap *, struct inode *, int); extern void __exit afs_clean_up_permit_cache(void); /* * server.c */ extern spinlock_t afs_server_peer_lock; extern struct afs_server *afs_find_server(struct afs_net *, const struct rxrpc_peer *); extern struct afs_server *afs_find_server_by_uuid(struct afs_net *, const uuid_t *); extern struct afs_server *afs_lookup_server(struct afs_cell *, struct key *, const uuid_t *, u32); extern struct afs_server *afs_get_server(struct afs_server *, enum afs_server_trace); extern struct afs_server *afs_use_server(struct afs_server *, enum afs_server_trace); extern void afs_unuse_server(struct afs_net *, struct afs_server *, enum afs_server_trace); extern void afs_unuse_server_notime(struct afs_net *, struct afs_server *, enum afs_server_trace); extern void afs_put_server(struct afs_net *, struct afs_server *, enum afs_server_trace); extern void afs_manage_servers(struct work_struct *); extern void afs_servers_timer(struct timer_list *); extern void afs_fs_probe_timer(struct timer_list *); extern void __net_exit afs_purge_servers(struct afs_net *); bool afs_check_server_record(struct afs_operation *op, struct afs_server *server, struct key *key); static inline void afs_inc_servers_outstanding(struct afs_net *net) { atomic_inc(&net->servers_outstanding); } static inline void afs_dec_servers_outstanding(struct afs_net *net) { if (atomic_dec_and_test(&net->servers_outstanding)) wake_up_var(&net->servers_outstanding); } static inline bool afs_is_probing_server(struct afs_server *server) { return list_empty(&server->probe_link); } /* * server_list.c */ static inline struct afs_server_list *afs_get_serverlist(struct afs_server_list *slist) { refcount_inc(&slist->usage); return slist; } extern void afs_put_serverlist(struct afs_net *, struct afs_server_list *); struct afs_server_list *afs_alloc_server_list(struct afs_volume *volume, struct key *key, struct afs_vldb_entry *vldb); extern bool afs_annotate_server_list(struct afs_server_list *, struct afs_server_list *); void afs_attach_volume_to_servers(struct afs_volume *volume, struct afs_server_list *slist); void afs_reattach_volume_to_servers(struct afs_volume *volume, struct afs_server_list *slist, struct afs_server_list *old); void afs_detach_volume_from_servers(struct afs_volume *volume, struct afs_server_list *slist); /* * super.c */ extern int __init afs_fs_init(void); extern void afs_fs_exit(void); /* * validation.c */ bool afs_check_validity(const struct afs_vnode *vnode); int afs_update_volume_state(struct afs_operation *op); int afs_validate(struct afs_vnode *vnode, struct key *key); /* * vlclient.c */ extern struct afs_vldb_entry *afs_vl_get_entry_by_name_u(struct afs_vl_cursor *, const char *, int); extern struct afs_addr_list *afs_vl_get_addrs_u(struct afs_vl_cursor *, const uuid_t *); struct afs_call *afs_vl_get_capabilities(struct afs_net *net, struct afs_addr_list *alist, unsigned int addr_index, struct key *key, struct afs_vlserver *server, unsigned int server_index); extern struct afs_addr_list *afs_yfsvl_get_endpoints(struct afs_vl_cursor *, const uuid_t *); extern char *afs_yfsvl_get_cell_name(struct afs_vl_cursor *); /* * vl_alias.c */ extern int afs_cell_detect_alias(struct afs_cell *, struct key *); /* * vl_probe.c */ extern void afs_vlserver_probe_result(struct afs_call *); extern int afs_send_vl_probes(struct afs_net *, struct key *, struct afs_vlserver_list *); extern int afs_wait_for_vl_probes(struct afs_vlserver_list *, unsigned long); /* * vl_rotate.c */ extern bool afs_begin_vlserver_operation(struct afs_vl_cursor *, struct afs_cell *, struct key *); extern bool afs_select_vlserver(struct afs_vl_cursor *); extern bool afs_select_current_vlserver(struct afs_vl_cursor *); extern int afs_end_vlserver_operation(struct afs_vl_cursor *); /* * vlserver_list.c */ static inline struct afs_vlserver *afs_get_vlserver(struct afs_vlserver *vlserver) { refcount_inc(&vlserver->ref); return vlserver; } static inline struct afs_vlserver_list *afs_get_vlserverlist(struct afs_vlserver_list *vllist) { if (vllist) refcount_inc(&vllist->ref); return vllist; } extern struct afs_vlserver *afs_alloc_vlserver(const char *, size_t, unsigned short); extern void afs_put_vlserver(struct afs_net *, struct afs_vlserver *); extern struct afs_vlserver_list *afs_alloc_vlserver_list(unsigned int); extern void afs_put_vlserverlist(struct afs_net *, struct afs_vlserver_list *); extern struct afs_vlserver_list *afs_extract_vlserver_list(struct afs_cell *, const void *, size_t); /* * volume.c */ extern struct afs_volume *afs_create_volume(struct afs_fs_context *); extern int afs_activate_volume(struct afs_volume *); extern void afs_deactivate_volume(struct afs_volume *); bool afs_try_get_volume(struct afs_volume *volume, enum afs_volume_trace reason); extern struct afs_volume *afs_get_volume(struct afs_volume *, enum afs_volume_trace); void afs_put_volume(struct afs_volume *volume, enum afs_volume_trace reason); extern int afs_check_volume_status(struct afs_volume *, struct afs_operation *); /* * write.c */ void afs_prepare_write(struct netfs_io_subrequest *subreq); void afs_issue_write(struct netfs_io_subrequest *subreq); void afs_begin_writeback(struct netfs_io_request *wreq); void afs_retry_request(struct netfs_io_request *wreq, struct netfs_io_stream *stream); extern int afs_writepages(struct address_space *, struct writeback_control *); extern int afs_fsync(struct file *, loff_t, loff_t, int); extern vm_fault_t afs_page_mkwrite(struct vm_fault *vmf); extern void afs_prune_wb_keys(struct afs_vnode *); /* * xattr.c */ extern const struct xattr_handler * const afs_xattr_handlers[]; /* * yfsclient.c */ extern void yfs_fs_fetch_data(struct afs_operation *); extern void yfs_fs_create_file(struct afs_operation *); extern void yfs_fs_make_dir(struct afs_operation *); extern void yfs_fs_remove_file2(struct afs_operation *); extern void yfs_fs_remove_file(struct afs_operation *); extern void yfs_fs_remove_dir(struct afs_operation *); extern void yfs_fs_link(struct afs_operation *); extern void yfs_fs_symlink(struct afs_operation *); extern void yfs_fs_rename(struct afs_operation *); extern void yfs_fs_store_data(struct afs_operation *); extern void yfs_fs_setattr(struct afs_operation *); extern void yfs_fs_get_volume_status(struct afs_operation *); extern void yfs_fs_set_lock(struct afs_operation *); extern void yfs_fs_extend_lock(struct afs_operation *); extern void yfs_fs_release_lock(struct afs_operation *); extern void yfs_fs_fetch_status(struct afs_operation *); extern void yfs_fs_inline_bulk_status(struct afs_operation *); struct yfs_acl { struct afs_acl *acl; /* Dir/file/symlink ACL */ struct afs_acl *vol_acl; /* Whole volume ACL */ u32 inherit_flag; /* True if ACL is inherited from parent dir */ u32 num_cleaned; /* Number of ACEs removed due to subject removal */ unsigned int flags; #define YFS_ACL_WANT_ACL 0x01 /* Set if caller wants ->acl */ #define YFS_ACL_WANT_VOL_ACL 0x02 /* Set if caller wants ->vol_acl */ }; extern void yfs_free_opaque_acl(struct yfs_acl *); extern void yfs_fs_fetch_opaque_acl(struct afs_operation *); extern void yfs_fs_store_opaque_acl2(struct afs_operation *); /* * Miscellaneous inline functions. */ static inline struct afs_vnode *AFS_FS_I(struct inode *inode) { return container_of(inode, struct afs_vnode, netfs.inode); } static inline struct inode *AFS_VNODE_TO_I(struct afs_vnode *vnode) { return &vnode->netfs.inode; } /* * Note that a dentry got changed. We need to set d_fsdata to the data version * number derived from the result of the operation. It doesn't matter if * d_fsdata goes backwards as we'll just revalidate. */ static inline void afs_update_dentry_version(struct afs_operation *op, struct afs_vnode_param *dir_vp, struct dentry *dentry) { if (!op->cumul_error.error) dentry->d_fsdata = (void *)(unsigned long)dir_vp->scb.status.data_version; } /* * Set the file size and block count. Estimate the number of 512 bytes blocks * used, rounded up to nearest 1K for consistency with other AFS clients. */ static inline void afs_set_i_size(struct afs_vnode *vnode, u64 size) { i_size_write(&vnode->netfs.inode, size); vnode->netfs.inode.i_blocks = ((size + 1023) >> 10) << 1; } /* * Check for a conflicting operation on a directory that we just unlinked from. * If someone managed to sneak a link or an unlink in on the file we just * unlinked, we won't be able to trust nlink on an AFS file (but not YFS). */ static inline void afs_check_dir_conflict(struct afs_operation *op, struct afs_vnode_param *dvp) { if (dvp->dv_before + dvp->dv_delta != dvp->scb.status.data_version) op->flags |= AFS_OPERATION_DIR_CONFLICT; } static inline int afs_io_error(struct afs_call *call, enum afs_io_error where) { trace_afs_io_error(call->debug_id, -EIO, where); return -EIO; } static inline int afs_bad(struct afs_vnode *vnode, enum afs_file_error where) { trace_afs_file_error(vnode, -EIO, where); return -EIO; } /* * Set the callback promise on a vnode. */ static inline void afs_set_cb_promise(struct afs_vnode *vnode, time64_t expires_at, enum afs_cb_promise_trace trace) { atomic64_set(&vnode->cb_expires_at, expires_at); trace_afs_cb_promise(vnode, trace); } /* * Clear the callback promise on a vnode, returning true if it was promised. */ static inline bool afs_clear_cb_promise(struct afs_vnode *vnode, enum afs_cb_promise_trace trace) { trace_afs_cb_promise(vnode, trace); return atomic64_xchg(&vnode->cb_expires_at, AFS_NO_CB_PROMISE) != AFS_NO_CB_PROMISE; } /* * Mark a directory as being invalid. */ static inline void afs_invalidate_dir(struct afs_vnode *dvnode, enum afs_dir_invalid_trace trace) { if (test_and_clear_bit(AFS_VNODE_DIR_VALID, &dvnode->flags)) { trace_afs_dir_invalid(dvnode, trace); afs_stat_v(dvnode, n_inval); } } /*****************************************************************************/ /* * debug tracing */ extern unsigned afs_debug; #define dbgprintk(FMT,...) \ printk("[%-6.6s] "FMT"\n", current->comm ,##__VA_ARGS__) #define kenter(FMT,...) dbgprintk("==> %s("FMT")",__func__ ,##__VA_ARGS__) #define kleave(FMT,...) dbgprintk("<== %s()"FMT"",__func__ ,##__VA_ARGS__) #define kdebug(FMT,...) dbgprintk(" "FMT ,##__VA_ARGS__) #if defined(__KDEBUG) #define _enter(FMT,...) kenter(FMT,##__VA_ARGS__) #define _leave(FMT,...) kleave(FMT,##__VA_ARGS__) #define _debug(FMT,...) kdebug(FMT,##__VA_ARGS__) #elif defined(CONFIG_AFS_DEBUG) #define AFS_DEBUG_KENTER 0x01 #define AFS_DEBUG_KLEAVE 0x02 #define AFS_DEBUG_KDEBUG 0x04 #define _enter(FMT,...) \ do { \ if (unlikely(afs_debug & AFS_DEBUG_KENTER)) \ kenter(FMT,##__VA_ARGS__); \ } while (0) #define _leave(FMT,...) \ do { \ if (unlikely(afs_debug & AFS_DEBUG_KLEAVE)) \ kleave(FMT,##__VA_ARGS__); \ } while (0) #define _debug(FMT,...) \ do { \ if (unlikely(afs_debug & AFS_DEBUG_KDEBUG)) \ kdebug(FMT,##__VA_ARGS__); \ } while (0) #else #define _enter(FMT,...) no_printk("==> %s("FMT")",__func__ ,##__VA_ARGS__) #define _leave(FMT,...) no_printk("<== %s()"FMT"",__func__ ,##__VA_ARGS__) #define _debug(FMT,...) no_printk(" "FMT ,##__VA_ARGS__) #endif /* * debug assertion checking */ #if 1 // defined(__KDEBUGALL) #define ASSERT(X) \ do { \ if (unlikely(!(X))) { \ printk(KERN_ERR "\n"); \ printk(KERN_ERR "AFS: Assertion failed\n"); \ BUG(); \ } \ } while(0) #define ASSERTCMP(X, OP, Y) \ do { \ if (unlikely(!((X) OP (Y)))) { \ printk(KERN_ERR "\n"); \ printk(KERN_ERR "AFS: Assertion failed\n"); \ printk(KERN_ERR "%lu " #OP " %lu is false\n", \ (unsigned long)(X), (unsigned long)(Y)); \ printk(KERN_ERR "0x%lx " #OP " 0x%lx is false\n", \ (unsigned long)(X), (unsigned long)(Y)); \ BUG(); \ } \ } while(0) #define ASSERTRANGE(L, OP1, N, OP2, H) \ do { \ if (unlikely(!((L) OP1 (N)) || !((N) OP2 (H)))) { \ printk(KERN_ERR "\n"); \ printk(KERN_ERR "AFS: Assertion failed\n"); \ printk(KERN_ERR "%lu "#OP1" %lu "#OP2" %lu is false\n", \ (unsigned long)(L), (unsigned long)(N), \ (unsigned long)(H)); \ printk(KERN_ERR "0x%lx "#OP1" 0x%lx "#OP2" 0x%lx is false\n", \ (unsigned long)(L), (unsigned long)(N), \ (unsigned long)(H)); \ BUG(); \ } \ } while(0) #define ASSERTIF(C, X) \ do { \ if (unlikely((C) && !(X))) { \ printk(KERN_ERR "\n"); \ printk(KERN_ERR "AFS: Assertion failed\n"); \ BUG(); \ } \ } while(0) #define ASSERTIFCMP(C, X, OP, Y) \ do { \ if (unlikely((C) && !((X) OP (Y)))) { \ printk(KERN_ERR "\n"); \ printk(KERN_ERR "AFS: Assertion failed\n"); \ printk(KERN_ERR "%lu " #OP " %lu is false\n", \ (unsigned long)(X), (unsigned long)(Y)); \ printk(KERN_ERR "0x%lx " #OP " 0x%lx is false\n", \ (unsigned long)(X), (unsigned long)(Y)); \ BUG(); \ } \ } while(0) #else #define ASSERT(X) \ do { \ } while(0) #define ASSERTCMP(X, OP, Y) \ do { \ } while(0) #define ASSERTRANGE(L, OP1, N, OP2, H) \ do { \ } while(0) #define ASSERTIF(C, X) \ do { \ } while(0) #define ASSERTIFCMP(C, X, OP, Y) \ do { \ } while(0) #endif /* __KDEBUGALL */
3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 /* SPDX-License-Identifier: GPL-2.0 */ /* * Device core Trace Support * Copyright (C) 2021, Intel Corporation * * Author: Andy Shevchenko <andriy.shevchenko@linux.intel.com> */ #undef TRACE_SYSTEM #define TRACE_SYSTEM dev #if !defined(__DEV_TRACE_H) || defined(TRACE_HEADER_MULTI_READ) #define __DEV_TRACE_H #include <linux/device.h> #include <linux/tracepoint.h> #include <linux/types.h> DECLARE_EVENT_CLASS(devres, TP_PROTO(struct device *dev, const char *op, void *node, const char *name, size_t size), TP_ARGS(dev, op, node, name, size), TP_STRUCT__entry( __string(devname, dev_name(dev)) __field(struct device *, dev) __field(const char *, op) __field(void *, node) __string(name, name) __field(size_t, size) ), TP_fast_assign( __assign_str(devname); __entry->op = op; __entry->node = node; __assign_str(name); __entry->size = size; ), TP_printk("%s %3s %p %s (%zu bytes)", __get_str(devname), __entry->op, __entry->node, __get_str(name), __entry->size) ); DEFINE_EVENT(devres, devres_log, TP_PROTO(struct device *dev, const char *op, void *node, const char *name, size_t size), TP_ARGS(dev, op, node, name, size) ); #endif /* __DEV_TRACE_H */ /* this part has to be here */ #undef TRACE_INCLUDE_PATH #define TRACE_INCLUDE_PATH . #undef TRACE_INCLUDE_FILE #define TRACE_INCLUDE_FILE trace #include <trace/define_trace.h>
3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 259 260 3 3 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 // SPDX-License-Identifier: GPL-2.0 /* * drivers/base/devres.c - device resource management * * Copyright (c) 2006 SUSE Linux Products GmbH * Copyright (c) 2006 Tejun Heo <teheo@suse.de> */ #include <linux/device.h> #include <linux/module.h> #include <linux/slab.h> #include <linux/percpu.h> #include <asm/sections.h> #include "base.h" #include "trace.h" struct devres_node { struct list_head entry; dr_release_t release; const char *name; size_t size; }; struct devres { struct devres_node node; /* * Some archs want to perform DMA into kmalloc caches * and need a guaranteed alignment larger than * the alignment of a 64-bit integer. * Thus we use ARCH_DMA_MINALIGN for data[] which will force the same * alignment for struct devres when allocated by kmalloc(). */ u8 __aligned(ARCH_DMA_MINALIGN) data[]; }; struct devres_group { struct devres_node node[2]; void *id; int color; /* -- 8 pointers */ }; static void set_node_dbginfo(struct devres_node *node, const char *name, size_t size) { node->name = name; node->size = size; } #ifdef CONFIG_DEBUG_DEVRES static int log_devres = 0; module_param_named(log, log_devres, int, S_IRUGO | S_IWUSR); static void devres_dbg(struct device *dev, struct devres_node *node, const char *op) { if (unlikely(log_devres)) dev_err(dev, "DEVRES %3s %p %s (%zu bytes)\n", op, node, node->name, node->size); } #else /* CONFIG_DEBUG_DEVRES */ #define devres_dbg(dev, node, op) do {} while (0) #endif /* CONFIG_DEBUG_DEVRES */ static void devres_log(struct device *dev, struct devres_node *node, const char *op) { trace_devres_log(dev, op, node, node->name, node->size); devres_dbg(dev, node, op); } /* * Release functions for devres group. These callbacks are used only * for identification. */ static void group_open_release(struct device *dev, void *res) { /* noop */ } static void group_close_release(struct device *dev, void *res) { /* noop */ } static struct devres_group *node_to_group(struct devres_node *node) { if (node->release == &group_open_release) return container_of(node, struct devres_group, node[0]); if (node->release == &group_close_release) return container_of(node, struct devres_group, node[1]); return NULL; } static bool check_dr_size(size_t size, size_t *tot_size) { /* We must catch any near-SIZE_MAX cases that could overflow. */ if (unlikely(check_add_overflow(sizeof(struct devres), size, tot_size))) return false; /* Actually allocate the full kmalloc bucket size. */ *tot_size = kmalloc_size_roundup(*tot_size); return true; } static __always_inline struct devres *alloc_dr(dr_release_t release, size_t size, gfp_t gfp, int nid) { size_t tot_size; struct devres *dr; if (!check_dr_size(size, &tot_size)) return NULL; dr = kmalloc_node_track_caller(tot_size, gfp, nid); if (unlikely(!dr)) return NULL; /* No need to clear memory twice */ if (!(gfp & __GFP_ZERO)) memset(dr, 0, offsetof(struct devres, data)); INIT_LIST_HEAD(&dr->node.entry); dr->node.release = release; return dr; } static void add_dr(struct device *dev, struct devres_node *node) { devres_log(dev, node, "ADD"); BUG_ON(!list_empty(&node->entry)); list_add_tail(&node->entry, &dev->devres_head); } static void replace_dr(struct device *dev, struct devres_node *old, struct devres_node *new) { devres_log(dev, old, "REPLACE"); BUG_ON(!list_empty(&new->entry)); list_replace(&old->entry, &new->entry); } /** * __devres_alloc_node - Allocate device resource data * @release: Release function devres will be associated with * @size: Allocation size * @gfp: Allocation flags * @nid: NUMA node * @name: Name of the resource * * Allocate devres of @size bytes. The allocated area is zeroed, then * associated with @release. The returned pointer can be passed to * other devres_*() functions. * * RETURNS: * Pointer to allocated devres on success, NULL on failure. */ void *__devres_alloc_node(dr_release_t release, size_t size, gfp_t gfp, int nid, const char *name) { struct devres *dr; dr = alloc_dr(release, size, gfp | __GFP_ZERO, nid); if (unlikely(!dr)) return NULL; set_node_dbginfo(&dr->node, name, size); return dr->data; } EXPORT_SYMBOL_GPL(__devres_alloc_node); /** * devres_for_each_res - Resource iterator * @dev: Device to iterate resource from * @release: Look for resources associated with this release function * @match: Match function (optional) * @match_data: Data for the match function * @fn: Function to be called for each matched resource. * @data: Data for @fn, the 3rd parameter of @fn * * Call @fn for each devres of @dev which is associated with @release * and for which @match returns 1. * * RETURNS: * void */ void devres_for_each_res(struct device *dev, dr_release_t release, dr_match_t match, void *match_data, void (*fn)(struct device *, void *, void *), void *data) { struct devres_node *node; struct devres_node *tmp; unsigned long flags; if (!fn) return; spin_lock_irqsave(&dev->devres_lock, flags); list_for_each_entry_safe_reverse(node, tmp, &dev->devres_head, entry) { struct devres *dr = container_of(node, struct devres, node); if (node->release != release) continue; if (match && !match(dev, dr->data, match_data)) continue; fn(dev, dr->data, data); } spin_unlock_irqrestore(&dev->devres_lock, flags); } EXPORT_SYMBOL_GPL(devres_for_each_res); /** * devres_free - Free device resource data * @res: Pointer to devres data to free * * Free devres created with devres_alloc(). */ void devres_free(void *res) { if (res) { struct devres *dr = container_of(res, struct devres, data); BUG_ON(!list_empty(&dr->node.entry)); kfree(dr); } } EXPORT_SYMBOL_GPL(devres_free); /** * devres_add - Register device resource * @dev: Device to add resource to * @res: Resource to register * * Register devres @res to @dev. @res should have been allocated * using devres_alloc(). On driver detach, the associated release * function will be invoked and devres will be freed automatically. */ void devres_add(struct device *dev, void *res) { struct devres *dr = container_of(res, struct devres, data); unsigned long flags; spin_lock_irqsave(&dev->devres_lock, flags); add_dr(dev, &dr->node); spin_unlock_irqrestore(&dev->devres_lock, flags); } EXPORT_SYMBOL_GPL(devres_add); static struct devres *find_dr(struct device *dev, dr_release_t release, dr_match_t match, void *match_data) { struct devres_node *node; list_for_each_entry_reverse(node, &dev->devres_head, entry) { struct devres *dr = container_of(node, struct devres, node); if (node->release != release) continue; if (match && !match(dev, dr->data, match_data)) continue; return dr; } return NULL; } /** * devres_find - Find device resource * @dev: Device to lookup resource from * @release: Look for resources associated with this release function * @match: Match function (optional) * @match_data: Data for the match function * * Find the latest devres of @dev which is associated with @release * and for which @match returns 1. If @match is NULL, it's considered * to match all. * * RETURNS: * Pointer to found devres, NULL if not found. */ void *devres_find(struct device *dev, dr_release_t release, dr_match_t match, void *match_data) { struct devres *dr; unsigned long flags; spin_lock_irqsave(&dev->devres_lock, flags); dr = find_dr(dev, release, match, match_data); spin_unlock_irqrestore(&dev->devres_lock, flags); if (dr) return dr->data; return NULL; } EXPORT_SYMBOL_GPL(devres_find); /** * devres_get - Find devres, if non-existent, add one atomically * @dev: Device to lookup or add devres for * @new_res: Pointer to new initialized devres to add if not found * @match: Match function (optional) * @match_data: Data for the match function * * Find the latest devres of @dev which has the same release function * as @new_res and for which @match return 1. If found, @new_res is * freed; otherwise, @new_res is added atomically. * * RETURNS: * Pointer to found or added devres. */ void *devres_get(struct device *dev, void *new_res, dr_match_t match, void *match_data) { struct devres *new_dr = container_of(new_res, struct devres, data); struct devres *dr; unsigned long flags; spin_lock_irqsave(&dev->devres_lock, flags); dr = find_dr(dev, new_dr->node.release, match, match_data); if (!dr) { add_dr(dev, &new_dr->node); dr = new_dr; new_res = NULL; } spin_unlock_irqrestore(&dev->devres_lock, flags); devres_free(new_res); return dr->data; } EXPORT_SYMBOL_GPL(devres_get); /** * devres_remove - Find a device resource and remove it * @dev: Device to find resource from * @release: Look for resources associated with this release function * @match: Match function (optional) * @match_data: Data for the match function * * Find the latest devres of @dev associated with @release and for * which @match returns 1. If @match is NULL, it's considered to * match all. If found, the resource is removed atomically and * returned. * * RETURNS: * Pointer to removed devres on success, NULL if not found. */ void *devres_remove(struct device *dev, dr_release_t release, dr_match_t match, void *match_data) { struct devres *dr; unsigned long flags; spin_lock_irqsave(&dev->devres_lock, flags); dr = find_dr(dev, release, match, match_data); if (dr) { list_del_init(&dr->node.entry); devres_log(dev, &dr->node, "REM"); } spin_unlock_irqrestore(&dev->devres_lock, flags); if (dr) return dr->data; return NULL; } EXPORT_SYMBOL_GPL(devres_remove); /** * devres_destroy - Find a device resource and destroy it * @dev: Device to find resource from * @release: Look for resources associated with this release function * @match: Match function (optional) * @match_data: Data for the match function * * Find the latest devres of @dev associated with @release and for * which @match returns 1. If @match is NULL, it's considered to * match all. If found, the resource is removed atomically and freed. * * Note that the release function for the resource will not be called, * only the devres-allocated data will be freed. The caller becomes * responsible for freeing any other data. * * RETURNS: * 0 if devres is found and freed, -ENOENT if not found. */ int devres_destroy(struct device *dev, dr_release_t release, dr_match_t match, void *match_data) { void *res; res = devres_remove(dev, release, match, match_data); if (unlikely(!res)) return -ENOENT; devres_free(res); return 0; } EXPORT_SYMBOL_GPL(devres_destroy); /** * devres_release - Find a device resource and destroy it, calling release * @dev: Device to find resource from * @release: Look for resources associated with this release function * @match: Match function (optional) * @match_data: Data for the match function * * Find the latest devres of @dev associated with @release and for * which @match returns 1. If @match is NULL, it's considered to * match all. If found, the resource is removed atomically, the * release function called and the resource freed. * * RETURNS: * 0 if devres is found and freed, -ENOENT if not found. */ int devres_release(struct device *dev, dr_release_t release, dr_match_t match, void *match_data) { void *res; res = devres_remove(dev, release, match, match_data); if (unlikely(!res)) return -ENOENT; (*release)(dev, res); devres_free(res); return 0; } EXPORT_SYMBOL_GPL(devres_release); static int remove_nodes(struct device *dev, struct list_head *first, struct list_head *end, struct list_head *todo) { struct devres_node *node, *n; int cnt = 0, nr_groups = 0; /* First pass - move normal devres entries to @todo and clear * devres_group colors. */ node = list_entry(first, struct devres_node, entry); list_for_each_entry_safe_from(node, n, end, entry) { struct devres_group *grp; grp = node_to_group(node); if (grp) { /* clear color of group markers in the first pass */ grp->color = 0; nr_groups++; } else { /* regular devres entry */ if (&node->entry == first) first = first->next; list_move_tail(&node->entry, todo); cnt++; } } if (!nr_groups) return cnt; /* Second pass - Scan groups and color them. A group gets * color value of two iff the group is wholly contained in * [current node, end). That is, for a closed group, both opening * and closing markers should be in the range, while just the * opening marker is enough for an open group. */ node = list_entry(first, struct devres_node, entry); list_for_each_entry_safe_from(node, n, end, entry) { struct devres_group *grp; grp = node_to_group(node); BUG_ON(!grp || list_empty(&grp->node[0].entry)); grp->color++; if (list_empty(&grp->node[1].entry)) grp->color++; BUG_ON(grp->color <= 0 || grp->color > 2); if (grp->color == 2) { /* No need to update current node or end. The removed * nodes are always before both. */ list_move_tail(&grp->node[0].entry, todo); list_del_init(&grp->node[1].entry); } } return cnt; } static void release_nodes(struct device *dev, struct list_head *todo) { struct devres *dr, *tmp; /* Release. Note that both devres and devres_group are * handled as devres in the following loop. This is safe. */ list_for_each_entry_safe_reverse(dr, tmp, todo, node.entry) { devres_log(dev, &dr->node, "REL"); dr->node.release(dev, dr->data); kfree(dr); } } /** * devres_release_all - Release all managed resources * @dev: Device to release resources for * * Release all resources associated with @dev. This function is * called on driver detach. */ int devres_release_all(struct device *dev) { unsigned long flags; LIST_HEAD(todo); int cnt; /* Looks like an uninitialized device structure */ if (WARN_ON(dev->devres_head.next == NULL)) return -ENODEV; /* Nothing to release if list is empty */ if (list_empty(&dev->devres_head)) return 0; spin_lock_irqsave(&dev->devres_lock, flags); cnt = remove_nodes(dev, dev->devres_head.next, &dev->devres_head, &todo); spin_unlock_irqrestore(&dev->devres_lock, flags); release_nodes(dev, &todo); return cnt; } /** * devres_open_group - Open a new devres group * @dev: Device to open devres group for * @id: Separator ID * @gfp: Allocation flags * * Open a new devres group for @dev with @id. For @id, using a * pointer to an object which won't be used for another group is * recommended. If @id is NULL, address-wise unique ID is created. * * RETURNS: * ID of the new group, NULL on failure. */ void *devres_open_group(struct device *dev, void *id, gfp_t gfp) { struct devres_group *grp; unsigned long flags; grp = kmalloc(sizeof(*grp), gfp); if (unlikely(!grp)) return NULL; grp->node[0].release = &group_open_release; grp->node[1].release = &group_close_release; INIT_LIST_HEAD(&grp->node[0].entry); INIT_LIST_HEAD(&grp->node[1].entry); set_node_dbginfo(&grp->node[0], "grp<", 0); set_node_dbginfo(&grp->node[1], "grp>", 0); grp->id = grp; if (id) grp->id = id; grp->color = 0; spin_lock_irqsave(&dev->devres_lock, flags); add_dr(dev, &grp->node[0]); spin_unlock_irqrestore(&dev->devres_lock, flags); return grp->id; } EXPORT_SYMBOL_GPL(devres_open_group); /* Find devres group with ID @id. If @id is NULL, look for the latest. */ static struct devres_group *find_group(struct device *dev, void *id) { struct devres_node *node; list_for_each_entry_reverse(node, &dev->devres_head, entry) { struct devres_group *grp; if (node->release != &group_open_release) continue; grp = container_of(node, struct devres_group, node[0]); if (id) { if (grp->id == id) return grp; } else if (list_empty(&grp->node[1].entry)) return grp; } return NULL; } /** * devres_close_group - Close a devres group * @dev: Device to close devres group for * @id: ID of target group, can be NULL * * Close the group identified by @id. If @id is NULL, the latest open * group is selected. */ void devres_close_group(struct device *dev, void *id) { struct devres_group *grp; unsigned long flags; spin_lock_irqsave(&dev->devres_lock, flags); grp = find_group(dev, id); if (grp) add_dr(dev, &grp->node[1]); else WARN_ON(1); spin_unlock_irqrestore(&dev->devres_lock, flags); } EXPORT_SYMBOL_GPL(devres_close_group); /** * devres_remove_group - Remove a devres group * @dev: Device to remove group for * @id: ID of target group, can be NULL * * Remove the group identified by @id. If @id is NULL, the latest * open group is selected. Note that removing a group doesn't affect * any other resources. */ void devres_remove_group(struct device *dev, void *id) { struct devres_group *grp; unsigned long flags; spin_lock_irqsave(&dev->devres_lock, flags); grp = find_group(dev, id); if (grp) { list_del_init(&grp->node[0].entry); list_del_init(&grp->node[1].entry); devres_log(dev, &grp->node[0], "REM"); } else WARN_ON(1); spin_unlock_irqrestore(&dev->devres_lock, flags); kfree(grp); } EXPORT_SYMBOL_GPL(devres_remove_group); /** * devres_release_group - Release resources in a devres group * @dev: Device to release group for * @id: ID of target group, can be NULL * * Release all resources in the group identified by @id. If @id is * NULL, the latest open group is selected. The selected group and * groups properly nested inside the selected group are removed. * * RETURNS: * The number of released non-group resources. */ int devres_release_group(struct device *dev, void *id) { struct devres_group *grp; unsigned long flags; LIST_HEAD(todo); int cnt = 0; spin_lock_irqsave(&dev->devres_lock, flags); grp = find_group(dev, id); if (grp) { struct list_head *first = &grp->node[0].entry; struct list_head *end = &dev->devres_head; if (!list_empty(&grp->node[1].entry)) end = grp->node[1].entry.next; cnt = remove_nodes(dev, first, end, &todo); spin_unlock_irqrestore(&dev->devres_lock, flags); release_nodes(dev, &todo); } else { WARN_ON(1); spin_unlock_irqrestore(&dev->devres_lock, flags); } return cnt; } EXPORT_SYMBOL_GPL(devres_release_group); /* * Custom devres actions allow inserting a simple function call * into the teardown sequence. */ struct action_devres { void *data; void (*action)(void *); }; static int devm_action_match(struct device *dev, void *res, void *p) { struct action_devres *devres = res; struct action_devres *target = p; return devres->action == target->action && devres->data == target->data; } static void devm_action_release(struct device *dev, void *res) { struct action_devres *devres = res; devres->action(devres->data); } /** * __devm_add_action() - add a custom action to list of managed resources * @dev: Device that owns the action * @action: Function that should be called * @data: Pointer to data passed to @action implementation * @name: Name of the resource (for debugging purposes) * * This adds a custom action to the list of managed resources so that * it gets executed as part of standard resource unwinding. */ int __devm_add_action(struct device *dev, void (*action)(void *), void *data, const char *name) { struct action_devres *devres; devres = __devres_alloc_node(devm_action_release, sizeof(struct action_devres), GFP_KERNEL, NUMA_NO_NODE, name); if (!devres) return -ENOMEM; devres->data = data; devres->action = action; devres_add(dev, devres); return 0; } EXPORT_SYMBOL_GPL(__devm_add_action); /** * devm_remove_action_nowarn() - removes previously added custom action * @dev: Device that owns the action * @action: Function implementing the action * @data: Pointer to data passed to @action implementation * * Removes instance of @action previously added by devm_add_action(). * Both action and data should match one of the existing entries. * * In contrast to devm_remove_action(), this function does not WARN() if no * entry could have been found. * * This should only be used if the action is contained in an object with * independent lifetime management, e.g. the Devres rust abstraction. * * Causing the warning from regular driver code most likely indicates an abuse * of the devres API. * * Returns: 0 on success, -ENOENT if no entry could have been found. */ int devm_remove_action_nowarn(struct device *dev, void (*action)(void *), void *data) { struct action_devres devres = { .data = data, .action = action, }; return devres_destroy(dev, devm_action_release, devm_action_match, &devres); } EXPORT_SYMBOL_GPL(devm_remove_action_nowarn); /** * devm_release_action() - release previously added custom action * @dev: Device that owns the action * @action: Function implementing the action * @data: Pointer to data passed to @action implementation * * Releases and removes instance of @action previously added by * devm_add_action(). Both action and data should match one of the * existing entries. */ void devm_release_action(struct device *dev, void (*action)(void *), void *data) { struct action_devres devres = { .data = data, .action = action, }; WARN_ON(devres_release(dev, devm_action_release, devm_action_match, &devres)); } EXPORT_SYMBOL_GPL(devm_release_action); /* * Managed kmalloc/kfree */ static void devm_kmalloc_release(struct device *dev, void *res) { /* noop */ } static int devm_kmalloc_match(struct device *dev, void *res, void *data) { return res == data; } /** * devm_kmalloc - Resource-managed kmalloc * @dev: Device to allocate memory for * @size: Allocation size * @gfp: Allocation gfp flags * * Managed kmalloc. Memory allocated with this function is * automatically freed on driver detach. Like all other devres * resources, guaranteed alignment is unsigned long long. * * RETURNS: * Pointer to allocated memory on success, NULL on failure. */ void *devm_kmalloc(struct device *dev, size_t size, gfp_t gfp) { struct devres *dr; if (unlikely(!size)) return ZERO_SIZE_PTR; /* use raw alloc_dr for kmalloc caller tracing */ dr = alloc_dr(devm_kmalloc_release, size, gfp, dev_to_node(dev)); if (unlikely(!dr)) return NULL; /* * This is named devm_kzalloc_release for historical reasons * The initial implementation did not support kmalloc, only kzalloc */ set_node_dbginfo(&dr->node, "devm_kzalloc_release", size); devres_add(dev, dr->data); return dr->data; } EXPORT_SYMBOL_GPL(devm_kmalloc); /** * devm_krealloc - Resource-managed krealloc() * @dev: Device to re-allocate memory for * @ptr: Pointer to the memory chunk to re-allocate * @new_size: New allocation size * @gfp: Allocation gfp flags * * Managed krealloc(). Resizes the memory chunk allocated with devm_kmalloc(). * Behaves similarly to regular krealloc(): if @ptr is NULL or ZERO_SIZE_PTR, * it's the equivalent of devm_kmalloc(). If new_size is zero, it frees the * previously allocated memory and returns ZERO_SIZE_PTR. This function doesn't * change the order in which the release callback for the re-alloc'ed devres * will be called (except when falling back to devm_kmalloc() or when freeing * resources when new_size is zero). The contents of the memory are preserved * up to the lesser of new and old sizes. */ void *devm_krealloc(struct device *dev, void *ptr, size_t new_size, gfp_t gfp) { size_t total_new_size, total_old_size; struct devres *old_dr, *new_dr; unsigned long flags; if (unlikely(!new_size)) { devm_kfree(dev, ptr); return ZERO_SIZE_PTR; } if (unlikely(ZERO_OR_NULL_PTR(ptr))) return devm_kmalloc(dev, new_size, gfp); if (WARN_ON(is_kernel_rodata((unsigned long)ptr))) /* * We cannot reliably realloc a const string returned by * devm_kstrdup_const(). */ return NULL; if (!check_dr_size(new_size, &total_new_size)) return NULL; total_old_size = ksize(container_of(ptr, struct devres, data)); if (total_old_size == 0) { WARN(1, "Pointer doesn't point to dynamically allocated memory."); return NULL; } /* * If new size is smaller or equal to the actual number of bytes * allocated previously - just return the same pointer. */ if (total_new_size <= total_old_size) return ptr; /* * Otherwise: allocate new, larger chunk. We need to allocate before * taking the lock as most probably the caller uses GFP_KERNEL. * alloc_dr() will call check_dr_size() to reserve extra memory * for struct devres automatically, so size @new_size user request * is delivered to it directly as devm_kmalloc() does. */ new_dr = alloc_dr(devm_kmalloc_release, new_size, gfp, dev_to_node(dev)); if (!new_dr) return NULL; /* * The spinlock protects the linked list against concurrent * modifications but not the resource itself. */ spin_lock_irqsave(&dev->devres_lock, flags); old_dr = find_dr(dev, devm_kmalloc_release, devm_kmalloc_match, ptr); if (!old_dr) { spin_unlock_irqrestore(&dev->devres_lock, flags); kfree(new_dr); WARN(1, "Memory chunk not managed or managed by a different device."); return NULL; } replace_dr(dev, &old_dr->node, &new_dr->node); spin_unlock_irqrestore(&dev->devres_lock, flags); /* * We can copy the memory contents after releasing the lock as we're * no longer modifying the list links. */ memcpy(new_dr->data, old_dr->data, total_old_size - offsetof(struct devres, data)); /* * Same for releasing the old devres - it's now been removed from the * list. This is also the reason why we must not use devm_kfree() - the * links are no longer valid. */ kfree(old_dr); return new_dr->data; } EXPORT_SYMBOL_GPL(devm_krealloc); /** * devm_kstrdup - Allocate resource managed space and * copy an existing string into that. * @dev: Device to allocate memory for * @s: the string to duplicate * @gfp: the GFP mask used in the devm_kmalloc() call when * allocating memory * RETURNS: * Pointer to allocated string on success, NULL on failure. */ char *devm_kstrdup(struct device *dev, const char *s, gfp_t gfp) { size_t size; char *buf; if (!s) return NULL; size = strlen(s) + 1; buf = devm_kmalloc(dev, size, gfp); if (buf) memcpy(buf, s, size); return buf; } EXPORT_SYMBOL_GPL(devm_kstrdup); /** * devm_kstrdup_const - resource managed conditional string duplication * @dev: device for which to duplicate the string * @s: the string to duplicate * @gfp: the GFP mask used in the kmalloc() call when allocating memory * * Strings allocated by devm_kstrdup_const will be automatically freed when * the associated device is detached. * * RETURNS: * Source string if it is in .rodata section otherwise it falls back to * devm_kstrdup. */ const char *devm_kstrdup_const(struct device *dev, const char *s, gfp_t gfp) { if (is_kernel_rodata((unsigned long)s)) return s; return devm_kstrdup(dev, s, gfp); } EXPORT_SYMBOL_GPL(devm_kstrdup_const); /** * devm_kvasprintf - Allocate resource managed space and format a string * into that. * @dev: Device to allocate memory for * @gfp: the GFP mask used in the devm_kmalloc() call when * allocating memory * @fmt: The printf()-style format string * @ap: Arguments for the format string * RETURNS: * Pointer to allocated string on success, NULL on failure. */ char *devm_kvasprintf(struct device *dev, gfp_t gfp, const char *fmt, va_list ap) { unsigned int len; char *p; va_list aq; va_copy(aq, ap); len = vsnprintf(NULL, 0, fmt, aq); va_end(aq); p = devm_kmalloc(dev, len+1, gfp); if (!p) return NULL; vsnprintf(p, len+1, fmt, ap); return p; } EXPORT_SYMBOL(devm_kvasprintf); /** * devm_kasprintf - Allocate resource managed space and format a string * into that. * @dev: Device to allocate memory for * @gfp: the GFP mask used in the devm_kmalloc() call when * allocating memory * @fmt: The printf()-style format string * @...: Arguments for the format string * RETURNS: * Pointer to allocated string on success, NULL on failure. */ char *devm_kasprintf(struct device *dev, gfp_t gfp, const char *fmt, ...) { va_list ap; char *p; va_start(ap, fmt); p = devm_kvasprintf(dev, gfp, fmt, ap); va_end(ap); return p; } EXPORT_SYMBOL_GPL(devm_kasprintf); /** * devm_kfree - Resource-managed kfree * @dev: Device this memory belongs to * @p: Memory to free * * Free memory allocated with devm_kmalloc(). */ void devm_kfree(struct device *dev, const void *p) { int rc; /* * Special cases: pointer to a string in .rodata returned by * devm_kstrdup_const() or NULL/ZERO ptr. */ if (unlikely(is_kernel_rodata((unsigned long)p) || ZERO_OR_NULL_PTR(p))) return; rc = devres_destroy(dev, devm_kmalloc_release, devm_kmalloc_match, (void *)p); WARN_ON(rc); } EXPORT_SYMBOL_GPL(devm_kfree); /** * devm_kmemdup - Resource-managed kmemdup * @dev: Device this memory belongs to * @src: Memory region to duplicate * @len: Memory region length * @gfp: GFP mask to use * * Duplicate region of a memory using resource managed kmalloc */ void *devm_kmemdup(struct device *dev, const void *src, size_t len, gfp_t gfp) { void *p; p = devm_kmalloc(dev, len, gfp); if (p) memcpy(p, src, len); return p; } EXPORT_SYMBOL_GPL(devm_kmemdup); struct pages_devres { unsigned long addr; unsigned int order; }; static int devm_pages_match(struct device *dev, void *res, void *p) { struct pages_devres *devres = res; struct pages_devres *target = p; return devres->addr == target->addr; } static void devm_pages_release(struct device *dev, void *res) { struct pages_devres *devres = res; free_pages(devres->addr, devres->order); } /** * devm_get_free_pages - Resource-managed __get_free_pages * @dev: Device to allocate memory for * @gfp_mask: Allocation gfp flags * @order: Allocation size is (1 << order) pages * * Managed get_free_pages. Memory allocated with this function is * automatically freed on driver detach. * * RETURNS: * Address of allocated memory on success, 0 on failure. */ unsigned long devm_get_free_pages(struct device *dev, gfp_t gfp_mask, unsigned int order) { struct pages_devres *devres; unsigned long addr; addr = __get_free_pages(gfp_mask, order); if (unlikely(!addr)) return 0; devres = devres_alloc(devm_pages_release, sizeof(struct pages_devres), GFP_KERNEL); if (unlikely(!devres)) { free_pages(addr, order); return 0; } devres->addr = addr; devres->order = order; devres_add(dev, devres); return addr; } EXPORT_SYMBOL_GPL(devm_get_free_pages); /** * devm_free_pages - Resource-managed free_pages * @dev: Device this memory belongs to * @addr: Memory to free * * Free memory allocated with devm_get_free_pages(). Unlike free_pages, * there is no need to supply the @order. */ void devm_free_pages(struct device *dev, unsigned long addr) { struct pages_devres devres = { .addr = addr }; WARN_ON(devres_release(dev, devm_pages_release, devm_pages_match, &devres)); } EXPORT_SYMBOL_GPL(devm_free_pages); static void devm_percpu_release(struct device *dev, void *pdata) { void __percpu *p; p = *(void __percpu **)pdata; free_percpu(p); } static int devm_percpu_match(struct device *dev, void *data, void *p) { struct devres *devr = container_of(data, struct devres, data); return *(void **)devr->data == p; } /** * __devm_alloc_percpu - Resource-managed alloc_percpu * @dev: Device to allocate per-cpu memory for * @size: Size of per-cpu memory to allocate * @align: Alignment of per-cpu memory to allocate * * Managed alloc_percpu. Per-cpu memory allocated with this function is * automatically freed on driver detach. * * RETURNS: * Pointer to allocated memory on success, NULL on failure. */ void __percpu *__devm_alloc_percpu(struct device *dev, size_t size, size_t align) { void *p; void __percpu *pcpu; pcpu = __alloc_percpu(size, align); if (!pcpu) return NULL; p = devres_alloc(devm_percpu_release, sizeof(void *), GFP_KERNEL); if (!p) { free_percpu(pcpu); return NULL; } *(void __percpu **)p = pcpu; devres_add(dev, p); return pcpu; } EXPORT_SYMBOL_GPL(__devm_alloc_percpu); /** * devm_free_percpu - Resource-managed free_percpu * @dev: Device this memory belongs to * @pdata: Per-cpu memory to free * * Free memory allocated with devm_alloc_percpu(). */ void devm_free_percpu(struct device *dev, void __percpu *pdata) { /* * Use devres_release() to prevent memory leakage as * devm_free_pages() does. */ WARN_ON(devres_release(dev, devm_percpu_release, devm_percpu_match, (void *)(__force unsigned long)pdata)); } EXPORT_SYMBOL_GPL(devm_free_percpu);
16 18 111 6 1 5 3 3 3 1 3 3 3 3 8 8 8 8 2 7 8 8 2 2 4 1 4 4 4 4 4 4 4 4 6 1 5 4 4 4 4 2 2 1 1 1 7 7 7 5 5 5 4 4 1 1 1 2 2 5 78 1 77 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 // SPDX-License-Identifier: GPL-2.0+ /* * NILFS segment usage file. * * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation. * * Written by Koji Sato. * Revised by Ryusuke Konishi. */ #include <linux/kernel.h> #include <linux/fs.h> #include <linux/string.h> #include <linux/buffer_head.h> #include <linux/errno.h> #include "mdt.h" #include "sufile.h" #include <trace/events/nilfs2.h> /** * struct nilfs_sufile_info - on-memory private data of sufile * @mi: on-memory private data of metadata file * @ncleansegs: number of clean segments * @allocmin: lower limit of allocatable segment range * @allocmax: upper limit of allocatable segment range */ struct nilfs_sufile_info { struct nilfs_mdt_info mi; unsigned long ncleansegs;/* number of clean segments */ __u64 allocmin; /* lower limit of allocatable segment range */ __u64 allocmax; /* upper limit of allocatable segment range */ }; static inline struct nilfs_sufile_info *NILFS_SUI(struct inode *sufile) { return (struct nilfs_sufile_info *)NILFS_MDT(sufile); } static inline unsigned long nilfs_sufile_segment_usages_per_block(const struct inode *sufile) { return NILFS_MDT(sufile)->mi_entries_per_block; } static unsigned long nilfs_sufile_get_blkoff(const struct inode *sufile, __u64 segnum) { __u64 t = segnum + NILFS_MDT(sufile)->mi_first_entry_offset; t = div64_ul(t, nilfs_sufile_segment_usages_per_block(sufile)); return (unsigned long)t; } static unsigned long nilfs_sufile_get_offset(const struct inode *sufile, __u64 segnum) { __u64 t = segnum + NILFS_MDT(sufile)->mi_first_entry_offset; return do_div(t, nilfs_sufile_segment_usages_per_block(sufile)); } static unsigned long nilfs_sufile_segment_usages_in_block(const struct inode *sufile, __u64 curr, __u64 max) { return min_t(unsigned long, nilfs_sufile_segment_usages_per_block(sufile) - nilfs_sufile_get_offset(sufile, curr), max - curr + 1); } /** * nilfs_sufile_segment_usage_offset - calculate the byte offset of a segment * usage entry in the folio containing it * @sufile: segment usage file inode * @segnum: number of segment usage * @bh: buffer head of block containing segment usage indexed by @segnum * * Return: Byte offset in the folio of the segment usage entry. */ static size_t nilfs_sufile_segment_usage_offset(const struct inode *sufile, __u64 segnum, struct buffer_head *bh) { return offset_in_folio(bh->b_folio, bh->b_data) + nilfs_sufile_get_offset(sufile, segnum) * NILFS_MDT(sufile)->mi_entry_size; } static int nilfs_sufile_get_header_block(struct inode *sufile, struct buffer_head **bhp) { int err = nilfs_mdt_get_block(sufile, 0, 0, NULL, bhp); if (unlikely(err == -ENOENT)) { nilfs_error(sufile->i_sb, "missing header block in segment usage metadata"); err = -EIO; } return err; } static inline int nilfs_sufile_get_segment_usage_block(struct inode *sufile, __u64 segnum, int create, struct buffer_head **bhp) { return nilfs_mdt_get_block(sufile, nilfs_sufile_get_blkoff(sufile, segnum), create, NULL, bhp); } static int nilfs_sufile_delete_segment_usage_block(struct inode *sufile, __u64 segnum) { return nilfs_mdt_delete_block(sufile, nilfs_sufile_get_blkoff(sufile, segnum)); } static void nilfs_sufile_mod_counter(struct buffer_head *header_bh, u64 ncleanadd, u64 ndirtyadd) { struct nilfs_sufile_header *header; header = kmap_local_folio(header_bh->b_folio, 0); le64_add_cpu(&header->sh_ncleansegs, ncleanadd); le64_add_cpu(&header->sh_ndirtysegs, ndirtyadd); kunmap_local(header); mark_buffer_dirty(header_bh); } /** * nilfs_sufile_get_ncleansegs - return the number of clean segments * @sufile: inode of segment usage file * * Return: Number of clean segments. */ unsigned long nilfs_sufile_get_ncleansegs(struct inode *sufile) { return NILFS_SUI(sufile)->ncleansegs; } /** * nilfs_sufile_updatev - modify multiple segment usages at a time * @sufile: inode of segment usage file * @segnumv: array of segment numbers * @nsegs: size of @segnumv array * @create: creation flag * @ndone: place to store number of modified segments on @segnumv * @dofunc: primitive operation for the update * * Description: nilfs_sufile_updatev() repeatedly calls @dofunc * against the given array of segments. The @dofunc is called with * buffers of a header block and the sufile block in which the target * segment usage entry is contained. If @ndone is given, the number * of successfully modified segments from the head is stored in the * place @ndone points to. * * Return: 0 on success, or one of the following negative error codes on * failure: * * %-EINVAL - Invalid segment usage number * * %-EIO - I/O error (including metadata corruption). * * %-ENOENT - Given segment usage is in hole block (may be returned if * @create is zero) * * %-ENOMEM - Insufficient memory available. */ int nilfs_sufile_updatev(struct inode *sufile, __u64 *segnumv, size_t nsegs, int create, size_t *ndone, void (*dofunc)(struct inode *, __u64, struct buffer_head *, struct buffer_head *)) { struct buffer_head *header_bh, *bh; unsigned long blkoff, prev_blkoff; __u64 *seg; size_t nerr = 0, n = 0; int ret = 0; if (unlikely(nsegs == 0)) goto out; down_write(&NILFS_MDT(sufile)->mi_sem); for (seg = segnumv; seg < segnumv + nsegs; seg++) { if (unlikely(*seg >= nilfs_sufile_get_nsegments(sufile))) { nilfs_warn(sufile->i_sb, "%s: invalid segment number: %llu", __func__, (unsigned long long)*seg); nerr++; } } if (nerr > 0) { ret = -EINVAL; goto out_sem; } ret = nilfs_sufile_get_header_block(sufile, &header_bh); if (ret < 0) goto out_sem; seg = segnumv; blkoff = nilfs_sufile_get_blkoff(sufile, *seg); ret = nilfs_mdt_get_block(sufile, blkoff, create, NULL, &bh); if (ret < 0) goto out_header; for (;;) { dofunc(sufile, *seg, header_bh, bh); if (++seg >= segnumv + nsegs) break; prev_blkoff = blkoff; blkoff = nilfs_sufile_get_blkoff(sufile, *seg); if (blkoff == prev_blkoff) continue; /* get different block */ brelse(bh); ret = nilfs_mdt_get_block(sufile, blkoff, create, NULL, &bh); if (unlikely(ret < 0)) goto out_header; } brelse(bh); out_header: n = seg - segnumv; brelse(header_bh); out_sem: up_write(&NILFS_MDT(sufile)->mi_sem); out: if (ndone) *ndone = n; return ret; } int nilfs_sufile_update(struct inode *sufile, __u64 segnum, int create, void (*dofunc)(struct inode *, __u64, struct buffer_head *, struct buffer_head *)) { struct buffer_head *header_bh, *bh; int ret; if (unlikely(segnum >= nilfs_sufile_get_nsegments(sufile))) { nilfs_warn(sufile->i_sb, "%s: invalid segment number: %llu", __func__, (unsigned long long)segnum); return -EINVAL; } down_write(&NILFS_MDT(sufile)->mi_sem); ret = nilfs_sufile_get_header_block(sufile, &header_bh); if (ret < 0) goto out_sem; ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, create, &bh); if (!ret) { dofunc(sufile, segnum, header_bh, bh); brelse(bh); } brelse(header_bh); out_sem: up_write(&NILFS_MDT(sufile)->mi_sem); return ret; } /** * nilfs_sufile_set_alloc_range - limit range of segment to be allocated * @sufile: inode of segment usage file * @start: minimum segment number of allocatable region (inclusive) * @end: maximum segment number of allocatable region (inclusive) * * Return: 0 on success, or %-ERANGE if segment range is invalid. */ int nilfs_sufile_set_alloc_range(struct inode *sufile, __u64 start, __u64 end) { struct nilfs_sufile_info *sui = NILFS_SUI(sufile); __u64 nsegs; int ret = -ERANGE; down_write(&NILFS_MDT(sufile)->mi_sem); nsegs = nilfs_sufile_get_nsegments(sufile); if (start <= end && end < nsegs) { sui->allocmin = start; sui->allocmax = end; ret = 0; } up_write(&NILFS_MDT(sufile)->mi_sem); return ret; } /** * nilfs_sufile_alloc - allocate a segment * @sufile: inode of segment usage file * @segnump: pointer to segment number * * Description: nilfs_sufile_alloc() allocates a clean segment, and stores * its segment number in the place pointed to by @segnump. * * Return: 0 on success, or one of the following negative error codes on * failure: * * %-EIO - I/O error (including metadata corruption). * * %-ENOMEM - Insufficient memory available. * * %-ENOSPC - No clean segment left. */ int nilfs_sufile_alloc(struct inode *sufile, __u64 *segnump) { struct buffer_head *header_bh, *su_bh; struct nilfs_sufile_header *header; struct nilfs_segment_usage *su; struct nilfs_sufile_info *sui = NILFS_SUI(sufile); size_t susz = NILFS_MDT(sufile)->mi_entry_size; __u64 segnum, maxsegnum, last_alloc; size_t offset; void *kaddr; unsigned long nsegments, nsus, cnt; int ret, j; down_write(&NILFS_MDT(sufile)->mi_sem); ret = nilfs_sufile_get_header_block(sufile, &header_bh); if (ret < 0) goto out_sem; header = kmap_local_folio(header_bh->b_folio, 0); last_alloc = le64_to_cpu(header->sh_last_alloc); kunmap_local(header); nsegments = nilfs_sufile_get_nsegments(sufile); maxsegnum = sui->allocmax; segnum = last_alloc + 1; if (segnum < sui->allocmin || segnum > sui->allocmax) segnum = sui->allocmin; for (cnt = 0; cnt < nsegments; cnt += nsus) { if (segnum > maxsegnum) { if (cnt < sui->allocmax - sui->allocmin + 1) { /* * wrap around in the limited region. * if allocation started from * sui->allocmin, this never happens. */ segnum = sui->allocmin; maxsegnum = last_alloc; } else if (segnum > sui->allocmin && sui->allocmax + 1 < nsegments) { segnum = sui->allocmax + 1; maxsegnum = nsegments - 1; } else if (sui->allocmin > 0) { segnum = 0; maxsegnum = sui->allocmin - 1; } else { break; /* never happens */ } } trace_nilfs2_segment_usage_check(sufile, segnum, cnt); ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 1, &su_bh); if (ret < 0) goto out_header; offset = nilfs_sufile_segment_usage_offset(sufile, segnum, su_bh); su = kaddr = kmap_local_folio(su_bh->b_folio, offset); nsus = nilfs_sufile_segment_usages_in_block( sufile, segnum, maxsegnum); for (j = 0; j < nsus; j++, su = (void *)su + susz, segnum++) { if (!nilfs_segment_usage_clean(su)) continue; /* found a clean segment */ nilfs_segment_usage_set_dirty(su); kunmap_local(kaddr); header = kmap_local_folio(header_bh->b_folio, 0); le64_add_cpu(&header->sh_ncleansegs, -1); le64_add_cpu(&header->sh_ndirtysegs, 1); header->sh_last_alloc = cpu_to_le64(segnum); kunmap_local(header); sui->ncleansegs--; mark_buffer_dirty(header_bh); mark_buffer_dirty(su_bh); nilfs_mdt_mark_dirty(sufile); brelse(su_bh); *segnump = segnum; trace_nilfs2_segment_usage_allocated(sufile, segnum); goto out_header; } kunmap_local(kaddr); brelse(su_bh); } /* no segments left */ ret = -ENOSPC; out_header: brelse(header_bh); out_sem: up_write(&NILFS_MDT(sufile)->mi_sem); return ret; } void nilfs_sufile_do_cancel_free(struct inode *sufile, __u64 segnum, struct buffer_head *header_bh, struct buffer_head *su_bh) { struct nilfs_segment_usage *su; size_t offset; offset = nilfs_sufile_segment_usage_offset(sufile, segnum, su_bh); su = kmap_local_folio(su_bh->b_folio, offset); if (unlikely(!nilfs_segment_usage_clean(su))) { nilfs_warn(sufile->i_sb, "%s: segment %llu must be clean", __func__, (unsigned long long)segnum); kunmap_local(su); return; } nilfs_segment_usage_set_dirty(su); kunmap_local(su); nilfs_sufile_mod_counter(header_bh, -1, 1); NILFS_SUI(sufile)->ncleansegs--; mark_buffer_dirty(su_bh); nilfs_mdt_mark_dirty(sufile); } void nilfs_sufile_do_scrap(struct inode *sufile, __u64 segnum, struct buffer_head *header_bh, struct buffer_head *su_bh) { struct nilfs_segment_usage *su; size_t offset; int clean, dirty; offset = nilfs_sufile_segment_usage_offset(sufile, segnum, su_bh); su = kmap_local_folio(su_bh->b_folio, offset); if (su->su_flags == cpu_to_le32(BIT(NILFS_SEGMENT_USAGE_DIRTY)) && su->su_nblocks == cpu_to_le32(0)) { kunmap_local(su); return; } clean = nilfs_segment_usage_clean(su); dirty = nilfs_segment_usage_dirty(su); /* make the segment garbage */ su->su_lastmod = cpu_to_le64(0); su->su_nblocks = cpu_to_le32(0); su->su_flags = cpu_to_le32(BIT(NILFS_SEGMENT_USAGE_DIRTY)); kunmap_local(su); nilfs_sufile_mod_counter(header_bh, clean ? (u64)-1 : 0, dirty ? 0 : 1); NILFS_SUI(sufile)->ncleansegs -= clean; mark_buffer_dirty(su_bh); nilfs_mdt_mark_dirty(sufile); } void nilfs_sufile_do_free(struct inode *sufile, __u64 segnum, struct buffer_head *header_bh, struct buffer_head *su_bh) { struct nilfs_segment_usage *su; size_t offset; int sudirty; offset = nilfs_sufile_segment_usage_offset(sufile, segnum, su_bh); su = kmap_local_folio(su_bh->b_folio, offset); if (nilfs_segment_usage_clean(su)) { nilfs_warn(sufile->i_sb, "%s: segment %llu is already clean", __func__, (unsigned long long)segnum); kunmap_local(su); return; } if (unlikely(nilfs_segment_usage_error(su))) nilfs_warn(sufile->i_sb, "free segment %llu marked in error", (unsigned long long)segnum); sudirty = nilfs_segment_usage_dirty(su); if (unlikely(!sudirty)) nilfs_warn(sufile->i_sb, "free unallocated segment %llu", (unsigned long long)segnum); nilfs_segment_usage_set_clean(su); kunmap_local(su); mark_buffer_dirty(su_bh); nilfs_sufile_mod_counter(header_bh, 1, sudirty ? (u64)-1 : 0); NILFS_SUI(sufile)->ncleansegs++; nilfs_mdt_mark_dirty(sufile); trace_nilfs2_segment_usage_freed(sufile, segnum); } /** * nilfs_sufile_mark_dirty - mark the buffer having a segment usage dirty * @sufile: inode of segment usage file * @segnum: segment number * * Return: 0 on success, or a negative error code on failure. */ int nilfs_sufile_mark_dirty(struct inode *sufile, __u64 segnum) { struct buffer_head *bh; size_t offset; struct nilfs_segment_usage *su; int ret; down_write(&NILFS_MDT(sufile)->mi_sem); ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 0, &bh); if (unlikely(ret)) { if (ret == -ENOENT) { nilfs_error(sufile->i_sb, "segment usage for segment %llu is unreadable due to a hole block", (unsigned long long)segnum); ret = -EIO; } goto out_sem; } offset = nilfs_sufile_segment_usage_offset(sufile, segnum, bh); su = kmap_local_folio(bh->b_folio, offset); if (unlikely(nilfs_segment_usage_error(su))) { struct the_nilfs *nilfs = sufile->i_sb->s_fs_info; kunmap_local(su); brelse(bh); if (nilfs_segment_is_active(nilfs, segnum)) { nilfs_error(sufile->i_sb, "active segment %llu is erroneous", (unsigned long long)segnum); } else { /* * Segments marked erroneous are never allocated by * nilfs_sufile_alloc(); only active segments, ie, * the segments indexed by ns_segnum or ns_nextnum, * can be erroneous here. */ WARN_ON_ONCE(1); } ret = -EIO; } else { nilfs_segment_usage_set_dirty(su); kunmap_local(su); mark_buffer_dirty(bh); nilfs_mdt_mark_dirty(sufile); brelse(bh); } out_sem: up_write(&NILFS_MDT(sufile)->mi_sem); return ret; } /** * nilfs_sufile_set_segment_usage - set usage of a segment * @sufile: inode of segment usage file * @segnum: segment number * @nblocks: number of live blocks in the segment * @modtime: modification time (option) * * Return: 0 on success, or a negative error code on failure. */ int nilfs_sufile_set_segment_usage(struct inode *sufile, __u64 segnum, unsigned long nblocks, time64_t modtime) { struct buffer_head *bh; struct nilfs_segment_usage *su; size_t offset; int ret; down_write(&NILFS_MDT(sufile)->mi_sem); ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 0, &bh); if (ret < 0) goto out_sem; offset = nilfs_sufile_segment_usage_offset(sufile, segnum, bh); su = kmap_local_folio(bh->b_folio, offset); if (modtime) { /* * Check segusage error and set su_lastmod only when updating * this entry with a valid timestamp, not for cancellation. */ WARN_ON_ONCE(nilfs_segment_usage_error(su)); su->su_lastmod = cpu_to_le64(modtime); } su->su_nblocks = cpu_to_le32(nblocks); kunmap_local(su); mark_buffer_dirty(bh); nilfs_mdt_mark_dirty(sufile); brelse(bh); out_sem: up_write(&NILFS_MDT(sufile)->mi_sem); return ret; } /** * nilfs_sufile_get_stat - get segment usage statistics * @sufile: inode of segment usage file * @sustat: pointer to a structure of segment usage statistics * * Description: nilfs_sufile_get_stat() retrieves segment usage statistics * and stores them in the location pointed to by @sustat. * * Return: 0 on success, or one of the following negative error codes on * failure: * * %-EIO - I/O error (including metadata corruption). * * %-ENOMEM - Insufficient memory available. */ int nilfs_sufile_get_stat(struct inode *sufile, struct nilfs_sustat *sustat) { struct buffer_head *header_bh; struct nilfs_sufile_header *header; struct the_nilfs *nilfs = sufile->i_sb->s_fs_info; int ret; down_read(&NILFS_MDT(sufile)->mi_sem); ret = nilfs_sufile_get_header_block(sufile, &header_bh); if (ret < 0) goto out_sem; header = kmap_local_folio(header_bh->b_folio, 0); sustat->ss_nsegs = nilfs_sufile_get_nsegments(sufile); sustat->ss_ncleansegs = le64_to_cpu(header->sh_ncleansegs); sustat->ss_ndirtysegs = le64_to_cpu(header->sh_ndirtysegs); sustat->ss_ctime = nilfs->ns_ctime; sustat->ss_nongc_ctime = nilfs->ns_nongc_ctime; spin_lock(&nilfs->ns_last_segment_lock); sustat->ss_prot_seq = nilfs->ns_prot_seq; spin_unlock(&nilfs->ns_last_segment_lock); kunmap_local(header); brelse(header_bh); out_sem: up_read(&NILFS_MDT(sufile)->mi_sem); return ret; } void nilfs_sufile_do_set_error(struct inode *sufile, __u64 segnum, struct buffer_head *header_bh, struct buffer_head *su_bh) { struct nilfs_segment_usage *su; size_t offset; int suclean; offset = nilfs_sufile_segment_usage_offset(sufile, segnum, su_bh); su = kmap_local_folio(su_bh->b_folio, offset); if (nilfs_segment_usage_error(su)) { kunmap_local(su); return; } suclean = nilfs_segment_usage_clean(su); nilfs_segment_usage_set_error(su); kunmap_local(su); if (suclean) { nilfs_sufile_mod_counter(header_bh, -1, 0); NILFS_SUI(sufile)->ncleansegs--; } mark_buffer_dirty(su_bh); nilfs_mdt_mark_dirty(sufile); } /** * nilfs_sufile_truncate_range - truncate range of segment array * @sufile: inode of segment usage file * @start: start segment number (inclusive) * @end: end segment number (inclusive) * * Return: 0 on success, or one of the following negative error codes on * failure: * * %-EBUSY - Dirty or active segments are present in the range. * * %-EINVAL - Invalid number of segments specified. * * %-EIO - I/O error (including metadata corruption). * * %-ENOMEM - Insufficient memory available. */ static int nilfs_sufile_truncate_range(struct inode *sufile, __u64 start, __u64 end) { struct the_nilfs *nilfs = sufile->i_sb->s_fs_info; struct buffer_head *header_bh; struct buffer_head *su_bh; struct nilfs_segment_usage *su, *su2; size_t susz = NILFS_MDT(sufile)->mi_entry_size; unsigned long segusages_per_block; unsigned long nsegs, ncleaned; __u64 segnum; size_t offset; ssize_t n, nc; int ret; int j; nsegs = nilfs_sufile_get_nsegments(sufile); ret = -EINVAL; if (start > end || start >= nsegs) goto out; ret = nilfs_sufile_get_header_block(sufile, &header_bh); if (ret < 0) goto out; segusages_per_block = nilfs_sufile_segment_usages_per_block(sufile); ncleaned = 0; for (segnum = start; segnum <= end; segnum += n) { n = min_t(unsigned long, segusages_per_block - nilfs_sufile_get_offset(sufile, segnum), end - segnum + 1); ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 0, &su_bh); if (ret < 0) { if (ret != -ENOENT) goto out_header; /* hole */ continue; } offset = nilfs_sufile_segment_usage_offset(sufile, segnum, su_bh); su = kmap_local_folio(su_bh->b_folio, offset); su2 = su; for (j = 0; j < n; j++, su = (void *)su + susz) { if ((le32_to_cpu(su->su_flags) & ~BIT(NILFS_SEGMENT_USAGE_ERROR)) || nilfs_segment_is_active(nilfs, segnum + j)) { ret = -EBUSY; kunmap_local(su2); brelse(su_bh); goto out_header; } } nc = 0; for (su = su2, j = 0; j < n; j++, su = (void *)su + susz) { if (nilfs_segment_usage_error(su)) { nilfs_segment_usage_set_clean(su); nc++; } } kunmap_local(su2); if (nc > 0) { mark_buffer_dirty(su_bh); ncleaned += nc; } brelse(su_bh); if (n == segusages_per_block) { /* make hole */ nilfs_sufile_delete_segment_usage_block(sufile, segnum); } } ret = 0; out_header: if (ncleaned > 0) { NILFS_SUI(sufile)->ncleansegs += ncleaned; nilfs_sufile_mod_counter(header_bh, ncleaned, 0); nilfs_mdt_mark_dirty(sufile); } brelse(header_bh); out: return ret; } /** * nilfs_sufile_resize - resize segment array * @sufile: inode of segment usage file * @newnsegs: new number of segments * * Return: 0 on success, or one of the following negative error codes on * failure: * * %-EBUSY - Dirty or active segments exist in the region to be truncated. * * %-EIO - I/O error (including metadata corruption). * * %-ENOMEM - Insufficient memory available. * * %-ENOSPC - Enough free space is not left for shrinking. */ int nilfs_sufile_resize(struct inode *sufile, __u64 newnsegs) { struct the_nilfs *nilfs = sufile->i_sb->s_fs_info; struct buffer_head *header_bh; struct nilfs_sufile_header *header; struct nilfs_sufile_info *sui = NILFS_SUI(sufile); unsigned long nsegs, nrsvsegs; int ret = 0; down_write(&NILFS_MDT(sufile)->mi_sem); nsegs = nilfs_sufile_get_nsegments(sufile); if (nsegs == newnsegs) goto out; ret = -ENOSPC; nrsvsegs = nilfs_nrsvsegs(nilfs, newnsegs); if (newnsegs < nsegs && nsegs - newnsegs + nrsvsegs > sui->ncleansegs) goto out; ret = nilfs_sufile_get_header_block(sufile, &header_bh); if (ret < 0) goto out; if (newnsegs > nsegs) { sui->ncleansegs += newnsegs - nsegs; } else /* newnsegs < nsegs */ { ret = nilfs_sufile_truncate_range(sufile, newnsegs, nsegs - 1); if (ret < 0) goto out_header; sui->ncleansegs -= nsegs - newnsegs; /* * If the sufile is successfully truncated, immediately adjust * the segment allocation space while locking the semaphore * "mi_sem" so that nilfs_sufile_alloc() never allocates * segments in the truncated space. */ sui->allocmax = newnsegs - 1; sui->allocmin = 0; } header = kmap_local_folio(header_bh->b_folio, 0); header->sh_ncleansegs = cpu_to_le64(sui->ncleansegs); kunmap_local(header); mark_buffer_dirty(header_bh); nilfs_mdt_mark_dirty(sufile); nilfs_set_nsegments(nilfs, newnsegs); out_header: brelse(header_bh); out: up_write(&NILFS_MDT(sufile)->mi_sem); return ret; } /** * nilfs_sufile_get_suinfo - get segment usage information * @sufile: inode of segment usage file * @segnum: segment number to start looking * @buf: array of suinfo * @sisz: byte size of suinfo * @nsi: size of suinfo array * * Return: Count of segment usage info items stored in the output buffer on * success, or one of the following negative error codes on failure: * * %-EIO - I/O error (including metadata corruption). * * %-ENOMEM - Insufficient memory available. */ ssize_t nilfs_sufile_get_suinfo(struct inode *sufile, __u64 segnum, void *buf, unsigned int sisz, size_t nsi) { struct buffer_head *su_bh; struct nilfs_segment_usage *su; struct nilfs_suinfo *si = buf; size_t susz = NILFS_MDT(sufile)->mi_entry_size; struct the_nilfs *nilfs = sufile->i_sb->s_fs_info; size_t offset; void *kaddr; unsigned long nsegs, segusages_per_block; ssize_t n; int ret, i, j; down_read(&NILFS_MDT(sufile)->mi_sem); segusages_per_block = nilfs_sufile_segment_usages_per_block(sufile); nsegs = min_t(unsigned long, nilfs_sufile_get_nsegments(sufile) - segnum, nsi); for (i = 0; i < nsegs; i += n, segnum += n) { n = min_t(unsigned long, segusages_per_block - nilfs_sufile_get_offset(sufile, segnum), nsegs - i); ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 0, &su_bh); if (ret < 0) { if (ret != -ENOENT) goto out; /* hole */ memset(si, 0, sisz * n); si = (void *)si + sisz * n; continue; } offset = nilfs_sufile_segment_usage_offset(sufile, segnum, su_bh); su = kaddr = kmap_local_folio(su_bh->b_folio, offset); for (j = 0; j < n; j++, su = (void *)su + susz, si = (void *)si + sisz) { si->sui_lastmod = le64_to_cpu(su->su_lastmod); si->sui_nblocks = le32_to_cpu(su->su_nblocks); si->sui_flags = le32_to_cpu(su->su_flags) & ~BIT(NILFS_SEGMENT_USAGE_ACTIVE); if (nilfs_segment_is_active(nilfs, segnum + j)) si->sui_flags |= BIT(NILFS_SEGMENT_USAGE_ACTIVE); } kunmap_local(kaddr); brelse(su_bh); } ret = nsegs; out: up_read(&NILFS_MDT(sufile)->mi_sem); return ret; } /** * nilfs_sufile_set_suinfo - sets segment usage info * @sufile: inode of segment usage file * @buf: array of suinfo_update * @supsz: byte size of suinfo_update * @nsup: size of suinfo_update array * * Description: Takes an array of nilfs_suinfo_update structs and updates * segment usage accordingly. Only the fields indicated by the sup_flags * are updated. * * Return: 0 on success, or one of the following negative error codes on * failure: * * %-EINVAL - Invalid values in input (segment number, flags or nblocks). * * %-EIO - I/O error (including metadata corruption). * * %-ENOMEM - Insufficient memory available. */ ssize_t nilfs_sufile_set_suinfo(struct inode *sufile, void *buf, unsigned int supsz, size_t nsup) { struct the_nilfs *nilfs = sufile->i_sb->s_fs_info; struct buffer_head *header_bh, *bh; struct nilfs_suinfo_update *sup, *supend = buf + supsz * nsup; struct nilfs_segment_usage *su; size_t offset; unsigned long blkoff, prev_blkoff; int cleansi, cleansu, dirtysi, dirtysu; long ncleaned = 0, ndirtied = 0; int ret = 0; if (unlikely(nsup == 0)) return ret; for (sup = buf; sup < supend; sup = (void *)sup + supsz) { if (sup->sup_segnum >= nilfs->ns_nsegments || (sup->sup_flags & (~0UL << __NR_NILFS_SUINFO_UPDATE_FIELDS)) || (nilfs_suinfo_update_nblocks(sup) && sup->sup_sui.sui_nblocks > nilfs->ns_blocks_per_segment)) return -EINVAL; } down_write(&NILFS_MDT(sufile)->mi_sem); ret = nilfs_sufile_get_header_block(sufile, &header_bh); if (ret < 0) goto out_sem; sup = buf; blkoff = nilfs_sufile_get_blkoff(sufile, sup->sup_segnum); ret = nilfs_mdt_get_block(sufile, blkoff, 1, NULL, &bh); if (ret < 0) goto out_header; for (;;) { offset = nilfs_sufile_segment_usage_offset( sufile, sup->sup_segnum, bh); su = kmap_local_folio(bh->b_folio, offset); if (nilfs_suinfo_update_lastmod(sup)) su->su_lastmod = cpu_to_le64(sup->sup_sui.sui_lastmod); if (nilfs_suinfo_update_nblocks(sup)) su->su_nblocks = cpu_to_le32(sup->sup_sui.sui_nblocks); if (nilfs_suinfo_update_flags(sup)) { /* * Active flag is a virtual flag projected by running * nilfs kernel code - drop it not to write it to * disk. */ sup->sup_sui.sui_flags &= ~BIT(NILFS_SEGMENT_USAGE_ACTIVE); cleansi = nilfs_suinfo_clean(&sup->sup_sui); cleansu = nilfs_segment_usage_clean(su); dirtysi = nilfs_suinfo_dirty(&sup->sup_sui); dirtysu = nilfs_segment_usage_dirty(su); if (cleansi && !cleansu) ++ncleaned; else if (!cleansi && cleansu) --ncleaned; if (dirtysi && !dirtysu) ++ndirtied; else if (!dirtysi && dirtysu) --ndirtied; su->su_flags = cpu_to_le32(sup->sup_sui.sui_flags); } kunmap_local(su); sup = (void *)sup + supsz; if (sup >= supend) break; prev_blkoff = blkoff; blkoff = nilfs_sufile_get_blkoff(sufile, sup->sup_segnum); if (blkoff == prev_blkoff) continue; /* get different block */ mark_buffer_dirty(bh); put_bh(bh); ret = nilfs_mdt_get_block(sufile, blkoff, 1, NULL, &bh); if (unlikely(ret < 0)) goto out_mark; } mark_buffer_dirty(bh); put_bh(bh); out_mark: if (ncleaned || ndirtied) { nilfs_sufile_mod_counter(header_bh, (u64)ncleaned, (u64)ndirtied); NILFS_SUI(sufile)->ncleansegs += ncleaned; } nilfs_mdt_mark_dirty(sufile); out_header: put_bh(header_bh); out_sem: up_write(&NILFS_MDT(sufile)->mi_sem); return ret; } /** * nilfs_sufile_trim_fs() - trim ioctl handle function * @sufile: inode of segment usage file * @range: fstrim_range structure * * start: First Byte to trim * len: number of Bytes to trim from start * minlen: minimum extent length in Bytes * * Decription: nilfs_sufile_trim_fs goes through all segments containing bytes * from start to start+len. start is rounded up to the next block boundary * and start+len is rounded down. For each clean segment blkdev_issue_discard * function is invoked. * * Return: 0 on success, or a negative error code on failure. */ int nilfs_sufile_trim_fs(struct inode *sufile, struct fstrim_range *range) { struct the_nilfs *nilfs = sufile->i_sb->s_fs_info; struct buffer_head *su_bh; struct nilfs_segment_usage *su; size_t offset; void *kaddr; size_t n, i, susz = NILFS_MDT(sufile)->mi_entry_size; sector_t seg_start, seg_end, start_block, end_block; sector_t start = 0, nblocks = 0; u64 segnum, segnum_end, minlen, len, max_blocks, ndiscarded = 0; int ret = 0; unsigned int sects_per_block; sects_per_block = (1 << nilfs->ns_blocksize_bits) / bdev_logical_block_size(nilfs->ns_bdev); len = range->len >> nilfs->ns_blocksize_bits; minlen = range->minlen >> nilfs->ns_blocksize_bits; max_blocks = ((u64)nilfs->ns_nsegments * nilfs->ns_blocks_per_segment); if (!len || range->start >= max_blocks << nilfs->ns_blocksize_bits) return -EINVAL; start_block = (range->start + nilfs->ns_blocksize - 1) >> nilfs->ns_blocksize_bits; /* * range->len can be very large (actually, it is set to * ULLONG_MAX by default) - truncate upper end of the range * carefully so as not to overflow. */ if (max_blocks - start_block < len) end_block = max_blocks - 1; else end_block = start_block + len - 1; segnum = nilfs_get_segnum_of_block(nilfs, start_block); segnum_end = nilfs_get_segnum_of_block(nilfs, end_block); down_read(&NILFS_MDT(sufile)->mi_sem); while (segnum <= segnum_end) { n = nilfs_sufile_segment_usages_in_block(sufile, segnum, segnum_end); ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 0, &su_bh); if (ret < 0) { if (ret != -ENOENT) goto out_sem; /* hole */ segnum += n; continue; } offset = nilfs_sufile_segment_usage_offset(sufile, segnum, su_bh); su = kaddr = kmap_local_folio(su_bh->b_folio, offset); for (i = 0; i < n; ++i, ++segnum, su = (void *)su + susz) { if (!nilfs_segment_usage_clean(su)) continue; nilfs_get_segment_range(nilfs, segnum, &seg_start, &seg_end); if (!nblocks) { /* start new extent */ start = seg_start; nblocks = seg_end - seg_start + 1; continue; } if (start + nblocks == seg_start) { /* add to previous extent */ nblocks += seg_end - seg_start + 1; continue; } /* discard previous extent */ if (start < start_block) { nblocks -= start_block - start; start = start_block; } if (nblocks >= minlen) { kunmap_local(kaddr); ret = blkdev_issue_discard(nilfs->ns_bdev, start * sects_per_block, nblocks * sects_per_block, GFP_NOFS); if (ret < 0) { put_bh(su_bh); goto out_sem; } ndiscarded += nblocks; offset = nilfs_sufile_segment_usage_offset( sufile, segnum, su_bh); su = kaddr = kmap_local_folio(su_bh->b_folio, offset); } /* start new extent */ start = seg_start; nblocks = seg_end - seg_start + 1; } kunmap_local(kaddr); put_bh(su_bh); } if (nblocks) { /* discard last extent */ if (start < start_block) { nblocks -= start_block - start; start = start_block; } if (start + nblocks > end_block + 1) nblocks = end_block - start + 1; if (nblocks >= minlen) { ret = blkdev_issue_discard(nilfs->ns_bdev, start * sects_per_block, nblocks * sects_per_block, GFP_NOFS); if (!ret) ndiscarded += nblocks; } } out_sem: up_read(&NILFS_MDT(sufile)->mi_sem); range->len = ndiscarded << nilfs->ns_blocksize_bits; return ret; } /** * nilfs_sufile_read - read or get sufile inode * @sb: super block instance * @susize: size of a segment usage entry * @raw_inode: on-disk sufile inode * @inodep: buffer to store the inode * * Return: 0 on success, or a negative error code on failure. */ int nilfs_sufile_read(struct super_block *sb, size_t susize, struct nilfs_inode *raw_inode, struct inode **inodep) { struct inode *sufile; struct nilfs_sufile_info *sui; struct buffer_head *header_bh; struct nilfs_sufile_header *header; int err; if (susize > sb->s_blocksize) { nilfs_err(sb, "too large segment usage size: %zu bytes", susize); return -EINVAL; } else if (susize < NILFS_MIN_SEGMENT_USAGE_SIZE) { nilfs_err(sb, "too small segment usage size: %zu bytes", susize); return -EINVAL; } sufile = nilfs_iget_locked(sb, NULL, NILFS_SUFILE_INO); if (unlikely(!sufile)) return -ENOMEM; if (!(sufile->i_state & I_NEW)) goto out; err = nilfs_mdt_init(sufile, NILFS_MDT_GFP, sizeof(*sui)); if (err) goto failed; nilfs_mdt_set_entry_size(sufile, susize, sizeof(struct nilfs_sufile_header)); err = nilfs_read_inode_common(sufile, raw_inode); if (err) goto failed; err = nilfs_mdt_get_block(sufile, 0, 0, NULL, &header_bh); if (unlikely(err)) { if (err == -ENOENT) { nilfs_err(sb, "missing header block in segment usage metadata"); err = -EINVAL; } goto failed; } sui = NILFS_SUI(sufile); header = kmap_local_folio(header_bh->b_folio, 0); sui->ncleansegs = le64_to_cpu(header->sh_ncleansegs); kunmap_local(header); brelse(header_bh); sui->allocmax = nilfs_sufile_get_nsegments(sufile) - 1; sui->allocmin = 0; unlock_new_inode(sufile); out: *inodep = sufile; return 0; failed: iget_failed(sufile); return err; }
31 31 135 138 139 135 136 136 137 52 52 52 136 136 134 49 30 104 76 76 4 4 73 1 73 77 77 6 76 1 1 1 1 1 1 77 90 90 90 90 34 20 34 20 31 77 1 77 6 72 5 74 8 73 6 72 5 8 1 4 4 1 1 4 2 2 1 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2017-2018 HUAWEI, Inc. * https://www.huawei.com/ * Copyright (C) 2021, Alibaba Cloud */ #include "internal.h" #include <linux/sched/mm.h> #include <trace/events/erofs.h> void erofs_unmap_metabuf(struct erofs_buf *buf) { if (!buf->base) return; kunmap_local(buf->base); buf->base = NULL; } void erofs_put_metabuf(struct erofs_buf *buf) { if (!buf->page) return; erofs_unmap_metabuf(buf); folio_put(page_folio(buf->page)); buf->page = NULL; } void *erofs_bread(struct erofs_buf *buf, erofs_off_t offset, enum erofs_kmap_type type) { pgoff_t index = offset >> PAGE_SHIFT; struct folio *folio = NULL; if (buf->page) { folio = page_folio(buf->page); if (folio_file_page(folio, index) != buf->page) erofs_unmap_metabuf(buf); } if (!folio || !folio_contains(folio, index)) { erofs_put_metabuf(buf); folio = read_mapping_folio(buf->mapping, index, buf->file); if (IS_ERR(folio)) return folio; } buf->page = folio_file_page(folio, index); if (!buf->base && type == EROFS_KMAP) buf->base = kmap_local_page(buf->page); if (type == EROFS_NO_KMAP) return NULL; return buf->base + (offset & ~PAGE_MASK); } void erofs_init_metabuf(struct erofs_buf *buf, struct super_block *sb) { struct erofs_sb_info *sbi = EROFS_SB(sb); buf->file = NULL; if (erofs_is_fileio_mode(sbi)) { buf->file = sbi->dif0.file; /* some fs like FUSE needs it */ buf->mapping = buf->file->f_mapping; } else if (erofs_is_fscache_mode(sb)) buf->mapping = sbi->dif0.fscache->inode->i_mapping; else buf->mapping = sb->s_bdev->bd_mapping; } void *erofs_read_metabuf(struct erofs_buf *buf, struct super_block *sb, erofs_off_t offset, enum erofs_kmap_type type) { erofs_init_metabuf(buf, sb); return erofs_bread(buf, offset, type); } static int erofs_map_blocks_flatmode(struct inode *inode, struct erofs_map_blocks *map) { struct erofs_inode *vi = EROFS_I(inode); struct super_block *sb = inode->i_sb; bool tailendpacking = (vi->datalayout == EROFS_INODE_FLAT_INLINE); erofs_blk_t lastblk = erofs_iblks(inode) - tailendpacking; map->m_flags = EROFS_MAP_MAPPED; /* no hole in flat inodes */ if (map->m_la < erofs_pos(sb, lastblk)) { map->m_pa = erofs_pos(sb, vi->raw_blkaddr) + map->m_la; map->m_plen = erofs_pos(sb, lastblk) - map->m_la; } else { DBG_BUGON(!tailendpacking); map->m_pa = erofs_iloc(inode) + vi->inode_isize + vi->xattr_isize + erofs_blkoff(sb, map->m_la); map->m_plen = inode->i_size - map->m_la; /* inline data should be located in the same meta block */ if (erofs_blkoff(sb, map->m_pa) + map->m_plen > sb->s_blocksize) { erofs_err(sb, "inline data across blocks @ nid %llu", vi->nid); DBG_BUGON(1); return -EFSCORRUPTED; } map->m_flags |= EROFS_MAP_META; } return 0; } int erofs_map_blocks(struct inode *inode, struct erofs_map_blocks *map) { struct super_block *sb = inode->i_sb; struct erofs_inode *vi = EROFS_I(inode); struct erofs_inode_chunk_index *idx; struct erofs_buf buf = __EROFS_BUF_INITIALIZER; u64 chunknr; unsigned int unit; erofs_off_t pos; void *kaddr; int err = 0; trace_erofs_map_blocks_enter(inode, map, 0); map->m_deviceid = 0; if (map->m_la >= inode->i_size) { /* leave out-of-bound access unmapped */ map->m_flags = 0; map->m_plen = map->m_llen; goto out; } if (vi->datalayout != EROFS_INODE_CHUNK_BASED) { err = erofs_map_blocks_flatmode(inode, map); goto out; } if (vi->chunkformat & EROFS_CHUNK_FORMAT_INDEXES) unit = sizeof(*idx); /* chunk index */ else unit = EROFS_BLOCK_MAP_ENTRY_SIZE; /* block map */ chunknr = map->m_la >> vi->chunkbits; pos = ALIGN(erofs_iloc(inode) + vi->inode_isize + vi->xattr_isize, unit) + unit * chunknr; kaddr = erofs_read_metabuf(&buf, sb, pos, EROFS_KMAP); if (IS_ERR(kaddr)) { err = PTR_ERR(kaddr); goto out; } map->m_la = chunknr << vi->chunkbits; map->m_plen = min_t(erofs_off_t, 1UL << vi->chunkbits, round_up(inode->i_size - map->m_la, sb->s_blocksize)); /* handle block map */ if (!(vi->chunkformat & EROFS_CHUNK_FORMAT_INDEXES)) { __le32 *blkaddr = kaddr; if (le32_to_cpu(*blkaddr) == EROFS_NULL_ADDR) { map->m_flags = 0; } else { map->m_pa = erofs_pos(sb, le32_to_cpu(*blkaddr)); map->m_flags = EROFS_MAP_MAPPED; } goto out_unlock; } /* parse chunk indexes */ idx = kaddr; switch (le32_to_cpu(idx->blkaddr)) { case EROFS_NULL_ADDR: map->m_flags = 0; break; default: map->m_deviceid = le16_to_cpu(idx->device_id) & EROFS_SB(sb)->device_id_mask; map->m_pa = erofs_pos(sb, le32_to_cpu(idx->blkaddr)); map->m_flags = EROFS_MAP_MAPPED; break; } out_unlock: erofs_put_metabuf(&buf); out: if (!err) map->m_llen = map->m_plen; trace_erofs_map_blocks_exit(inode, map, 0, err); return err; } static void erofs_fill_from_devinfo(struct erofs_map_dev *map, struct super_block *sb, struct erofs_device_info *dif) { map->m_sb = sb; map->m_dif = dif; map->m_bdev = NULL; if (dif->file && S_ISBLK(file_inode(dif->file)->i_mode)) map->m_bdev = file_bdev(dif->file); } int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map) { struct erofs_dev_context *devs = EROFS_SB(sb)->devs; struct erofs_device_info *dif; erofs_off_t startoff, length; int id; erofs_fill_from_devinfo(map, sb, &EROFS_SB(sb)->dif0); map->m_bdev = sb->s_bdev; /* use s_bdev for the primary device */ if (map->m_deviceid) { down_read(&devs->rwsem); dif = idr_find(&devs->tree, map->m_deviceid - 1); if (!dif) { up_read(&devs->rwsem); return -ENODEV; } if (devs->flatdev) { map->m_pa += erofs_pos(sb, dif->mapped_blkaddr); up_read(&devs->rwsem); return 0; } erofs_fill_from_devinfo(map, sb, dif); up_read(&devs->rwsem); } else if (devs->extra_devices && !devs->flatdev) { down_read(&devs->rwsem); idr_for_each_entry(&devs->tree, dif, id) { if (!dif->mapped_blkaddr) continue; startoff = erofs_pos(sb, dif->mapped_blkaddr); length = erofs_pos(sb, dif->blocks); if (map->m_pa >= startoff && map->m_pa < startoff + length) { map->m_pa -= startoff; erofs_fill_from_devinfo(map, sb, dif); break; } } up_read(&devs->rwsem); } return 0; } /* * bit 30: I/O error occurred on this folio * bit 0 - 29: remaining parts to complete this folio */ #define EROFS_ONLINEFOLIO_EIO (1 << 30) void erofs_onlinefolio_init(struct folio *folio) { union { atomic_t o; void *v; } u = { .o = ATOMIC_INIT(1) }; folio->private = u.v; /* valid only if file-backed folio is locked */ } void erofs_onlinefolio_split(struct folio *folio) { atomic_inc((atomic_t *)&folio->private); } void erofs_onlinefolio_end(struct folio *folio, int err) { int orig, v; do { orig = atomic_read((atomic_t *)&folio->private); v = (orig - 1) | (err ? EROFS_ONLINEFOLIO_EIO : 0); } while (atomic_cmpxchg((atomic_t *)&folio->private, orig, v) != orig); if (v & ~EROFS_ONLINEFOLIO_EIO) return; folio->private = 0; folio_end_read(folio, !(v & EROFS_ONLINEFOLIO_EIO)); } static int erofs_iomap_begin(struct inode *inode, loff_t offset, loff_t length, unsigned int flags, struct iomap *iomap, struct iomap *srcmap) { int ret; struct super_block *sb = inode->i_sb; struct erofs_map_blocks map; struct erofs_map_dev mdev; map.m_la = offset; map.m_llen = length; ret = erofs_map_blocks(inode, &map); if (ret < 0) return ret; mdev = (struct erofs_map_dev) { .m_deviceid = map.m_deviceid, .m_pa = map.m_pa, }; ret = erofs_map_dev(sb, &mdev); if (ret) return ret; iomap->offset = map.m_la; if (flags & IOMAP_DAX) iomap->dax_dev = mdev.m_dif->dax_dev; else iomap->bdev = mdev.m_bdev; iomap->length = map.m_llen; iomap->flags = 0; iomap->private = NULL; if (!(map.m_flags & EROFS_MAP_MAPPED)) { iomap->type = IOMAP_HOLE; iomap->addr = IOMAP_NULL_ADDR; if (!iomap->length) iomap->length = length; return 0; } if (map.m_flags & EROFS_MAP_META) { void *ptr; struct erofs_buf buf = __EROFS_BUF_INITIALIZER; iomap->type = IOMAP_INLINE; ptr = erofs_read_metabuf(&buf, sb, mdev.m_pa, EROFS_KMAP); if (IS_ERR(ptr)) return PTR_ERR(ptr); iomap->inline_data = ptr; iomap->private = buf.base; } else { iomap->type = IOMAP_MAPPED; iomap->addr = mdev.m_pa; if (flags & IOMAP_DAX) iomap->addr += mdev.m_dif->dax_part_off; } return 0; } static int erofs_iomap_end(struct inode *inode, loff_t pos, loff_t length, ssize_t written, unsigned int flags, struct iomap *iomap) { void *ptr = iomap->private; if (ptr) { struct erofs_buf buf = { .page = kmap_to_page(ptr), .base = ptr, }; DBG_BUGON(iomap->type != IOMAP_INLINE); erofs_put_metabuf(&buf); } else { DBG_BUGON(iomap->type == IOMAP_INLINE); } return written; } static const struct iomap_ops erofs_iomap_ops = { .iomap_begin = erofs_iomap_begin, .iomap_end = erofs_iomap_end, }; int erofs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len) { if (erofs_inode_is_data_compressed(EROFS_I(inode)->datalayout)) { #ifdef CONFIG_EROFS_FS_ZIP return iomap_fiemap(inode, fieinfo, start, len, &z_erofs_iomap_report_ops); #else return -EOPNOTSUPP; #endif } return iomap_fiemap(inode, fieinfo, start, len, &erofs_iomap_ops); } /* * since we dont have write or truncate flows, so no inode * locking needs to be held at the moment. */ static int erofs_read_folio(struct file *file, struct folio *folio) { return iomap_read_folio(folio, &erofs_iomap_ops); } static void erofs_readahead(struct readahead_control *rac) { return iomap_readahead(rac, &erofs_iomap_ops); } static sector_t erofs_bmap(struct address_space *mapping, sector_t block) { return iomap_bmap(mapping, block, &erofs_iomap_ops); } static ssize_t erofs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) { struct inode *inode = file_inode(iocb->ki_filp); /* no need taking (shared) inode lock since it's a ro filesystem */ if (!iov_iter_count(to)) return 0; #ifdef CONFIG_FS_DAX if (IS_DAX(inode)) return dax_iomap_rw(iocb, to, &erofs_iomap_ops); #endif if ((iocb->ki_flags & IOCB_DIRECT) && inode->i_sb->s_bdev) return iomap_dio_rw(iocb, to, &erofs_iomap_ops, NULL, 0, NULL, 0); return filemap_read(iocb, to, 0); } /* for uncompressed (aligned) files and raw access for other files */ const struct address_space_operations erofs_aops = { .read_folio = erofs_read_folio, .readahead = erofs_readahead, .bmap = erofs_bmap, .direct_IO = noop_direct_IO, .release_folio = iomap_release_folio, .invalidate_folio = iomap_invalidate_folio, }; #ifdef CONFIG_FS_DAX static vm_fault_t erofs_dax_huge_fault(struct vm_fault *vmf, unsigned int order) { return dax_iomap_fault(vmf, order, NULL, NULL, &erofs_iomap_ops); } static vm_fault_t erofs_dax_fault(struct vm_fault *vmf) { return erofs_dax_huge_fault(vmf, 0); } static const struct vm_operations_struct erofs_dax_vm_ops = { .fault = erofs_dax_fault, .huge_fault = erofs_dax_huge_fault, }; static int erofs_file_mmap(struct file *file, struct vm_area_struct *vma) { if (!IS_DAX(file_inode(file))) return generic_file_readonly_mmap(file, vma); if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) return -EINVAL; vma->vm_ops = &erofs_dax_vm_ops; vm_flags_set(vma, VM_HUGEPAGE); return 0; } #else #define erofs_file_mmap generic_file_readonly_mmap #endif static loff_t erofs_file_llseek(struct file *file, loff_t offset, int whence) { struct inode *inode = file->f_mapping->host; const struct iomap_ops *ops = &erofs_iomap_ops; if (erofs_inode_is_data_compressed(EROFS_I(inode)->datalayout)) #ifdef CONFIG_EROFS_FS_ZIP ops = &z_erofs_iomap_report_ops; #else return generic_file_llseek(file, offset, whence); #endif if (whence == SEEK_HOLE) offset = iomap_seek_hole(inode, offset, ops); else if (whence == SEEK_DATA) offset = iomap_seek_data(inode, offset, ops); else return generic_file_llseek(file, offset, whence); if (offset < 0) return offset; return vfs_setpos(file, offset, inode->i_sb->s_maxbytes); } const struct file_operations erofs_file_fops = { .llseek = erofs_file_llseek, .read_iter = erofs_file_read_iter, .mmap = erofs_file_mmap, .get_unmapped_area = thp_get_unmapped_area, .splice_read = filemap_splice_read, };
112 113 93 186 117 94 145 63 63 63 63 24 53 14 62 75 63 25 62 62 25 62 83 82 100 48 83 137 137 107 99 98 3 99 44 44 44 44 44 112 112 110 111 36 14 84 84 1 77 89 1 2 2 3 3 3 69 69 68 69 69 73 126 123 69 8 40 16 37 47 3 46 3 47 5 43 79 79 11 69 79 79 79 79 79 5 75 57 52 57 43 14 13 57 5 56 57 57 57 18 43 43 18 6 6 4 6 16 16 16 2 14 43 44 43 32 33 9 9 2 91 91 91 91 61 42 6 37 39 40 40 1 62 99 99 99 99 99 90 12 99 99 99 8 90 99 38 8 91 1 98 90 90 90 97 8 90 81 13 1 77 77 38 30 56 69 29 9 9 77 1 82 78 4 82 81 9 75 9 9 7 2 9 1 1 1 1 2 1 1 1 50 50 50 50 50 50 32 24 80 80 32 53 8 8 79 79 75 75 75 44 31 31 21 9 5 5 7 7 7 7 7 5 2 5 2 80 75 16 75 75 75 69 16 34 2 12 2 1 75 75 75 75 25 61 55 45 1 75 75 75 1 56 49 1 48 77 77 77 77 77 77 77 77 52 27 63 77 77 75 2 75 76 62 25 76 81 80 77 80 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2010 Red Hat, Inc. * Copyright (C) 2016-2023 Christoph Hellwig. */ #include <linux/module.h> #include <linux/compiler.h> #include <linux/fs.h> #include <linux/iomap.h> #include <linux/pagemap.h> #include <linux/uio.h> #include <linux/buffer_head.h> #include <linux/dax.h> #include <linux/writeback.h> #include <linux/list_sort.h> #include <linux/swap.h> #include <linux/bio.h> #include <linux/sched/signal.h> #include <linux/migrate.h> #include "trace.h" #include "../internal.h" #define IOEND_BATCH_SIZE 4096 /* * Structure allocated for each folio to track per-block uptodate, dirty state * and I/O completions. */ struct iomap_folio_state { spinlock_t state_lock; unsigned int read_bytes_pending; atomic_t write_bytes_pending; /* * Each block has two bits in this bitmap: * Bits [0..blocks_per_folio) has the uptodate status. * Bits [b_p_f...(2*b_p_f)) has the dirty status. */ unsigned long state[]; }; static struct bio_set iomap_ioend_bioset; static inline bool ifs_is_fully_uptodate(struct folio *folio, struct iomap_folio_state *ifs) { struct inode *inode = folio->mapping->host; return bitmap_full(ifs->state, i_blocks_per_folio(inode, folio)); } static inline bool ifs_block_is_uptodate(struct iomap_folio_state *ifs, unsigned int block) { return test_bit(block, ifs->state); } static bool ifs_set_range_uptodate(struct folio *folio, struct iomap_folio_state *ifs, size_t off, size_t len) { struct inode *inode = folio->mapping->host; unsigned int first_blk = off >> inode->i_blkbits; unsigned int last_blk = (off + len - 1) >> inode->i_blkbits; unsigned int nr_blks = last_blk - first_blk + 1; bitmap_set(ifs->state, first_blk, nr_blks); return ifs_is_fully_uptodate(folio, ifs); } static void iomap_set_range_uptodate(struct folio *folio, size_t off, size_t len) { struct iomap_folio_state *ifs = folio->private; unsigned long flags; bool uptodate = true; if (ifs) { spin_lock_irqsave(&ifs->state_lock, flags); uptodate = ifs_set_range_uptodate(folio, ifs, off, len); spin_unlock_irqrestore(&ifs->state_lock, flags); } if (uptodate) folio_mark_uptodate(folio); } static inline bool ifs_block_is_dirty(struct folio *folio, struct iomap_folio_state *ifs, int block) { struct inode *inode = folio->mapping->host; unsigned int blks_per_folio = i_blocks_per_folio(inode, folio); return test_bit(block + blks_per_folio, ifs->state); } static unsigned ifs_find_dirty_range(struct folio *folio, struct iomap_folio_state *ifs, u64 *range_start, u64 range_end) { struct inode *inode = folio->mapping->host; unsigned start_blk = offset_in_folio(folio, *range_start) >> inode->i_blkbits; unsigned end_blk = min_not_zero( offset_in_folio(folio, range_end) >> inode->i_blkbits, i_blocks_per_folio(inode, folio)); unsigned nblks = 1; while (!ifs_block_is_dirty(folio, ifs, start_blk)) if (++start_blk == end_blk) return 0; while (start_blk + nblks < end_blk) { if (!ifs_block_is_dirty(folio, ifs, start_blk + nblks)) break; nblks++; } *range_start = folio_pos(folio) + (start_blk << inode->i_blkbits); return nblks << inode->i_blkbits; } static unsigned iomap_find_dirty_range(struct folio *folio, u64 *range_start, u64 range_end) { struct iomap_folio_state *ifs = folio->private; if (*range_start >= range_end) return 0; if (ifs) return ifs_find_dirty_range(folio, ifs, range_start, range_end); return range_end - *range_start; } static void ifs_clear_range_dirty(struct folio *folio, struct iomap_folio_state *ifs, size_t off, size_t len) { struct inode *inode = folio->mapping->host; unsigned int blks_per_folio = i_blocks_per_folio(inode, folio); unsigned int first_blk = (off >> inode->i_blkbits); unsigned int last_blk = (off + len - 1) >> inode->i_blkbits; unsigned int nr_blks = last_blk - first_blk + 1; unsigned long flags; spin_lock_irqsave(&ifs->state_lock, flags); bitmap_clear(ifs->state, first_blk + blks_per_folio, nr_blks); spin_unlock_irqrestore(&ifs->state_lock, flags); } static void iomap_clear_range_dirty(struct folio *folio, size_t off, size_t len) { struct iomap_folio_state *ifs = folio->private; if (ifs) ifs_clear_range_dirty(folio, ifs, off, len); } static void ifs_set_range_dirty(struct folio *folio, struct iomap_folio_state *ifs, size_t off, size_t len) { struct inode *inode = folio->mapping->host; unsigned int blks_per_folio = i_blocks_per_folio(inode, folio); unsigned int first_blk = (off >> inode->i_blkbits); unsigned int last_blk = (off + len - 1) >> inode->i_blkbits; unsigned int nr_blks = last_blk - first_blk + 1; unsigned long flags; spin_lock_irqsave(&ifs->state_lock, flags); bitmap_set(ifs->state, first_blk + blks_per_folio, nr_blks); spin_unlock_irqrestore(&ifs->state_lock, flags); } static void iomap_set_range_dirty(struct folio *folio, size_t off, size_t len) { struct iomap_folio_state *ifs = folio->private; if (ifs) ifs_set_range_dirty(folio, ifs, off, len); } static struct iomap_folio_state *ifs_alloc(struct inode *inode, struct folio *folio, unsigned int flags) { struct iomap_folio_state *ifs = folio->private; unsigned int nr_blocks = i_blocks_per_folio(inode, folio); gfp_t gfp; if (ifs || nr_blocks <= 1) return ifs; if (flags & IOMAP_NOWAIT) gfp = GFP_NOWAIT; else gfp = GFP_NOFS | __GFP_NOFAIL; /* * ifs->state tracks two sets of state flags when the * filesystem block size is smaller than the folio size. * The first state tracks per-block uptodate and the * second tracks per-block dirty state. */ ifs = kzalloc(struct_size(ifs, state, BITS_TO_LONGS(2 * nr_blocks)), gfp); if (!ifs) return ifs; spin_lock_init(&ifs->state_lock); if (folio_test_uptodate(folio)) bitmap_set(ifs->state, 0, nr_blocks); if (folio_test_dirty(folio)) bitmap_set(ifs->state, nr_blocks, nr_blocks); folio_attach_private(folio, ifs); return ifs; } static void ifs_free(struct folio *folio) { struct iomap_folio_state *ifs = folio_detach_private(folio); if (!ifs) return; WARN_ON_ONCE(ifs->read_bytes_pending != 0); WARN_ON_ONCE(atomic_read(&ifs->write_bytes_pending)); WARN_ON_ONCE(ifs_is_fully_uptodate(folio, ifs) != folio_test_uptodate(folio)); kfree(ifs); } /* * Calculate the range inside the folio that we actually need to read. */ static void iomap_adjust_read_range(struct inode *inode, struct folio *folio, loff_t *pos, loff_t length, size_t *offp, size_t *lenp) { struct iomap_folio_state *ifs = folio->private; loff_t orig_pos = *pos; loff_t isize = i_size_read(inode); unsigned block_bits = inode->i_blkbits; unsigned block_size = (1 << block_bits); size_t poff = offset_in_folio(folio, *pos); size_t plen = min_t(loff_t, folio_size(folio) - poff, length); size_t orig_plen = plen; unsigned first = poff >> block_bits; unsigned last = (poff + plen - 1) >> block_bits; /* * If the block size is smaller than the page size, we need to check the * per-block uptodate status and adjust the offset and length if needed * to avoid reading in already uptodate ranges. */ if (ifs) { unsigned int i; /* move forward for each leading block marked uptodate */ for (i = first; i <= last; i++) { if (!ifs_block_is_uptodate(ifs, i)) break; *pos += block_size; poff += block_size; plen -= block_size; first++; } /* truncate len if we find any trailing uptodate block(s) */ for ( ; i <= last; i++) { if (ifs_block_is_uptodate(ifs, i)) { plen -= (last - i + 1) * block_size; last = i - 1; break; } } } /* * If the extent spans the block that contains the i_size, we need to * handle both halves separately so that we properly zero data in the * page cache for blocks that are entirely outside of i_size. */ if (orig_pos <= isize && orig_pos + orig_plen > isize) { unsigned end = offset_in_folio(folio, isize - 1) >> block_bits; if (first <= end && last > end) plen -= (last - end) * block_size; } *offp = poff; *lenp = plen; } static void iomap_finish_folio_read(struct folio *folio, size_t off, size_t len, int error) { struct iomap_folio_state *ifs = folio->private; bool uptodate = !error; bool finished = true; if (ifs) { unsigned long flags; spin_lock_irqsave(&ifs->state_lock, flags); if (!error) uptodate = ifs_set_range_uptodate(folio, ifs, off, len); ifs->read_bytes_pending -= len; finished = !ifs->read_bytes_pending; spin_unlock_irqrestore(&ifs->state_lock, flags); } if (finished) folio_end_read(folio, uptodate); } static void iomap_read_end_io(struct bio *bio) { int error = blk_status_to_errno(bio->bi_status); struct folio_iter fi; bio_for_each_folio_all(fi, bio) iomap_finish_folio_read(fi.folio, fi.offset, fi.length, error); bio_put(bio); } struct iomap_readpage_ctx { struct folio *cur_folio; bool cur_folio_in_bio; struct bio *bio; struct readahead_control *rac; }; /** * iomap_read_inline_data - copy inline data into the page cache * @iter: iteration structure * @folio: folio to copy to * * Copy the inline data in @iter into @folio and zero out the rest of the folio. * Only a single IOMAP_INLINE extent is allowed at the end of each file. * Returns zero for success to complete the read, or the usual negative errno. */ static int iomap_read_inline_data(const struct iomap_iter *iter, struct folio *folio) { const struct iomap *iomap = iomap_iter_srcmap(iter); size_t size = i_size_read(iter->inode) - iomap->offset; size_t offset = offset_in_folio(folio, iomap->offset); if (folio_test_uptodate(folio)) return 0; if (WARN_ON_ONCE(size > iomap->length)) return -EIO; if (offset > 0) ifs_alloc(iter->inode, folio, iter->flags); folio_fill_tail(folio, offset, iomap->inline_data, size); iomap_set_range_uptodate(folio, offset, folio_size(folio) - offset); return 0; } static inline bool iomap_block_needs_zeroing(const struct iomap_iter *iter, loff_t pos) { const struct iomap *srcmap = iomap_iter_srcmap(iter); return srcmap->type != IOMAP_MAPPED || (srcmap->flags & IOMAP_F_NEW) || pos >= i_size_read(iter->inode); } static loff_t iomap_readpage_iter(const struct iomap_iter *iter, struct iomap_readpage_ctx *ctx, loff_t offset) { const struct iomap *iomap = &iter->iomap; loff_t pos = iter->pos + offset; loff_t length = iomap_length(iter) - offset; struct folio *folio = ctx->cur_folio; struct iomap_folio_state *ifs; loff_t orig_pos = pos; size_t poff, plen; sector_t sector; if (iomap->type == IOMAP_INLINE) return iomap_read_inline_data(iter, folio); /* zero post-eof blocks as the page may be mapped */ ifs = ifs_alloc(iter->inode, folio, iter->flags); iomap_adjust_read_range(iter->inode, folio, &pos, length, &poff, &plen); if (plen == 0) goto done; if (iomap_block_needs_zeroing(iter, pos)) { folio_zero_range(folio, poff, plen); iomap_set_range_uptodate(folio, poff, plen); goto done; } ctx->cur_folio_in_bio = true; if (ifs) { spin_lock_irq(&ifs->state_lock); ifs->read_bytes_pending += plen; spin_unlock_irq(&ifs->state_lock); } sector = iomap_sector(iomap, pos); if (!ctx->bio || bio_end_sector(ctx->bio) != sector || !bio_add_folio(ctx->bio, folio, plen, poff)) { gfp_t gfp = mapping_gfp_constraint(folio->mapping, GFP_KERNEL); gfp_t orig_gfp = gfp; unsigned int nr_vecs = DIV_ROUND_UP(length, PAGE_SIZE); if (ctx->bio) submit_bio(ctx->bio); if (ctx->rac) /* same as readahead_gfp_mask */ gfp |= __GFP_NORETRY | __GFP_NOWARN; ctx->bio = bio_alloc(iomap->bdev, bio_max_segs(nr_vecs), REQ_OP_READ, gfp); /* * If the bio_alloc fails, try it again for a single page to * avoid having to deal with partial page reads. This emulates * what do_mpage_read_folio does. */ if (!ctx->bio) { ctx->bio = bio_alloc(iomap->bdev, 1, REQ_OP_READ, orig_gfp); } if (ctx->rac) ctx->bio->bi_opf |= REQ_RAHEAD; ctx->bio->bi_iter.bi_sector = sector; ctx->bio->bi_end_io = iomap_read_end_io; bio_add_folio_nofail(ctx->bio, folio, plen, poff); } done: /* * Move the caller beyond our range so that it keeps making progress. * For that, we have to include any leading non-uptodate ranges, but * we can skip trailing ones as they will be handled in the next * iteration. */ return pos - orig_pos + plen; } static loff_t iomap_read_folio_iter(const struct iomap_iter *iter, struct iomap_readpage_ctx *ctx) { struct folio *folio = ctx->cur_folio; size_t offset = offset_in_folio(folio, iter->pos); loff_t length = min_t(loff_t, folio_size(folio) - offset, iomap_length(iter)); loff_t done, ret; for (done = 0; done < length; done += ret) { ret = iomap_readpage_iter(iter, ctx, done); if (ret <= 0) return ret; } return done; } int iomap_read_folio(struct folio *folio, const struct iomap_ops *ops) { struct iomap_iter iter = { .inode = folio->mapping->host, .pos = folio_pos(folio), .len = folio_size(folio), }; struct iomap_readpage_ctx ctx = { .cur_folio = folio, }; int ret; trace_iomap_readpage(iter.inode, 1); while ((ret = iomap_iter(&iter, ops)) > 0) iter.processed = iomap_read_folio_iter(&iter, &ctx); if (ctx.bio) { submit_bio(ctx.bio); WARN_ON_ONCE(!ctx.cur_folio_in_bio); } else { WARN_ON_ONCE(ctx.cur_folio_in_bio); folio_unlock(folio); } /* * Just like mpage_readahead and block_read_full_folio, we always * return 0 and just set the folio error flag on errors. This * should be cleaned up throughout the stack eventually. */ return 0; } EXPORT_SYMBOL_GPL(iomap_read_folio); static loff_t iomap_readahead_iter(const struct iomap_iter *iter, struct iomap_readpage_ctx *ctx) { loff_t length = iomap_length(iter); loff_t done, ret; for (done = 0; done < length; done += ret) { if (ctx->cur_folio && offset_in_folio(ctx->cur_folio, iter->pos + done) == 0) { if (!ctx->cur_folio_in_bio) folio_unlock(ctx->cur_folio); ctx->cur_folio = NULL; } if (!ctx->cur_folio) { ctx->cur_folio = readahead_folio(ctx->rac); ctx->cur_folio_in_bio = false; } ret = iomap_readpage_iter(iter, ctx, done); if (ret <= 0) return ret; } return done; } /** * iomap_readahead - Attempt to read pages from a file. * @rac: Describes the pages to be read. * @ops: The operations vector for the filesystem. * * This function is for filesystems to call to implement their readahead * address_space operation. * * Context: The @ops callbacks may submit I/O (eg to read the addresses of * blocks from disc), and may wait for it. The caller may be trying to * access a different page, and so sleeping excessively should be avoided. * It may allocate memory, but should avoid costly allocations. This * function is called with memalloc_nofs set, so allocations will not cause * the filesystem to be reentered. */ void iomap_readahead(struct readahead_control *rac, const struct iomap_ops *ops) { struct iomap_iter iter = { .inode = rac->mapping->host, .pos = readahead_pos(rac), .len = readahead_length(rac), }; struct iomap_readpage_ctx ctx = { .rac = rac, }; trace_iomap_readahead(rac->mapping->host, readahead_count(rac)); while (iomap_iter(&iter, ops) > 0) iter.processed = iomap_readahead_iter(&iter, &ctx); if (ctx.bio) submit_bio(ctx.bio); if (ctx.cur_folio) { if (!ctx.cur_folio_in_bio) folio_unlock(ctx.cur_folio); } } EXPORT_SYMBOL_GPL(iomap_readahead); /* * iomap_is_partially_uptodate checks whether blocks within a folio are * uptodate or not. * * Returns true if all blocks which correspond to the specified part * of the folio are uptodate. */ bool iomap_is_partially_uptodate(struct folio *folio, size_t from, size_t count) { struct iomap_folio_state *ifs = folio->private; struct inode *inode = folio->mapping->host; unsigned first, last, i; if (!ifs) return false; /* Caller's range may extend past the end of this folio */ count = min(folio_size(folio) - from, count); /* First and last blocks in range within folio */ first = from >> inode->i_blkbits; last = (from + count - 1) >> inode->i_blkbits; for (i = first; i <= last; i++) if (!ifs_block_is_uptodate(ifs, i)) return false; return true; } EXPORT_SYMBOL_GPL(iomap_is_partially_uptodate); /** * iomap_get_folio - get a folio reference for writing * @iter: iteration structure * @pos: start offset of write * @len: Suggested size of folio to create. * * Returns a locked reference to the folio at @pos, or an error pointer if the * folio could not be obtained. */ struct folio *iomap_get_folio(struct iomap_iter *iter, loff_t pos, size_t len) { fgf_t fgp = FGP_WRITEBEGIN | FGP_NOFS; if (iter->flags & IOMAP_NOWAIT) fgp |= FGP_NOWAIT; fgp |= fgf_set_order(len); return __filemap_get_folio(iter->inode->i_mapping, pos >> PAGE_SHIFT, fgp, mapping_gfp_mask(iter->inode->i_mapping)); } EXPORT_SYMBOL_GPL(iomap_get_folio); bool iomap_release_folio(struct folio *folio, gfp_t gfp_flags) { trace_iomap_release_folio(folio->mapping->host, folio_pos(folio), folio_size(folio)); /* * If the folio is dirty, we refuse to release our metadata because * it may be partially dirty. Once we track per-block dirty state, * we can release the metadata if every block is dirty. */ if (folio_test_dirty(folio)) return false; ifs_free(folio); return true; } EXPORT_SYMBOL_GPL(iomap_release_folio); void iomap_invalidate_folio(struct folio *folio, size_t offset, size_t len) { trace_iomap_invalidate_folio(folio->mapping->host, folio_pos(folio) + offset, len); /* * If we're invalidating the entire folio, clear the dirty state * from it and release it to avoid unnecessary buildup of the LRU. */ if (offset == 0 && len == folio_size(folio)) { WARN_ON_ONCE(folio_test_writeback(folio)); folio_cancel_dirty(folio); ifs_free(folio); } } EXPORT_SYMBOL_GPL(iomap_invalidate_folio); bool iomap_dirty_folio(struct address_space *mapping, struct folio *folio) { struct inode *inode = mapping->host; size_t len = folio_size(folio); ifs_alloc(inode, folio, 0); iomap_set_range_dirty(folio, 0, len); return filemap_dirty_folio(mapping, folio); } EXPORT_SYMBOL_GPL(iomap_dirty_folio); static void iomap_write_failed(struct inode *inode, loff_t pos, unsigned len) { loff_t i_size = i_size_read(inode); /* * Only truncate newly allocated pages beyoned EOF, even if the * write started inside the existing inode size. */ if (pos + len > i_size) truncate_pagecache_range(inode, max(pos, i_size), pos + len - 1); } static int iomap_read_folio_sync(loff_t block_start, struct folio *folio, size_t poff, size_t plen, const struct iomap *iomap) { struct bio_vec bvec; struct bio bio; bio_init(&bio, iomap->bdev, &bvec, 1, REQ_OP_READ); bio.bi_iter.bi_sector = iomap_sector(iomap, block_start); bio_add_folio_nofail(&bio, folio, plen, poff); return submit_bio_wait(&bio); } static int __iomap_write_begin(const struct iomap_iter *iter, loff_t pos, size_t len, struct folio *folio) { const struct iomap *srcmap = iomap_iter_srcmap(iter); struct iomap_folio_state *ifs; loff_t block_size = i_blocksize(iter->inode); loff_t block_start = round_down(pos, block_size); loff_t block_end = round_up(pos + len, block_size); unsigned int nr_blocks = i_blocks_per_folio(iter->inode, folio); size_t from = offset_in_folio(folio, pos), to = from + len; size_t poff, plen; /* * If the write or zeroing completely overlaps the current folio, then * entire folio will be dirtied so there is no need for * per-block state tracking structures to be attached to this folio. * For the unshare case, we must read in the ondisk contents because we * are not changing pagecache contents. */ if (!(iter->flags & IOMAP_UNSHARE) && pos <= folio_pos(folio) && pos + len >= folio_pos(folio) + folio_size(folio)) return 0; ifs = ifs_alloc(iter->inode, folio, iter->flags); if ((iter->flags & IOMAP_NOWAIT) && !ifs && nr_blocks > 1) return -EAGAIN; if (folio_test_uptodate(folio)) return 0; do { iomap_adjust_read_range(iter->inode, folio, &block_start, block_end - block_start, &poff, &plen); if (plen == 0) break; if (!(iter->flags & IOMAP_UNSHARE) && (from <= poff || from >= poff + plen) && (to <= poff || to >= poff + plen)) continue; if (iomap_block_needs_zeroing(iter, block_start)) { if (WARN_ON_ONCE(iter->flags & IOMAP_UNSHARE)) return -EIO; folio_zero_segments(folio, poff, from, to, poff + plen); } else { int status; if (iter->flags & IOMAP_NOWAIT) return -EAGAIN; status = iomap_read_folio_sync(block_start, folio, poff, plen, srcmap); if (status) return status; } iomap_set_range_uptodate(folio, poff, plen); } while ((block_start += plen) < block_end); return 0; } static struct folio *__iomap_get_folio(struct iomap_iter *iter, loff_t pos, size_t len) { const struct iomap_folio_ops *folio_ops = iter->iomap.folio_ops; if (folio_ops && folio_ops->get_folio) return folio_ops->get_folio(iter, pos, len); else return iomap_get_folio(iter, pos, len); } static void __iomap_put_folio(struct iomap_iter *iter, loff_t pos, size_t ret, struct folio *folio) { const struct iomap_folio_ops *folio_ops = iter->iomap.folio_ops; if (folio_ops && folio_ops->put_folio) { folio_ops->put_folio(iter->inode, pos, ret, folio); } else { folio_unlock(folio); folio_put(folio); } } static int iomap_write_begin_inline(const struct iomap_iter *iter, struct folio *folio) { /* needs more work for the tailpacking case; disable for now */ if (WARN_ON_ONCE(iomap_iter_srcmap(iter)->offset != 0)) return -EIO; return iomap_read_inline_data(iter, folio); } static int iomap_write_begin(struct iomap_iter *iter, loff_t pos, size_t len, struct folio **foliop) { const struct iomap_folio_ops *folio_ops = iter->iomap.folio_ops; const struct iomap *srcmap = iomap_iter_srcmap(iter); struct folio *folio; int status = 0; BUG_ON(pos + len > iter->iomap.offset + iter->iomap.length); if (srcmap != &iter->iomap) BUG_ON(pos + len > srcmap->offset + srcmap->length); if (fatal_signal_pending(current)) return -EINTR; if (!mapping_large_folio_support(iter->inode->i_mapping)) len = min_t(size_t, len, PAGE_SIZE - offset_in_page(pos)); folio = __iomap_get_folio(iter, pos, len); if (IS_ERR(folio)) return PTR_ERR(folio); /* * Now we have a locked folio, before we do anything with it we need to * check that the iomap we have cached is not stale. The inode extent * mapping can change due to concurrent IO in flight (e.g. * IOMAP_UNWRITTEN state can change and memory reclaim could have * reclaimed a previously partially written page at this index after IO * completion before this write reaches this file offset) and hence we * could do the wrong thing here (zero a page range incorrectly or fail * to zero) and corrupt data. */ if (folio_ops && folio_ops->iomap_valid) { bool iomap_valid = folio_ops->iomap_valid(iter->inode, &iter->iomap); if (!iomap_valid) { iter->iomap.flags |= IOMAP_F_STALE; status = 0; goto out_unlock; } } if (pos + len > folio_pos(folio) + folio_size(folio)) len = folio_pos(folio) + folio_size(folio) - pos; if (srcmap->type == IOMAP_INLINE) status = iomap_write_begin_inline(iter, folio); else if (srcmap->flags & IOMAP_F_BUFFER_HEAD) status = __block_write_begin_int(folio, pos, len, NULL, srcmap); else status = __iomap_write_begin(iter, pos, len, folio); if (unlikely(status)) goto out_unlock; *foliop = folio; return 0; out_unlock: __iomap_put_folio(iter, pos, 0, folio); return status; } static bool __iomap_write_end(struct inode *inode, loff_t pos, size_t len, size_t copied, struct folio *folio) { flush_dcache_folio(folio); /* * The blocks that were entirely written will now be uptodate, so we * don't have to worry about a read_folio reading them and overwriting a * partial write. However, if we've encountered a short write and only * partially written into a block, it will not be marked uptodate, so a * read_folio might come in and destroy our partial write. * * Do the simplest thing and just treat any short write to a * non-uptodate page as a zero-length write, and force the caller to * redo the whole thing. */ if (unlikely(copied < len && !folio_test_uptodate(folio))) return false; iomap_set_range_uptodate(folio, offset_in_folio(folio, pos), len); iomap_set_range_dirty(folio, offset_in_folio(folio, pos), copied); filemap_dirty_folio(inode->i_mapping, folio); return true; } static void iomap_write_end_inline(const struct iomap_iter *iter, struct folio *folio, loff_t pos, size_t copied) { const struct iomap *iomap = &iter->iomap; void *addr; WARN_ON_ONCE(!folio_test_uptodate(folio)); BUG_ON(!iomap_inline_data_valid(iomap)); flush_dcache_folio(folio); addr = kmap_local_folio(folio, pos); memcpy(iomap_inline_data(iomap, pos), addr, copied); kunmap_local(addr); mark_inode_dirty(iter->inode); } /* * Returns true if all copied bytes have been written to the pagecache, * otherwise return false. */ static bool iomap_write_end(struct iomap_iter *iter, loff_t pos, size_t len, size_t copied, struct folio *folio) { const struct iomap *srcmap = iomap_iter_srcmap(iter); if (srcmap->type == IOMAP_INLINE) { iomap_write_end_inline(iter, folio, pos, copied); return true; } if (srcmap->flags & IOMAP_F_BUFFER_HEAD) { size_t bh_written; bh_written = block_write_end(NULL, iter->inode->i_mapping, pos, len, copied, folio, NULL); WARN_ON_ONCE(bh_written != copied && bh_written != 0); return bh_written == copied; } return __iomap_write_end(iter->inode, pos, len, copied, folio); } static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i) { loff_t length = iomap_length(iter); loff_t pos = iter->pos; ssize_t total_written = 0; long status = 0; struct address_space *mapping = iter->inode->i_mapping; size_t chunk = mapping_max_folio_size(mapping); unsigned int bdp_flags = (iter->flags & IOMAP_NOWAIT) ? BDP_ASYNC : 0; do { struct folio *folio; loff_t old_size; size_t offset; /* Offset into folio */ size_t bytes; /* Bytes to write to folio */ size_t copied; /* Bytes copied from user */ size_t written; /* Bytes have been written */ bytes = iov_iter_count(i); retry: offset = pos & (chunk - 1); bytes = min(chunk - offset, bytes); status = balance_dirty_pages_ratelimited_flags(mapping, bdp_flags); if (unlikely(status)) break; if (bytes > length) bytes = length; /* * Bring in the user page that we'll copy from _first_. * Otherwise there's a nasty deadlock on copying from the * same page as we're writing to, without it being marked * up-to-date. * * For async buffered writes the assumption is that the user * page has already been faulted in. This can be optimized by * faulting the user page. */ if (unlikely(fault_in_iov_iter_readable(i, bytes) == bytes)) { status = -EFAULT; break; } status = iomap_write_begin(iter, pos, bytes, &folio); if (unlikely(status)) { iomap_write_failed(iter->inode, pos, bytes); break; } if (iter->iomap.flags & IOMAP_F_STALE) break; offset = offset_in_folio(folio, pos); if (bytes > folio_size(folio) - offset) bytes = folio_size(folio) - offset; if (mapping_writably_mapped(mapping)) flush_dcache_folio(folio); copied = copy_folio_from_iter_atomic(folio, offset, bytes, i); written = iomap_write_end(iter, pos, bytes, copied, folio) ? copied : 0; /* * Update the in-memory inode size after copying the data into * the page cache. It's up to the file system to write the * updated size to disk, preferably after I/O completion so that * no stale data is exposed. Only once that's done can we * unlock and release the folio. */ old_size = iter->inode->i_size; if (pos + written > old_size) { i_size_write(iter->inode, pos + written); iter->iomap.flags |= IOMAP_F_SIZE_CHANGED; } __iomap_put_folio(iter, pos, written, folio); if (old_size < pos) pagecache_isize_extended(iter->inode, old_size, pos); cond_resched(); if (unlikely(written == 0)) { /* * A short copy made iomap_write_end() reject the * thing entirely. Might be memory poisoning * halfway through, might be a race with munmap, * might be severe memory pressure. */ iomap_write_failed(iter->inode, pos, bytes); iov_iter_revert(i, copied); if (chunk > PAGE_SIZE) chunk /= 2; if (copied) { bytes = copied; goto retry; } } else { pos += written; total_written += written; length -= written; } } while (iov_iter_count(i) && length); if (status == -EAGAIN) { iov_iter_revert(i, total_written); return -EAGAIN; } return total_written ? total_written : status; } ssize_t iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *i, const struct iomap_ops *ops, void *private) { struct iomap_iter iter = { .inode = iocb->ki_filp->f_mapping->host, .pos = iocb->ki_pos, .len = iov_iter_count(i), .flags = IOMAP_WRITE, .private = private, }; ssize_t ret; if (iocb->ki_flags & IOCB_NOWAIT) iter.flags |= IOMAP_NOWAIT; while ((ret = iomap_iter(&iter, ops)) > 0) iter.processed = iomap_write_iter(&iter, i); if (unlikely(iter.pos == iocb->ki_pos)) return ret; ret = iter.pos - iocb->ki_pos; iocb->ki_pos = iter.pos; return ret; } EXPORT_SYMBOL_GPL(iomap_file_buffered_write); static void iomap_write_delalloc_ifs_punch(struct inode *inode, struct folio *folio, loff_t start_byte, loff_t end_byte, struct iomap *iomap, iomap_punch_t punch) { unsigned int first_blk, last_blk, i; loff_t last_byte; u8 blkbits = inode->i_blkbits; struct iomap_folio_state *ifs; /* * When we have per-block dirty tracking, there can be * blocks within a folio which are marked uptodate * but not dirty. In that case it is necessary to punch * out such blocks to avoid leaking any delalloc blocks. */ ifs = folio->private; if (!ifs) return; last_byte = min_t(loff_t, end_byte - 1, folio_pos(folio) + folio_size(folio) - 1); first_blk = offset_in_folio(folio, start_byte) >> blkbits; last_blk = offset_in_folio(folio, last_byte) >> blkbits; for (i = first_blk; i <= last_blk; i++) { if (!ifs_block_is_dirty(folio, ifs, i)) punch(inode, folio_pos(folio) + (i << blkbits), 1 << blkbits, iomap); } } static void iomap_write_delalloc_punch(struct inode *inode, struct folio *folio, loff_t *punch_start_byte, loff_t start_byte, loff_t end_byte, struct iomap *iomap, iomap_punch_t punch) { if (!folio_test_dirty(folio)) return; /* if dirty, punch up to offset */ if (start_byte > *punch_start_byte) { punch(inode, *punch_start_byte, start_byte - *punch_start_byte, iomap); } /* Punch non-dirty blocks within folio */ iomap_write_delalloc_ifs_punch(inode, folio, start_byte, end_byte, iomap, punch); /* * Make sure the next punch start is correctly bound to * the end of this data range, not the end of the folio. */ *punch_start_byte = min_t(loff_t, end_byte, folio_pos(folio) + folio_size(folio)); } /* * Scan the data range passed to us for dirty page cache folios. If we find a * dirty folio, punch out the preceding range and update the offset from which * the next punch will start from. * * We can punch out storage reservations under clean pages because they either * contain data that has been written back - in which case the delalloc punch * over that range is a no-op - or they have been read faults in which case they * contain zeroes and we can remove the delalloc backing range and any new * writes to those pages will do the normal hole filling operation... * * This makes the logic simple: we only need to keep the delalloc extents only * over the dirty ranges of the page cache. * * This function uses [start_byte, end_byte) intervals (i.e. open ended) to * simplify range iterations. */ static void iomap_write_delalloc_scan(struct inode *inode, loff_t *punch_start_byte, loff_t start_byte, loff_t end_byte, struct iomap *iomap, iomap_punch_t punch) { while (start_byte < end_byte) { struct folio *folio; /* grab locked page */ folio = filemap_lock_folio(inode->i_mapping, start_byte >> PAGE_SHIFT); if (IS_ERR(folio)) { start_byte = ALIGN_DOWN(start_byte, PAGE_SIZE) + PAGE_SIZE; continue; } iomap_write_delalloc_punch(inode, folio, punch_start_byte, start_byte, end_byte, iomap, punch); /* move offset to start of next folio in range */ start_byte = folio_pos(folio) + folio_size(folio); folio_unlock(folio); folio_put(folio); } } /* * When a short write occurs, the filesystem might need to use ->iomap_end * to remove space reservations created in ->iomap_begin. * * For filesystems that use delayed allocation, there can be dirty pages over * the delalloc extent outside the range of a short write but still within the * delalloc extent allocated for this iomap if the write raced with page * faults. * * Punch out all the delalloc blocks in the range given except for those that * have dirty data still pending in the page cache - those are going to be * written and so must still retain the delalloc backing for writeback. * * The punch() callback *must* only punch delalloc extents in the range passed * to it. It must skip over all other types of extents in the range and leave * them completely unchanged. It must do this punch atomically with respect to * other extent modifications. * * The punch() callback may be called with a folio locked to prevent writeback * extent allocation racing at the edge of the range we are currently punching. * The locked folio may or may not cover the range being punched, so it is not * safe for the punch() callback to lock folios itself. * * Lock order is: * * inode->i_rwsem (shared or exclusive) * inode->i_mapping->invalidate_lock (exclusive) * folio_lock() * ->punch * internal filesystem allocation lock * * As we are scanning the page cache for data, we don't need to reimplement the * wheel - mapping_seek_hole_data() does exactly what we need to identify the * start and end of data ranges correctly even for sub-folio block sizes. This * byte range based iteration is especially convenient because it means we * don't have to care about variable size folios, nor where the start or end of * the data range lies within a folio, if they lie within the same folio or even * if there are multiple discontiguous data ranges within the folio. * * It should be noted that mapping_seek_hole_data() is not aware of EOF, and so * can return data ranges that exist in the cache beyond EOF. e.g. a page fault * spanning EOF will initialise the post-EOF data to zeroes and mark it up to * date. A write page fault can then mark it dirty. If we then fail a write() * beyond EOF into that up to date cached range, we allocate a delalloc block * beyond EOF and then have to punch it out. Because the range is up to date, * mapping_seek_hole_data() will return it, and we will skip the punch because * the folio is dirty. THis is incorrect - we always need to punch out delalloc * beyond EOF in this case as writeback will never write back and covert that * delalloc block beyond EOF. Hence we limit the cached data scan range to EOF, * resulting in always punching out the range from the EOF to the end of the * range the iomap spans. * * Intervals are of the form [start_byte, end_byte) (i.e. open ended) because it * matches the intervals returned by mapping_seek_hole_data(). i.e. SEEK_DATA * returns the start of a data range (start_byte), and SEEK_HOLE(start_byte) * returns the end of the data range (data_end). Using closed intervals would * require sprinkling this code with magic "+ 1" and "- 1" arithmetic and expose * the code to subtle off-by-one bugs.... */ void iomap_write_delalloc_release(struct inode *inode, loff_t start_byte, loff_t end_byte, unsigned flags, struct iomap *iomap, iomap_punch_t punch) { loff_t punch_start_byte = start_byte; loff_t scan_end_byte = min(i_size_read(inode), end_byte); /* * The caller must hold invalidate_lock to avoid races with page faults * re-instantiating folios and dirtying them via ->page_mkwrite whilst * we walk the cache and perform delalloc extent removal. Failing to do * this can leave dirty pages with no space reservation in the cache. */ lockdep_assert_held_write(&inode->i_mapping->invalidate_lock); while (start_byte < scan_end_byte) { loff_t data_end; start_byte = mapping_seek_hole_data(inode->i_mapping, start_byte, scan_end_byte, SEEK_DATA); /* * If there is no more data to scan, all that is left is to * punch out the remaining range. * * Note that mapping_seek_hole_data is only supposed to return * either an offset or -ENXIO, so WARN on any other error as * that would be an API change without updating the callers. */ if (start_byte == -ENXIO || start_byte == scan_end_byte) break; if (WARN_ON_ONCE(start_byte < 0)) return; WARN_ON_ONCE(start_byte < punch_start_byte); WARN_ON_ONCE(start_byte > scan_end_byte); /* * We find the end of this contiguous cached data range by * seeking from start_byte to the beginning of the next hole. */ data_end = mapping_seek_hole_data(inode->i_mapping, start_byte, scan_end_byte, SEEK_HOLE); if (WARN_ON_ONCE(data_end < 0)) return; /* * If we race with post-direct I/O invalidation of the page cache, * there might be no data left at start_byte. */ if (data_end == start_byte) continue; WARN_ON_ONCE(data_end < start_byte); WARN_ON_ONCE(data_end > scan_end_byte); iomap_write_delalloc_scan(inode, &punch_start_byte, start_byte, data_end, iomap, punch); /* The next data search starts at the end of this one. */ start_byte = data_end; } if (punch_start_byte < end_byte) punch(inode, punch_start_byte, end_byte - punch_start_byte, iomap); } EXPORT_SYMBOL_GPL(iomap_write_delalloc_release); static loff_t iomap_unshare_iter(struct iomap_iter *iter) { struct iomap *iomap = &iter->iomap; loff_t pos = iter->pos; loff_t length = iomap_length(iter); loff_t written = 0; if (!iomap_want_unshare_iter(iter)) return length; do { struct folio *folio; int status; size_t offset; size_t bytes = min_t(u64, SIZE_MAX, length); bool ret; status = iomap_write_begin(iter, pos, bytes, &folio); if (unlikely(status)) return status; if (iomap->flags & IOMAP_F_STALE) break; offset = offset_in_folio(folio, pos); if (bytes > folio_size(folio) - offset) bytes = folio_size(folio) - offset; ret = iomap_write_end(iter, pos, bytes, bytes, folio); __iomap_put_folio(iter, pos, bytes, folio); if (WARN_ON_ONCE(!ret)) return -EIO; cond_resched(); pos += bytes; written += bytes; length -= bytes; balance_dirty_pages_ratelimited(iter->inode->i_mapping); } while (length > 0); return written; } int iomap_file_unshare(struct inode *inode, loff_t pos, loff_t len, const struct iomap_ops *ops) { struct iomap_iter iter = { .inode = inode, .pos = pos, .flags = IOMAP_WRITE | IOMAP_UNSHARE, }; loff_t size = i_size_read(inode); int ret; if (pos < 0 || pos >= size) return 0; iter.len = min(len, size - pos); while ((ret = iomap_iter(&iter, ops)) > 0) iter.processed = iomap_unshare_iter(&iter); return ret; } EXPORT_SYMBOL_GPL(iomap_file_unshare); /* * Flush the remaining range of the iter and mark the current mapping stale. * This is used when zero range sees an unwritten mapping that may have had * dirty pagecache over it. */ static inline int iomap_zero_iter_flush_and_stale(struct iomap_iter *i) { struct address_space *mapping = i->inode->i_mapping; loff_t end = i->pos + i->len - 1; i->iomap.flags |= IOMAP_F_STALE; return filemap_write_and_wait_range(mapping, i->pos, end); } static loff_t iomap_zero_iter(struct iomap_iter *iter, bool *did_zero) { loff_t pos = iter->pos; loff_t length = iomap_length(iter); loff_t written = 0; do { struct folio *folio; int status; size_t offset; size_t bytes = min_t(u64, SIZE_MAX, length); bool ret; status = iomap_write_begin(iter, pos, bytes, &folio); if (status) return status; if (iter->iomap.flags & IOMAP_F_STALE) break; /* warn about zeroing folios beyond eof that won't write back */ WARN_ON_ONCE(folio_pos(folio) > iter->inode->i_size); offset = offset_in_folio(folio, pos); if (bytes > folio_size(folio) - offset) bytes = folio_size(folio) - offset; folio_zero_range(folio, offset, bytes); folio_mark_accessed(folio); ret = iomap_write_end(iter, pos, bytes, bytes, folio); __iomap_put_folio(iter, pos, bytes, folio); if (WARN_ON_ONCE(!ret)) return -EIO; pos += bytes; length -= bytes; written += bytes; } while (length > 0); if (did_zero) *did_zero = true; return written; } int iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero, const struct iomap_ops *ops) { struct iomap_iter iter = { .inode = inode, .pos = pos, .len = len, .flags = IOMAP_ZERO, }; struct address_space *mapping = inode->i_mapping; unsigned int blocksize = i_blocksize(inode); unsigned int off = pos & (blocksize - 1); loff_t plen = min_t(loff_t, len, blocksize - off); int ret; bool range_dirty; /* * Zero range can skip mappings that are zero on disk so long as * pagecache is clean. If pagecache was dirty prior to zero range, the * mapping converts on writeback completion and so must be zeroed. * * The simplest way to deal with this across a range is to flush * pagecache and process the updated mappings. To avoid excessive * flushing on partial eof zeroing, special case it to zero the * unaligned start portion if already dirty in pagecache. */ if (off && filemap_range_needs_writeback(mapping, pos, pos + plen - 1)) { iter.len = plen; while ((ret = iomap_iter(&iter, ops)) > 0) iter.processed = iomap_zero_iter(&iter, did_zero); iter.len = len - (iter.pos - pos); if (ret || !iter.len) return ret; } /* * To avoid an unconditional flush, check pagecache state and only flush * if dirty and the fs returns a mapping that might convert on * writeback. */ range_dirty = filemap_range_needs_writeback(inode->i_mapping, iter.pos, iter.pos + iter.len - 1); while ((ret = iomap_iter(&iter, ops)) > 0) { const struct iomap *srcmap = iomap_iter_srcmap(&iter); if (srcmap->type == IOMAP_HOLE || srcmap->type == IOMAP_UNWRITTEN) { loff_t proc = iomap_length(&iter); if (range_dirty) { range_dirty = false; proc = iomap_zero_iter_flush_and_stale(&iter); } iter.processed = proc; continue; } iter.processed = iomap_zero_iter(&iter, did_zero); } return ret; } EXPORT_SYMBOL_GPL(iomap_zero_range); int iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero, const struct iomap_ops *ops) { unsigned int blocksize = i_blocksize(inode); unsigned int off = pos & (blocksize - 1); /* Block boundary? Nothing to do */ if (!off) return 0; return iomap_zero_range(inode, pos, blocksize - off, did_zero, ops); } EXPORT_SYMBOL_GPL(iomap_truncate_page); static loff_t iomap_folio_mkwrite_iter(struct iomap_iter *iter, struct folio *folio) { loff_t length = iomap_length(iter); int ret; if (iter->iomap.flags & IOMAP_F_BUFFER_HEAD) { ret = __block_write_begin_int(folio, iter->pos, length, NULL, &iter->iomap); if (ret) return ret; block_commit_write(&folio->page, 0, length); } else { WARN_ON_ONCE(!folio_test_uptodate(folio)); folio_mark_dirty(folio); } return length; } vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops) { struct iomap_iter iter = { .inode = file_inode(vmf->vma->vm_file), .flags = IOMAP_WRITE | IOMAP_FAULT, }; struct folio *folio = page_folio(vmf->page); ssize_t ret; folio_lock(folio); ret = folio_mkwrite_check_truncate(folio, iter.inode); if (ret < 0) goto out_unlock; iter.pos = folio_pos(folio); iter.len = ret; while ((ret = iomap_iter(&iter, ops)) > 0) iter.processed = iomap_folio_mkwrite_iter(&iter, folio); if (ret < 0) goto out_unlock; folio_wait_stable(folio); return VM_FAULT_LOCKED; out_unlock: folio_unlock(folio); return vmf_fs_error(ret); } EXPORT_SYMBOL_GPL(iomap_page_mkwrite); static void iomap_finish_folio_write(struct inode *inode, struct folio *folio, size_t len) { struct iomap_folio_state *ifs = folio->private; WARN_ON_ONCE(i_blocks_per_folio(inode, folio) > 1 && !ifs); WARN_ON_ONCE(ifs && atomic_read(&ifs->write_bytes_pending) <= 0); if (!ifs || atomic_sub_and_test(len, &ifs->write_bytes_pending)) folio_end_writeback(folio); } /* * We're now finished for good with this ioend structure. Update the page * state, release holds on bios, and finally free up memory. Do not use the * ioend after this. */ static u32 iomap_finish_ioend(struct iomap_ioend *ioend, int error) { struct inode *inode = ioend->io_inode; struct bio *bio = &ioend->io_bio; struct folio_iter fi; u32 folio_count = 0; if (error) { mapping_set_error(inode->i_mapping, error); if (!bio_flagged(bio, BIO_QUIET)) { pr_err_ratelimited( "%s: writeback error on inode %lu, offset %lld, sector %llu", inode->i_sb->s_id, inode->i_ino, ioend->io_offset, ioend->io_sector); } } /* walk all folios in bio, ending page IO on them */ bio_for_each_folio_all(fi, bio) { iomap_finish_folio_write(inode, fi.folio, fi.length); folio_count++; } bio_put(bio); /* frees the ioend */ return folio_count; } /* * Ioend completion routine for merged bios. This can only be called from task * contexts as merged ioends can be of unbound length. Hence we have to break up * the writeback completions into manageable chunks to avoid long scheduler * holdoffs. We aim to keep scheduler holdoffs down below 10ms so that we get * good batch processing throughput without creating adverse scheduler latency * conditions. */ void iomap_finish_ioends(struct iomap_ioend *ioend, int error) { struct list_head tmp; u32 completions; might_sleep(); list_replace_init(&ioend->io_list, &tmp); completions = iomap_finish_ioend(ioend, error); while (!list_empty(&tmp)) { if (completions > IOEND_BATCH_SIZE * 8) { cond_resched(); completions = 0; } ioend = list_first_entry(&tmp, struct iomap_ioend, io_list); list_del_init(&ioend->io_list); completions += iomap_finish_ioend(ioend, error); } } EXPORT_SYMBOL_GPL(iomap_finish_ioends); /* * We can merge two adjacent ioends if they have the same set of work to do. */ static bool iomap_ioend_can_merge(struct iomap_ioend *ioend, struct iomap_ioend *next) { if (ioend->io_bio.bi_status != next->io_bio.bi_status) return false; if (next->io_flags & IOMAP_F_BOUNDARY) return false; if ((ioend->io_flags & IOMAP_F_SHARED) ^ (next->io_flags & IOMAP_F_SHARED)) return false; if ((ioend->io_type == IOMAP_UNWRITTEN) ^ (next->io_type == IOMAP_UNWRITTEN)) return false; if (ioend->io_offset + ioend->io_size != next->io_offset) return false; /* * Do not merge physically discontiguous ioends. The filesystem * completion functions will have to iterate the physical * discontiguities even if we merge the ioends at a logical level, so * we don't gain anything by merging physical discontiguities here. * * We cannot use bio->bi_iter.bi_sector here as it is modified during * submission so does not point to the start sector of the bio at * completion. */ if (ioend->io_sector + (ioend->io_size >> 9) != next->io_sector) return false; return true; } void iomap_ioend_try_merge(struct iomap_ioend *ioend, struct list_head *more_ioends) { struct iomap_ioend *next; INIT_LIST_HEAD(&ioend->io_list); while ((next = list_first_entry_or_null(more_ioends, struct iomap_ioend, io_list))) { if (!iomap_ioend_can_merge(ioend, next)) break; list_move_tail(&next->io_list, &ioend->io_list); ioend->io_size += next->io_size; } } EXPORT_SYMBOL_GPL(iomap_ioend_try_merge); static int iomap_ioend_compare(void *priv, const struct list_head *a, const struct list_head *b) { struct iomap_ioend *ia = container_of(a, struct iomap_ioend, io_list); struct iomap_ioend *ib = container_of(b, struct iomap_ioend, io_list); if (ia->io_offset < ib->io_offset) return -1; if (ia->io_offset > ib->io_offset) return 1; return 0; } void iomap_sort_ioends(struct list_head *ioend_list) { list_sort(NULL, ioend_list, iomap_ioend_compare); } EXPORT_SYMBOL_GPL(iomap_sort_ioends); static void iomap_writepage_end_bio(struct bio *bio) { iomap_finish_ioend(iomap_ioend_from_bio(bio), blk_status_to_errno(bio->bi_status)); } /* * Submit the final bio for an ioend. * * If @error is non-zero, it means that we have a situation where some part of * the submission process has failed after we've marked pages for writeback. * We cannot cancel ioend directly in that case, so call the bio end I/O handler * with the error status here to run the normal I/O completion handler to clear * the writeback bit and let the file system proess the errors. */ static int iomap_submit_ioend(struct iomap_writepage_ctx *wpc, int error) { if (!wpc->ioend) return error; /* * Let the file systems prepare the I/O submission and hook in an I/O * comletion handler. This also needs to happen in case after a * failure happened so that the file system end I/O handler gets called * to clean up. */ if (wpc->ops->prepare_ioend) error = wpc->ops->prepare_ioend(wpc->ioend, error); if (error) { wpc->ioend->io_bio.bi_status = errno_to_blk_status(error); bio_endio(&wpc->ioend->io_bio); } else { submit_bio(&wpc->ioend->io_bio); } wpc->ioend = NULL; return error; } static struct iomap_ioend *iomap_alloc_ioend(struct iomap_writepage_ctx *wpc, struct writeback_control *wbc, struct inode *inode, loff_t pos) { struct iomap_ioend *ioend; struct bio *bio; bio = bio_alloc_bioset(wpc->iomap.bdev, BIO_MAX_VECS, REQ_OP_WRITE | wbc_to_write_flags(wbc), GFP_NOFS, &iomap_ioend_bioset); bio->bi_iter.bi_sector = iomap_sector(&wpc->iomap, pos); bio->bi_end_io = iomap_writepage_end_bio; wbc_init_bio(wbc, bio); bio->bi_write_hint = inode->i_write_hint; ioend = iomap_ioend_from_bio(bio); INIT_LIST_HEAD(&ioend->io_list); ioend->io_type = wpc->iomap.type; ioend->io_flags = wpc->iomap.flags; if (pos > wpc->iomap.offset) wpc->iomap.flags &= ~IOMAP_F_BOUNDARY; ioend->io_inode = inode; ioend->io_size = 0; ioend->io_offset = pos; ioend->io_sector = bio->bi_iter.bi_sector; wpc->nr_folios = 0; return ioend; } static bool iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t pos) { if (wpc->iomap.offset == pos && (wpc->iomap.flags & IOMAP_F_BOUNDARY)) return false; if ((wpc->iomap.flags & IOMAP_F_SHARED) != (wpc->ioend->io_flags & IOMAP_F_SHARED)) return false; if (wpc->iomap.type != wpc->ioend->io_type) return false; if (pos != wpc->ioend->io_offset + wpc->ioend->io_size) return false; if (iomap_sector(&wpc->iomap, pos) != bio_end_sector(&wpc->ioend->io_bio)) return false; /* * Limit ioend bio chain lengths to minimise IO completion latency. This * also prevents long tight loops ending page writeback on all the * folios in the ioend. */ if (wpc->nr_folios >= IOEND_BATCH_SIZE) return false; return true; } /* * Test to see if we have an existing ioend structure that we could append to * first; otherwise finish off the current ioend and start another. * * If a new ioend is created and cached, the old ioend is submitted to the block * layer instantly. Batching optimisations are provided by higher level block * plugging. * * At the end of a writeback pass, there will be a cached ioend remaining on the * writepage context that the caller will need to submit. */ static int iomap_add_to_ioend(struct iomap_writepage_ctx *wpc, struct writeback_control *wbc, struct folio *folio, struct inode *inode, loff_t pos, loff_t end_pos, unsigned len) { struct iomap_folio_state *ifs = folio->private; size_t poff = offset_in_folio(folio, pos); int error; if (!wpc->ioend || !iomap_can_add_to_ioend(wpc, pos)) { new_ioend: error = iomap_submit_ioend(wpc, 0); if (error) return error; wpc->ioend = iomap_alloc_ioend(wpc, wbc, inode, pos); } if (!bio_add_folio(&wpc->ioend->io_bio, folio, len, poff)) goto new_ioend; if (ifs) atomic_add(len, &ifs->write_bytes_pending); /* * Clamp io_offset and io_size to the incore EOF so that ondisk * file size updates in the ioend completion are byte-accurate. * This avoids recovering files with zeroed tail regions when * writeback races with appending writes: * * Thread 1: Thread 2: * ------------ ----------- * write [A, A+B] * update inode size to A+B * submit I/O [A, A+BS] * write [A+B, A+B+C] * update inode size to A+B+C * <I/O completes, updates disk size to min(A+B+C, A+BS)> * <power failure> * * After reboot: * 1) with A+B+C < A+BS, the file has zero padding in range * [A+B, A+B+C] * * |< Block Size (BS) >| * |DDDDDDDDDDDD0000000000000| * ^ ^ ^ * A A+B A+B+C * (EOF) * * 2) with A+B+C > A+BS, the file has zero padding in range * [A+B, A+BS] * * |< Block Size (BS) >|< Block Size (BS) >| * |DDDDDDDDDDDD0000000000000|00000000000000000000000000| * ^ ^ ^ ^ * A A+B A+BS A+B+C * (EOF) * * D = Valid Data * 0 = Zero Padding * * Note that this defeats the ability to chain the ioends of * appending writes. */ wpc->ioend->io_size += len; if (wpc->ioend->io_offset + wpc->ioend->io_size > end_pos) wpc->ioend->io_size = end_pos - wpc->ioend->io_offset; wbc_account_cgroup_owner(wbc, folio, len); return 0; } static int iomap_writepage_map_blocks(struct iomap_writepage_ctx *wpc, struct writeback_control *wbc, struct folio *folio, struct inode *inode, u64 pos, u64 end_pos, unsigned dirty_len, unsigned *count) { int error; do { unsigned map_len; error = wpc->ops->map_blocks(wpc, inode, pos, dirty_len); if (error) break; trace_iomap_writepage_map(inode, pos, dirty_len, &wpc->iomap); map_len = min_t(u64, dirty_len, wpc->iomap.offset + wpc->iomap.length - pos); WARN_ON_ONCE(!folio->private && map_len < dirty_len); switch (wpc->iomap.type) { case IOMAP_INLINE: WARN_ON_ONCE(1); error = -EIO; break; case IOMAP_HOLE: break; default: error = iomap_add_to_ioend(wpc, wbc, folio, inode, pos, end_pos, map_len); if (!error) (*count)++; break; } dirty_len -= map_len; pos += map_len; } while (dirty_len && !error); /* * We cannot cancel the ioend directly here on error. We may have * already set other pages under writeback and hence we have to run I/O * completion to mark the error state of the pages under writeback * appropriately. * * Just let the file system know what portion of the folio failed to * map. */ if (error && wpc->ops->discard_folio) wpc->ops->discard_folio(folio, pos); return error; } /* * Check interaction of the folio with the file end. * * If the folio is entirely beyond i_size, return false. If it straddles * i_size, adjust end_pos and zero all data beyond i_size. */ static bool iomap_writepage_handle_eof(struct folio *folio, struct inode *inode, u64 *end_pos) { u64 isize = i_size_read(inode); if (*end_pos > isize) { size_t poff = offset_in_folio(folio, isize); pgoff_t end_index = isize >> PAGE_SHIFT; /* * If the folio is entirely ouside of i_size, skip it. * * This can happen due to a truncate operation that is in * progress and in that case truncate will finish it off once * we've dropped the folio lock. * * Note that the pgoff_t used for end_index is an unsigned long. * If the given offset is greater than 16TB on a 32-bit system, * then if we checked if the folio is fully outside i_size with * "if (folio->index >= end_index + 1)", "end_index + 1" would * overflow and evaluate to 0. Hence this folio would be * redirtied and written out repeatedly, which would result in * an infinite loop; the user program performing this operation * would hang. Instead, we can detect this situation by * checking if the folio is totally beyond i_size or if its * offset is just equal to the EOF. */ if (folio->index > end_index || (folio->index == end_index && poff == 0)) return false; /* * The folio straddles i_size. * * It must be zeroed out on each and every writepage invocation * because it may be mmapped: * * A file is mapped in multiples of the page size. For a * file that is not a multiple of the page size, the * remaining memory is zeroed when mapped, and writes to that * region are not written out to the file. * * Also adjust the end_pos to the end of file and skip writeback * for all blocks entirely beyond i_size. */ folio_zero_segment(folio, poff, folio_size(folio)); *end_pos = isize; } return true; } static int iomap_writepage_map(struct iomap_writepage_ctx *wpc, struct writeback_control *wbc, struct folio *folio) { struct iomap_folio_state *ifs = folio->private; struct inode *inode = folio->mapping->host; u64 pos = folio_pos(folio); u64 end_pos = pos + folio_size(folio); u64 end_aligned = 0; unsigned count = 0; int error = 0; u32 rlen; WARN_ON_ONCE(!folio_test_locked(folio)); WARN_ON_ONCE(folio_test_dirty(folio)); WARN_ON_ONCE(folio_test_writeback(folio)); trace_iomap_writepage(inode, pos, folio_size(folio)); if (!iomap_writepage_handle_eof(folio, inode, &end_pos)) { folio_unlock(folio); return 0; } WARN_ON_ONCE(end_pos <= pos); if (i_blocks_per_folio(inode, folio) > 1) { if (!ifs) { ifs = ifs_alloc(inode, folio, 0); iomap_set_range_dirty(folio, 0, end_pos - pos); } /* * Keep the I/O completion handler from clearing the writeback * bit until we have submitted all blocks by adding a bias to * ifs->write_bytes_pending, which is dropped after submitting * all blocks. */ WARN_ON_ONCE(atomic_read(&ifs->write_bytes_pending) != 0); atomic_inc(&ifs->write_bytes_pending); } /* * Set the writeback bit ASAP, as the I/O completion for the single * block per folio case happen hit as soon as we're submitting the bio. */ folio_start_writeback(folio); /* * Walk through the folio to find dirty areas to write back. */ end_aligned = round_up(end_pos, i_blocksize(inode)); while ((rlen = iomap_find_dirty_range(folio, &pos, end_aligned))) { error = iomap_writepage_map_blocks(wpc, wbc, folio, inode, pos, end_pos, rlen, &count); if (error) break; pos += rlen; } if (count) wpc->nr_folios++; /* * We can have dirty bits set past end of file in page_mkwrite path * while mapping the last partial folio. Hence it's better to clear * all the dirty bits in the folio here. */ iomap_clear_range_dirty(folio, 0, folio_size(folio)); /* * Usually the writeback bit is cleared by the I/O completion handler. * But we may end up either not actually writing any blocks, or (when * there are multiple blocks in a folio) all I/O might have finished * already at this point. In that case we need to clear the writeback * bit ourselves right after unlocking the page. */ folio_unlock(folio); if (ifs) { if (atomic_dec_and_test(&ifs->write_bytes_pending)) folio_end_writeback(folio); } else { if (!count) folio_end_writeback(folio); } mapping_set_error(inode->i_mapping, error); return error; } int iomap_writepages(struct address_space *mapping, struct writeback_control *wbc, struct iomap_writepage_ctx *wpc, const struct iomap_writeback_ops *ops) { struct folio *folio = NULL; int error; /* * Writeback from reclaim context should never happen except in the case * of a VM regression so warn about it and refuse to write the data. */ if (WARN_ON_ONCE((current->flags & (PF_MEMALLOC | PF_KSWAPD)) == PF_MEMALLOC)) return -EIO; wpc->ops = ops; while ((folio = writeback_iter(mapping, wbc, folio, &error))) error = iomap_writepage_map(wpc, wbc, folio); return iomap_submit_ioend(wpc, error); } EXPORT_SYMBOL_GPL(iomap_writepages); static int __init iomap_buffered_init(void) { return bioset_init(&iomap_ioend_bioset, 4 * (PAGE_SIZE / SECTOR_SIZE), offsetof(struct iomap_ioend, io_bio), BIOSET_NEED_BVECS); } fs_initcall(iomap_buffered_init);
1 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 // SPDX-License-Identifier: GPL-2.0-only /* * inode.c - securityfs * * Copyright (C) 2005 Greg Kroah-Hartman <gregkh@suse.de> * * Based on fs/debugfs/inode.c which had the following copyright notice: * Copyright (C) 2004 Greg Kroah-Hartman <greg@kroah.com> * Copyright (C) 2004 IBM Inc. */ /* #define DEBUG */ #include <linux/sysfs.h> #include <linux/kobject.h> #include <linux/fs.h> #include <linux/fs_context.h> #include <linux/mount.h> #include <linux/pagemap.h> #include <linux/init.h> #include <linux/namei.h> #include <linux/security.h> #include <linux/lsm_hooks.h> #include <linux/magic.h> static struct vfsmount *mount; static int mount_count; static void securityfs_free_inode(struct inode *inode) { if (S_ISLNK(inode->i_mode)) kfree(inode->i_link); free_inode_nonrcu(inode); } static const struct super_operations securityfs_super_operations = { .statfs = simple_statfs, .free_inode = securityfs_free_inode, }; static int securityfs_fill_super(struct super_block *sb, struct fs_context *fc) { static const struct tree_descr files[] = {{""}}; int error; error = simple_fill_super(sb, SECURITYFS_MAGIC, files); if (error) return error; sb->s_op = &securityfs_super_operations; return 0; } static int securityfs_get_tree(struct fs_context *fc) { return get_tree_single(fc, securityfs_fill_super); } static const struct fs_context_operations securityfs_context_ops = { .get_tree = securityfs_get_tree, }; static int securityfs_init_fs_context(struct fs_context *fc) { fc->ops = &securityfs_context_ops; return 0; } static struct file_system_type fs_type = { .owner = THIS_MODULE, .name = "securityfs", .init_fs_context = securityfs_init_fs_context, .kill_sb = kill_litter_super, }; /** * securityfs_create_dentry - create a dentry in the securityfs filesystem * * @name: a pointer to a string containing the name of the file to create. * @mode: the permission that the file should have * @parent: a pointer to the parent dentry for this file. This should be a * directory dentry if set. If this parameter is %NULL, then the * file will be created in the root of the securityfs filesystem. * @data: a pointer to something that the caller will want to get to later * on. The inode.i_private pointer will point to this value on * the open() call. * @fops: a pointer to a struct file_operations that should be used for * this file. * @iops: a point to a struct of inode_operations that should be used for * this file/dir * * This is the basic "create a file/dir/symlink" function for * securityfs. It allows for a wide range of flexibility in creating * a file, or a directory (if you want to create a directory, the * securityfs_create_dir() function is recommended to be used * instead). * * This function returns a pointer to a dentry if it succeeds. This * pointer must be passed to the securityfs_remove() function when the * file is to be removed (no automatic cleanup happens if your module * is unloaded, you are responsible here). If an error occurs, the * function will return the error value (via ERR_PTR). * * If securityfs is not enabled in the kernel, the value %-ENODEV is * returned. */ static struct dentry *securityfs_create_dentry(const char *name, umode_t mode, struct dentry *parent, void *data, const struct file_operations *fops, const struct inode_operations *iops) { struct dentry *dentry; struct inode *dir, *inode; int error; if (!(mode & S_IFMT)) mode = (mode & S_IALLUGO) | S_IFREG; pr_debug("securityfs: creating file '%s'\n",name); error = simple_pin_fs(&fs_type, &mount, &mount_count); if (error) return ERR_PTR(error); if (!parent) parent = mount->mnt_root; dir = d_inode(parent); inode_lock(dir); dentry = lookup_one_len(name, parent, strlen(name)); if (IS_ERR(dentry)) goto out; if (d_really_is_positive(dentry)) { error = -EEXIST; goto out1; } inode = new_inode(dir->i_sb); if (!inode) { error = -ENOMEM; goto out1; } inode->i_ino = get_next_ino(); inode->i_mode = mode; simple_inode_init_ts(inode); inode->i_private = data; if (S_ISDIR(mode)) { inode->i_op = &simple_dir_inode_operations; inode->i_fop = &simple_dir_operations; inc_nlink(inode); inc_nlink(dir); } else if (S_ISLNK(mode)) { inode->i_op = iops ? iops : &simple_symlink_inode_operations; inode->i_link = data; } else { inode->i_fop = fops; } d_instantiate(dentry, inode); dget(dentry); inode_unlock(dir); return dentry; out1: dput(dentry); dentry = ERR_PTR(error); out: inode_unlock(dir); simple_release_fs(&mount, &mount_count); return dentry; } /** * securityfs_create_file - create a file in the securityfs filesystem * * @name: a pointer to a string containing the name of the file to create. * @mode: the permission that the file should have * @parent: a pointer to the parent dentry for this file. This should be a * directory dentry if set. If this parameter is %NULL, then the * file will be created in the root of the securityfs filesystem. * @data: a pointer to something that the caller will want to get to later * on. The inode.i_private pointer will point to this value on * the open() call. * @fops: a pointer to a struct file_operations that should be used for * this file. * * This function creates a file in securityfs with the given @name. * * This function returns a pointer to a dentry if it succeeds. This * pointer must be passed to the securityfs_remove() function when the file is * to be removed (no automatic cleanup happens if your module is unloaded, * you are responsible here). If an error occurs, the function will return * the error value (via ERR_PTR). * * If securityfs is not enabled in the kernel, the value %-ENODEV is * returned. */ struct dentry *securityfs_create_file(const char *name, umode_t mode, struct dentry *parent, void *data, const struct file_operations *fops) { return securityfs_create_dentry(name, mode, parent, data, fops, NULL); } EXPORT_SYMBOL_GPL(securityfs_create_file); /** * securityfs_create_dir - create a directory in the securityfs filesystem * * @name: a pointer to a string containing the name of the directory to * create. * @parent: a pointer to the parent dentry for this file. This should be a * directory dentry if set. If this parameter is %NULL, then the * directory will be created in the root of the securityfs filesystem. * * This function creates a directory in securityfs with the given @name. * * This function returns a pointer to a dentry if it succeeds. This * pointer must be passed to the securityfs_remove() function when the file is * to be removed (no automatic cleanup happens if your module is unloaded, * you are responsible here). If an error occurs, the function will return * the error value (via ERR_PTR). * * If securityfs is not enabled in the kernel, the value %-ENODEV is * returned. */ struct dentry *securityfs_create_dir(const char *name, struct dentry *parent) { return securityfs_create_file(name, S_IFDIR | 0755, parent, NULL, NULL); } EXPORT_SYMBOL_GPL(securityfs_create_dir); /** * securityfs_create_symlink - create a symlink in the securityfs filesystem * * @name: a pointer to a string containing the name of the symlink to * create. * @parent: a pointer to the parent dentry for the symlink. This should be a * directory dentry if set. If this parameter is %NULL, then the * directory will be created in the root of the securityfs filesystem. * @target: a pointer to a string containing the name of the symlink's target. * If this parameter is %NULL, then the @iops parameter needs to be * setup to handle .readlink and .get_link inode_operations. * @iops: a pointer to the struct inode_operations to use for the symlink. If * this parameter is %NULL, then the default simple_symlink_inode * operations will be used. * * This function creates a symlink in securityfs with the given @name. * * This function returns a pointer to a dentry if it succeeds. This * pointer must be passed to the securityfs_remove() function when the file is * to be removed (no automatic cleanup happens if your module is unloaded, * you are responsible here). If an error occurs, the function will return * the error value (via ERR_PTR). * * If securityfs is not enabled in the kernel, the value %-ENODEV is * returned. */ struct dentry *securityfs_create_symlink(const char *name, struct dentry *parent, const char *target, const struct inode_operations *iops) { struct dentry *dent; char *link = NULL; if (target) { link = kstrdup(target, GFP_KERNEL); if (!link) return ERR_PTR(-ENOMEM); } dent = securityfs_create_dentry(name, S_IFLNK | 0444, parent, link, NULL, iops); if (IS_ERR(dent)) kfree(link); return dent; } EXPORT_SYMBOL_GPL(securityfs_create_symlink); /** * securityfs_remove - removes a file or directory from the securityfs filesystem * * @dentry: a pointer to a the dentry of the file or directory to be removed. * * This function removes a file or directory in securityfs that was previously * created with a call to another securityfs function (like * securityfs_create_file() or variants thereof.) * * This function is required to be called in order for the file to be * removed. No automatic cleanup of files will happen when a module is * removed; you are responsible here. */ void securityfs_remove(struct dentry *dentry) { struct inode *dir; if (IS_ERR_OR_NULL(dentry)) return; dir = d_inode(dentry->d_parent); inode_lock(dir); if (simple_positive(dentry)) { if (d_is_dir(dentry)) simple_rmdir(dir, dentry); else simple_unlink(dir, dentry); dput(dentry); } inode_unlock(dir); simple_release_fs(&mount, &mount_count); } EXPORT_SYMBOL_GPL(securityfs_remove); static void remove_one(struct dentry *victim) { simple_release_fs(&mount, &mount_count); } /** * securityfs_recursive_remove - recursively removes a file or directory * * @dentry: a pointer to a the dentry of the file or directory to be removed. * * This function recursively removes a file or directory in securityfs that was * previously created with a call to another securityfs function (like * securityfs_create_file() or variants thereof.) */ void securityfs_recursive_remove(struct dentry *dentry) { if (IS_ERR_OR_NULL(dentry)) return; simple_pin_fs(&fs_type, &mount, &mount_count); simple_recursive_removal(dentry, remove_one); simple_release_fs(&mount, &mount_count); } EXPORT_SYMBOL_GPL(securityfs_recursive_remove); #ifdef CONFIG_SECURITY static struct dentry *lsm_dentry; static ssize_t lsm_read(struct file *filp, char __user *buf, size_t count, loff_t *ppos) { return simple_read_from_buffer(buf, count, ppos, lsm_names, strlen(lsm_names)); } static const struct file_operations lsm_ops = { .read = lsm_read, .llseek = generic_file_llseek, }; #endif static int __init securityfs_init(void) { int retval; retval = sysfs_create_mount_point(kernel_kobj, "security"); if (retval) return retval; retval = register_filesystem(&fs_type); if (retval) { sysfs_remove_mount_point(kernel_kobj, "security"); return retval; } #ifdef CONFIG_SECURITY lsm_dentry = securityfs_create_file("lsm", 0444, NULL, NULL, &lsm_ops); #endif return 0; } core_initcall(securityfs_init);
7 3 4 4 6 6 9 7 2 6 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/ext4/symlink.c * * Only fast symlinks left here - the rest is done by generic code. AV, 1999 * * Copyright (C) 1992, 1993, 1994, 1995 * Remy Card (card@masi.ibp.fr) * Laboratoire MASI - Institut Blaise Pascal * Universite Pierre et Marie Curie (Paris VI) * * from * * linux/fs/minix/symlink.c * * Copyright (C) 1991, 1992 Linus Torvalds * * ext4 symlink handling code */ #include <linux/fs.h> #include <linux/namei.h> #include "ext4.h" #include "xattr.h" static const char *ext4_encrypted_get_link(struct dentry *dentry, struct inode *inode, struct delayed_call *done) { struct buffer_head *bh = NULL; const void *caddr; unsigned int max_size; const char *paddr; if (!dentry) return ERR_PTR(-ECHILD); if (ext4_inode_is_fast_symlink(inode)) { caddr = EXT4_I(inode)->i_data; max_size = sizeof(EXT4_I(inode)->i_data); } else { bh = ext4_bread(NULL, inode, 0, 0); if (IS_ERR(bh)) return ERR_CAST(bh); if (!bh) { EXT4_ERROR_INODE(inode, "bad symlink."); return ERR_PTR(-EFSCORRUPTED); } caddr = bh->b_data; max_size = inode->i_sb->s_blocksize; } paddr = fscrypt_get_symlink(inode, caddr, max_size, done); brelse(bh); return paddr; } static int ext4_encrypted_symlink_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { ext4_getattr(idmap, path, stat, request_mask, query_flags); return fscrypt_symlink_getattr(path, stat); } static void ext4_free_link(void *bh) { brelse(bh); } static const char *ext4_get_link(struct dentry *dentry, struct inode *inode, struct delayed_call *callback) { struct buffer_head *bh; char *inline_link; /* * Create a new inlined symlink is not supported, just provide a * method to read the leftovers. */ if (ext4_has_inline_data(inode)) { if (!dentry) return ERR_PTR(-ECHILD); inline_link = ext4_read_inline_link(inode); if (!IS_ERR(inline_link)) set_delayed_call(callback, kfree_link, inline_link); return inline_link; } if (!dentry) { bh = ext4_getblk(NULL, inode, 0, EXT4_GET_BLOCKS_CACHED_NOWAIT); if (IS_ERR(bh) || !bh) return ERR_PTR(-ECHILD); if (!ext4_buffer_uptodate(bh)) { brelse(bh); return ERR_PTR(-ECHILD); } } else { bh = ext4_bread(NULL, inode, 0, 0); if (IS_ERR(bh)) return ERR_CAST(bh); if (!bh) { EXT4_ERROR_INODE(inode, "bad symlink."); return ERR_PTR(-EFSCORRUPTED); } } set_delayed_call(callback, ext4_free_link, bh); nd_terminate_link(bh->b_data, inode->i_size, inode->i_sb->s_blocksize - 1); return bh->b_data; } const struct inode_operations ext4_encrypted_symlink_inode_operations = { .get_link = ext4_encrypted_get_link, .setattr = ext4_setattr, .getattr = ext4_encrypted_symlink_getattr, .listxattr = ext4_listxattr, }; const struct inode_operations ext4_symlink_inode_operations = { .get_link = ext4_get_link, .setattr = ext4_setattr, .getattr = ext4_getattr, .listxattr = ext4_listxattr, }; const struct inode_operations ext4_fast_symlink_inode_operations = { .get_link = simple_get_link, .setattr = ext4_setattr, .getattr = ext4_getattr, .listxattr = ext4_listxattr, };
228 154 156 143 207 207 18 195 65 18 8 40 47 65 26 40 29 2 27 4 4 30 26 4 22 22 22 12 9 3 17 9 8 31 25 12 2 4 30 25 3 3 6 24 4 6 17 26 26 9 8 10 18 18 103 24 11 33 90 81 90 90 49 23 24 40 11 34 47 18 87 23 78 4 71 19 19 19 70 3 78 3 3 3 9 70 1 1 10 2 10 10 5 10 2 8 10 90 90 90 2 90 2 81 77 79 90 3 1 67 20 81 29 27 6 27 25 69 41 86 28 9 73 80 47 34 80 80 14 67 90 90 90 80 74 31 1 80 1 3 80 24 29 34 29 24 1 24 29 25 28 28 1 1 1 1 28 80 74 68 62 29 36 1 1 25 67 46 47 1 80 54 6 1 46 46 3 65 65 65 5 9 2 7 10 10 23 23 28 28 9 49 1 6 160 157 3 3 160 42 116 40 118 57 118 2 114 42 42 156 96 75 52 118 3 156 108 90 21 20 22 18 77 1 76 28 48 10 66 63 12 64 76 2 4 29 47 24 24 24 24 75 1 23 42 5 1 4 1 1 76 34 42 1 173 9 1 168 2 2 1 1 1 1 1 1 77 36 78 1 1 77 1 18 15 69 162 162 4 4 162 55 154 228 116 228 2 2 228 196 191 119 222 4 60 126 101 207 46 2 47 47 47 41 41 28 28 41 41 41 215 215 143 41 25 207 207 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 // SPDX-License-Identifier: GPL-2.0-only /* * inode.c * * PURPOSE * Inode handling routines for the OSTA-UDF(tm) filesystem. * * COPYRIGHT * (C) 1998 Dave Boynton * (C) 1998-2004 Ben Fennema * (C) 1999-2000 Stelias Computing Inc * * HISTORY * * 10/04/98 dgb Added rudimentary directory functions * 10/07/98 Fully working udf_block_map! It works! * 11/25/98 bmap altered to better support extents * 12/06/98 blf partition support in udf_iget, udf_block_map * and udf_read_inode * 12/12/98 rewrote udf_block_map to handle next extents and descs across * block boundaries (which is not actually allowed) * 12/20/98 added support for strategy 4096 * 03/07/99 rewrote udf_block_map (again) * New funcs, inode_bmap, udf_next_aext * 04/19/99 Support for writing device EA's for major/minor # */ #include "udfdecl.h" #include <linux/mm.h> #include <linux/module.h> #include <linux/pagemap.h> #include <linux/writeback.h> #include <linux/slab.h> #include <linux/crc-itu-t.h> #include <linux/mpage.h> #include <linux/uio.h> #include <linux/bio.h> #include "udf_i.h" #include "udf_sb.h" #define EXTENT_MERGE_SIZE 5 #define FE_MAPPED_PERMS (FE_PERM_U_READ | FE_PERM_U_WRITE | FE_PERM_U_EXEC | \ FE_PERM_G_READ | FE_PERM_G_WRITE | FE_PERM_G_EXEC | \ FE_PERM_O_READ | FE_PERM_O_WRITE | FE_PERM_O_EXEC) #define FE_DELETE_PERMS (FE_PERM_U_DELETE | FE_PERM_G_DELETE | \ FE_PERM_O_DELETE) struct udf_map_rq; static umode_t udf_convert_permissions(struct fileEntry *); static int udf_update_inode(struct inode *, int); static int udf_sync_inode(struct inode *inode); static int udf_alloc_i_data(struct inode *inode, size_t size); static int inode_getblk(struct inode *inode, struct udf_map_rq *map); static int udf_insert_aext(struct inode *, struct extent_position, struct kernel_lb_addr, uint32_t); static void udf_split_extents(struct inode *, int *, int, udf_pblk_t, struct kernel_long_ad *, int *); static void udf_prealloc_extents(struct inode *, int, int, struct kernel_long_ad *, int *); static void udf_merge_extents(struct inode *, struct kernel_long_ad *, int *); static int udf_update_extents(struct inode *, struct kernel_long_ad *, int, int, struct extent_position *); static int udf_get_block_wb(struct inode *inode, sector_t block, struct buffer_head *bh_result, int create); static void __udf_clear_extent_cache(struct inode *inode) { struct udf_inode_info *iinfo = UDF_I(inode); if (iinfo->cached_extent.lstart != -1) { brelse(iinfo->cached_extent.epos.bh); iinfo->cached_extent.lstart = -1; } } /* Invalidate extent cache */ static void udf_clear_extent_cache(struct inode *inode) { struct udf_inode_info *iinfo = UDF_I(inode); spin_lock(&iinfo->i_extent_cache_lock); __udf_clear_extent_cache(inode); spin_unlock(&iinfo->i_extent_cache_lock); } /* Return contents of extent cache */ static int udf_read_extent_cache(struct inode *inode, loff_t bcount, loff_t *lbcount, struct extent_position *pos) { struct udf_inode_info *iinfo = UDF_I(inode); int ret = 0; spin_lock(&iinfo->i_extent_cache_lock); if ((iinfo->cached_extent.lstart <= bcount) && (iinfo->cached_extent.lstart != -1)) { /* Cache hit */ *lbcount = iinfo->cached_extent.lstart; memcpy(pos, &iinfo->cached_extent.epos, sizeof(struct extent_position)); if (pos->bh) get_bh(pos->bh); ret = 1; } spin_unlock(&iinfo->i_extent_cache_lock); return ret; } /* Add extent to extent cache */ static void udf_update_extent_cache(struct inode *inode, loff_t estart, struct extent_position *pos) { struct udf_inode_info *iinfo = UDF_I(inode); spin_lock(&iinfo->i_extent_cache_lock); /* Invalidate previously cached extent */ __udf_clear_extent_cache(inode); if (pos->bh) get_bh(pos->bh); memcpy(&iinfo->cached_extent.epos, pos, sizeof(*pos)); iinfo->cached_extent.lstart = estart; switch (iinfo->i_alloc_type) { case ICBTAG_FLAG_AD_SHORT: iinfo->cached_extent.epos.offset -= sizeof(struct short_ad); break; case ICBTAG_FLAG_AD_LONG: iinfo->cached_extent.epos.offset -= sizeof(struct long_ad); break; } spin_unlock(&iinfo->i_extent_cache_lock); } void udf_evict_inode(struct inode *inode) { struct udf_inode_info *iinfo = UDF_I(inode); int want_delete = 0; if (!is_bad_inode(inode)) { if (!inode->i_nlink) { want_delete = 1; udf_setsize(inode, 0); udf_update_inode(inode, IS_SYNC(inode)); } if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB && inode->i_size != iinfo->i_lenExtents) { udf_warn(inode->i_sb, "Inode %lu (mode %o) has inode size %llu different from extent length %llu. Filesystem need not be standards compliant.\n", inode->i_ino, inode->i_mode, (unsigned long long)inode->i_size, (unsigned long long)iinfo->i_lenExtents); } } truncate_inode_pages_final(&inode->i_data); invalidate_inode_buffers(inode); clear_inode(inode); kfree(iinfo->i_data); iinfo->i_data = NULL; udf_clear_extent_cache(inode); if (want_delete) { udf_free_inode(inode); } } static void udf_write_failed(struct address_space *mapping, loff_t to) { struct inode *inode = mapping->host; struct udf_inode_info *iinfo = UDF_I(inode); loff_t isize = inode->i_size; if (to > isize) { truncate_pagecache(inode, isize); if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) { down_write(&iinfo->i_data_sem); udf_clear_extent_cache(inode); udf_truncate_extents(inode); up_write(&iinfo->i_data_sem); } } } static int udf_adinicb_writepage(struct folio *folio, struct writeback_control *wbc, void *data) { struct inode *inode = folio->mapping->host; struct udf_inode_info *iinfo = UDF_I(inode); BUG_ON(!folio_test_locked(folio)); BUG_ON(folio->index != 0); memcpy_from_file_folio(iinfo->i_data + iinfo->i_lenEAttr, folio, 0, i_size_read(inode)); folio_unlock(folio); mark_inode_dirty(inode); return 0; } static int udf_writepages(struct address_space *mapping, struct writeback_control *wbc) { struct inode *inode = mapping->host; struct udf_inode_info *iinfo = UDF_I(inode); if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) return mpage_writepages(mapping, wbc, udf_get_block_wb); return write_cache_pages(mapping, wbc, udf_adinicb_writepage, NULL); } static void udf_adinicb_read_folio(struct folio *folio) { struct inode *inode = folio->mapping->host; struct udf_inode_info *iinfo = UDF_I(inode); loff_t isize = i_size_read(inode); folio_fill_tail(folio, 0, iinfo->i_data + iinfo->i_lenEAttr, isize); folio_mark_uptodate(folio); } static int udf_read_folio(struct file *file, struct folio *folio) { struct udf_inode_info *iinfo = UDF_I(file_inode(file)); if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) { udf_adinicb_read_folio(folio); folio_unlock(folio); return 0; } return mpage_read_folio(folio, udf_get_block); } static void udf_readahead(struct readahead_control *rac) { struct udf_inode_info *iinfo = UDF_I(rac->mapping->host); /* * No readahead needed for in-ICB files and udf_get_block() would get * confused for such file anyway. */ if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) return; mpage_readahead(rac, udf_get_block); } static int udf_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, struct folio **foliop, void **fsdata) { struct udf_inode_info *iinfo = UDF_I(file_inode(file)); struct folio *folio; int ret; if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) { ret = block_write_begin(mapping, pos, len, foliop, udf_get_block); if (unlikely(ret)) udf_write_failed(mapping, pos + len); return ret; } if (WARN_ON_ONCE(pos >= PAGE_SIZE)) return -EIO; folio = __filemap_get_folio(mapping, 0, FGP_WRITEBEGIN, mapping_gfp_mask(mapping)); if (IS_ERR(folio)) return PTR_ERR(folio); *foliop = folio; if (!folio_test_uptodate(folio)) udf_adinicb_read_folio(folio); return 0; } static int udf_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct folio *folio, void *fsdata) { struct inode *inode = file_inode(file); loff_t last_pos; if (UDF_I(inode)->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) return generic_write_end(file, mapping, pos, len, copied, folio, fsdata); last_pos = pos + copied; if (last_pos > inode->i_size) i_size_write(inode, last_pos); folio_mark_dirty(folio); folio_unlock(folio); folio_put(folio); return copied; } static ssize_t udf_direct_IO(struct kiocb *iocb, struct iov_iter *iter) { struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; size_t count = iov_iter_count(iter); ssize_t ret; /* Fallback to buffered IO for in-ICB files */ if (UDF_I(inode)->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) return 0; ret = blockdev_direct_IO(iocb, inode, iter, udf_get_block); if (unlikely(ret < 0 && iov_iter_rw(iter) == WRITE)) udf_write_failed(mapping, iocb->ki_pos + count); return ret; } static sector_t udf_bmap(struct address_space *mapping, sector_t block) { struct udf_inode_info *iinfo = UDF_I(mapping->host); if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) return -EINVAL; return generic_block_bmap(mapping, block, udf_get_block); } const struct address_space_operations udf_aops = { .dirty_folio = block_dirty_folio, .invalidate_folio = block_invalidate_folio, .read_folio = udf_read_folio, .readahead = udf_readahead, .writepages = udf_writepages, .write_begin = udf_write_begin, .write_end = udf_write_end, .direct_IO = udf_direct_IO, .bmap = udf_bmap, .migrate_folio = buffer_migrate_folio, }; /* * Expand file stored in ICB to a normal one-block-file * * This function requires i_mutex held */ int udf_expand_file_adinicb(struct inode *inode) { struct folio *folio; struct udf_inode_info *iinfo = UDF_I(inode); int err; WARN_ON_ONCE(!inode_is_locked(inode)); if (!iinfo->i_lenAlloc) { down_write(&iinfo->i_data_sem); if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_SHORT_AD)) iinfo->i_alloc_type = ICBTAG_FLAG_AD_SHORT; else iinfo->i_alloc_type = ICBTAG_FLAG_AD_LONG; up_write(&iinfo->i_data_sem); mark_inode_dirty(inode); return 0; } folio = __filemap_get_folio(inode->i_mapping, 0, FGP_LOCK | FGP_ACCESSED | FGP_CREAT, GFP_KERNEL); if (IS_ERR(folio)) return PTR_ERR(folio); if (!folio_test_uptodate(folio)) udf_adinicb_read_folio(folio); down_write(&iinfo->i_data_sem); memset(iinfo->i_data + iinfo->i_lenEAttr, 0x00, iinfo->i_lenAlloc); iinfo->i_lenAlloc = 0; if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_SHORT_AD)) iinfo->i_alloc_type = ICBTAG_FLAG_AD_SHORT; else iinfo->i_alloc_type = ICBTAG_FLAG_AD_LONG; folio_mark_dirty(folio); folio_unlock(folio); up_write(&iinfo->i_data_sem); err = filemap_fdatawrite(inode->i_mapping); if (err) { /* Restore everything back so that we don't lose data... */ folio_lock(folio); down_write(&iinfo->i_data_sem); memcpy_from_folio(iinfo->i_data + iinfo->i_lenEAttr, folio, 0, inode->i_size); folio_unlock(folio); iinfo->i_alloc_type = ICBTAG_FLAG_AD_IN_ICB; iinfo->i_lenAlloc = inode->i_size; up_write(&iinfo->i_data_sem); } folio_put(folio); mark_inode_dirty(inode); return err; } #define UDF_MAP_CREATE 0x01 /* Mapping can allocate new blocks */ #define UDF_MAP_NOPREALLOC 0x02 /* Do not preallocate blocks */ #define UDF_BLK_MAPPED 0x01 /* Block was successfully mapped */ #define UDF_BLK_NEW 0x02 /* Block was freshly allocated */ struct udf_map_rq { sector_t lblk; udf_pblk_t pblk; int iflags; /* UDF_MAP_ flags determining behavior */ int oflags; /* UDF_BLK_ flags reporting results */ }; static int udf_map_block(struct inode *inode, struct udf_map_rq *map) { int ret; struct udf_inode_info *iinfo = UDF_I(inode); if (WARN_ON_ONCE(iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB)) return -EFSCORRUPTED; map->oflags = 0; if (!(map->iflags & UDF_MAP_CREATE)) { struct kernel_lb_addr eloc; uint32_t elen; sector_t offset; struct extent_position epos = {}; int8_t etype; down_read(&iinfo->i_data_sem); ret = inode_bmap(inode, map->lblk, &epos, &eloc, &elen, &offset, &etype); if (ret < 0) goto out_read; if (ret > 0 && etype == (EXT_RECORDED_ALLOCATED >> 30)) { map->pblk = udf_get_lb_pblock(inode->i_sb, &eloc, offset); map->oflags |= UDF_BLK_MAPPED; ret = 0; } out_read: up_read(&iinfo->i_data_sem); brelse(epos.bh); return ret; } down_write(&iinfo->i_data_sem); /* * Block beyond EOF and prealloc extents? Just discard preallocation * as it is not useful and complicates things. */ if (((loff_t)map->lblk) << inode->i_blkbits >= iinfo->i_lenExtents) udf_discard_prealloc(inode); udf_clear_extent_cache(inode); ret = inode_getblk(inode, map); up_write(&iinfo->i_data_sem); return ret; } static int __udf_get_block(struct inode *inode, sector_t block, struct buffer_head *bh_result, int flags) { int err; struct udf_map_rq map = { .lblk = block, .iflags = flags, }; err = udf_map_block(inode, &map); if (err < 0) return err; if (map.oflags & UDF_BLK_MAPPED) { map_bh(bh_result, inode->i_sb, map.pblk); if (map.oflags & UDF_BLK_NEW) set_buffer_new(bh_result); } return 0; } int udf_get_block(struct inode *inode, sector_t block, struct buffer_head *bh_result, int create) { int flags = create ? UDF_MAP_CREATE : 0; /* * We preallocate blocks only for regular files. It also makes sense * for directories but there's a problem when to drop the * preallocation. We might use some delayed work for that but I feel * it's overengineering for a filesystem like UDF. */ if (!S_ISREG(inode->i_mode)) flags |= UDF_MAP_NOPREALLOC; return __udf_get_block(inode, block, bh_result, flags); } /* * We shouldn't be allocating blocks on page writeback since we allocate them * on page fault. We can spot dirty buffers without allocated blocks though * when truncate expands file. These however don't have valid data so we can * safely ignore them. So never allocate blocks from page writeback. */ static int udf_get_block_wb(struct inode *inode, sector_t block, struct buffer_head *bh_result, int create) { return __udf_get_block(inode, block, bh_result, 0); } /* Extend the file with new blocks totaling 'new_block_bytes', * return the number of extents added */ static int udf_do_extend_file(struct inode *inode, struct extent_position *last_pos, struct kernel_long_ad *last_ext, loff_t new_block_bytes) { uint32_t add; int count = 0, fake = !(last_ext->extLength & UDF_EXTENT_LENGTH_MASK); struct super_block *sb = inode->i_sb; struct udf_inode_info *iinfo; int err; /* The previous extent is fake and we should not extend by anything * - there's nothing to do... */ if (!new_block_bytes && fake) return 0; iinfo = UDF_I(inode); /* Round the last extent up to a multiple of block size */ if (last_ext->extLength & (sb->s_blocksize - 1)) { last_ext->extLength = (last_ext->extLength & UDF_EXTENT_FLAG_MASK) | (((last_ext->extLength & UDF_EXTENT_LENGTH_MASK) + sb->s_blocksize - 1) & ~(sb->s_blocksize - 1)); iinfo->i_lenExtents = (iinfo->i_lenExtents + sb->s_blocksize - 1) & ~(sb->s_blocksize - 1); } add = 0; /* Can we merge with the previous extent? */ if ((last_ext->extLength & UDF_EXTENT_FLAG_MASK) == EXT_NOT_RECORDED_NOT_ALLOCATED) { add = (1 << 30) - sb->s_blocksize - (last_ext->extLength & UDF_EXTENT_LENGTH_MASK); if (add > new_block_bytes) add = new_block_bytes; new_block_bytes -= add; last_ext->extLength += add; } if (fake) { err = udf_add_aext(inode, last_pos, &last_ext->extLocation, last_ext->extLength, 1); if (err < 0) goto out_err; count++; } else { struct kernel_lb_addr tmploc; uint32_t tmplen; int8_t tmptype; udf_write_aext(inode, last_pos, &last_ext->extLocation, last_ext->extLength, 1); /* * We've rewritten the last extent. If we are going to add * more extents, we may need to enter possible following * empty indirect extent. */ if (new_block_bytes) { err = udf_next_aext(inode, last_pos, &tmploc, &tmplen, &tmptype, 0); if (err < 0) goto out_err; } } iinfo->i_lenExtents += add; /* Managed to do everything necessary? */ if (!new_block_bytes) goto out; /* All further extents will be NOT_RECORDED_NOT_ALLOCATED */ last_ext->extLocation.logicalBlockNum = 0; last_ext->extLocation.partitionReferenceNum = 0; add = (1 << 30) - sb->s_blocksize; last_ext->extLength = EXT_NOT_RECORDED_NOT_ALLOCATED | add; /* Create enough extents to cover the whole hole */ while (new_block_bytes > add) { new_block_bytes -= add; err = udf_add_aext(inode, last_pos, &last_ext->extLocation, last_ext->extLength, 1); if (err) goto out_err; iinfo->i_lenExtents += add; count++; } if (new_block_bytes) { last_ext->extLength = EXT_NOT_RECORDED_NOT_ALLOCATED | new_block_bytes; err = udf_add_aext(inode, last_pos, &last_ext->extLocation, last_ext->extLength, 1); if (err) goto out_err; iinfo->i_lenExtents += new_block_bytes; count++; } out: /* last_pos should point to the last written extent... */ if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) last_pos->offset -= sizeof(struct short_ad); else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) last_pos->offset -= sizeof(struct long_ad); else return -EIO; return count; out_err: /* Remove extents we've created so far */ udf_clear_extent_cache(inode); udf_truncate_extents(inode); return err; } /* Extend the final block of the file to final_block_len bytes */ static void udf_do_extend_final_block(struct inode *inode, struct extent_position *last_pos, struct kernel_long_ad *last_ext, uint32_t new_elen) { uint32_t added_bytes; /* * Extent already large enough? It may be already rounded up to block * size... */ if (new_elen <= (last_ext->extLength & UDF_EXTENT_LENGTH_MASK)) return; added_bytes = new_elen - (last_ext->extLength & UDF_EXTENT_LENGTH_MASK); last_ext->extLength += added_bytes; UDF_I(inode)->i_lenExtents += added_bytes; udf_write_aext(inode, last_pos, &last_ext->extLocation, last_ext->extLength, 1); } static int udf_extend_file(struct inode *inode, loff_t newsize) { struct extent_position epos; struct kernel_lb_addr eloc; uint32_t elen; int8_t etype; struct super_block *sb = inode->i_sb; sector_t first_block = newsize >> sb->s_blocksize_bits, offset; loff_t new_elen; int adsize; struct udf_inode_info *iinfo = UDF_I(inode); struct kernel_long_ad extent; int err = 0; bool within_last_ext; if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) adsize = sizeof(struct short_ad); else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) adsize = sizeof(struct long_ad); else BUG(); down_write(&iinfo->i_data_sem); /* * When creating hole in file, just don't bother with preserving * preallocation. It likely won't be very useful anyway. */ udf_discard_prealloc(inode); err = inode_bmap(inode, first_block, &epos, &eloc, &elen, &offset, &etype); if (err < 0) goto out; within_last_ext = (err == 1); /* We don't expect extents past EOF... */ WARN_ON_ONCE(within_last_ext && elen > ((loff_t)offset + 1) << inode->i_blkbits); if ((!epos.bh && epos.offset == udf_file_entry_alloc_offset(inode)) || (epos.bh && epos.offset == sizeof(struct allocExtDesc))) { /* File has no extents at all or has empty last * indirect extent! Create a fake extent... */ extent.extLocation.logicalBlockNum = 0; extent.extLocation.partitionReferenceNum = 0; extent.extLength = EXT_NOT_RECORDED_NOT_ALLOCATED; } else { epos.offset -= adsize; err = udf_next_aext(inode, &epos, &extent.extLocation, &extent.extLength, &etype, 0); if (err <= 0) goto out; extent.extLength |= etype << 30; } new_elen = ((loff_t)offset << inode->i_blkbits) | (newsize & (sb->s_blocksize - 1)); /* File has extent covering the new size (could happen when extending * inside a block)? */ if (within_last_ext) { /* Extending file within the last file block */ udf_do_extend_final_block(inode, &epos, &extent, new_elen); } else { err = udf_do_extend_file(inode, &epos, &extent, new_elen); } if (err < 0) goto out; err = 0; out: brelse(epos.bh); up_write(&iinfo->i_data_sem); return err; } static int inode_getblk(struct inode *inode, struct udf_map_rq *map) { struct kernel_long_ad laarr[EXTENT_MERGE_SIZE]; struct extent_position prev_epos, cur_epos, next_epos; int count = 0, startnum = 0, endnum = 0; uint32_t elen = 0, tmpelen; struct kernel_lb_addr eloc, tmpeloc; int c = 1; loff_t lbcount = 0, b_off = 0; udf_pblk_t newblocknum; sector_t offset = 0; int8_t etype, tmpetype; struct udf_inode_info *iinfo = UDF_I(inode); udf_pblk_t goal = 0, pgoal = iinfo->i_location.logicalBlockNum; int lastblock = 0; bool isBeyondEOF = false; int ret = 0; prev_epos.offset = udf_file_entry_alloc_offset(inode); prev_epos.block = iinfo->i_location; prev_epos.bh = NULL; cur_epos = next_epos = prev_epos; b_off = (loff_t)map->lblk << inode->i_sb->s_blocksize_bits; /* find the extent which contains the block we are looking for. alternate between laarr[0] and laarr[1] for locations of the current extent, and the previous extent */ do { if (prev_epos.bh != cur_epos.bh) { brelse(prev_epos.bh); get_bh(cur_epos.bh); prev_epos.bh = cur_epos.bh; } if (cur_epos.bh != next_epos.bh) { brelse(cur_epos.bh); get_bh(next_epos.bh); cur_epos.bh = next_epos.bh; } lbcount += elen; prev_epos.block = cur_epos.block; cur_epos.block = next_epos.block; prev_epos.offset = cur_epos.offset; cur_epos.offset = next_epos.offset; ret = udf_next_aext(inode, &next_epos, &eloc, &elen, &etype, 1); if (ret < 0) { goto out_free; } else if (ret == 0) { isBeyondEOF = true; break; } c = !c; laarr[c].extLength = (etype << 30) | elen; laarr[c].extLocation = eloc; if (etype != (EXT_NOT_RECORDED_NOT_ALLOCATED >> 30)) pgoal = eloc.logicalBlockNum + ((elen + inode->i_sb->s_blocksize - 1) >> inode->i_sb->s_blocksize_bits); count++; } while (lbcount + elen <= b_off); b_off -= lbcount; offset = b_off >> inode->i_sb->s_blocksize_bits; /* * Move prev_epos and cur_epos into indirect extent if we are at * the pointer to it */ ret = udf_next_aext(inode, &prev_epos, &tmpeloc, &tmpelen, &tmpetype, 0); if (ret < 0) goto out_free; ret = udf_next_aext(inode, &cur_epos, &tmpeloc, &tmpelen, &tmpetype, 0); if (ret < 0) goto out_free; /* if the extent is allocated and recorded, return the block if the extent is not a multiple of the blocksize, round up */ if (!isBeyondEOF && etype == (EXT_RECORDED_ALLOCATED >> 30)) { if (elen & (inode->i_sb->s_blocksize - 1)) { elen = EXT_RECORDED_ALLOCATED | ((elen + inode->i_sb->s_blocksize - 1) & ~(inode->i_sb->s_blocksize - 1)); iinfo->i_lenExtents = ALIGN(iinfo->i_lenExtents, inode->i_sb->s_blocksize); udf_write_aext(inode, &cur_epos, &eloc, elen, 1); } map->oflags = UDF_BLK_MAPPED; map->pblk = udf_get_lb_pblock(inode->i_sb, &eloc, offset); goto out_free; } /* Are we beyond EOF and preallocated extent? */ if (isBeyondEOF) { loff_t hole_len; if (count) { if (c) laarr[0] = laarr[1]; startnum = 1; } else { /* Create a fake extent when there's not one */ memset(&laarr[0].extLocation, 0x00, sizeof(struct kernel_lb_addr)); laarr[0].extLength = EXT_NOT_RECORDED_NOT_ALLOCATED; /* Will udf_do_extend_file() create real extent from a fake one? */ startnum = (offset > 0); } /* Create extents for the hole between EOF and offset */ hole_len = (loff_t)offset << inode->i_blkbits; ret = udf_do_extend_file(inode, &prev_epos, laarr, hole_len); if (ret < 0) goto out_free; c = 0; offset = 0; count += ret; /* * Is there any real extent? - otherwise we overwrite the fake * one... */ if (count) c = !c; laarr[c].extLength = EXT_NOT_RECORDED_NOT_ALLOCATED | inode->i_sb->s_blocksize; memset(&laarr[c].extLocation, 0x00, sizeof(struct kernel_lb_addr)); count++; endnum = c + 1; lastblock = 1; } else { endnum = startnum = ((count > 2) ? 2 : count); /* if the current extent is in position 0, swap it with the previous */ if (!c && count != 1) { laarr[2] = laarr[0]; laarr[0] = laarr[1]; laarr[1] = laarr[2]; c = 1; } /* if the current block is located in an extent, read the next extent */ ret = udf_next_aext(inode, &next_epos, &eloc, &elen, &etype, 0); if (ret > 0) { laarr[c + 1].extLength = (etype << 30) | elen; laarr[c + 1].extLocation = eloc; count++; startnum++; endnum++; } else if (ret == 0) lastblock = 1; else goto out_free; } /* if the current extent is not recorded but allocated, get the * block in the extent corresponding to the requested block */ if ((laarr[c].extLength >> 30) == (EXT_NOT_RECORDED_ALLOCATED >> 30)) newblocknum = laarr[c].extLocation.logicalBlockNum + offset; else { /* otherwise, allocate a new block */ if (iinfo->i_next_alloc_block == map->lblk) goal = iinfo->i_next_alloc_goal; if (!goal) { if (!(goal = pgoal)) /* XXX: what was intended here? */ goal = iinfo->i_location.logicalBlockNum + 1; } newblocknum = udf_new_block(inode->i_sb, inode, iinfo->i_location.partitionReferenceNum, goal, &ret); if (!newblocknum) goto out_free; if (isBeyondEOF) iinfo->i_lenExtents += inode->i_sb->s_blocksize; } /* if the extent the requsted block is located in contains multiple * blocks, split the extent into at most three extents. blocks prior * to requested block, requested block, and blocks after requested * block */ udf_split_extents(inode, &c, offset, newblocknum, laarr, &endnum); if (!(map->iflags & UDF_MAP_NOPREALLOC)) udf_prealloc_extents(inode, c, lastblock, laarr, &endnum); /* merge any continuous blocks in laarr */ udf_merge_extents(inode, laarr, &endnum); /* write back the new extents, inserting new extents if the new number * of extents is greater than the old number, and deleting extents if * the new number of extents is less than the old number */ ret = udf_update_extents(inode, laarr, startnum, endnum, &prev_epos); if (ret < 0) goto out_free; map->pblk = udf_get_pblock(inode->i_sb, newblocknum, iinfo->i_location.partitionReferenceNum, 0); if (!map->pblk) { ret = -EFSCORRUPTED; goto out_free; } map->oflags = UDF_BLK_NEW | UDF_BLK_MAPPED; iinfo->i_next_alloc_block = map->lblk + 1; iinfo->i_next_alloc_goal = newblocknum + 1; inode_set_ctime_current(inode); if (IS_SYNC(inode)) udf_sync_inode(inode); else mark_inode_dirty(inode); ret = 0; out_free: brelse(prev_epos.bh); brelse(cur_epos.bh); brelse(next_epos.bh); return ret; } static void udf_split_extents(struct inode *inode, int *c, int offset, udf_pblk_t newblocknum, struct kernel_long_ad *laarr, int *endnum) { unsigned long blocksize = inode->i_sb->s_blocksize; unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits; if ((laarr[*c].extLength >> 30) == (EXT_NOT_RECORDED_ALLOCATED >> 30) || (laarr[*c].extLength >> 30) == (EXT_NOT_RECORDED_NOT_ALLOCATED >> 30)) { int curr = *c; int blen = ((laarr[curr].extLength & UDF_EXTENT_LENGTH_MASK) + blocksize - 1) >> blocksize_bits; int8_t etype = (laarr[curr].extLength >> 30); if (blen == 1) ; else if (!offset || blen == offset + 1) { laarr[curr + 2] = laarr[curr + 1]; laarr[curr + 1] = laarr[curr]; } else { laarr[curr + 3] = laarr[curr + 1]; laarr[curr + 2] = laarr[curr + 1] = laarr[curr]; } if (offset) { if (etype == (EXT_NOT_RECORDED_ALLOCATED >> 30)) { udf_free_blocks(inode->i_sb, inode, &laarr[curr].extLocation, 0, offset); laarr[curr].extLength = EXT_NOT_RECORDED_NOT_ALLOCATED | (offset << blocksize_bits); laarr[curr].extLocation.logicalBlockNum = 0; laarr[curr].extLocation. partitionReferenceNum = 0; } else laarr[curr].extLength = (etype << 30) | (offset << blocksize_bits); curr++; (*c)++; (*endnum)++; } laarr[curr].extLocation.logicalBlockNum = newblocknum; if (etype == (EXT_NOT_RECORDED_NOT_ALLOCATED >> 30)) laarr[curr].extLocation.partitionReferenceNum = UDF_I(inode)->i_location.partitionReferenceNum; laarr[curr].extLength = EXT_RECORDED_ALLOCATED | blocksize; curr++; if (blen != offset + 1) { if (etype == (EXT_NOT_RECORDED_ALLOCATED >> 30)) laarr[curr].extLocation.logicalBlockNum += offset + 1; laarr[curr].extLength = (etype << 30) | ((blen - (offset + 1)) << blocksize_bits); curr++; (*endnum)++; } } } static void udf_prealloc_extents(struct inode *inode, int c, int lastblock, struct kernel_long_ad *laarr, int *endnum) { int start, length = 0, currlength = 0, i; if (*endnum >= (c + 1)) { if (!lastblock) return; else start = c; } else { if ((laarr[c + 1].extLength >> 30) == (EXT_NOT_RECORDED_ALLOCATED >> 30)) { start = c + 1; length = currlength = (((laarr[c + 1].extLength & UDF_EXTENT_LENGTH_MASK) + inode->i_sb->s_blocksize - 1) >> inode->i_sb->s_blocksize_bits); } else start = c; } for (i = start + 1; i <= *endnum; i++) { if (i == *endnum) { if (lastblock) length += UDF_DEFAULT_PREALLOC_BLOCKS; } else if ((laarr[i].extLength >> 30) == (EXT_NOT_RECORDED_NOT_ALLOCATED >> 30)) { length += (((laarr[i].extLength & UDF_EXTENT_LENGTH_MASK) + inode->i_sb->s_blocksize - 1) >> inode->i_sb->s_blocksize_bits); } else break; } if (length) { int next = laarr[start].extLocation.logicalBlockNum + (((laarr[start].extLength & UDF_EXTENT_LENGTH_MASK) + inode->i_sb->s_blocksize - 1) >> inode->i_sb->s_blocksize_bits); int numalloc = udf_prealloc_blocks(inode->i_sb, inode, laarr[start].extLocation.partitionReferenceNum, next, (UDF_DEFAULT_PREALLOC_BLOCKS > length ? length : UDF_DEFAULT_PREALLOC_BLOCKS) - currlength); if (numalloc) { if (start == (c + 1)) laarr[start].extLength += (numalloc << inode->i_sb->s_blocksize_bits); else { memmove(&laarr[c + 2], &laarr[c + 1], sizeof(struct long_ad) * (*endnum - (c + 1))); (*endnum)++; laarr[c + 1].extLocation.logicalBlockNum = next; laarr[c + 1].extLocation.partitionReferenceNum = laarr[c].extLocation. partitionReferenceNum; laarr[c + 1].extLength = EXT_NOT_RECORDED_ALLOCATED | (numalloc << inode->i_sb->s_blocksize_bits); start = c + 1; } for (i = start + 1; numalloc && i < *endnum; i++) { int elen = ((laarr[i].extLength & UDF_EXTENT_LENGTH_MASK) + inode->i_sb->s_blocksize - 1) >> inode->i_sb->s_blocksize_bits; if (elen > numalloc) { laarr[i].extLength -= (numalloc << inode->i_sb->s_blocksize_bits); numalloc = 0; } else { numalloc -= elen; if (*endnum > (i + 1)) memmove(&laarr[i], &laarr[i + 1], sizeof(struct long_ad) * (*endnum - (i + 1))); i--; (*endnum)--; } } UDF_I(inode)->i_lenExtents += numalloc << inode->i_sb->s_blocksize_bits; } } } static void udf_merge_extents(struct inode *inode, struct kernel_long_ad *laarr, int *endnum) { int i; unsigned long blocksize = inode->i_sb->s_blocksize; unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits; for (i = 0; i < (*endnum - 1); i++) { struct kernel_long_ad *li /*l[i]*/ = &laarr[i]; struct kernel_long_ad *lip1 /*l[i plus 1]*/ = &laarr[i + 1]; if (((li->extLength >> 30) == (lip1->extLength >> 30)) && (((li->extLength >> 30) == (EXT_NOT_RECORDED_NOT_ALLOCATED >> 30)) || ((lip1->extLocation.logicalBlockNum - li->extLocation.logicalBlockNum) == (((li->extLength & UDF_EXTENT_LENGTH_MASK) + blocksize - 1) >> blocksize_bits)))) { if (((li->extLength & UDF_EXTENT_LENGTH_MASK) + (lip1->extLength & UDF_EXTENT_LENGTH_MASK) + blocksize - 1) <= UDF_EXTENT_LENGTH_MASK) { li->extLength = lip1->extLength + (((li->extLength & UDF_EXTENT_LENGTH_MASK) + blocksize - 1) & ~(blocksize - 1)); if (*endnum > (i + 2)) memmove(&laarr[i + 1], &laarr[i + 2], sizeof(struct long_ad) * (*endnum - (i + 2))); i--; (*endnum)--; } } else if (((li->extLength >> 30) == (EXT_NOT_RECORDED_ALLOCATED >> 30)) && ((lip1->extLength >> 30) == (EXT_NOT_RECORDED_NOT_ALLOCATED >> 30))) { udf_free_blocks(inode->i_sb, inode, &li->extLocation, 0, ((li->extLength & UDF_EXTENT_LENGTH_MASK) + blocksize - 1) >> blocksize_bits); li->extLocation.logicalBlockNum = 0; li->extLocation.partitionReferenceNum = 0; if (((li->extLength & UDF_EXTENT_LENGTH_MASK) + (lip1->extLength & UDF_EXTENT_LENGTH_MASK) + blocksize - 1) & ~UDF_EXTENT_LENGTH_MASK) { lip1->extLength = (lip1->extLength - (li->extLength & UDF_EXTENT_LENGTH_MASK) + UDF_EXTENT_LENGTH_MASK) & ~(blocksize - 1); li->extLength = (li->extLength & UDF_EXTENT_FLAG_MASK) + (UDF_EXTENT_LENGTH_MASK + 1) - blocksize; } else { li->extLength = lip1->extLength + (((li->extLength & UDF_EXTENT_LENGTH_MASK) + blocksize - 1) & ~(blocksize - 1)); if (*endnum > (i + 2)) memmove(&laarr[i + 1], &laarr[i + 2], sizeof(struct long_ad) * (*endnum - (i + 2))); i--; (*endnum)--; } } else if ((li->extLength >> 30) == (EXT_NOT_RECORDED_ALLOCATED >> 30)) { udf_free_blocks(inode->i_sb, inode, &li->extLocation, 0, ((li->extLength & UDF_EXTENT_LENGTH_MASK) + blocksize - 1) >> blocksize_bits); li->extLocation.logicalBlockNum = 0; li->extLocation.partitionReferenceNum = 0; li->extLength = (li->extLength & UDF_EXTENT_LENGTH_MASK) | EXT_NOT_RECORDED_NOT_ALLOCATED; } } } static int udf_update_extents(struct inode *inode, struct kernel_long_ad *laarr, int startnum, int endnum, struct extent_position *epos) { int start = 0, i; struct kernel_lb_addr tmploc; uint32_t tmplen; int8_t tmpetype; int err; if (startnum > endnum) { for (i = 0; i < (startnum - endnum); i++) udf_delete_aext(inode, *epos); } else if (startnum < endnum) { for (i = 0; i < (endnum - startnum); i++) { err = udf_insert_aext(inode, *epos, laarr[i].extLocation, laarr[i].extLength); /* * If we fail here, we are likely corrupting the extent * list and leaking blocks. At least stop early to * limit the damage. */ if (err < 0) return err; err = udf_next_aext(inode, epos, &laarr[i].extLocation, &laarr[i].extLength, &tmpetype, 1); if (err < 0) return err; start++; } } for (i = start; i < endnum; i++) { err = udf_next_aext(inode, epos, &tmploc, &tmplen, &tmpetype, 0); if (err < 0) return err; udf_write_aext(inode, epos, &laarr[i].extLocation, laarr[i].extLength, 1); } return 0; } struct buffer_head *udf_bread(struct inode *inode, udf_pblk_t block, int create, int *err) { struct buffer_head *bh = NULL; struct udf_map_rq map = { .lblk = block, .iflags = UDF_MAP_NOPREALLOC | (create ? UDF_MAP_CREATE : 0), }; *err = udf_map_block(inode, &map); if (*err || !(map.oflags & UDF_BLK_MAPPED)) return NULL; bh = sb_getblk(inode->i_sb, map.pblk); if (!bh) { *err = -ENOMEM; return NULL; } if (map.oflags & UDF_BLK_NEW) { lock_buffer(bh); memset(bh->b_data, 0x00, inode->i_sb->s_blocksize); set_buffer_uptodate(bh); unlock_buffer(bh); mark_buffer_dirty_inode(bh, inode); return bh; } if (bh_read(bh, 0) >= 0) return bh; brelse(bh); *err = -EIO; return NULL; } int udf_setsize(struct inode *inode, loff_t newsize) { int err = 0; struct udf_inode_info *iinfo; unsigned int bsize = i_blocksize(inode); if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))) return -EINVAL; iinfo = UDF_I(inode); if (newsize > inode->i_size) { if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) { if (bsize >= (udf_file_entry_alloc_offset(inode) + newsize)) { down_write(&iinfo->i_data_sem); iinfo->i_lenAlloc = newsize; up_write(&iinfo->i_data_sem); goto set_size; } err = udf_expand_file_adinicb(inode); if (err) return err; } err = udf_extend_file(inode, newsize); if (err) return err; set_size: truncate_setsize(inode, newsize); } else { if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) { down_write(&iinfo->i_data_sem); udf_clear_extent_cache(inode); memset(iinfo->i_data + iinfo->i_lenEAttr + newsize, 0x00, bsize - newsize - udf_file_entry_alloc_offset(inode)); iinfo->i_lenAlloc = newsize; truncate_setsize(inode, newsize); up_write(&iinfo->i_data_sem); goto update_time; } err = block_truncate_page(inode->i_mapping, newsize, udf_get_block); if (err) return err; truncate_setsize(inode, newsize); down_write(&iinfo->i_data_sem); udf_clear_extent_cache(inode); err = udf_truncate_extents(inode); up_write(&iinfo->i_data_sem); if (err) return err; } update_time: inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); if (IS_SYNC(inode)) udf_sync_inode(inode); else mark_inode_dirty(inode); return err; } /* * Maximum length of linked list formed by ICB hierarchy. The chosen number is * arbitrary - just that we hopefully don't limit any real use of rewritten * inode on write-once media but avoid looping for too long on corrupted media. */ #define UDF_MAX_ICB_NESTING 1024 static int udf_read_inode(struct inode *inode, bool hidden_inode) { struct buffer_head *bh = NULL; struct fileEntry *fe; struct extendedFileEntry *efe; uint16_t ident; struct udf_inode_info *iinfo = UDF_I(inode); struct udf_sb_info *sbi = UDF_SB(inode->i_sb); struct kernel_lb_addr *iloc = &iinfo->i_location; unsigned int link_count; unsigned int indirections = 0; int bs = inode->i_sb->s_blocksize; int ret = -EIO; uint32_t uid, gid; struct timespec64 ts; reread: if (iloc->partitionReferenceNum >= sbi->s_partitions) { udf_debug("partition reference: %u > logical volume partitions: %u\n", iloc->partitionReferenceNum, sbi->s_partitions); return -EIO; } if (iloc->logicalBlockNum >= sbi->s_partmaps[iloc->partitionReferenceNum].s_partition_len) { udf_debug("block=%u, partition=%u out of range\n", iloc->logicalBlockNum, iloc->partitionReferenceNum); return -EIO; } /* * Set defaults, but the inode is still incomplete! * Note: get_new_inode() sets the following on a new inode: * i_sb = sb * i_no = ino * i_flags = sb->s_flags * i_state = 0 * clean_inode(): zero fills and sets * i_count = 1 * i_nlink = 1 * i_op = NULL; */ bh = udf_read_ptagged(inode->i_sb, iloc, 0, &ident); if (!bh) { udf_err(inode->i_sb, "(ino %lu) failed !bh\n", inode->i_ino); return -EIO; } if (ident != TAG_IDENT_FE && ident != TAG_IDENT_EFE && ident != TAG_IDENT_USE) { udf_err(inode->i_sb, "(ino %lu) failed ident=%u\n", inode->i_ino, ident); goto out; } fe = (struct fileEntry *)bh->b_data; efe = (struct extendedFileEntry *)bh->b_data; if (fe->icbTag.strategyType == cpu_to_le16(4096)) { struct buffer_head *ibh; ibh = udf_read_ptagged(inode->i_sb, iloc, 1, &ident); if (ident == TAG_IDENT_IE && ibh) { struct kernel_lb_addr loc; struct indirectEntry *ie; ie = (struct indirectEntry *)ibh->b_data; loc = lelb_to_cpu(ie->indirectICB.extLocation); if (ie->indirectICB.extLength) { brelse(ibh); memcpy(&iinfo->i_location, &loc, sizeof(struct kernel_lb_addr)); if (++indirections > UDF_MAX_ICB_NESTING) { udf_err(inode->i_sb, "too many ICBs in ICB hierarchy" " (max %d supported)\n", UDF_MAX_ICB_NESTING); goto out; } brelse(bh); goto reread; } } brelse(ibh); } else if (fe->icbTag.strategyType != cpu_to_le16(4)) { udf_err(inode->i_sb, "unsupported strategy type: %u\n", le16_to_cpu(fe->icbTag.strategyType)); goto out; } if (fe->icbTag.strategyType == cpu_to_le16(4)) iinfo->i_strat4096 = 0; else /* if (fe->icbTag.strategyType == cpu_to_le16(4096)) */ iinfo->i_strat4096 = 1; iinfo->i_alloc_type = le16_to_cpu(fe->icbTag.flags) & ICBTAG_FLAG_AD_MASK; if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_SHORT && iinfo->i_alloc_type != ICBTAG_FLAG_AD_LONG && iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) { ret = -EIO; goto out; } iinfo->i_hidden = hidden_inode; iinfo->i_unique = 0; iinfo->i_lenEAttr = 0; iinfo->i_lenExtents = 0; iinfo->i_lenAlloc = 0; iinfo->i_next_alloc_block = 0; iinfo->i_next_alloc_goal = 0; if (fe->descTag.tagIdent == cpu_to_le16(TAG_IDENT_EFE)) { iinfo->i_efe = 1; iinfo->i_use = 0; ret = udf_alloc_i_data(inode, bs - sizeof(struct extendedFileEntry)); if (ret) goto out; memcpy(iinfo->i_data, bh->b_data + sizeof(struct extendedFileEntry), bs - sizeof(struct extendedFileEntry)); } else if (fe->descTag.tagIdent == cpu_to_le16(TAG_IDENT_FE)) { iinfo->i_efe = 0; iinfo->i_use = 0; ret = udf_alloc_i_data(inode, bs - sizeof(struct fileEntry)); if (ret) goto out; memcpy(iinfo->i_data, bh->b_data + sizeof(struct fileEntry), bs - sizeof(struct fileEntry)); } else if (fe->descTag.tagIdent == cpu_to_le16(TAG_IDENT_USE)) { iinfo->i_efe = 0; iinfo->i_use = 1; iinfo->i_lenAlloc = le32_to_cpu( ((struct unallocSpaceEntry *)bh->b_data)-> lengthAllocDescs); ret = udf_alloc_i_data(inode, bs - sizeof(struct unallocSpaceEntry)); if (ret) goto out; memcpy(iinfo->i_data, bh->b_data + sizeof(struct unallocSpaceEntry), bs - sizeof(struct unallocSpaceEntry)); return 0; } ret = -EIO; read_lock(&sbi->s_cred_lock); uid = le32_to_cpu(fe->uid); if (uid == UDF_INVALID_ID || UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_UID_SET)) inode->i_uid = sbi->s_uid; else i_uid_write(inode, uid); gid = le32_to_cpu(fe->gid); if (gid == UDF_INVALID_ID || UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_GID_SET)) inode->i_gid = sbi->s_gid; else i_gid_write(inode, gid); if (fe->icbTag.fileType != ICBTAG_FILE_TYPE_DIRECTORY && sbi->s_fmode != UDF_INVALID_MODE) inode->i_mode = sbi->s_fmode; else if (fe->icbTag.fileType == ICBTAG_FILE_TYPE_DIRECTORY && sbi->s_dmode != UDF_INVALID_MODE) inode->i_mode = sbi->s_dmode; else inode->i_mode = udf_convert_permissions(fe); inode->i_mode &= ~sbi->s_umask; iinfo->i_extraPerms = le32_to_cpu(fe->permissions) & ~FE_MAPPED_PERMS; read_unlock(&sbi->s_cred_lock); link_count = le16_to_cpu(fe->fileLinkCount); if (!link_count) { if (!hidden_inode) { ret = -ESTALE; goto out; } link_count = 1; } set_nlink(inode, link_count); inode->i_size = le64_to_cpu(fe->informationLength); iinfo->i_lenExtents = inode->i_size; if (iinfo->i_efe == 0) { inode->i_blocks = le64_to_cpu(fe->logicalBlocksRecorded) << (inode->i_sb->s_blocksize_bits - 9); udf_disk_stamp_to_time(&ts, fe->accessTime); inode_set_atime_to_ts(inode, ts); udf_disk_stamp_to_time(&ts, fe->modificationTime); inode_set_mtime_to_ts(inode, ts); udf_disk_stamp_to_time(&ts, fe->attrTime); inode_set_ctime_to_ts(inode, ts); iinfo->i_unique = le64_to_cpu(fe->uniqueID); iinfo->i_lenEAttr = le32_to_cpu(fe->lengthExtendedAttr); iinfo->i_lenAlloc = le32_to_cpu(fe->lengthAllocDescs); iinfo->i_checkpoint = le32_to_cpu(fe->checkpoint); iinfo->i_streamdir = 0; iinfo->i_lenStreams = 0; } else { inode->i_blocks = le64_to_cpu(efe->logicalBlocksRecorded) << (inode->i_sb->s_blocksize_bits - 9); udf_disk_stamp_to_time(&ts, efe->accessTime); inode_set_atime_to_ts(inode, ts); udf_disk_stamp_to_time(&ts, efe->modificationTime); inode_set_mtime_to_ts(inode, ts); udf_disk_stamp_to_time(&ts, efe->attrTime); inode_set_ctime_to_ts(inode, ts); udf_disk_stamp_to_time(&iinfo->i_crtime, efe->createTime); iinfo->i_unique = le64_to_cpu(efe->uniqueID); iinfo->i_lenEAttr = le32_to_cpu(efe->lengthExtendedAttr); iinfo->i_lenAlloc = le32_to_cpu(efe->lengthAllocDescs); iinfo->i_checkpoint = le32_to_cpu(efe->checkpoint); /* Named streams */ iinfo->i_streamdir = (efe->streamDirectoryICB.extLength != 0); iinfo->i_locStreamdir = lelb_to_cpu(efe->streamDirectoryICB.extLocation); iinfo->i_lenStreams = le64_to_cpu(efe->objectSize); if (iinfo->i_lenStreams >= inode->i_size) iinfo->i_lenStreams -= inode->i_size; else iinfo->i_lenStreams = 0; } inode->i_generation = iinfo->i_unique; /* * Sanity check length of allocation descriptors and extended attrs to * avoid integer overflows */ if (iinfo->i_lenEAttr > bs || iinfo->i_lenAlloc > bs) goto out; /* Now do exact checks */ if (udf_file_entry_alloc_offset(inode) + iinfo->i_lenAlloc > bs) goto out; /* Sanity checks for files in ICB so that we don't get confused later */ if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) { /* * For file in ICB data is stored in allocation descriptor * so sizes should match */ if (iinfo->i_lenAlloc != inode->i_size) goto out; /* File in ICB has to fit in there... */ if (inode->i_size > bs - udf_file_entry_alloc_offset(inode)) goto out; } switch (fe->icbTag.fileType) { case ICBTAG_FILE_TYPE_DIRECTORY: inode->i_op = &udf_dir_inode_operations; inode->i_fop = &udf_dir_operations; inode->i_mode |= S_IFDIR; inc_nlink(inode); break; case ICBTAG_FILE_TYPE_REALTIME: case ICBTAG_FILE_TYPE_REGULAR: case ICBTAG_FILE_TYPE_UNDEF: case ICBTAG_FILE_TYPE_VAT20: inode->i_data.a_ops = &udf_aops; inode->i_op = &udf_file_inode_operations; inode->i_fop = &udf_file_operations; inode->i_mode |= S_IFREG; break; case ICBTAG_FILE_TYPE_BLOCK: inode->i_mode |= S_IFBLK; break; case ICBTAG_FILE_TYPE_CHAR: inode->i_mode |= S_IFCHR; break; case ICBTAG_FILE_TYPE_FIFO: init_special_inode(inode, inode->i_mode | S_IFIFO, 0); break; case ICBTAG_FILE_TYPE_SOCKET: init_special_inode(inode, inode->i_mode | S_IFSOCK, 0); break; case ICBTAG_FILE_TYPE_SYMLINK: inode->i_data.a_ops = &udf_symlink_aops; inode->i_op = &udf_symlink_inode_operations; inode_nohighmem(inode); inode->i_mode = S_IFLNK | 0777; break; case ICBTAG_FILE_TYPE_MAIN: udf_debug("METADATA FILE-----\n"); break; case ICBTAG_FILE_TYPE_MIRROR: udf_debug("METADATA MIRROR FILE-----\n"); break; case ICBTAG_FILE_TYPE_BITMAP: udf_debug("METADATA BITMAP FILE-----\n"); break; default: udf_err(inode->i_sb, "(ino %lu) failed unknown file type=%u\n", inode->i_ino, fe->icbTag.fileType); goto out; } if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) { struct deviceSpec *dsea = (struct deviceSpec *)udf_get_extendedattr(inode, 12, 1); if (dsea) { init_special_inode(inode, inode->i_mode, MKDEV(le32_to_cpu(dsea->majorDeviceIdent), le32_to_cpu(dsea->minorDeviceIdent))); /* Developer ID ??? */ } else goto out; } ret = 0; out: brelse(bh); return ret; } static int udf_alloc_i_data(struct inode *inode, size_t size) { struct udf_inode_info *iinfo = UDF_I(inode); iinfo->i_data = kmalloc(size, GFP_KERNEL); if (!iinfo->i_data) return -ENOMEM; return 0; } static umode_t udf_convert_permissions(struct fileEntry *fe) { umode_t mode; uint32_t permissions; uint32_t flags; permissions = le32_to_cpu(fe->permissions); flags = le16_to_cpu(fe->icbTag.flags); mode = ((permissions) & 0007) | ((permissions >> 2) & 0070) | ((permissions >> 4) & 0700) | ((flags & ICBTAG_FLAG_SETUID) ? S_ISUID : 0) | ((flags & ICBTAG_FLAG_SETGID) ? S_ISGID : 0) | ((flags & ICBTAG_FLAG_STICKY) ? S_ISVTX : 0); return mode; } void udf_update_extra_perms(struct inode *inode, umode_t mode) { struct udf_inode_info *iinfo = UDF_I(inode); /* * UDF 2.01 sec. 3.3.3.3 Note 2: * In Unix, delete permission tracks write */ iinfo->i_extraPerms &= ~FE_DELETE_PERMS; if (mode & 0200) iinfo->i_extraPerms |= FE_PERM_U_DELETE; if (mode & 0020) iinfo->i_extraPerms |= FE_PERM_G_DELETE; if (mode & 0002) iinfo->i_extraPerms |= FE_PERM_O_DELETE; } int udf_write_inode(struct inode *inode, struct writeback_control *wbc) { return udf_update_inode(inode, wbc->sync_mode == WB_SYNC_ALL); } static int udf_sync_inode(struct inode *inode) { return udf_update_inode(inode, 1); } static void udf_adjust_time(struct udf_inode_info *iinfo, struct timespec64 time) { if (iinfo->i_crtime.tv_sec > time.tv_sec || (iinfo->i_crtime.tv_sec == time.tv_sec && iinfo->i_crtime.tv_nsec > time.tv_nsec)) iinfo->i_crtime = time; } static int udf_update_inode(struct inode *inode, int do_sync) { struct buffer_head *bh = NULL; struct fileEntry *fe; struct extendedFileEntry *efe; uint64_t lb_recorded; uint32_t udfperms; uint16_t icbflags; uint16_t crclen; int err = 0; struct udf_sb_info *sbi = UDF_SB(inode->i_sb); unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits; struct udf_inode_info *iinfo = UDF_I(inode); bh = sb_getblk(inode->i_sb, udf_get_lb_pblock(inode->i_sb, &iinfo->i_location, 0)); if (!bh) { udf_debug("getblk failure\n"); return -EIO; } lock_buffer(bh); memset(bh->b_data, 0, inode->i_sb->s_blocksize); fe = (struct fileEntry *)bh->b_data; efe = (struct extendedFileEntry *)bh->b_data; if (iinfo->i_use) { struct unallocSpaceEntry *use = (struct unallocSpaceEntry *)bh->b_data; use->lengthAllocDescs = cpu_to_le32(iinfo->i_lenAlloc); memcpy(bh->b_data + sizeof(struct unallocSpaceEntry), iinfo->i_data, inode->i_sb->s_blocksize - sizeof(struct unallocSpaceEntry)); use->descTag.tagIdent = cpu_to_le16(TAG_IDENT_USE); crclen = sizeof(struct unallocSpaceEntry); goto finish; } if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_UID_FORGET)) fe->uid = cpu_to_le32(UDF_INVALID_ID); else fe->uid = cpu_to_le32(i_uid_read(inode)); if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_GID_FORGET)) fe->gid = cpu_to_le32(UDF_INVALID_ID); else fe->gid = cpu_to_le32(i_gid_read(inode)); udfperms = ((inode->i_mode & 0007)) | ((inode->i_mode & 0070) << 2) | ((inode->i_mode & 0700) << 4); udfperms |= iinfo->i_extraPerms; fe->permissions = cpu_to_le32(udfperms); if (S_ISDIR(inode->i_mode) && inode->i_nlink > 0) fe->fileLinkCount = cpu_to_le16(inode->i_nlink - 1); else { if (iinfo->i_hidden) fe->fileLinkCount = cpu_to_le16(0); else fe->fileLinkCount = cpu_to_le16(inode->i_nlink); } fe->informationLength = cpu_to_le64(inode->i_size); if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) { struct regid *eid; struct deviceSpec *dsea = (struct deviceSpec *)udf_get_extendedattr(inode, 12, 1); if (!dsea) { dsea = (struct deviceSpec *) udf_add_extendedattr(inode, sizeof(struct deviceSpec) + sizeof(struct regid), 12, 0x3); dsea->attrType = cpu_to_le32(12); dsea->attrSubtype = 1; dsea->attrLength = cpu_to_le32( sizeof(struct deviceSpec) + sizeof(struct regid)); dsea->impUseLength = cpu_to_le32(sizeof(struct regid)); } eid = (struct regid *)dsea->impUse; memset(eid, 0, sizeof(*eid)); strcpy(eid->ident, UDF_ID_DEVELOPER); eid->identSuffix[0] = UDF_OS_CLASS_UNIX; eid->identSuffix[1] = UDF_OS_ID_LINUX; dsea->majorDeviceIdent = cpu_to_le32(imajor(inode)); dsea->minorDeviceIdent = cpu_to_le32(iminor(inode)); } if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) lb_recorded = 0; /* No extents => no blocks! */ else lb_recorded = (inode->i_blocks + (1 << (blocksize_bits - 9)) - 1) >> (blocksize_bits - 9); if (iinfo->i_efe == 0) { memcpy(bh->b_data + sizeof(struct fileEntry), iinfo->i_data, inode->i_sb->s_blocksize - sizeof(struct fileEntry)); fe->logicalBlocksRecorded = cpu_to_le64(lb_recorded); udf_time_to_disk_stamp(&fe->accessTime, inode_get_atime(inode)); udf_time_to_disk_stamp(&fe->modificationTime, inode_get_mtime(inode)); udf_time_to_disk_stamp(&fe->attrTime, inode_get_ctime(inode)); memset(&(fe->impIdent), 0, sizeof(struct regid)); strcpy(fe->impIdent.ident, UDF_ID_DEVELOPER); fe->impIdent.identSuffix[0] = UDF_OS_CLASS_UNIX; fe->impIdent.identSuffix[1] = UDF_OS_ID_LINUX; fe->uniqueID = cpu_to_le64(iinfo->i_unique); fe->lengthExtendedAttr = cpu_to_le32(iinfo->i_lenEAttr); fe->lengthAllocDescs = cpu_to_le32(iinfo->i_lenAlloc); fe->checkpoint = cpu_to_le32(iinfo->i_checkpoint); fe->descTag.tagIdent = cpu_to_le16(TAG_IDENT_FE); crclen = sizeof(struct fileEntry); } else { memcpy(bh->b_data + sizeof(struct extendedFileEntry), iinfo->i_data, inode->i_sb->s_blocksize - sizeof(struct extendedFileEntry)); efe->objectSize = cpu_to_le64(inode->i_size + iinfo->i_lenStreams); efe->logicalBlocksRecorded = cpu_to_le64(lb_recorded); if (iinfo->i_streamdir) { struct long_ad *icb_lad = &efe->streamDirectoryICB; icb_lad->extLocation = cpu_to_lelb(iinfo->i_locStreamdir); icb_lad->extLength = cpu_to_le32(inode->i_sb->s_blocksize); } udf_adjust_time(iinfo, inode_get_atime(inode)); udf_adjust_time(iinfo, inode_get_mtime(inode)); udf_adjust_time(iinfo, inode_get_ctime(inode)); udf_time_to_disk_stamp(&efe->accessTime, inode_get_atime(inode)); udf_time_to_disk_stamp(&efe->modificationTime, inode_get_mtime(inode)); udf_time_to_disk_stamp(&efe->createTime, iinfo->i_crtime); udf_time_to_disk_stamp(&efe->attrTime, inode_get_ctime(inode)); memset(&(efe->impIdent), 0, sizeof(efe->impIdent)); strcpy(efe->impIdent.ident, UDF_ID_DEVELOPER); efe->impIdent.identSuffix[0] = UDF_OS_CLASS_UNIX; efe->impIdent.identSuffix[1] = UDF_OS_ID_LINUX; efe->uniqueID = cpu_to_le64(iinfo->i_unique); efe->lengthExtendedAttr = cpu_to_le32(iinfo->i_lenEAttr); efe->lengthAllocDescs = cpu_to_le32(iinfo->i_lenAlloc); efe->checkpoint = cpu_to_le32(iinfo->i_checkpoint); efe->descTag.tagIdent = cpu_to_le16(TAG_IDENT_EFE); crclen = sizeof(struct extendedFileEntry); } finish: if (iinfo->i_strat4096) { fe->icbTag.strategyType = cpu_to_le16(4096); fe->icbTag.strategyParameter = cpu_to_le16(1); fe->icbTag.numEntries = cpu_to_le16(2); } else { fe->icbTag.strategyType = cpu_to_le16(4); fe->icbTag.numEntries = cpu_to_le16(1); } if (iinfo->i_use) fe->icbTag.fileType = ICBTAG_FILE_TYPE_USE; else if (S_ISDIR(inode->i_mode)) fe->icbTag.fileType = ICBTAG_FILE_TYPE_DIRECTORY; else if (S_ISREG(inode->i_mode)) fe->icbTag.fileType = ICBTAG_FILE_TYPE_REGULAR; else if (S_ISLNK(inode->i_mode)) fe->icbTag.fileType = ICBTAG_FILE_TYPE_SYMLINK; else if (S_ISBLK(inode->i_mode)) fe->icbTag.fileType = ICBTAG_FILE_TYPE_BLOCK; else if (S_ISCHR(inode->i_mode)) fe->icbTag.fileType = ICBTAG_FILE_TYPE_CHAR; else if (S_ISFIFO(inode->i_mode)) fe->icbTag.fileType = ICBTAG_FILE_TYPE_FIFO; else if (S_ISSOCK(inode->i_mode)) fe->icbTag.fileType = ICBTAG_FILE_TYPE_SOCKET; icbflags = iinfo->i_alloc_type | ((inode->i_mode & S_ISUID) ? ICBTAG_FLAG_SETUID : 0) | ((inode->i_mode & S_ISGID) ? ICBTAG_FLAG_SETGID : 0) | ((inode->i_mode & S_ISVTX) ? ICBTAG_FLAG_STICKY : 0) | (le16_to_cpu(fe->icbTag.flags) & ~(ICBTAG_FLAG_AD_MASK | ICBTAG_FLAG_SETUID | ICBTAG_FLAG_SETGID | ICBTAG_FLAG_STICKY)); fe->icbTag.flags = cpu_to_le16(icbflags); if (sbi->s_udfrev >= 0x0200) fe->descTag.descVersion = cpu_to_le16(3); else fe->descTag.descVersion = cpu_to_le16(2); fe->descTag.tagSerialNum = cpu_to_le16(sbi->s_serial_number); fe->descTag.tagLocation = cpu_to_le32( iinfo->i_location.logicalBlockNum); crclen += iinfo->i_lenEAttr + iinfo->i_lenAlloc - sizeof(struct tag); fe->descTag.descCRCLength = cpu_to_le16(crclen); fe->descTag.descCRC = cpu_to_le16(crc_itu_t(0, (char *)fe + sizeof(struct tag), crclen)); fe->descTag.tagChecksum = udf_tag_checksum(&fe->descTag); set_buffer_uptodate(bh); unlock_buffer(bh); /* write the data blocks */ mark_buffer_dirty(bh); if (do_sync) { sync_dirty_buffer(bh); if (buffer_write_io_error(bh)) { udf_warn(inode->i_sb, "IO error syncing udf inode [%08lx]\n", inode->i_ino); err = -EIO; } } brelse(bh); return err; } struct inode *__udf_iget(struct super_block *sb, struct kernel_lb_addr *ino, bool hidden_inode) { unsigned long block = udf_get_lb_pblock(sb, ino, 0); struct inode *inode = iget_locked(sb, block); int err; if (!inode) return ERR_PTR(-ENOMEM); if (!(inode->i_state & I_NEW)) { if (UDF_I(inode)->i_hidden != hidden_inode) { iput(inode); return ERR_PTR(-EFSCORRUPTED); } return inode; } memcpy(&UDF_I(inode)->i_location, ino, sizeof(struct kernel_lb_addr)); err = udf_read_inode(inode, hidden_inode); if (err < 0) { iget_failed(inode); return ERR_PTR(err); } unlock_new_inode(inode); return inode; } int udf_setup_indirect_aext(struct inode *inode, udf_pblk_t block, struct extent_position *epos) { struct super_block *sb = inode->i_sb; struct buffer_head *bh; struct allocExtDesc *aed; struct extent_position nepos; struct kernel_lb_addr neloc; int ver, adsize; int err = 0; if (UDF_I(inode)->i_alloc_type == ICBTAG_FLAG_AD_SHORT) adsize = sizeof(struct short_ad); else if (UDF_I(inode)->i_alloc_type == ICBTAG_FLAG_AD_LONG) adsize = sizeof(struct long_ad); else return -EIO; neloc.logicalBlockNum = block; neloc.partitionReferenceNum = epos->block.partitionReferenceNum; bh = sb_getblk(sb, udf_get_lb_pblock(sb, &neloc, 0)); if (!bh) return -EIO; lock_buffer(bh); memset(bh->b_data, 0x00, sb->s_blocksize); set_buffer_uptodate(bh); unlock_buffer(bh); mark_buffer_dirty_inode(bh, inode); aed = (struct allocExtDesc *)(bh->b_data); if (!UDF_QUERY_FLAG(sb, UDF_FLAG_STRICT)) { aed->previousAllocExtLocation = cpu_to_le32(epos->block.logicalBlockNum); } aed->lengthAllocDescs = cpu_to_le32(0); if (UDF_SB(sb)->s_udfrev >= 0x0200) ver = 3; else ver = 2; udf_new_tag(bh->b_data, TAG_IDENT_AED, ver, 1, block, sizeof(struct tag)); nepos.block = neloc; nepos.offset = sizeof(struct allocExtDesc); nepos.bh = bh; /* * Do we have to copy current last extent to make space for indirect * one? */ if (epos->offset + adsize > sb->s_blocksize) { struct kernel_lb_addr cp_loc; uint32_t cp_len; int8_t cp_type; epos->offset -= adsize; err = udf_current_aext(inode, epos, &cp_loc, &cp_len, &cp_type, 0); if (err <= 0) goto err_out; cp_len |= ((uint32_t)cp_type) << 30; __udf_add_aext(inode, &nepos, &cp_loc, cp_len, 1); udf_write_aext(inode, epos, &nepos.block, sb->s_blocksize | EXT_NEXT_EXTENT_ALLOCDESCS, 0); } else { __udf_add_aext(inode, epos, &nepos.block, sb->s_blocksize | EXT_NEXT_EXTENT_ALLOCDESCS, 0); } brelse(epos->bh); *epos = nepos; return 0; err_out: brelse(bh); return err; } /* * Append extent at the given position - should be the first free one in inode * / indirect extent. This function assumes there is enough space in the inode * or indirect extent. Use udf_add_aext() if you didn't check for this before. */ int __udf_add_aext(struct inode *inode, struct extent_position *epos, struct kernel_lb_addr *eloc, uint32_t elen, int inc) { struct udf_inode_info *iinfo = UDF_I(inode); struct allocExtDesc *aed; int adsize; if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) adsize = sizeof(struct short_ad); else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) adsize = sizeof(struct long_ad); else return -EIO; if (!epos->bh) { WARN_ON(iinfo->i_lenAlloc != epos->offset - udf_file_entry_alloc_offset(inode)); } else { aed = (struct allocExtDesc *)epos->bh->b_data; WARN_ON(le32_to_cpu(aed->lengthAllocDescs) != epos->offset - sizeof(struct allocExtDesc)); WARN_ON(epos->offset + adsize > inode->i_sb->s_blocksize); } udf_write_aext(inode, epos, eloc, elen, inc); if (!epos->bh) { iinfo->i_lenAlloc += adsize; mark_inode_dirty(inode); } else { aed = (struct allocExtDesc *)epos->bh->b_data; le32_add_cpu(&aed->lengthAllocDescs, adsize); if (!UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_STRICT) || UDF_SB(inode->i_sb)->s_udfrev >= 0x0201) udf_update_tag(epos->bh->b_data, epos->offset + (inc ? 0 : adsize)); else udf_update_tag(epos->bh->b_data, sizeof(struct allocExtDesc)); mark_buffer_dirty_inode(epos->bh, inode); } return 0; } /* * Append extent at given position - should be the first free one in inode * / indirect extent. Takes care of allocating and linking indirect blocks. */ int udf_add_aext(struct inode *inode, struct extent_position *epos, struct kernel_lb_addr *eloc, uint32_t elen, int inc) { int adsize; struct super_block *sb = inode->i_sb; if (UDF_I(inode)->i_alloc_type == ICBTAG_FLAG_AD_SHORT) adsize = sizeof(struct short_ad); else if (UDF_I(inode)->i_alloc_type == ICBTAG_FLAG_AD_LONG) adsize = sizeof(struct long_ad); else return -EIO; if (epos->offset + (2 * adsize) > sb->s_blocksize) { int err; udf_pblk_t new_block; new_block = udf_new_block(sb, NULL, epos->block.partitionReferenceNum, epos->block.logicalBlockNum, &err); if (!new_block) return -ENOSPC; err = udf_setup_indirect_aext(inode, new_block, epos); if (err) return err; } return __udf_add_aext(inode, epos, eloc, elen, inc); } void udf_write_aext(struct inode *inode, struct extent_position *epos, struct kernel_lb_addr *eloc, uint32_t elen, int inc) { int adsize; uint8_t *ptr; struct short_ad *sad; struct long_ad *lad; struct udf_inode_info *iinfo = UDF_I(inode); if (!epos->bh) ptr = iinfo->i_data + epos->offset - udf_file_entry_alloc_offset(inode) + iinfo->i_lenEAttr; else ptr = epos->bh->b_data + epos->offset; switch (iinfo->i_alloc_type) { case ICBTAG_FLAG_AD_SHORT: sad = (struct short_ad *)ptr; sad->extLength = cpu_to_le32(elen); sad->extPosition = cpu_to_le32(eloc->logicalBlockNum); adsize = sizeof(struct short_ad); break; case ICBTAG_FLAG_AD_LONG: lad = (struct long_ad *)ptr; lad->extLength = cpu_to_le32(elen); lad->extLocation = cpu_to_lelb(*eloc); memset(lad->impUse, 0x00, sizeof(lad->impUse)); adsize = sizeof(struct long_ad); break; default: return; } if (epos->bh) { if (!UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_STRICT) || UDF_SB(inode->i_sb)->s_udfrev >= 0x0201) { struct allocExtDesc *aed = (struct allocExtDesc *)epos->bh->b_data; udf_update_tag(epos->bh->b_data, le32_to_cpu(aed->lengthAllocDescs) + sizeof(struct allocExtDesc)); } mark_buffer_dirty_inode(epos->bh, inode); } else { mark_inode_dirty(inode); } if (inc) epos->offset += adsize; } /* * Only 1 indirect extent in a row really makes sense but allow upto 16 in case * someone does some weird stuff. */ #define UDF_MAX_INDIR_EXTS 16 /* * Returns 1 on success, -errno on error, 0 on hit EOF. */ int udf_next_aext(struct inode *inode, struct extent_position *epos, struct kernel_lb_addr *eloc, uint32_t *elen, int8_t *etype, int inc) { unsigned int indirections = 0; int ret = 0; udf_pblk_t block; while (1) { ret = udf_current_aext(inode, epos, eloc, elen, etype, inc); if (ret <= 0) return ret; if (*etype != (EXT_NEXT_EXTENT_ALLOCDESCS >> 30)) return ret; if (++indirections > UDF_MAX_INDIR_EXTS) { udf_err(inode->i_sb, "too many indirect extents in inode %lu\n", inode->i_ino); return -EFSCORRUPTED; } epos->block = *eloc; epos->offset = sizeof(struct allocExtDesc); brelse(epos->bh); block = udf_get_lb_pblock(inode->i_sb, &epos->block, 0); epos->bh = sb_bread(inode->i_sb, block); if (!epos->bh) { udf_debug("reading block %u failed!\n", block); return -EIO; } } } /* * Returns 1 on success, -errno on error, 0 on hit EOF. */ int udf_current_aext(struct inode *inode, struct extent_position *epos, struct kernel_lb_addr *eloc, uint32_t *elen, int8_t *etype, int inc) { int alen; uint8_t *ptr; struct short_ad *sad; struct long_ad *lad; struct udf_inode_info *iinfo = UDF_I(inode); if (!epos->bh) { if (!epos->offset) epos->offset = udf_file_entry_alloc_offset(inode); ptr = iinfo->i_data + epos->offset - udf_file_entry_alloc_offset(inode) + iinfo->i_lenEAttr; alen = udf_file_entry_alloc_offset(inode) + iinfo->i_lenAlloc; } else { struct allocExtDesc *header = (struct allocExtDesc *)epos->bh->b_data; if (!epos->offset) epos->offset = sizeof(struct allocExtDesc); ptr = epos->bh->b_data + epos->offset; if (check_add_overflow(sizeof(struct allocExtDesc), le32_to_cpu(header->lengthAllocDescs), &alen)) return -1; } switch (iinfo->i_alloc_type) { case ICBTAG_FLAG_AD_SHORT: sad = udf_get_fileshortad(ptr, alen, &epos->offset, inc); if (!sad) return 0; *etype = le32_to_cpu(sad->extLength) >> 30; eloc->logicalBlockNum = le32_to_cpu(sad->extPosition); eloc->partitionReferenceNum = iinfo->i_location.partitionReferenceNum; *elen = le32_to_cpu(sad->extLength) & UDF_EXTENT_LENGTH_MASK; break; case ICBTAG_FLAG_AD_LONG: lad = udf_get_filelongad(ptr, alen, &epos->offset, inc); if (!lad) return 0; *etype = le32_to_cpu(lad->extLength) >> 30; *eloc = lelb_to_cpu(lad->extLocation); *elen = le32_to_cpu(lad->extLength) & UDF_EXTENT_LENGTH_MASK; break; default: udf_debug("alloc_type = %u unsupported\n", iinfo->i_alloc_type); return -EINVAL; } return 1; } static int udf_insert_aext(struct inode *inode, struct extent_position epos, struct kernel_lb_addr neloc, uint32_t nelen) { struct kernel_lb_addr oeloc; uint32_t oelen; int8_t etype; int ret; if (epos.bh) get_bh(epos.bh); while (1) { ret = udf_next_aext(inode, &epos, &oeloc, &oelen, &etype, 0); if (ret <= 0) break; udf_write_aext(inode, &epos, &neloc, nelen, 1); neloc = oeloc; nelen = (etype << 30) | oelen; } if (ret == 0) ret = udf_add_aext(inode, &epos, &neloc, nelen, 1); brelse(epos.bh); return ret; } int8_t udf_delete_aext(struct inode *inode, struct extent_position epos) { struct extent_position oepos; int adsize; int8_t etype; struct allocExtDesc *aed; struct udf_inode_info *iinfo; struct kernel_lb_addr eloc; uint32_t elen; int ret; if (epos.bh) { get_bh(epos.bh); get_bh(epos.bh); } iinfo = UDF_I(inode); if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) adsize = sizeof(struct short_ad); else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) adsize = sizeof(struct long_ad); else adsize = 0; oepos = epos; if (udf_next_aext(inode, &epos, &eloc, &elen, &etype, 1) <= 0) return -1; while (1) { ret = udf_next_aext(inode, &epos, &eloc, &elen, &etype, 1); if (ret < 0) { brelse(epos.bh); brelse(oepos.bh); return -1; } if (ret == 0) break; udf_write_aext(inode, &oepos, &eloc, (etype << 30) | elen, 1); if (oepos.bh != epos.bh) { oepos.block = epos.block; brelse(oepos.bh); get_bh(epos.bh); oepos.bh = epos.bh; oepos.offset = epos.offset - adsize; } } memset(&eloc, 0x00, sizeof(struct kernel_lb_addr)); elen = 0; if (epos.bh != oepos.bh) { udf_free_blocks(inode->i_sb, inode, &epos.block, 0, 1); udf_write_aext(inode, &oepos, &eloc, elen, 1); udf_write_aext(inode, &oepos, &eloc, elen, 1); if (!oepos.bh) { iinfo->i_lenAlloc -= (adsize * 2); mark_inode_dirty(inode); } else { aed = (struct allocExtDesc *)oepos.bh->b_data; le32_add_cpu(&aed->lengthAllocDescs, -(2 * adsize)); if (!UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_STRICT) || UDF_SB(inode->i_sb)->s_udfrev >= 0x0201) udf_update_tag(oepos.bh->b_data, oepos.offset - (2 * adsize)); else udf_update_tag(oepos.bh->b_data, sizeof(struct allocExtDesc)); mark_buffer_dirty_inode(oepos.bh, inode); } } else { udf_write_aext(inode, &oepos, &eloc, elen, 1); if (!oepos.bh) { iinfo->i_lenAlloc -= adsize; mark_inode_dirty(inode); } else { aed = (struct allocExtDesc *)oepos.bh->b_data; le32_add_cpu(&aed->lengthAllocDescs, -adsize); if (!UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_STRICT) || UDF_SB(inode->i_sb)->s_udfrev >= 0x0201) udf_update_tag(oepos.bh->b_data, epos.offset - adsize); else udf_update_tag(oepos.bh->b_data, sizeof(struct allocExtDesc)); mark_buffer_dirty_inode(oepos.bh, inode); } } brelse(epos.bh); brelse(oepos.bh); return (elen >> 30); } /* * Returns 1 on success, -errno on error, 0 on hit EOF. */ int inode_bmap(struct inode *inode, sector_t block, struct extent_position *pos, struct kernel_lb_addr *eloc, uint32_t *elen, sector_t *offset, int8_t *etype) { unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits; loff_t lbcount = 0, bcount = (loff_t) block << blocksize_bits; struct udf_inode_info *iinfo; int err = 0; iinfo = UDF_I(inode); if (!udf_read_extent_cache(inode, bcount, &lbcount, pos)) { pos->offset = 0; pos->block = iinfo->i_location; pos->bh = NULL; } *elen = 0; do { err = udf_next_aext(inode, pos, eloc, elen, etype, 1); if (err <= 0) { if (err == 0) { *offset = (bcount - lbcount) >> blocksize_bits; iinfo->i_lenExtents = lbcount; } return err; } lbcount += *elen; } while (lbcount <= bcount); /* update extent cache */ udf_update_extent_cache(inode, lbcount - *elen, pos); *offset = (bcount + *elen - lbcount) >> blocksize_bits; return 1; }
3 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM snd_pcm #define TRACE_INCLUDE_FILE pcm_trace #if !defined(_PCM_TRACE_H) || defined(TRACE_HEADER_MULTI_READ) #define _PCM_TRACE_H #include <linux/tracepoint.h> TRACE_EVENT(hwptr, TP_PROTO(struct snd_pcm_substream *substream, snd_pcm_uframes_t pos, bool irq), TP_ARGS(substream, pos, irq), TP_STRUCT__entry( __field( bool, in_interrupt ) __field( unsigned int, card ) __field( unsigned int, device ) __field( unsigned int, number ) __field( unsigned int, stream ) __field( snd_pcm_uframes_t, pos ) __field( snd_pcm_uframes_t, period_size ) __field( snd_pcm_uframes_t, buffer_size ) __field( snd_pcm_uframes_t, old_hw_ptr ) __field( snd_pcm_uframes_t, hw_ptr_base ) ), TP_fast_assign( __entry->in_interrupt = (irq); __entry->card = (substream)->pcm->card->number; __entry->device = (substream)->pcm->device; __entry->number = (substream)->number; __entry->stream = (substream)->stream; __entry->pos = (pos); __entry->period_size = (substream)->runtime->period_size; __entry->buffer_size = (substream)->runtime->buffer_size; __entry->old_hw_ptr = (substream)->runtime->status->hw_ptr; __entry->hw_ptr_base = (substream)->runtime->hw_ptr_base; ), TP_printk("pcmC%dD%d%s/sub%d: %s: pos=%lu, old=%lu, base=%lu, period=%lu, buf=%lu", __entry->card, __entry->device, __entry->stream == SNDRV_PCM_STREAM_PLAYBACK ? "p" : "c", __entry->number, __entry->in_interrupt ? "IRQ" : "POS", (unsigned long)__entry->pos, (unsigned long)__entry->old_hw_ptr, (unsigned long)__entry->hw_ptr_base, (unsigned long)__entry->period_size, (unsigned long)__entry->buffer_size) ); TRACE_EVENT(xrun, TP_PROTO(struct snd_pcm_substream *substream), TP_ARGS(substream), TP_STRUCT__entry( __field( unsigned int, card ) __field( unsigned int, device ) __field( unsigned int, number ) __field( unsigned int, stream ) __field( snd_pcm_uframes_t, period_size ) __field( snd_pcm_uframes_t, buffer_size ) __field( snd_pcm_uframes_t, old_hw_ptr ) __field( snd_pcm_uframes_t, hw_ptr_base ) ), TP_fast_assign( __entry->card = (substream)->pcm->card->number; __entry->device = (substream)->pcm->device; __entry->number = (substream)->number; __entry->stream = (substream)->stream; __entry->period_size = (substream)->runtime->period_size; __entry->buffer_size = (substream)->runtime->buffer_size; __entry->old_hw_ptr = (substream)->runtime->status->hw_ptr; __entry->hw_ptr_base = (substream)->runtime->hw_ptr_base; ), TP_printk("pcmC%dD%d%s/sub%d: XRUN: old=%lu, base=%lu, period=%lu, buf=%lu", __entry->card, __entry->device, __entry->stream == SNDRV_PCM_STREAM_PLAYBACK ? "p" : "c", __entry->number, (unsigned long)__entry->old_hw_ptr, (unsigned long)__entry->hw_ptr_base, (unsigned long)__entry->period_size, (unsigned long)__entry->buffer_size) ); TRACE_EVENT(hw_ptr_error, TP_PROTO(struct snd_pcm_substream *substream, const char *why), TP_ARGS(substream, why), TP_STRUCT__entry( __field( unsigned int, card ) __field( unsigned int, device ) __field( unsigned int, number ) __field( unsigned int, stream ) __string( reason, why ) ), TP_fast_assign( __entry->card = (substream)->pcm->card->number; __entry->device = (substream)->pcm->device; __entry->number = (substream)->number; __entry->stream = (substream)->stream; __assign_str(reason); ), TP_printk("pcmC%dD%d%s/sub%d: ERROR: %s", __entry->card, __entry->device, __entry->stream == SNDRV_PCM_STREAM_PLAYBACK ? "p" : "c", __entry->number, __get_str(reason)) ); TRACE_EVENT(applptr, TP_PROTO(struct snd_pcm_substream *substream, snd_pcm_uframes_t prev, snd_pcm_uframes_t curr), TP_ARGS(substream, prev, curr), TP_STRUCT__entry( __field( unsigned int, card ) __field( unsigned int, device ) __field( unsigned int, number ) __field( unsigned int, stream ) __field( snd_pcm_uframes_t, prev ) __field( snd_pcm_uframes_t, curr ) __field( snd_pcm_uframes_t, avail ) __field( snd_pcm_uframes_t, period_size ) __field( snd_pcm_uframes_t, buffer_size ) ), TP_fast_assign( __entry->card = (substream)->pcm->card->number; __entry->device = (substream)->pcm->device; __entry->number = (substream)->number; __entry->stream = (substream)->stream; __entry->prev = (prev); __entry->curr = (curr); __entry->avail = (substream)->stream ? snd_pcm_capture_avail(substream->runtime) : snd_pcm_playback_avail(substream->runtime); __entry->period_size = (substream)->runtime->period_size; __entry->buffer_size = (substream)->runtime->buffer_size; ), TP_printk("pcmC%dD%d%s/sub%d: prev=%lu, curr=%lu, avail=%lu, period=%lu, buf=%lu", __entry->card, __entry->device, __entry->stream ? "c" : "p", __entry->number, __entry->prev, __entry->curr, __entry->avail, __entry->period_size, __entry->buffer_size ) ); #endif /* _PCM_TRACE_H */ /* This part must be outside protection */ #undef TRACE_INCLUDE_PATH #define TRACE_INCLUDE_PATH . #include <trace/define_trace.h>
206 27 1104 154 23 1 4 1 271 501 838 345 499 846 1202 150 32 120 156 1313 1304 80 80 346 15 17 52 53 1046 1047 1046 1047 104 103 18 80 82 68 81 460 465 365 104 326 226 99 32 4 3 7 316 316 273 274 17 17 1 1 2 120 17 54 2 17 17 14 3 17 90 90 282 17 180 3 90 48 311 239 1 23 91 90 91 6 90 19 19 18 48 48 220 39 19 48 4 710 5 259 274 18 2 217 36 260 257 2 2 715 5 710 964 722 4 251 3 3 227 244 971 965 1 2 709 48 48 48 14 14 48 48 1 1 1 172 173 37 220 173 1 48 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 // SPDX-License-Identifier: GPL-2.0-only #include <linux/export.h> #include <linux/bvec.h> #include <linux/fault-inject-usercopy.h> #include <linux/uio.h> #include <linux/pagemap.h> #include <linux/highmem.h> #include <linux/slab.h> #include <linux/vmalloc.h> #include <linux/splice.h> #include <linux/compat.h> #include <linux/scatterlist.h> #include <linux/instrumented.h> #include <linux/iov_iter.h> static __always_inline size_t copy_to_user_iter(void __user *iter_to, size_t progress, size_t len, void *from, void *priv2) { if (should_fail_usercopy()) return len; if (access_ok(iter_to, len)) { from += progress; instrument_copy_to_user(iter_to, from, len); len = raw_copy_to_user(iter_to, from, len); } return len; } static __always_inline size_t copy_to_user_iter_nofault(void __user *iter_to, size_t progress, size_t len, void *from, void *priv2) { ssize_t res; if (should_fail_usercopy()) return len; from += progress; res = copy_to_user_nofault(iter_to, from, len); return res < 0 ? len : res; } static __always_inline size_t copy_from_user_iter(void __user *iter_from, size_t progress, size_t len, void *to, void *priv2) { size_t res = len; if (should_fail_usercopy()) return len; if (access_ok(iter_from, len)) { to += progress; instrument_copy_from_user_before(to, iter_from, len); res = raw_copy_from_user(to, iter_from, len); instrument_copy_from_user_after(to, iter_from, len, res); } return res; } static __always_inline size_t memcpy_to_iter(void *iter_to, size_t progress, size_t len, void *from, void *priv2) { memcpy(iter_to, from + progress, len); return 0; } static __always_inline size_t memcpy_from_iter(void *iter_from, size_t progress, size_t len, void *to, void *priv2) { memcpy(to + progress, iter_from, len); return 0; } /* * fault_in_iov_iter_readable - fault in iov iterator for reading * @i: iterator * @size: maximum length * * Fault in one or more iovecs of the given iov_iter, to a maximum length of * @size. For each iovec, fault in each page that constitutes the iovec. * * Returns the number of bytes not faulted in (like copy_to_user() and * copy_from_user()). * * Always returns 0 for non-userspace iterators. */ size_t fault_in_iov_iter_readable(const struct iov_iter *i, size_t size) { if (iter_is_ubuf(i)) { size_t n = min(size, iov_iter_count(i)); n -= fault_in_readable(i->ubuf + i->iov_offset, n); return size - n; } else if (iter_is_iovec(i)) { size_t count = min(size, iov_iter_count(i)); const struct iovec *p; size_t skip; size -= count; for (p = iter_iov(i), skip = i->iov_offset; count; p++, skip = 0) { size_t len = min(count, p->iov_len - skip); size_t ret; if (unlikely(!len)) continue; ret = fault_in_readable(p->iov_base + skip, len); count -= len - ret; if (ret) break; } return count + size; } return 0; } EXPORT_SYMBOL(fault_in_iov_iter_readable); /* * fault_in_iov_iter_writeable - fault in iov iterator for writing * @i: iterator * @size: maximum length * * Faults in the iterator using get_user_pages(), i.e., without triggering * hardware page faults. This is primarily useful when we already know that * some or all of the pages in @i aren't in memory. * * Returns the number of bytes not faulted in, like copy_to_user() and * copy_from_user(). * * Always returns 0 for non-user-space iterators. */ size_t fault_in_iov_iter_writeable(const struct iov_iter *i, size_t size) { if (iter_is_ubuf(i)) { size_t n = min(size, iov_iter_count(i)); n -= fault_in_safe_writeable(i->ubuf + i->iov_offset, n); return size - n; } else if (iter_is_iovec(i)) { size_t count = min(size, iov_iter_count(i)); const struct iovec *p; size_t skip; size -= count; for (p = iter_iov(i), skip = i->iov_offset; count; p++, skip = 0) { size_t len = min(count, p->iov_len - skip); size_t ret; if (unlikely(!len)) continue; ret = fault_in_safe_writeable(p->iov_base + skip, len); count -= len - ret; if (ret) break; } return count + size; } return 0; } EXPORT_SYMBOL(fault_in_iov_iter_writeable); void iov_iter_init(struct iov_iter *i, unsigned int direction, const struct iovec *iov, unsigned long nr_segs, size_t count) { WARN_ON(direction & ~(READ | WRITE)); *i = (struct iov_iter) { .iter_type = ITER_IOVEC, .nofault = false, .data_source = direction, .__iov = iov, .nr_segs = nr_segs, .iov_offset = 0, .count = count }; } EXPORT_SYMBOL(iov_iter_init); size_t _copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i) { if (WARN_ON_ONCE(i->data_source)) return 0; if (user_backed_iter(i)) might_fault(); return iterate_and_advance(i, bytes, (void *)addr, copy_to_user_iter, memcpy_to_iter); } EXPORT_SYMBOL(_copy_to_iter); #ifdef CONFIG_ARCH_HAS_COPY_MC static __always_inline size_t copy_to_user_iter_mc(void __user *iter_to, size_t progress, size_t len, void *from, void *priv2) { if (access_ok(iter_to, len)) { from += progress; instrument_copy_to_user(iter_to, from, len); len = copy_mc_to_user(iter_to, from, len); } return len; } static __always_inline size_t memcpy_to_iter_mc(void *iter_to, size_t progress, size_t len, void *from, void *priv2) { return copy_mc_to_kernel(iter_to, from + progress, len); } /** * _copy_mc_to_iter - copy to iter with source memory error exception handling * @addr: source kernel address * @bytes: total transfer length * @i: destination iterator * * The pmem driver deploys this for the dax operation * (dax_copy_to_iter()) for dax reads (bypass page-cache and the * block-layer). Upon #MC read(2) aborts and returns EIO or the bytes * successfully copied. * * The main differences between this and typical _copy_to_iter(). * * * Typical tail/residue handling after a fault retries the copy * byte-by-byte until the fault happens again. Re-triggering machine * checks is potentially fatal so the implementation uses source * alignment and poison alignment assumptions to avoid re-triggering * hardware exceptions. * * * ITER_KVEC and ITER_BVEC can return short copies. Compare to * copy_to_iter() where only ITER_IOVEC attempts might return a short copy. * * Return: number of bytes copied (may be %0) */ size_t _copy_mc_to_iter(const void *addr, size_t bytes, struct iov_iter *i) { if (WARN_ON_ONCE(i->data_source)) return 0; if (user_backed_iter(i)) might_fault(); return iterate_and_advance(i, bytes, (void *)addr, copy_to_user_iter_mc, memcpy_to_iter_mc); } EXPORT_SYMBOL_GPL(_copy_mc_to_iter); #endif /* CONFIG_ARCH_HAS_COPY_MC */ static __always_inline size_t __copy_from_iter(void *addr, size_t bytes, struct iov_iter *i) { return iterate_and_advance(i, bytes, addr, copy_from_user_iter, memcpy_from_iter); } size_t _copy_from_iter(void *addr, size_t bytes, struct iov_iter *i) { if (WARN_ON_ONCE(!i->data_source)) return 0; if (user_backed_iter(i)) might_fault(); return __copy_from_iter(addr, bytes, i); } EXPORT_SYMBOL(_copy_from_iter); static __always_inline size_t copy_from_user_iter_nocache(void __user *iter_from, size_t progress, size_t len, void *to, void *priv2) { return __copy_from_user_inatomic_nocache(to + progress, iter_from, len); } size_t _copy_from_iter_nocache(void *addr, size_t bytes, struct iov_iter *i) { if (WARN_ON_ONCE(!i->data_source)) return 0; return iterate_and_advance(i, bytes, addr, copy_from_user_iter_nocache, memcpy_from_iter); } EXPORT_SYMBOL(_copy_from_iter_nocache); #ifdef CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE static __always_inline size_t copy_from_user_iter_flushcache(void __user *iter_from, size_t progress, size_t len, void *to, void *priv2) { return __copy_from_user_flushcache(to + progress, iter_from, len); } static __always_inline size_t memcpy_from_iter_flushcache(void *iter_from, size_t progress, size_t len, void *to, void *priv2) { memcpy_flushcache(to + progress, iter_from, len); return 0; } /** * _copy_from_iter_flushcache - write destination through cpu cache * @addr: destination kernel address * @bytes: total transfer length * @i: source iterator * * The pmem driver arranges for filesystem-dax to use this facility via * dax_copy_from_iter() for ensuring that writes to persistent memory * are flushed through the CPU cache. It is differentiated from * _copy_from_iter_nocache() in that guarantees all data is flushed for * all iterator types. The _copy_from_iter_nocache() only attempts to * bypass the cache for the ITER_IOVEC case, and on some archs may use * instructions that strand dirty-data in the cache. * * Return: number of bytes copied (may be %0) */ size_t _copy_from_iter_flushcache(void *addr, size_t bytes, struct iov_iter *i) { if (WARN_ON_ONCE(!i->data_source)) return 0; return iterate_and_advance(i, bytes, addr, copy_from_user_iter_flushcache, memcpy_from_iter_flushcache); } EXPORT_SYMBOL_GPL(_copy_from_iter_flushcache); #endif static inline bool page_copy_sane(struct page *page, size_t offset, size_t n) { struct page *head; size_t v = n + offset; /* * The general case needs to access the page order in order * to compute the page size. * However, we mostly deal with order-0 pages and thus can * avoid a possible cache line miss for requests that fit all * page orders. */ if (n <= v && v <= PAGE_SIZE) return true; head = compound_head(page); v += (page - head) << PAGE_SHIFT; if (WARN_ON(n > v || v > page_size(head))) return false; return true; } size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes, struct iov_iter *i) { size_t res = 0; if (!page_copy_sane(page, offset, bytes)) return 0; if (WARN_ON_ONCE(i->data_source)) return 0; page += offset / PAGE_SIZE; // first subpage offset %= PAGE_SIZE; while (1) { void *kaddr = kmap_local_page(page); size_t n = min(bytes, (size_t)PAGE_SIZE - offset); n = _copy_to_iter(kaddr + offset, n, i); kunmap_local(kaddr); res += n; bytes -= n; if (!bytes || !n) break; offset += n; if (offset == PAGE_SIZE) { page++; offset = 0; } } return res; } EXPORT_SYMBOL(copy_page_to_iter); size_t copy_page_to_iter_nofault(struct page *page, unsigned offset, size_t bytes, struct iov_iter *i) { size_t res = 0; if (!page_copy_sane(page, offset, bytes)) return 0; if (WARN_ON_ONCE(i->data_source)) return 0; page += offset / PAGE_SIZE; // first subpage offset %= PAGE_SIZE; while (1) { void *kaddr = kmap_local_page(page); size_t n = min(bytes, (size_t)PAGE_SIZE - offset); n = iterate_and_advance(i, n, kaddr + offset, copy_to_user_iter_nofault, memcpy_to_iter); kunmap_local(kaddr); res += n; bytes -= n; if (!bytes || !n) break; offset += n; if (offset == PAGE_SIZE) { page++; offset = 0; } } return res; } EXPORT_SYMBOL(copy_page_to_iter_nofault); size_t copy_page_from_iter(struct page *page, size_t offset, size_t bytes, struct iov_iter *i) { size_t res = 0; if (!page_copy_sane(page, offset, bytes)) return 0; page += offset / PAGE_SIZE; // first subpage offset %= PAGE_SIZE; while (1) { void *kaddr = kmap_local_page(page); size_t n = min(bytes, (size_t)PAGE_SIZE - offset); n = _copy_from_iter(kaddr + offset, n, i); kunmap_local(kaddr); res += n; bytes -= n; if (!bytes || !n) break; offset += n; if (offset == PAGE_SIZE) { page++; offset = 0; } } return res; } EXPORT_SYMBOL(copy_page_from_iter); static __always_inline size_t zero_to_user_iter(void __user *iter_to, size_t progress, size_t len, void *priv, void *priv2) { return clear_user(iter_to, len); } static __always_inline size_t zero_to_iter(void *iter_to, size_t progress, size_t len, void *priv, void *priv2) { memset(iter_to, 0, len); return 0; } size_t iov_iter_zero(size_t bytes, struct iov_iter *i) { return iterate_and_advance(i, bytes, NULL, zero_to_user_iter, zero_to_iter); } EXPORT_SYMBOL(iov_iter_zero); size_t copy_page_from_iter_atomic(struct page *page, size_t offset, size_t bytes, struct iov_iter *i) { size_t n, copied = 0; bool uses_kmap = IS_ENABLED(CONFIG_DEBUG_KMAP_LOCAL_FORCE_MAP) || PageHighMem(page); if (!page_copy_sane(page, offset, bytes)) return 0; if (WARN_ON_ONCE(!i->data_source)) return 0; do { char *p; n = bytes - copied; if (uses_kmap) { page += offset / PAGE_SIZE; offset %= PAGE_SIZE; n = min_t(size_t, n, PAGE_SIZE - offset); } p = kmap_atomic(page) + offset; n = __copy_from_iter(p, n, i); kunmap_atomic(p); copied += n; offset += n; } while (uses_kmap && copied != bytes && n > 0); return copied; } EXPORT_SYMBOL(copy_page_from_iter_atomic); static void iov_iter_bvec_advance(struct iov_iter *i, size_t size) { const struct bio_vec *bvec, *end; if (!i->count) return; i->count -= size; size += i->iov_offset; for (bvec = i->bvec, end = bvec + i->nr_segs; bvec < end; bvec++) { if (likely(size < bvec->bv_len)) break; size -= bvec->bv_len; } i->iov_offset = size; i->nr_segs -= bvec - i->bvec; i->bvec = bvec; } static void iov_iter_iovec_advance(struct iov_iter *i, size_t size) { const struct iovec *iov, *end; if (!i->count) return; i->count -= size; size += i->iov_offset; // from beginning of current segment for (iov = iter_iov(i), end = iov + i->nr_segs; iov < end; iov++) { if (likely(size < iov->iov_len)) break; size -= iov->iov_len; } i->iov_offset = size; i->nr_segs -= iov - iter_iov(i); i->__iov = iov; } static void iov_iter_folioq_advance(struct iov_iter *i, size_t size) { const struct folio_queue *folioq = i->folioq; unsigned int slot = i->folioq_slot; if (!i->count) return; i->count -= size; if (slot >= folioq_nr_slots(folioq)) { folioq = folioq->next; slot = 0; } size += i->iov_offset; /* From beginning of current segment. */ do { size_t fsize = folioq_folio_size(folioq, slot); if (likely(size < fsize)) break; size -= fsize; slot++; if (slot >= folioq_nr_slots(folioq) && folioq->next) { folioq = folioq->next; slot = 0; } } while (size); i->iov_offset = size; i->folioq_slot = slot; i->folioq = folioq; } void iov_iter_advance(struct iov_iter *i, size_t size) { if (unlikely(i->count < size)) size = i->count; if (likely(iter_is_ubuf(i)) || unlikely(iov_iter_is_xarray(i))) { i->iov_offset += size; i->count -= size; } else if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i))) { /* iovec and kvec have identical layouts */ iov_iter_iovec_advance(i, size); } else if (iov_iter_is_bvec(i)) { iov_iter_bvec_advance(i, size); } else if (iov_iter_is_folioq(i)) { iov_iter_folioq_advance(i, size); } else if (iov_iter_is_discard(i)) { i->count -= size; } } EXPORT_SYMBOL(iov_iter_advance); static void iov_iter_folioq_revert(struct iov_iter *i, size_t unroll) { const struct folio_queue *folioq = i->folioq; unsigned int slot = i->folioq_slot; for (;;) { size_t fsize; if (slot == 0) { folioq = folioq->prev; slot = folioq_nr_slots(folioq); } slot--; fsize = folioq_folio_size(folioq, slot); if (unroll <= fsize) { i->iov_offset = fsize - unroll; break; } unroll -= fsize; } i->folioq_slot = slot; i->folioq = folioq; } void iov_iter_revert(struct iov_iter *i, size_t unroll) { if (!unroll) return; if (WARN_ON(unroll > MAX_RW_COUNT)) return; i->count += unroll; if (unlikely(iov_iter_is_discard(i))) return; if (unroll <= i->iov_offset) { i->iov_offset -= unroll; return; } unroll -= i->iov_offset; if (iov_iter_is_xarray(i) || iter_is_ubuf(i)) { BUG(); /* We should never go beyond the start of the specified * range since we might then be straying into pages that * aren't pinned. */ } else if (iov_iter_is_bvec(i)) { const struct bio_vec *bvec = i->bvec; while (1) { size_t n = (--bvec)->bv_len; i->nr_segs++; if (unroll <= n) { i->bvec = bvec; i->iov_offset = n - unroll; return; } unroll -= n; } } else if (iov_iter_is_folioq(i)) { i->iov_offset = 0; iov_iter_folioq_revert(i, unroll); } else { /* same logics for iovec and kvec */ const struct iovec *iov = iter_iov(i); while (1) { size_t n = (--iov)->iov_len; i->nr_segs++; if (unroll <= n) { i->__iov = iov; i->iov_offset = n - unroll; return; } unroll -= n; } } } EXPORT_SYMBOL(iov_iter_revert); /* * Return the count of just the current iov_iter segment. */ size_t iov_iter_single_seg_count(const struct iov_iter *i) { if (i->nr_segs > 1) { if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i))) return min(i->count, iter_iov(i)->iov_len - i->iov_offset); if (iov_iter_is_bvec(i)) return min(i->count, i->bvec->bv_len - i->iov_offset); } if (unlikely(iov_iter_is_folioq(i))) return !i->count ? 0 : umin(folioq_folio_size(i->folioq, i->folioq_slot), i->count); return i->count; } EXPORT_SYMBOL(iov_iter_single_seg_count); void iov_iter_kvec(struct iov_iter *i, unsigned int direction, const struct kvec *kvec, unsigned long nr_segs, size_t count) { WARN_ON(direction & ~(READ | WRITE)); *i = (struct iov_iter){ .iter_type = ITER_KVEC, .data_source = direction, .kvec = kvec, .nr_segs = nr_segs, .iov_offset = 0, .count = count }; } EXPORT_SYMBOL(iov_iter_kvec); void iov_iter_bvec(struct iov_iter *i, unsigned int direction, const struct bio_vec *bvec, unsigned long nr_segs, size_t count) { WARN_ON(direction & ~(READ | WRITE)); *i = (struct iov_iter){ .iter_type = ITER_BVEC, .data_source = direction, .bvec = bvec, .nr_segs = nr_segs, .iov_offset = 0, .count = count }; } EXPORT_SYMBOL(iov_iter_bvec); /** * iov_iter_folio_queue - Initialise an I/O iterator to use the folios in a folio queue * @i: The iterator to initialise. * @direction: The direction of the transfer. * @folioq: The starting point in the folio queue. * @first_slot: The first slot in the folio queue to use * @offset: The offset into the folio in the first slot to start at * @count: The size of the I/O buffer in bytes. * * Set up an I/O iterator to either draw data out of the pages attached to an * inode or to inject data into those pages. The pages *must* be prevented * from evaporation, either by taking a ref on them or locking them by the * caller. */ void iov_iter_folio_queue(struct iov_iter *i, unsigned int direction, const struct folio_queue *folioq, unsigned int first_slot, unsigned int offset, size_t count) { BUG_ON(direction & ~1); *i = (struct iov_iter) { .iter_type = ITER_FOLIOQ, .data_source = direction, .folioq = folioq, .folioq_slot = first_slot, .count = count, .iov_offset = offset, }; } EXPORT_SYMBOL(iov_iter_folio_queue); /** * iov_iter_xarray - Initialise an I/O iterator to use the pages in an xarray * @i: The iterator to initialise. * @direction: The direction of the transfer. * @xarray: The xarray to access. * @start: The start file position. * @count: The size of the I/O buffer in bytes. * * Set up an I/O iterator to either draw data out of the pages attached to an * inode or to inject data into those pages. The pages *must* be prevented * from evaporation, either by taking a ref on them or locking them by the * caller. */ void iov_iter_xarray(struct iov_iter *i, unsigned int direction, struct xarray *xarray, loff_t start, size_t count) { BUG_ON(direction & ~1); *i = (struct iov_iter) { .iter_type = ITER_XARRAY, .data_source = direction, .xarray = xarray, .xarray_start = start, .count = count, .iov_offset = 0 }; } EXPORT_SYMBOL(iov_iter_xarray); /** * iov_iter_discard - Initialise an I/O iterator that discards data * @i: The iterator to initialise. * @direction: The direction of the transfer. * @count: The size of the I/O buffer in bytes. * * Set up an I/O iterator that just discards everything that's written to it. * It's only available as a READ iterator. */ void iov_iter_discard(struct iov_iter *i, unsigned int direction, size_t count) { BUG_ON(direction != READ); *i = (struct iov_iter){ .iter_type = ITER_DISCARD, .data_source = false, .count = count, .iov_offset = 0 }; } EXPORT_SYMBOL(iov_iter_discard); static bool iov_iter_aligned_iovec(const struct iov_iter *i, unsigned addr_mask, unsigned len_mask) { const struct iovec *iov = iter_iov(i); size_t size = i->count; size_t skip = i->iov_offset; do { size_t len = iov->iov_len - skip; if (len > size) len = size; if (len & len_mask) return false; if ((unsigned long)(iov->iov_base + skip) & addr_mask) return false; iov++; size -= len; skip = 0; } while (size); return true; } static bool iov_iter_aligned_bvec(const struct iov_iter *i, unsigned addr_mask, unsigned len_mask) { const struct bio_vec *bvec = i->bvec; unsigned skip = i->iov_offset; size_t size = i->count; do { size_t len = bvec->bv_len; if (len > size) len = size; if (len & len_mask) return false; if ((unsigned long)(bvec->bv_offset + skip) & addr_mask) return false; bvec++; size -= len; skip = 0; } while (size); return true; } /** * iov_iter_is_aligned() - Check if the addresses and lengths of each segments * are aligned to the parameters. * * @i: &struct iov_iter to restore * @addr_mask: bit mask to check against the iov element's addresses * @len_mask: bit mask to check against the iov element's lengths * * Return: false if any addresses or lengths intersect with the provided masks */ bool iov_iter_is_aligned(const struct iov_iter *i, unsigned addr_mask, unsigned len_mask) { if (likely(iter_is_ubuf(i))) { if (i->count & len_mask) return false; if ((unsigned long)(i->ubuf + i->iov_offset) & addr_mask) return false; return true; } if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i))) return iov_iter_aligned_iovec(i, addr_mask, len_mask); if (iov_iter_is_bvec(i)) return iov_iter_aligned_bvec(i, addr_mask, len_mask); /* With both xarray and folioq types, we're dealing with whole folios. */ if (iov_iter_is_xarray(i)) { if (i->count & len_mask) return false; if ((i->xarray_start + i->iov_offset) & addr_mask) return false; } if (iov_iter_is_folioq(i)) { if (i->count & len_mask) return false; if (i->iov_offset & addr_mask) return false; } return true; } EXPORT_SYMBOL_GPL(iov_iter_is_aligned); static unsigned long iov_iter_alignment_iovec(const struct iov_iter *i) { const struct iovec *iov = iter_iov(i); unsigned long res = 0; size_t size = i->count; size_t skip = i->iov_offset; do { size_t len = iov->iov_len - skip; if (len) { res |= (unsigned long)iov->iov_base + skip; if (len > size) len = size; res |= len; size -= len; } iov++; skip = 0; } while (size); return res; } static unsigned long iov_iter_alignment_bvec(const struct iov_iter *i) { const struct bio_vec *bvec = i->bvec; unsigned res = 0; size_t size = i->count; unsigned skip = i->iov_offset; do { size_t len = bvec->bv_len - skip; res |= (unsigned long)bvec->bv_offset + skip; if (len > size) len = size; res |= len; bvec++; size -= len; skip = 0; } while (size); return res; } unsigned long iov_iter_alignment(const struct iov_iter *i) { if (likely(iter_is_ubuf(i))) { size_t size = i->count; if (size) return ((unsigned long)i->ubuf + i->iov_offset) | size; return 0; } /* iovec and kvec have identical layouts */ if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i))) return iov_iter_alignment_iovec(i); if (iov_iter_is_bvec(i)) return iov_iter_alignment_bvec(i); /* With both xarray and folioq types, we're dealing with whole folios. */ if (iov_iter_is_folioq(i)) return i->iov_offset | i->count; if (iov_iter_is_xarray(i)) return (i->xarray_start + i->iov_offset) | i->count; return 0; } EXPORT_SYMBOL(iov_iter_alignment); unsigned long iov_iter_gap_alignment(const struct iov_iter *i) { unsigned long res = 0; unsigned long v = 0; size_t size = i->count; unsigned k; if (iter_is_ubuf(i)) return 0; if (WARN_ON(!iter_is_iovec(i))) return ~0U; for (k = 0; k < i->nr_segs; k++) { const struct iovec *iov = iter_iov(i) + k; if (iov->iov_len) { unsigned long base = (unsigned long)iov->iov_base; if (v) // if not the first one res |= base | v; // this start | previous end v = base + iov->iov_len; if (size <= iov->iov_len) break; size -= iov->iov_len; } } return res; } EXPORT_SYMBOL(iov_iter_gap_alignment); static int want_pages_array(struct page ***res, size_t size, size_t start, unsigned int maxpages) { unsigned int count = DIV_ROUND_UP(size + start, PAGE_SIZE); if (count > maxpages) count = maxpages; WARN_ON(!count); // caller should've prevented that if (!*res) { *res = kvmalloc_array(count, sizeof(struct page *), GFP_KERNEL); if (!*res) return 0; } return count; } static ssize_t iter_folioq_get_pages(struct iov_iter *iter, struct page ***ppages, size_t maxsize, unsigned maxpages, size_t *_start_offset) { const struct folio_queue *folioq = iter->folioq; struct page **pages; unsigned int slot = iter->folioq_slot; size_t extracted = 0, count = iter->count, iov_offset = iter->iov_offset; if (slot >= folioq_nr_slots(folioq)) { folioq = folioq->next; slot = 0; if (WARN_ON(iov_offset != 0)) return -EIO; } maxpages = want_pages_array(ppages, maxsize, iov_offset & ~PAGE_MASK, maxpages); if (!maxpages) return -ENOMEM; *_start_offset = iov_offset & ~PAGE_MASK; pages = *ppages; for (;;) { struct folio *folio = folioq_folio(folioq, slot); size_t offset = iov_offset, fsize = folioq_folio_size(folioq, slot); size_t part = PAGE_SIZE - offset % PAGE_SIZE; if (offset < fsize) { part = umin(part, umin(maxsize - extracted, fsize - offset)); count -= part; iov_offset += part; extracted += part; *pages = folio_page(folio, offset / PAGE_SIZE); get_page(*pages); pages++; maxpages--; } if (maxpages == 0 || extracted >= maxsize) break; if (iov_offset >= fsize) { iov_offset = 0; slot++; if (slot == folioq_nr_slots(folioq) && folioq->next) { folioq = folioq->next; slot = 0; } } } iter->count = count; iter->iov_offset = iov_offset; iter->folioq = folioq; iter->folioq_slot = slot; return extracted; } static ssize_t iter_xarray_populate_pages(struct page **pages, struct xarray *xa, pgoff_t index, unsigned int nr_pages) { XA_STATE(xas, xa, index); struct page *page; unsigned int ret = 0; rcu_read_lock(); for (page = xas_load(&xas); page; page = xas_next(&xas)) { if (xas_retry(&xas, page)) continue; /* Has the page moved or been split? */ if (unlikely(page != xas_reload(&xas))) { xas_reset(&xas); continue; } pages[ret] = find_subpage(page, xas.xa_index); get_page(pages[ret]); if (++ret == nr_pages) break; } rcu_read_unlock(); return ret; } static ssize_t iter_xarray_get_pages(struct iov_iter *i, struct page ***pages, size_t maxsize, unsigned maxpages, size_t *_start_offset) { unsigned nr, offset, count; pgoff_t index; loff_t pos; pos = i->xarray_start + i->iov_offset; index = pos >> PAGE_SHIFT; offset = pos & ~PAGE_MASK; *_start_offset = offset; count = want_pages_array(pages, maxsize, offset, maxpages); if (!count) return -ENOMEM; nr = iter_xarray_populate_pages(*pages, i->xarray, index, count); if (nr == 0) return 0; maxsize = min_t(size_t, nr * PAGE_SIZE - offset, maxsize); i->iov_offset += maxsize; i->count -= maxsize; return maxsize; } /* must be done on non-empty ITER_UBUF or ITER_IOVEC one */ static unsigned long first_iovec_segment(const struct iov_iter *i, size_t *size) { size_t skip; long k; if (iter_is_ubuf(i)) return (unsigned long)i->ubuf + i->iov_offset; for (k = 0, skip = i->iov_offset; k < i->nr_segs; k++, skip = 0) { const struct iovec *iov = iter_iov(i) + k; size_t len = iov->iov_len - skip; if (unlikely(!len)) continue; if (*size > len) *size = len; return (unsigned long)iov->iov_base + skip; } BUG(); // if it had been empty, we wouldn't get called } /* must be done on non-empty ITER_BVEC one */ static struct page *first_bvec_segment(const struct iov_iter *i, size_t *size, size_t *start) { struct page *page; size_t skip = i->iov_offset, len; len = i->bvec->bv_len - skip; if (*size > len) *size = len; skip += i->bvec->bv_offset; page = i->bvec->bv_page + skip / PAGE_SIZE; *start = skip % PAGE_SIZE; return page; } static ssize_t __iov_iter_get_pages_alloc(struct iov_iter *i, struct page ***pages, size_t maxsize, unsigned int maxpages, size_t *start) { unsigned int n, gup_flags = 0; if (maxsize > i->count) maxsize = i->count; if (!maxsize) return 0; if (maxsize > MAX_RW_COUNT) maxsize = MAX_RW_COUNT; if (likely(user_backed_iter(i))) { unsigned long addr; int res; if (iov_iter_rw(i) != WRITE) gup_flags |= FOLL_WRITE; if (i->nofault) gup_flags |= FOLL_NOFAULT; addr = first_iovec_segment(i, &maxsize); *start = addr % PAGE_SIZE; addr &= PAGE_MASK; n = want_pages_array(pages, maxsize, *start, maxpages); if (!n) return -ENOMEM; res = get_user_pages_fast(addr, n, gup_flags, *pages); if (unlikely(res <= 0)) return res; maxsize = min_t(size_t, maxsize, res * PAGE_SIZE - *start); iov_iter_advance(i, maxsize); return maxsize; } if (iov_iter_is_bvec(i)) { struct page **p; struct page *page; page = first_bvec_segment(i, &maxsize, start); n = want_pages_array(pages, maxsize, *start, maxpages); if (!n) return -ENOMEM; p = *pages; for (int k = 0; k < n; k++) get_page(p[k] = page + k); maxsize = min_t(size_t, maxsize, n * PAGE_SIZE - *start); i->count -= maxsize; i->iov_offset += maxsize; if (i->iov_offset == i->bvec->bv_len) { i->iov_offset = 0; i->bvec++; i->nr_segs--; } return maxsize; } if (iov_iter_is_folioq(i)) return iter_folioq_get_pages(i, pages, maxsize, maxpages, start); if (iov_iter_is_xarray(i)) return iter_xarray_get_pages(i, pages, maxsize, maxpages, start); return -EFAULT; } ssize_t iov_iter_get_pages2(struct iov_iter *i, struct page **pages, size_t maxsize, unsigned maxpages, size_t *start) { if (!maxpages) return 0; BUG_ON(!pages); return __iov_iter_get_pages_alloc(i, &pages, maxsize, maxpages, start); } EXPORT_SYMBOL(iov_iter_get_pages2); ssize_t iov_iter_get_pages_alloc2(struct iov_iter *i, struct page ***pages, size_t maxsize, size_t *start) { ssize_t len; *pages = NULL; len = __iov_iter_get_pages_alloc(i, pages, maxsize, ~0U, start); if (len <= 0) { kvfree(*pages); *pages = NULL; } return len; } EXPORT_SYMBOL(iov_iter_get_pages_alloc2); static int iov_npages(const struct iov_iter *i, int maxpages) { size_t skip = i->iov_offset, size = i->count; const struct iovec *p; int npages = 0; for (p = iter_iov(i); size; skip = 0, p++) { unsigned offs = offset_in_page(p->iov_base + skip); size_t len = min(p->iov_len - skip, size); if (len) { size -= len; npages += DIV_ROUND_UP(offs + len, PAGE_SIZE); if (unlikely(npages > maxpages)) return maxpages; } } return npages; } static int bvec_npages(const struct iov_iter *i, int maxpages) { size_t skip = i->iov_offset, size = i->count; const struct bio_vec *p; int npages = 0; for (p = i->bvec; size; skip = 0, p++) { unsigned offs = (p->bv_offset + skip) % PAGE_SIZE; size_t len = min(p->bv_len - skip, size); size -= len; npages += DIV_ROUND_UP(offs + len, PAGE_SIZE); if (unlikely(npages > maxpages)) return maxpages; } return npages; } int iov_iter_npages(const struct iov_iter *i, int maxpages) { if (unlikely(!i->count)) return 0; if (likely(iter_is_ubuf(i))) { unsigned offs = offset_in_page(i->ubuf + i->iov_offset); int npages = DIV_ROUND_UP(offs + i->count, PAGE_SIZE); return min(npages, maxpages); } /* iovec and kvec have identical layouts */ if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i))) return iov_npages(i, maxpages); if (iov_iter_is_bvec(i)) return bvec_npages(i, maxpages); if (iov_iter_is_folioq(i)) { unsigned offset = i->iov_offset % PAGE_SIZE; int npages = DIV_ROUND_UP(offset + i->count, PAGE_SIZE); return min(npages, maxpages); } if (iov_iter_is_xarray(i)) { unsigned offset = (i->xarray_start + i->iov_offset) % PAGE_SIZE; int npages = DIV_ROUND_UP(offset + i->count, PAGE_SIZE); return min(npages, maxpages); } return 0; } EXPORT_SYMBOL(iov_iter_npages); const void *dup_iter(struct iov_iter *new, struct iov_iter *old, gfp_t flags) { *new = *old; if (iov_iter_is_bvec(new)) return new->bvec = kmemdup(new->bvec, new->nr_segs * sizeof(struct bio_vec), flags); else if (iov_iter_is_kvec(new) || iter_is_iovec(new)) /* iovec and kvec have identical layout */ return new->__iov = kmemdup(new->__iov, new->nr_segs * sizeof(struct iovec), flags); return NULL; } EXPORT_SYMBOL(dup_iter); static __noclone int copy_compat_iovec_from_user(struct iovec *iov, const struct iovec __user *uvec, u32 nr_segs) { const struct compat_iovec __user *uiov = (const struct compat_iovec __user *)uvec; int ret = -EFAULT; u32 i; if (!user_access_begin(uiov, nr_segs * sizeof(*uiov))) return -EFAULT; for (i = 0; i < nr_segs; i++) { compat_uptr_t buf; compat_ssize_t len; unsafe_get_user(len, &uiov[i].iov_len, uaccess_end); unsafe_get_user(buf, &uiov[i].iov_base, uaccess_end); /* check for compat_size_t not fitting in compat_ssize_t .. */ if (len < 0) { ret = -EINVAL; goto uaccess_end; } iov[i].iov_base = compat_ptr(buf); iov[i].iov_len = len; } ret = 0; uaccess_end: user_access_end(); return ret; } static __noclone int copy_iovec_from_user(struct iovec *iov, const struct iovec __user *uiov, unsigned long nr_segs) { int ret = -EFAULT; if (!user_access_begin(uiov, nr_segs * sizeof(*uiov))) return -EFAULT; do { void __user *buf; ssize_t len; unsafe_get_user(len, &uiov->iov_len, uaccess_end); unsafe_get_user(buf, &uiov->iov_base, uaccess_end); /* check for size_t not fitting in ssize_t .. */ if (unlikely(len < 0)) { ret = -EINVAL; goto uaccess_end; } iov->iov_base = buf; iov->iov_len = len; uiov++; iov++; } while (--nr_segs); ret = 0; uaccess_end: user_access_end(); return ret; } struct iovec *iovec_from_user(const struct iovec __user *uvec, unsigned long nr_segs, unsigned long fast_segs, struct iovec *fast_iov, bool compat) { struct iovec *iov = fast_iov; int ret; /* * SuS says "The readv() function *may* fail if the iovcnt argument was * less than or equal to 0, or greater than {IOV_MAX}. Linux has * traditionally returned zero for zero segments, so... */ if (nr_segs == 0) return iov; if (nr_segs > UIO_MAXIOV) return ERR_PTR(-EINVAL); if (nr_segs > fast_segs) { iov = kmalloc_array(nr_segs, sizeof(struct iovec), GFP_KERNEL); if (!iov) return ERR_PTR(-ENOMEM); } if (unlikely(compat)) ret = copy_compat_iovec_from_user(iov, uvec, nr_segs); else ret = copy_iovec_from_user(iov, uvec, nr_segs); if (ret) { if (iov != fast_iov) kfree(iov); return ERR_PTR(ret); } return iov; } /* * Single segment iovec supplied by the user, import it as ITER_UBUF. */ static ssize_t __import_iovec_ubuf(int type, const struct iovec __user *uvec, struct iovec **iovp, struct iov_iter *i, bool compat) { struct iovec *iov = *iovp; ssize_t ret; *iovp = NULL; if (compat) ret = copy_compat_iovec_from_user(iov, uvec, 1); else ret = copy_iovec_from_user(iov, uvec, 1); if (unlikely(ret)) return ret; ret = import_ubuf(type, iov->iov_base, iov->iov_len, i); if (unlikely(ret)) return ret; return i->count; } ssize_t __import_iovec(int type, const struct iovec __user *uvec, unsigned nr_segs, unsigned fast_segs, struct iovec **iovp, struct iov_iter *i, bool compat) { ssize_t total_len = 0; unsigned long seg; struct iovec *iov; if (nr_segs == 1) return __import_iovec_ubuf(type, uvec, iovp, i, compat); iov = iovec_from_user(uvec, nr_segs, fast_segs, *iovp, compat); if (IS_ERR(iov)) { *iovp = NULL; return PTR_ERR(iov); } /* * According to the Single Unix Specification we should return EINVAL if * an element length is < 0 when cast to ssize_t or if the total length * would overflow the ssize_t return value of the system call. * * Linux caps all read/write calls to MAX_RW_COUNT, and avoids the * overflow case. */ for (seg = 0; seg < nr_segs; seg++) { ssize_t len = (ssize_t)iov[seg].iov_len; if (!access_ok(iov[seg].iov_base, len)) { if (iov != *iovp) kfree(iov); *iovp = NULL; return -EFAULT; } if (len > MAX_RW_COUNT - total_len) { len = MAX_RW_COUNT - total_len; iov[seg].iov_len = len; } total_len += len; } iov_iter_init(i, type, iov, nr_segs, total_len); if (iov == *iovp) *iovp = NULL; else *iovp = iov; return total_len; } /** * import_iovec() - Copy an array of &struct iovec from userspace * into the kernel, check that it is valid, and initialize a new * &struct iov_iter iterator to access it. * * @type: One of %READ or %WRITE. * @uvec: Pointer to the userspace array. * @nr_segs: Number of elements in userspace array. * @fast_segs: Number of elements in @iov. * @iovp: (input and output parameter) Pointer to pointer to (usually small * on-stack) kernel array. * @i: Pointer to iterator that will be initialized on success. * * If the array pointed to by *@iov is large enough to hold all @nr_segs, * then this function places %NULL in *@iov on return. Otherwise, a new * array will be allocated and the result placed in *@iov. This means that * the caller may call kfree() on *@iov regardless of whether the small * on-stack array was used or not (and regardless of whether this function * returns an error or not). * * Return: Negative error code on error, bytes imported on success */ ssize_t import_iovec(int type, const struct iovec __user *uvec, unsigned nr_segs, unsigned fast_segs, struct iovec **iovp, struct iov_iter *i) { return __import_iovec(type, uvec, nr_segs, fast_segs, iovp, i, in_compat_syscall()); } EXPORT_SYMBOL(import_iovec); int import_ubuf(int rw, void __user *buf, size_t len, struct iov_iter *i) { if (len > MAX_RW_COUNT) len = MAX_RW_COUNT; if (unlikely(!access_ok(buf, len))) return -EFAULT; iov_iter_ubuf(i, rw, buf, len); return 0; } EXPORT_SYMBOL_GPL(import_ubuf); /** * iov_iter_restore() - Restore a &struct iov_iter to the same state as when * iov_iter_save_state() was called. * * @i: &struct iov_iter to restore * @state: state to restore from * * Used after iov_iter_save_state() to bring restore @i, if operations may * have advanced it. * * Note: only works on ITER_IOVEC, ITER_BVEC, and ITER_KVEC */ void iov_iter_restore(struct iov_iter *i, struct iov_iter_state *state) { if (WARN_ON_ONCE(!iov_iter_is_bvec(i) && !iter_is_iovec(i) && !iter_is_ubuf(i)) && !iov_iter_is_kvec(i)) return; i->iov_offset = state->iov_offset; i->count = state->count; if (iter_is_ubuf(i)) return; /* * For the *vec iters, nr_segs + iov is constant - if we increment * the vec, then we also decrement the nr_segs count. Hence we don't * need to track both of these, just one is enough and we can deduct * the other from that. ITER_KVEC and ITER_IOVEC are the same struct * size, so we can just increment the iov pointer as they are unionzed. * ITER_BVEC _may_ be the same size on some archs, but on others it is * not. Be safe and handle it separately. */ BUILD_BUG_ON(sizeof(struct iovec) != sizeof(struct kvec)); if (iov_iter_is_bvec(i)) i->bvec -= state->nr_segs - i->nr_segs; else i->__iov -= state->nr_segs - i->nr_segs; i->nr_segs = state->nr_segs; } /* * Extract a list of contiguous pages from an ITER_FOLIOQ iterator. This does * not get references on the pages, nor does it get a pin on them. */ static ssize_t iov_iter_extract_folioq_pages(struct iov_iter *i, struct page ***pages, size_t maxsize, unsigned int maxpages, iov_iter_extraction_t extraction_flags, size_t *offset0) { const struct folio_queue *folioq = i->folioq; struct page **p; unsigned int nr = 0; size_t extracted = 0, offset, slot = i->folioq_slot; if (slot >= folioq_nr_slots(folioq)) { folioq = folioq->next; slot = 0; if (WARN_ON(i->iov_offset != 0)) return -EIO; } offset = i->iov_offset & ~PAGE_MASK; *offset0 = offset; maxpages = want_pages_array(pages, maxsize, offset, maxpages); if (!maxpages) return -ENOMEM; p = *pages; for (;;) { struct folio *folio = folioq_folio(folioq, slot); size_t offset = i->iov_offset, fsize = folioq_folio_size(folioq, slot); size_t part = PAGE_SIZE - offset % PAGE_SIZE; if (offset < fsize) { part = umin(part, umin(maxsize - extracted, fsize - offset)); i->count -= part; i->iov_offset += part; extracted += part; p[nr++] = folio_page(folio, offset / PAGE_SIZE); } if (nr >= maxpages || extracted >= maxsize) break; if (i->iov_offset >= fsize) { i->iov_offset = 0; slot++; if (slot == folioq_nr_slots(folioq) && folioq->next) { folioq = folioq->next; slot = 0; } } } i->folioq = folioq; i->folioq_slot = slot; return extracted; } /* * Extract a list of contiguous pages from an ITER_XARRAY iterator. This does not * get references on the pages, nor does it get a pin on them. */ static ssize_t iov_iter_extract_xarray_pages(struct iov_iter *i, struct page ***pages, size_t maxsize, unsigned int maxpages, iov_iter_extraction_t extraction_flags, size_t *offset0) { struct page *page, **p; unsigned int nr = 0, offset; loff_t pos = i->xarray_start + i->iov_offset; pgoff_t index = pos >> PAGE_SHIFT; XA_STATE(xas, i->xarray, index); offset = pos & ~PAGE_MASK; *offset0 = offset; maxpages = want_pages_array(pages, maxsize, offset, maxpages); if (!maxpages) return -ENOMEM; p = *pages; rcu_read_lock(); for (page = xas_load(&xas); page; page = xas_next(&xas)) { if (xas_retry(&xas, page)) continue; /* Has the page moved or been split? */ if (unlikely(page != xas_reload(&xas))) { xas_reset(&xas); continue; } p[nr++] = find_subpage(page, xas.xa_index); if (nr == maxpages) break; } rcu_read_unlock(); maxsize = min_t(size_t, nr * PAGE_SIZE - offset, maxsize); iov_iter_advance(i, maxsize); return maxsize; } /* * Extract a list of virtually contiguous pages from an ITER_BVEC iterator. * This does not get references on the pages, nor does it get a pin on them. */ static ssize_t iov_iter_extract_bvec_pages(struct iov_iter *i, struct page ***pages, size_t maxsize, unsigned int maxpages, iov_iter_extraction_t extraction_flags, size_t *offset0) { size_t skip = i->iov_offset, size = 0; struct bvec_iter bi; int k = 0; if (i->nr_segs == 0) return 0; if (i->iov_offset == i->bvec->bv_len) { i->iov_offset = 0; i->nr_segs--; i->bvec++; skip = 0; } bi.bi_idx = 0; bi.bi_size = maxsize; bi.bi_bvec_done = skip; maxpages = want_pages_array(pages, maxsize, skip, maxpages); while (bi.bi_size && bi.bi_idx < i->nr_segs) { struct bio_vec bv = bvec_iter_bvec(i->bvec, bi); /* * The iov_iter_extract_pages interface only allows an offset * into the first page. Break out of the loop if we see an * offset into subsequent pages, the caller will have to call * iov_iter_extract_pages again for the reminder. */ if (k) { if (bv.bv_offset) break; } else { *offset0 = bv.bv_offset; } (*pages)[k++] = bv.bv_page; size += bv.bv_len; if (k >= maxpages) break; /* * We are done when the end of the bvec doesn't align to a page * boundary as that would create a hole in the returned space. * The caller will handle this with another call to * iov_iter_extract_pages. */ if (bv.bv_offset + bv.bv_len != PAGE_SIZE) break; bvec_iter_advance_single(i->bvec, &bi, bv.bv_len); } iov_iter_advance(i, size); return size; } /* * Extract a list of virtually contiguous pages from an ITER_KVEC iterator. * This does not get references on the pages, nor does it get a pin on them. */ static ssize_t iov_iter_extract_kvec_pages(struct iov_iter *i, struct page ***pages, size_t maxsize, unsigned int maxpages, iov_iter_extraction_t extraction_flags, size_t *offset0) { struct page **p, *page; const void *kaddr; size_t skip = i->iov_offset, offset, len, size; int k; for (;;) { if (i->nr_segs == 0) return 0; size = min(maxsize, i->kvec->iov_len - skip); if (size) break; i->iov_offset = 0; i->nr_segs--; i->kvec++; skip = 0; } kaddr = i->kvec->iov_base + skip; offset = (unsigned long)kaddr & ~PAGE_MASK; *offset0 = offset; maxpages = want_pages_array(pages, size, offset, maxpages); if (!maxpages) return -ENOMEM; p = *pages; kaddr -= offset; len = offset + size; for (k = 0; k < maxpages; k++) { size_t seg = min_t(size_t, len, PAGE_SIZE); if (is_vmalloc_or_module_addr(kaddr)) page = vmalloc_to_page(kaddr); else page = virt_to_page(kaddr); p[k] = page; len -= seg; kaddr += PAGE_SIZE; } size = min_t(size_t, size, maxpages * PAGE_SIZE - offset); iov_iter_advance(i, size); return size; } /* * Extract a list of contiguous pages from a user iterator and get a pin on * each of them. This should only be used if the iterator is user-backed * (IOBUF/UBUF). * * It does not get refs on the pages, but the pages must be unpinned by the * caller once the transfer is complete. * * This is safe to be used where background IO/DMA *is* going to be modifying * the buffer; using a pin rather than a ref makes forces fork() to give the * child a copy of the page. */ static ssize_t iov_iter_extract_user_pages(struct iov_iter *i, struct page ***pages, size_t maxsize, unsigned int maxpages, iov_iter_extraction_t extraction_flags, size_t *offset0) { unsigned long addr; unsigned int gup_flags = 0; size_t offset; int res; if (i->data_source == ITER_DEST) gup_flags |= FOLL_WRITE; if (extraction_flags & ITER_ALLOW_P2PDMA) gup_flags |= FOLL_PCI_P2PDMA; if (i->nofault) gup_flags |= FOLL_NOFAULT; addr = first_iovec_segment(i, &maxsize); *offset0 = offset = addr % PAGE_SIZE; addr &= PAGE_MASK; maxpages = want_pages_array(pages, maxsize, offset, maxpages); if (!maxpages) return -ENOMEM; res = pin_user_pages_fast(addr, maxpages, gup_flags, *pages); if (unlikely(res <= 0)) return res; maxsize = min_t(size_t, maxsize, res * PAGE_SIZE - offset); iov_iter_advance(i, maxsize); return maxsize; } /** * iov_iter_extract_pages - Extract a list of contiguous pages from an iterator * @i: The iterator to extract from * @pages: Where to return the list of pages * @maxsize: The maximum amount of iterator to extract * @maxpages: The maximum size of the list of pages * @extraction_flags: Flags to qualify request * @offset0: Where to return the starting offset into (*@pages)[0] * * Extract a list of contiguous pages from the current point of the iterator, * advancing the iterator. The maximum number of pages and the maximum amount * of page contents can be set. * * If *@pages is NULL, a page list will be allocated to the required size and * *@pages will be set to its base. If *@pages is not NULL, it will be assumed * that the caller allocated a page list at least @maxpages in size and this * will be filled in. * * @extraction_flags can have ITER_ALLOW_P2PDMA set to request peer-to-peer DMA * be allowed on the pages extracted. * * The iov_iter_extract_will_pin() function can be used to query how cleanup * should be performed. * * Extra refs or pins on the pages may be obtained as follows: * * (*) If the iterator is user-backed (ITER_IOVEC/ITER_UBUF), pins will be * added to the pages, but refs will not be taken. * iov_iter_extract_will_pin() will return true. * * (*) If the iterator is ITER_KVEC, ITER_BVEC, ITER_FOLIOQ or ITER_XARRAY, the * pages are merely listed; no extra refs or pins are obtained. * iov_iter_extract_will_pin() will return 0. * * Note also: * * (*) Use with ITER_DISCARD is not supported as that has no content. * * On success, the function sets *@pages to the new pagelist, if allocated, and * sets *offset0 to the offset into the first page. * * It may also return -ENOMEM and -EFAULT. */ ssize_t iov_iter_extract_pages(struct iov_iter *i, struct page ***pages, size_t maxsize, unsigned int maxpages, iov_iter_extraction_t extraction_flags, size_t *offset0) { maxsize = min_t(size_t, min_t(size_t, maxsize, i->count), MAX_RW_COUNT); if (!maxsize) return 0; if (likely(user_backed_iter(i))) return iov_iter_extract_user_pages(i, pages, maxsize, maxpages, extraction_flags, offset0); if (iov_iter_is_kvec(i)) return iov_iter_extract_kvec_pages(i, pages, maxsize, maxpages, extraction_flags, offset0); if (iov_iter_is_bvec(i)) return iov_iter_extract_bvec_pages(i, pages, maxsize, maxpages, extraction_flags, offset0); if (iov_iter_is_folioq(i)) return iov_iter_extract_folioq_pages(i, pages, maxsize, maxpages, extraction_flags, offset0); if (iov_iter_is_xarray(i)) return iov_iter_extract_xarray_pages(i, pages, maxsize, maxpages, extraction_flags, offset0); return -EFAULT; } EXPORT_SYMBOL_GPL(iov_iter_extract_pages);
20 23 23 23 23 8 20 5 16 14 15 2 15 15 8 6 6 3 3 1 5 5 1 30 30 1 29 23 5 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2005 Silicon Graphics, Inc. * Copyright (c) 2013 Red Hat, Inc. * All Rights Reserved. */ #include "xfs.h" #include "xfs_fs.h" #include "xfs_shared.h" #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_mount.h" #include "xfs_inode.h" #include "xfs_dir2.h" #include "xfs_dir2_priv.h" #include "xfs_trace.h" #include "xfs_bmap.h" #include "xfs_trans.h" #include "xfs_error.h" #include "xfs_health.h" /* * Directory file type support functions */ static unsigned char xfs_dir3_filetype_table[] = { DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK, DT_WHT, }; unsigned char xfs_dir3_get_dtype( struct xfs_mount *mp, uint8_t filetype) { if (!xfs_has_ftype(mp)) return DT_UNKNOWN; if (filetype >= XFS_DIR3_FT_MAX) return DT_UNKNOWN; return xfs_dir3_filetype_table[filetype]; } STATIC int xfs_dir2_sf_getdents( struct xfs_da_args *args, struct dir_context *ctx) { int i; /* shortform entry number */ struct xfs_inode *dp = args->dp; /* incore directory inode */ struct xfs_mount *mp = dp->i_mount; xfs_dir2_dataptr_t off; /* current entry's offset */ xfs_dir2_sf_entry_t *sfep; /* shortform directory entry */ struct xfs_dir2_sf_hdr *sfp = dp->i_df.if_data; xfs_dir2_dataptr_t dot_offset; xfs_dir2_dataptr_t dotdot_offset; xfs_ino_t ino; struct xfs_da_geometry *geo = args->geo; ASSERT(dp->i_df.if_format == XFS_DINODE_FMT_LOCAL); ASSERT(dp->i_df.if_bytes == dp->i_disk_size); ASSERT(sfp != NULL); /* * If the block number in the offset is out of range, we're done. */ if (xfs_dir2_dataptr_to_db(geo, ctx->pos) > geo->datablk) return 0; /* * Precalculate offsets for "." and ".." as we will always need them. * This relies on the fact that directories always start with the * entries for "." and "..". */ dot_offset = xfs_dir2_db_off_to_dataptr(geo, geo->datablk, geo->data_entry_offset); dotdot_offset = xfs_dir2_db_off_to_dataptr(geo, geo->datablk, geo->data_entry_offset + xfs_dir2_data_entsize(mp, sizeof(".") - 1)); /* * Put . entry unless we're starting past it. */ if (ctx->pos <= dot_offset) { ctx->pos = dot_offset & 0x7fffffff; if (!dir_emit(ctx, ".", 1, dp->i_ino, DT_DIR)) return 0; } /* * Put .. entry unless we're starting past it. */ if (ctx->pos <= dotdot_offset) { ino = xfs_dir2_sf_get_parent_ino(sfp); ctx->pos = dotdot_offset & 0x7fffffff; if (!dir_emit(ctx, "..", 2, ino, DT_DIR)) return 0; } /* * Loop while there are more entries and put'ing works. */ sfep = xfs_dir2_sf_firstentry(sfp); for (i = 0; i < sfp->count; i++) { uint8_t filetype; off = xfs_dir2_db_off_to_dataptr(geo, geo->datablk, xfs_dir2_sf_get_offset(sfep)); if (ctx->pos > off) { sfep = xfs_dir2_sf_nextentry(mp, sfp, sfep); continue; } ino = xfs_dir2_sf_get_ino(mp, sfp, sfep); filetype = xfs_dir2_sf_get_ftype(mp, sfep); ctx->pos = off & 0x7fffffff; if (XFS_IS_CORRUPT(dp->i_mount, !xfs_dir2_namecheck(sfep->name, sfep->namelen))) { xfs_dirattr_mark_sick(dp, XFS_DATA_FORK); return -EFSCORRUPTED; } if (!dir_emit(ctx, (char *)sfep->name, sfep->namelen, ino, xfs_dir3_get_dtype(mp, filetype))) return 0; sfep = xfs_dir2_sf_nextentry(mp, sfp, sfep); } ctx->pos = xfs_dir2_db_off_to_dataptr(geo, geo->datablk + 1, 0) & 0x7fffffff; return 0; } /* * Readdir for block directories. */ STATIC int xfs_dir2_block_getdents( struct xfs_da_args *args, struct dir_context *ctx, unsigned int *lock_mode) { struct xfs_inode *dp = args->dp; /* incore directory inode */ struct xfs_buf *bp; /* buffer for block */ int error; /* error return value */ int wantoff; /* starting block offset */ xfs_off_t cook; struct xfs_da_geometry *geo = args->geo; unsigned int offset, next_offset; unsigned int end; /* * If the block number in the offset is out of range, we're done. */ if (xfs_dir2_dataptr_to_db(geo, ctx->pos) > geo->datablk) return 0; error = xfs_dir3_block_read(args->trans, dp, args->owner, &bp); if (error) return error; xfs_iunlock(dp, *lock_mode); *lock_mode = 0; /* * Extract the byte offset we start at from the seek pointer. * We'll skip entries before this. */ wantoff = xfs_dir2_dataptr_to_off(geo, ctx->pos); xfs_dir3_data_check(dp, bp); /* * Loop over the data portion of the block. * Each object is a real entry (dep) or an unused one (dup). */ end = xfs_dir3_data_end_offset(geo, bp->b_addr); for (offset = geo->data_entry_offset; offset < end; offset = next_offset) { struct xfs_dir2_data_unused *dup = bp->b_addr + offset; struct xfs_dir2_data_entry *dep = bp->b_addr + offset; uint8_t filetype; /* * Unused, skip it. */ if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) { next_offset = offset + be16_to_cpu(dup->length); continue; } /* * Bump pointer for the next iteration. */ next_offset = offset + xfs_dir2_data_entsize(dp->i_mount, dep->namelen); /* * The entry is before the desired starting point, skip it. */ if (offset < wantoff) continue; cook = xfs_dir2_db_off_to_dataptr(geo, geo->datablk, offset); ctx->pos = cook & 0x7fffffff; filetype = xfs_dir2_data_get_ftype(dp->i_mount, dep); /* * If it didn't fit, set the final offset to here & return. */ if (XFS_IS_CORRUPT(dp->i_mount, !xfs_dir2_namecheck(dep->name, dep->namelen))) { xfs_dirattr_mark_sick(dp, XFS_DATA_FORK); error = -EFSCORRUPTED; goto out_rele; } if (!dir_emit(ctx, (char *)dep->name, dep->namelen, be64_to_cpu(dep->inumber), xfs_dir3_get_dtype(dp->i_mount, filetype))) goto out_rele; } /* * Reached the end of the block. * Set the offset to a non-existent block 1 and return. */ ctx->pos = xfs_dir2_db_off_to_dataptr(geo, geo->datablk + 1, 0) & 0x7fffffff; out_rele: xfs_trans_brelse(args->trans, bp); return error; } /* * Read a directory block and initiate readahead for blocks beyond that. * We maintain a sliding readahead window of the remaining space in the * buffer rounded up to the nearest block. */ STATIC int xfs_dir2_leaf_readbuf( struct xfs_da_args *args, size_t bufsize, xfs_dir2_off_t *cur_off, xfs_dablk_t *ra_blk, struct xfs_buf **bpp) { struct xfs_inode *dp = args->dp; struct xfs_buf *bp = NULL; struct xfs_da_geometry *geo = args->geo; struct xfs_ifork *ifp = xfs_ifork_ptr(dp, XFS_DATA_FORK); struct xfs_bmbt_irec map; struct blk_plug plug; xfs_dir2_off_t new_off; xfs_dablk_t next_ra; xfs_dablk_t map_off; xfs_dablk_t last_da; struct xfs_iext_cursor icur; int ra_want; int error = 0; error = xfs_iread_extents(args->trans, dp, XFS_DATA_FORK); if (error) goto out; /* * Look for mapped directory blocks at or above the current offset. * Truncate down to the nearest directory block to start the scanning * operation. */ last_da = xfs_dir2_byte_to_da(geo, XFS_DIR2_LEAF_OFFSET); map_off = xfs_dir2_db_to_da(geo, xfs_dir2_byte_to_db(geo, *cur_off)); if (!xfs_iext_lookup_extent(dp, ifp, map_off, &icur, &map)) goto out; if (map.br_startoff >= last_da) goto out; xfs_trim_extent(&map, map_off, last_da - map_off); /* Read the directory block of that first mapping. */ new_off = xfs_dir2_da_to_byte(geo, map.br_startoff); if (new_off > *cur_off) *cur_off = new_off; error = xfs_dir3_data_read(args->trans, dp, args->owner, map.br_startoff, 0, &bp); if (error) goto out; /* * Start readahead for the next bufsize's worth of dir data blocks. * We may have already issued readahead for some of that range; * ra_blk tracks the last block we tried to read(ahead). */ ra_want = howmany(bufsize + geo->blksize, (1 << geo->fsblog)); if (*ra_blk >= last_da) goto out; else if (*ra_blk == 0) *ra_blk = map.br_startoff; next_ra = map.br_startoff + geo->fsbcount; if (next_ra >= last_da) goto out_no_ra; if (map.br_blockcount < geo->fsbcount && !xfs_iext_next_extent(ifp, &icur, &map)) goto out_no_ra; if (map.br_startoff >= last_da) goto out_no_ra; xfs_trim_extent(&map, next_ra, last_da - next_ra); /* Start ra for each dir (not fs) block that has a mapping. */ blk_start_plug(&plug); while (ra_want > 0) { next_ra = roundup((xfs_dablk_t)map.br_startoff, geo->fsbcount); while (ra_want > 0 && next_ra < map.br_startoff + map.br_blockcount) { if (next_ra >= last_da) { *ra_blk = last_da; break; } if (next_ra > *ra_blk) { xfs_dir3_data_readahead(dp, next_ra, XFS_DABUF_MAP_HOLE_OK); *ra_blk = next_ra; } ra_want -= geo->fsbcount; next_ra += geo->fsbcount; } if (!xfs_iext_next_extent(ifp, &icur, &map)) { *ra_blk = last_da; break; } } blk_finish_plug(&plug); out: *bpp = bp; return error; out_no_ra: *ra_blk = last_da; goto out; } /* * Getdents (readdir) for leaf and node directories. * This reads the data blocks only, so is the same for both forms. */ STATIC int xfs_dir2_leaf_getdents( struct xfs_da_args *args, struct dir_context *ctx, size_t bufsize, unsigned int *lock_mode) { struct xfs_inode *dp = args->dp; struct xfs_mount *mp = dp->i_mount; struct xfs_buf *bp = NULL; /* data block buffer */ xfs_dir2_data_entry_t *dep; /* data entry */ xfs_dir2_data_unused_t *dup; /* unused entry */ struct xfs_da_geometry *geo = args->geo; xfs_dablk_t rablk = 0; /* current readahead block */ xfs_dir2_off_t curoff; /* current overall offset */ int length; /* temporary length value */ int byteoff; /* offset in current block */ unsigned int offset = 0; int error = 0; /* error return value */ /* * If the offset is at or past the largest allowed value, * give up right away. */ if (ctx->pos >= XFS_DIR2_MAX_DATAPTR) return 0; /* * Inside the loop we keep the main offset value as a byte offset * in the directory file. */ curoff = xfs_dir2_dataptr_to_byte(ctx->pos); /* * Loop over directory entries until we reach the end offset. * Get more blocks and readahead as necessary. */ while (curoff < XFS_DIR2_LEAF_OFFSET) { uint8_t filetype; /* * If we have no buffer, or we're off the end of the * current buffer, need to get another one. */ if (!bp || offset >= geo->blksize) { if (bp) { xfs_trans_brelse(args->trans, bp); bp = NULL; } if (*lock_mode == 0) *lock_mode = xfs_ilock_data_map_shared(dp); error = xfs_dir2_leaf_readbuf(args, bufsize, &curoff, &rablk, &bp); if (error || !bp) break; xfs_iunlock(dp, *lock_mode); *lock_mode = 0; xfs_dir3_data_check(dp, bp); /* * Find our position in the block. */ offset = geo->data_entry_offset; byteoff = xfs_dir2_byte_to_off(geo, curoff); /* * Skip past the header. */ if (byteoff == 0) curoff += geo->data_entry_offset; /* * Skip past entries until we reach our offset. */ else { while (offset < byteoff) { dup = bp->b_addr + offset; if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) { length = be16_to_cpu(dup->length); offset += length; continue; } dep = bp->b_addr + offset; length = xfs_dir2_data_entsize(mp, dep->namelen); offset += length; } /* * Now set our real offset. */ curoff = xfs_dir2_db_off_to_byte(geo, xfs_dir2_byte_to_db(geo, curoff), offset); if (offset >= geo->blksize) continue; } } /* * We have a pointer to an entry. Is it a live one? */ dup = bp->b_addr + offset; /* * No, it's unused, skip over it. */ if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) { length = be16_to_cpu(dup->length); offset += length; curoff += length; continue; } dep = bp->b_addr + offset; length = xfs_dir2_data_entsize(mp, dep->namelen); filetype = xfs_dir2_data_get_ftype(mp, dep); ctx->pos = xfs_dir2_byte_to_dataptr(curoff) & 0x7fffffff; if (XFS_IS_CORRUPT(dp->i_mount, !xfs_dir2_namecheck(dep->name, dep->namelen))) { xfs_dirattr_mark_sick(dp, XFS_DATA_FORK); error = -EFSCORRUPTED; break; } if (!dir_emit(ctx, (char *)dep->name, dep->namelen, be64_to_cpu(dep->inumber), xfs_dir3_get_dtype(dp->i_mount, filetype))) break; /* * Advance to next entry in the block. */ offset += length; curoff += length; /* bufsize may have just been a guess; don't go negative */ bufsize = bufsize > length ? bufsize - length : 0; } /* * All done. Set output offset value to current offset. */ if (curoff > xfs_dir2_dataptr_to_byte(XFS_DIR2_MAX_DATAPTR)) ctx->pos = XFS_DIR2_MAX_DATAPTR & 0x7fffffff; else ctx->pos = xfs_dir2_byte_to_dataptr(curoff) & 0x7fffffff; if (bp) xfs_trans_brelse(args->trans, bp); return error; } /* * Read a directory. * * If supplied, the transaction collects locked dir buffers to avoid * nested buffer deadlocks. This function does not dirty the * transaction. The caller must hold the IOLOCK (shared or exclusive) * before calling this function. */ int xfs_readdir( struct xfs_trans *tp, struct xfs_inode *dp, struct dir_context *ctx, size_t bufsize) { struct xfs_da_args args = { NULL }; unsigned int lock_mode; int error; trace_xfs_readdir(dp); if (xfs_is_shutdown(dp->i_mount)) return -EIO; if (xfs_ifork_zapped(dp, XFS_DATA_FORK)) return -EIO; ASSERT(S_ISDIR(VFS_I(dp)->i_mode)); xfs_assert_ilocked(dp, XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL); XFS_STATS_INC(dp->i_mount, xs_dir_getdents); args.dp = dp; args.geo = dp->i_mount->m_dir_geo; args.trans = tp; args.owner = dp->i_ino; if (dp->i_df.if_format == XFS_DINODE_FMT_LOCAL) return xfs_dir2_sf_getdents(&args, ctx); lock_mode = xfs_ilock_data_map_shared(dp); switch (xfs_dir2_format(&args, &error)) { case XFS_DIR2_FMT_BLOCK: error = xfs_dir2_block_getdents(&args, ctx, &lock_mode); break; case XFS_DIR2_FMT_LEAF: case XFS_DIR2_FMT_NODE: error = xfs_dir2_leaf_getdents(&args, ctx, bufsize, &lock_mode); break; default: break; } if (lock_mode) xfs_iunlock(dp, lock_mode); return error; }
1 1 2 2 7 7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 // SPDX-License-Identifier: GPL-2.0-or-later /* * ALSA sequencer FIFO * Copyright (c) 1998 by Frank van de Pol <fvdpol@coil.demon.nl> */ #include <sound/core.h> #include <linux/slab.h> #include <linux/sched/signal.h> #include "seq_fifo.h" #include "seq_lock.h" /* FIFO */ /* create new fifo */ struct snd_seq_fifo *snd_seq_fifo_new(int poolsize) { struct snd_seq_fifo *f; f = kzalloc(sizeof(*f), GFP_KERNEL); if (!f) return NULL; f->pool = snd_seq_pool_new(poolsize); if (f->pool == NULL) { kfree(f); return NULL; } if (snd_seq_pool_init(f->pool) < 0) { snd_seq_pool_delete(&f->pool); kfree(f); return NULL; } spin_lock_init(&f->lock); snd_use_lock_init(&f->use_lock); init_waitqueue_head(&f->input_sleep); atomic_set(&f->overflow, 0); f->head = NULL; f->tail = NULL; f->cells = 0; return f; } void snd_seq_fifo_delete(struct snd_seq_fifo **fifo) { struct snd_seq_fifo *f; if (snd_BUG_ON(!fifo)) return; f = *fifo; if (snd_BUG_ON(!f)) return; *fifo = NULL; if (f->pool) snd_seq_pool_mark_closing(f->pool); snd_seq_fifo_clear(f); /* wake up clients if any */ if (waitqueue_active(&f->input_sleep)) wake_up(&f->input_sleep); /* release resources...*/ /*....................*/ if (f->pool) { snd_seq_pool_done(f->pool); snd_seq_pool_delete(&f->pool); } kfree(f); } static struct snd_seq_event_cell *fifo_cell_out(struct snd_seq_fifo *f); /* clear queue */ void snd_seq_fifo_clear(struct snd_seq_fifo *f) { struct snd_seq_event_cell *cell; /* clear overflow flag */ atomic_set(&f->overflow, 0); snd_use_lock_sync(&f->use_lock); guard(spinlock_irq)(&f->lock); /* drain the fifo */ while ((cell = fifo_cell_out(f)) != NULL) { snd_seq_cell_free(cell); } } /* enqueue event to fifo */ int snd_seq_fifo_event_in(struct snd_seq_fifo *f, struct snd_seq_event *event) { struct snd_seq_event_cell *cell; int err; if (snd_BUG_ON(!f)) return -EINVAL; snd_use_lock_use(&f->use_lock); err = snd_seq_event_dup(f->pool, event, &cell, 1, NULL, NULL); /* always non-blocking */ if (err < 0) { if ((err == -ENOMEM) || (err == -EAGAIN)) atomic_inc(&f->overflow); snd_use_lock_free(&f->use_lock); return err; } /* append new cells to fifo */ scoped_guard(spinlock_irqsave, &f->lock) { if (f->tail != NULL) f->tail->next = cell; f->tail = cell; if (f->head == NULL) f->head = cell; cell->next = NULL; f->cells++; } /* wakeup client */ if (waitqueue_active(&f->input_sleep)) wake_up(&f->input_sleep); snd_use_lock_free(&f->use_lock); return 0; /* success */ } /* dequeue cell from fifo */ static struct snd_seq_event_cell *fifo_cell_out(struct snd_seq_fifo *f) { struct snd_seq_event_cell *cell; cell = f->head; if (cell) { f->head = cell->next; /* reset tail if this was the last element */ if (f->tail == cell) f->tail = NULL; cell->next = NULL; f->cells--; } return cell; } /* dequeue cell from fifo and copy on user space */ int snd_seq_fifo_cell_out(struct snd_seq_fifo *f, struct snd_seq_event_cell **cellp, int nonblock) { struct snd_seq_event_cell *cell; unsigned long flags; wait_queue_entry_t wait; if (snd_BUG_ON(!f)) return -EINVAL; *cellp = NULL; init_waitqueue_entry(&wait, current); spin_lock_irqsave(&f->lock, flags); while ((cell = fifo_cell_out(f)) == NULL) { if (nonblock) { /* non-blocking - return immediately */ spin_unlock_irqrestore(&f->lock, flags); return -EAGAIN; } set_current_state(TASK_INTERRUPTIBLE); add_wait_queue(&f->input_sleep, &wait); spin_unlock_irqrestore(&f->lock, flags); schedule(); spin_lock_irqsave(&f->lock, flags); remove_wait_queue(&f->input_sleep, &wait); if (signal_pending(current)) { spin_unlock_irqrestore(&f->lock, flags); return -ERESTARTSYS; } } spin_unlock_irqrestore(&f->lock, flags); *cellp = cell; return 0; } void snd_seq_fifo_cell_putback(struct snd_seq_fifo *f, struct snd_seq_event_cell *cell) { if (cell) { guard(spinlock_irqsave)(&f->lock); cell->next = f->head; f->head = cell; if (!f->tail) f->tail = cell; f->cells++; } } /* polling; return non-zero if queue is available */ int snd_seq_fifo_poll_wait(struct snd_seq_fifo *f, struct file *file, poll_table *wait) { poll_wait(file, &f->input_sleep, wait); return (f->cells > 0); } /* change the size of pool; all old events are removed */ int snd_seq_fifo_resize(struct snd_seq_fifo *f, int poolsize) { struct snd_seq_pool *newpool, *oldpool; struct snd_seq_event_cell *cell, *next, *oldhead; if (snd_BUG_ON(!f || !f->pool)) return -EINVAL; /* allocate new pool */ newpool = snd_seq_pool_new(poolsize); if (newpool == NULL) return -ENOMEM; if (snd_seq_pool_init(newpool) < 0) { snd_seq_pool_delete(&newpool); return -ENOMEM; } scoped_guard(spinlock_irq, &f->lock) { /* remember old pool */ oldpool = f->pool; oldhead = f->head; /* exchange pools */ f->pool = newpool; f->head = NULL; f->tail = NULL; f->cells = 0; /* NOTE: overflow flag is not cleared */ } /* close the old pool and wait until all users are gone */ snd_seq_pool_mark_closing(oldpool); snd_use_lock_sync(&f->use_lock); /* release cells in old pool */ for (cell = oldhead; cell; cell = next) { next = cell->next; snd_seq_cell_free(cell); } snd_seq_pool_delete(&oldpool); return 0; } /* get the number of unused cells safely */ int snd_seq_fifo_unused_cells(struct snd_seq_fifo *f) { int cells; if (!f) return 0; snd_use_lock_use(&f->use_lock); scoped_guard(spinlock_irqsave, &f->lock) cells = snd_seq_unused_cells(f->pool); snd_use_lock_free(&f->use_lock); return cells; }
1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 // SPDX-License-Identifier: GPL-2.0-only /* * Input Multitouch Library * * Copyright (c) 2008-2010 Henrik Rydberg */ #include <linux/input/mt.h> #include <linux/export.h> #include <linux/slab.h> #include "input-core-private.h" #define TRKID_SGN ((TRKID_MAX + 1) >> 1) static void copy_abs(struct input_dev *dev, unsigned int dst, unsigned int src) { if (dev->absinfo && test_bit(src, dev->absbit)) { dev->absinfo[dst] = dev->absinfo[src]; dev->absinfo[dst].fuzz = 0; __set_bit(dst, dev->absbit); } } /** * input_mt_init_slots() - initialize MT input slots * @dev: input device supporting MT events and finger tracking * @num_slots: number of slots used by the device * @flags: mt tasks to handle in core * * This function allocates all necessary memory for MT slot handling * in the input device, prepares the ABS_MT_SLOT and * ABS_MT_TRACKING_ID events for use and sets up appropriate buffers. * Depending on the flags set, it also performs pointer emulation and * frame synchronization. * * May be called repeatedly. Returns -EINVAL if attempting to * reinitialize with a different number of slots. */ int input_mt_init_slots(struct input_dev *dev, unsigned int num_slots, unsigned int flags) { if (!num_slots) return 0; if (dev->mt) return dev->mt->num_slots != num_slots ? -EINVAL : 0; /* Arbitrary limit for avoiding too large memory allocation. */ if (num_slots > 1024) return -EINVAL; struct input_mt *mt __free(kfree) = kzalloc(struct_size(mt, slots, num_slots), GFP_KERNEL); if (!mt) return -ENOMEM; mt->num_slots = num_slots; mt->flags = flags; input_set_abs_params(dev, ABS_MT_SLOT, 0, num_slots - 1, 0, 0); input_set_abs_params(dev, ABS_MT_TRACKING_ID, 0, TRKID_MAX, 0, 0); if (flags & (INPUT_MT_POINTER | INPUT_MT_DIRECT)) { __set_bit(EV_KEY, dev->evbit); __set_bit(BTN_TOUCH, dev->keybit); copy_abs(dev, ABS_X, ABS_MT_POSITION_X); copy_abs(dev, ABS_Y, ABS_MT_POSITION_Y); copy_abs(dev, ABS_PRESSURE, ABS_MT_PRESSURE); } if (flags & INPUT_MT_POINTER) { __set_bit(BTN_TOOL_FINGER, dev->keybit); __set_bit(BTN_TOOL_DOUBLETAP, dev->keybit); if (num_slots >= 3) __set_bit(BTN_TOOL_TRIPLETAP, dev->keybit); if (num_slots >= 4) __set_bit(BTN_TOOL_QUADTAP, dev->keybit); if (num_slots >= 5) __set_bit(BTN_TOOL_QUINTTAP, dev->keybit); __set_bit(INPUT_PROP_POINTER, dev->propbit); } if (flags & INPUT_MT_DIRECT) __set_bit(INPUT_PROP_DIRECT, dev->propbit); if (flags & INPUT_MT_SEMI_MT) __set_bit(INPUT_PROP_SEMI_MT, dev->propbit); if (flags & INPUT_MT_TRACK) { unsigned int n2 = num_slots * num_slots; mt->red = kcalloc(n2, sizeof(*mt->red), GFP_KERNEL); if (!mt->red) return -ENOMEM; } /* Mark slots as 'inactive' */ for (unsigned int i = 0; i < num_slots; i++) input_mt_set_value(&mt->slots[i], ABS_MT_TRACKING_ID, -1); /* Mark slots as 'unused' */ mt->frame = 1; dev->mt = no_free_ptr(mt); return 0; } EXPORT_SYMBOL(input_mt_init_slots); /** * input_mt_destroy_slots() - frees the MT slots of the input device * @dev: input device with allocated MT slots * * This function is only needed in error path as the input core will * automatically free the MT slots when the device is destroyed. */ void input_mt_destroy_slots(struct input_dev *dev) { if (dev->mt) { kfree(dev->mt->red); kfree(dev->mt); } dev->mt = NULL; } EXPORT_SYMBOL(input_mt_destroy_slots); /** * input_mt_report_slot_state() - report contact state * @dev: input device with allocated MT slots * @tool_type: the tool type to use in this slot * @active: true if contact is active, false otherwise * * Reports a contact via ABS_MT_TRACKING_ID, and optionally * ABS_MT_TOOL_TYPE. If active is true and the slot is currently * inactive, or if the tool type is changed, a new tracking id is * assigned to the slot. The tool type is only reported if the * corresponding absbit field is set. * * Returns true if contact is active. */ bool input_mt_report_slot_state(struct input_dev *dev, unsigned int tool_type, bool active) { struct input_mt *mt = dev->mt; struct input_mt_slot *slot; int id; if (!mt) return false; slot = &mt->slots[mt->slot]; slot->frame = mt->frame; if (!active) { input_event(dev, EV_ABS, ABS_MT_TRACKING_ID, -1); return false; } id = input_mt_get_value(slot, ABS_MT_TRACKING_ID); if (id < 0) id = input_mt_new_trkid(mt); input_event(dev, EV_ABS, ABS_MT_TRACKING_ID, id); input_event(dev, EV_ABS, ABS_MT_TOOL_TYPE, tool_type); return true; } EXPORT_SYMBOL(input_mt_report_slot_state); /** * input_mt_report_finger_count() - report contact count * @dev: input device with allocated MT slots * @count: the number of contacts * * Reports the contact count via BTN_TOOL_FINGER, BTN_TOOL_DOUBLETAP, * BTN_TOOL_TRIPLETAP and BTN_TOOL_QUADTAP. * * The input core ensures only the KEY events already setup for * this device will produce output. */ void input_mt_report_finger_count(struct input_dev *dev, int count) { input_event(dev, EV_KEY, BTN_TOOL_FINGER, count == 1); input_event(dev, EV_KEY, BTN_TOOL_DOUBLETAP, count == 2); input_event(dev, EV_KEY, BTN_TOOL_TRIPLETAP, count == 3); input_event(dev, EV_KEY, BTN_TOOL_QUADTAP, count == 4); input_event(dev, EV_KEY, BTN_TOOL_QUINTTAP, count == 5); } EXPORT_SYMBOL(input_mt_report_finger_count); /** * input_mt_report_pointer_emulation() - common pointer emulation * @dev: input device with allocated MT slots * @use_count: report number of active contacts as finger count * * Performs legacy pointer emulation via BTN_TOUCH, ABS_X, ABS_Y and * ABS_PRESSURE. Touchpad finger count is emulated if use_count is true. * * The input core ensures only the KEY and ABS axes already setup for * this device will produce output. */ void input_mt_report_pointer_emulation(struct input_dev *dev, bool use_count) { struct input_mt *mt = dev->mt; struct input_mt_slot *oldest; int oldid, count, i; if (!mt) return; oldest = NULL; oldid = mt->trkid; count = 0; for (i = 0; i < mt->num_slots; ++i) { struct input_mt_slot *ps = &mt->slots[i]; int id = input_mt_get_value(ps, ABS_MT_TRACKING_ID); if (id < 0) continue; if ((id - oldid) & TRKID_SGN) { oldest = ps; oldid = id; } count++; } input_event(dev, EV_KEY, BTN_TOUCH, count > 0); if (use_count) { if (count == 0 && !test_bit(ABS_MT_DISTANCE, dev->absbit) && test_bit(ABS_DISTANCE, dev->absbit) && input_abs_get_val(dev, ABS_DISTANCE) != 0) { /* * Force reporting BTN_TOOL_FINGER for devices that * only report general hover (and not per-contact * distance) when contact is in proximity but not * on the surface. */ count = 1; } input_mt_report_finger_count(dev, count); } if (oldest) { int x = input_mt_get_value(oldest, ABS_MT_POSITION_X); int y = input_mt_get_value(oldest, ABS_MT_POSITION_Y); input_event(dev, EV_ABS, ABS_X, x); input_event(dev, EV_ABS, ABS_Y, y); if (test_bit(ABS_MT_PRESSURE, dev->absbit)) { int p = input_mt_get_value(oldest, ABS_MT_PRESSURE); input_event(dev, EV_ABS, ABS_PRESSURE, p); } } else { if (test_bit(ABS_MT_PRESSURE, dev->absbit)) input_event(dev, EV_ABS, ABS_PRESSURE, 0); } } EXPORT_SYMBOL(input_mt_report_pointer_emulation); static void __input_mt_drop_unused(struct input_dev *dev, struct input_mt *mt) { int i; lockdep_assert_held(&dev->event_lock); for (i = 0; i < mt->num_slots; i++) { if (input_mt_is_active(&mt->slots[i]) && !input_mt_is_used(mt, &mt->slots[i])) { input_handle_event(dev, EV_ABS, ABS_MT_SLOT, i); input_handle_event(dev, EV_ABS, ABS_MT_TRACKING_ID, -1); } } } /** * input_mt_drop_unused() - Inactivate slots not seen in this frame * @dev: input device with allocated MT slots * * Lift all slots not seen since the last call to this function. */ void input_mt_drop_unused(struct input_dev *dev) { struct input_mt *mt = dev->mt; if (mt) { guard(spinlock_irqsave)(&dev->event_lock); __input_mt_drop_unused(dev, mt); mt->frame++; } } EXPORT_SYMBOL(input_mt_drop_unused); /** * input_mt_release_slots() - Deactivate all slots * @dev: input device with allocated MT slots * * Lift all active slots. */ void input_mt_release_slots(struct input_dev *dev) { struct input_mt *mt = dev->mt; lockdep_assert_held(&dev->event_lock); if (mt) { /* This will effectively mark all slots unused. */ mt->frame++; __input_mt_drop_unused(dev, mt); if (test_bit(ABS_PRESSURE, dev->absbit)) input_handle_event(dev, EV_ABS, ABS_PRESSURE, 0); mt->frame++; } } /** * input_mt_sync_frame() - synchronize mt frame * @dev: input device with allocated MT slots * * Close the frame and prepare the internal state for a new one. * Depending on the flags, marks unused slots as inactive and performs * pointer emulation. */ void input_mt_sync_frame(struct input_dev *dev) { struct input_mt *mt = dev->mt; bool use_count = false; if (!mt) return; if (mt->flags & INPUT_MT_DROP_UNUSED) { guard(spinlock_irqsave)(&dev->event_lock); __input_mt_drop_unused(dev, mt); } if ((mt->flags & INPUT_MT_POINTER) && !(mt->flags & INPUT_MT_SEMI_MT)) use_count = true; input_mt_report_pointer_emulation(dev, use_count); mt->frame++; } EXPORT_SYMBOL(input_mt_sync_frame); static int adjust_dual(int *begin, int step, int *end, int eq, int mu) { int f, *p, s, c; if (begin == end) return 0; f = *begin; p = begin + step; s = p == end ? f + 1 : *p; for (; p != end; p += step) { if (*p < f) { s = f; f = *p; } else if (*p < s) { s = *p; } } c = (f + s + 1) / 2; if (c == 0 || (c > mu && (!eq || mu > 0))) return 0; /* Improve convergence for positive matrices by penalizing overcovers */ if (s < 0 && mu <= 0) c *= 2; for (p = begin; p != end; p += step) *p -= c; return (c < s && s <= 0) || (f >= 0 && f < c); } static void find_reduced_matrix(int *w, int nr, int nc, int nrc, int mu) { int i, k, sum; for (k = 0; k < nrc; k++) { for (i = 0; i < nr; i++) adjust_dual(w + i, nr, w + i + nrc, nr <= nc, mu); sum = 0; for (i = 0; i < nrc; i += nr) sum += adjust_dual(w + i, 1, w + i + nr, nc <= nr, mu); if (!sum) break; } } static int input_mt_set_matrix(struct input_mt *mt, const struct input_mt_pos *pos, int num_pos, int mu) { const struct input_mt_pos *p; struct input_mt_slot *s; int *w = mt->red; int x, y; for (s = mt->slots; s != mt->slots + mt->num_slots; s++) { if (!input_mt_is_active(s)) continue; x = input_mt_get_value(s, ABS_MT_POSITION_X); y = input_mt_get_value(s, ABS_MT_POSITION_Y); for (p = pos; p != pos + num_pos; p++) { int dx = x - p->x, dy = y - p->y; *w++ = dx * dx + dy * dy - mu; } } return w - mt->red; } static void input_mt_set_slots(struct input_mt *mt, int *slots, int num_pos) { struct input_mt_slot *s; int *w = mt->red, j; for (j = 0; j != num_pos; j++) slots[j] = -1; for (s = mt->slots; s != mt->slots + mt->num_slots; s++) { if (!input_mt_is_active(s)) continue; for (j = 0; j != num_pos; j++) { if (w[j] < 0) { slots[j] = s - mt->slots; break; } } w += num_pos; } for (s = mt->slots; s != mt->slots + mt->num_slots; s++) { if (input_mt_is_active(s)) continue; for (j = 0; j != num_pos; j++) { if (slots[j] < 0) { slots[j] = s - mt->slots; break; } } } } /** * input_mt_assign_slots() - perform a best-match assignment * @dev: input device with allocated MT slots * @slots: the slot assignment to be filled * @pos: the position array to match * @num_pos: number of positions * @dmax: maximum ABS_MT_POSITION displacement (zero for infinite) * * Performs a best match against the current contacts and returns * the slot assignment list. New contacts are assigned to unused * slots. * * The assignments are balanced so that all coordinate displacements are * below the euclidian distance dmax. If no such assignment can be found, * some contacts are assigned to unused slots. * * Returns zero on success, or negative error in case of failure. */ int input_mt_assign_slots(struct input_dev *dev, int *slots, const struct input_mt_pos *pos, int num_pos, int dmax) { struct input_mt *mt = dev->mt; int mu = 2 * dmax * dmax; int nrc; if (!mt || !mt->red) return -ENXIO; if (num_pos > mt->num_slots) return -EINVAL; if (num_pos < 1) return 0; nrc = input_mt_set_matrix(mt, pos, num_pos, mu); find_reduced_matrix(mt->red, num_pos, nrc / num_pos, nrc, mu); input_mt_set_slots(mt, slots, num_pos); return 0; } EXPORT_SYMBOL(input_mt_assign_slots); /** * input_mt_get_slot_by_key() - return slot matching key * @dev: input device with allocated MT slots * @key: the key of the sought slot * * Returns the slot of the given key, if it exists, otherwise * set the key on the first unused slot and return. * * If no available slot can be found, -1 is returned. * Note that for this function to work properly, input_mt_sync_frame() has * to be called at each frame. */ int input_mt_get_slot_by_key(struct input_dev *dev, int key) { struct input_mt *mt = dev->mt; struct input_mt_slot *s; if (!mt) return -1; for (s = mt->slots; s != mt->slots + mt->num_slots; s++) if (input_mt_is_active(s) && s->key == key) return s - mt->slots; for (s = mt->slots; s != mt->slots + mt->num_slots; s++) if (!input_mt_is_active(s) && !input_mt_is_used(mt, s)) { s->key = key; return s - mt->slots; } return -1; } EXPORT_SYMBOL(input_mt_get_slot_by_key);
3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 5 1 1 8 7 2 5 1 5 5 1 5 5 3 3 7 7 7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 // SPDX-License-Identifier: GPL-2.0-only /* * Input driver to ExplorerPS/2 device driver module. * * Copyright (c) 1999-2002 Vojtech Pavlik * Copyright (c) 2004 Dmitry Torokhov */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #define MOUSEDEV_MINOR_BASE 32 #define MOUSEDEV_MINORS 31 #define MOUSEDEV_MIX 63 #include <linux/bitops.h> #include <linux/sched.h> #include <linux/slab.h> #include <linux/poll.h> #include <linux/module.h> #include <linux/init.h> #include <linux/input.h> #include <linux/random.h> #include <linux/major.h> #include <linux/device.h> #include <linux/cdev.h> #include <linux/kernel.h> MODULE_AUTHOR("Vojtech Pavlik <vojtech@ucw.cz>"); MODULE_DESCRIPTION("Mouse (ExplorerPS/2) device interfaces"); MODULE_LICENSE("GPL"); #ifndef CONFIG_INPUT_MOUSEDEV_SCREEN_X #define CONFIG_INPUT_MOUSEDEV_SCREEN_X 1024 #endif #ifndef CONFIG_INPUT_MOUSEDEV_SCREEN_Y #define CONFIG_INPUT_MOUSEDEV_SCREEN_Y 768 #endif static int xres = CONFIG_INPUT_MOUSEDEV_SCREEN_X; module_param(xres, uint, 0644); MODULE_PARM_DESC(xres, "Horizontal screen resolution"); static int yres = CONFIG_INPUT_MOUSEDEV_SCREEN_Y; module_param(yres, uint, 0644); MODULE_PARM_DESC(yres, "Vertical screen resolution"); static unsigned tap_time = 200; module_param(tap_time, uint, 0644); MODULE_PARM_DESC(tap_time, "Tap time for touchpads in absolute mode (msecs)"); struct mousedev_hw_data { int dx, dy, dz; int x, y; int abs_event; unsigned long buttons; }; struct mousedev { int open; struct input_handle handle; wait_queue_head_t wait; struct list_head client_list; spinlock_t client_lock; /* protects client_list */ struct mutex mutex; struct device dev; struct cdev cdev; bool exist; struct list_head mixdev_node; bool opened_by_mixdev; struct mousedev_hw_data packet; unsigned int pkt_count; int old_x[4], old_y[4]; int frac_dx, frac_dy; unsigned long touch; int (*open_device)(struct mousedev *mousedev); void (*close_device)(struct mousedev *mousedev); }; enum mousedev_emul { MOUSEDEV_EMUL_PS2, MOUSEDEV_EMUL_IMPS, MOUSEDEV_EMUL_EXPS }; struct mousedev_motion { int dx, dy, dz; unsigned long buttons; }; #define PACKET_QUEUE_LEN 16 struct mousedev_client { struct fasync_struct *fasync; struct mousedev *mousedev; struct list_head node; struct mousedev_motion packets[PACKET_QUEUE_LEN]; unsigned int head, tail; spinlock_t packet_lock; int pos_x, pos_y; u8 ps2[6]; unsigned char ready, buffer, bufsiz; unsigned char imexseq, impsseq; enum mousedev_emul mode; unsigned long last_buttons; }; #define MOUSEDEV_SEQ_LEN 6 static unsigned char mousedev_imps_seq[] = { 0xf3, 200, 0xf3, 100, 0xf3, 80 }; static unsigned char mousedev_imex_seq[] = { 0xf3, 200, 0xf3, 200, 0xf3, 80 }; static struct mousedev *mousedev_mix; static LIST_HEAD(mousedev_mix_list); #define fx(i) (mousedev->old_x[(mousedev->pkt_count - (i)) & 03]) #define fy(i) (mousedev->old_y[(mousedev->pkt_count - (i)) & 03]) static void mousedev_touchpad_event(struct input_dev *dev, struct mousedev *mousedev, unsigned int code, int value) { int size, tmp; enum { FRACTION_DENOM = 128 }; switch (code) { case ABS_X: fx(0) = value; if (mousedev->touch && mousedev->pkt_count >= 2) { size = input_abs_get_max(dev, ABS_X) - input_abs_get_min(dev, ABS_X); if (size == 0) size = 256 * 2; tmp = ((value - fx(2)) * 256 * FRACTION_DENOM) / size; tmp += mousedev->frac_dx; mousedev->packet.dx = tmp / FRACTION_DENOM; mousedev->frac_dx = tmp - mousedev->packet.dx * FRACTION_DENOM; } break; case ABS_Y: fy(0) = value; if (mousedev->touch && mousedev->pkt_count >= 2) { /* use X size for ABS_Y to keep the same scale */ size = input_abs_get_max(dev, ABS_X) - input_abs_get_min(dev, ABS_X); if (size == 0) size = 256 * 2; tmp = -((value - fy(2)) * 256 * FRACTION_DENOM) / size; tmp += mousedev->frac_dy; mousedev->packet.dy = tmp / FRACTION_DENOM; mousedev->frac_dy = tmp - mousedev->packet.dy * FRACTION_DENOM; } break; } } static void mousedev_abs_event(struct input_dev *dev, struct mousedev *mousedev, unsigned int code, int value) { int min, max, size; switch (code) { case ABS_X: min = input_abs_get_min(dev, ABS_X); max = input_abs_get_max(dev, ABS_X); size = max - min; if (size == 0) size = xres ? : 1; value = clamp(value, min, max); mousedev->packet.x = ((value - min) * xres) / size; mousedev->packet.abs_event = 1; break; case ABS_Y: min = input_abs_get_min(dev, ABS_Y); max = input_abs_get_max(dev, ABS_Y); size = max - min; if (size == 0) size = yres ? : 1; value = clamp(value, min, max); mousedev->packet.y = yres - ((value - min) * yres) / size; mousedev->packet.abs_event = 1; break; } } static void mousedev_rel_event(struct mousedev *mousedev, unsigned int code, int value) { switch (code) { case REL_X: mousedev->packet.dx += value; break; case REL_Y: mousedev->packet.dy -= value; break; case REL_WHEEL: mousedev->packet.dz -= value; break; } } static void mousedev_key_event(struct mousedev *mousedev, unsigned int code, int value) { int index; switch (code) { case BTN_TOUCH: case BTN_0: case BTN_LEFT: index = 0; break; case BTN_STYLUS: case BTN_1: case BTN_RIGHT: index = 1; break; case BTN_2: case BTN_FORWARD: case BTN_STYLUS2: case BTN_MIDDLE: index = 2; break; case BTN_3: case BTN_BACK: case BTN_SIDE: index = 3; break; case BTN_4: case BTN_EXTRA: index = 4; break; default: return; } if (value) { set_bit(index, &mousedev->packet.buttons); set_bit(index, &mousedev_mix->packet.buttons); } else { clear_bit(index, &mousedev->packet.buttons); clear_bit(index, &mousedev_mix->packet.buttons); } } static void mousedev_notify_readers(struct mousedev *mousedev, struct mousedev_hw_data *packet) { struct mousedev_client *client; struct mousedev_motion *p; unsigned int new_head; int wake_readers = 0; rcu_read_lock(); list_for_each_entry_rcu(client, &mousedev->client_list, node) { /* Just acquire the lock, interrupts already disabled */ spin_lock(&client->packet_lock); p = &client->packets[client->head]; if (client->ready && p->buttons != mousedev->packet.buttons) { new_head = (client->head + 1) % PACKET_QUEUE_LEN; if (new_head != client->tail) { p = &client->packets[client->head = new_head]; memset(p, 0, sizeof(struct mousedev_motion)); } } if (packet->abs_event) { p->dx += packet->x - client->pos_x; p->dy += packet->y - client->pos_y; client->pos_x = packet->x; client->pos_y = packet->y; } client->pos_x += packet->dx; client->pos_x = clamp_val(client->pos_x, 0, xres); client->pos_y += packet->dy; client->pos_y = clamp_val(client->pos_y, 0, yres); p->dx += packet->dx; p->dy += packet->dy; p->dz += packet->dz; p->buttons = mousedev->packet.buttons; if (p->dx || p->dy || p->dz || p->buttons != client->last_buttons) client->ready = 1; spin_unlock(&client->packet_lock); if (client->ready) { kill_fasync(&client->fasync, SIGIO, POLL_IN); wake_readers = 1; } } rcu_read_unlock(); if (wake_readers) wake_up_interruptible(&mousedev->wait); } static void mousedev_touchpad_touch(struct mousedev *mousedev, int value) { if (!value) { if (mousedev->touch && time_before(jiffies, mousedev->touch + msecs_to_jiffies(tap_time))) { /* * Toggle left button to emulate tap. * We rely on the fact that mousedev_mix always has 0 * motion packet so we won't mess current position. */ set_bit(0, &mousedev->packet.buttons); set_bit(0, &mousedev_mix->packet.buttons); mousedev_notify_readers(mousedev, &mousedev_mix->packet); mousedev_notify_readers(mousedev_mix, &mousedev_mix->packet); clear_bit(0, &mousedev->packet.buttons); clear_bit(0, &mousedev_mix->packet.buttons); } mousedev->touch = mousedev->pkt_count = 0; mousedev->frac_dx = 0; mousedev->frac_dy = 0; } else if (!mousedev->touch) mousedev->touch = jiffies; } static void mousedev_event(struct input_handle *handle, unsigned int type, unsigned int code, int value) { struct mousedev *mousedev = handle->private; switch (type) { case EV_ABS: /* Ignore joysticks */ if (test_bit(BTN_TRIGGER, handle->dev->keybit)) return; if (test_bit(BTN_TOOL_FINGER, handle->dev->keybit)) mousedev_touchpad_event(handle->dev, mousedev, code, value); else mousedev_abs_event(handle->dev, mousedev, code, value); break; case EV_REL: mousedev_rel_event(mousedev, code, value); break; case EV_KEY: if (value != 2) { if (code == BTN_TOUCH && test_bit(BTN_TOOL_FINGER, handle->dev->keybit)) mousedev_touchpad_touch(mousedev, value); else mousedev_key_event(mousedev, code, value); } break; case EV_SYN: if (code == SYN_REPORT) { if (mousedev->touch) { mousedev->pkt_count++; /* * Input system eats duplicate events, * but we need all of them to do correct * averaging so apply present one forward */ fx(0) = fx(1); fy(0) = fy(1); } mousedev_notify_readers(mousedev, &mousedev->packet); mousedev_notify_readers(mousedev_mix, &mousedev->packet); mousedev->packet.dx = mousedev->packet.dy = mousedev->packet.dz = 0; mousedev->packet.abs_event = 0; } break; } } static int mousedev_fasync(int fd, struct file *file, int on) { struct mousedev_client *client = file->private_data; return fasync_helper(fd, file, on, &client->fasync); } static void mousedev_free(struct device *dev) { struct mousedev *mousedev = container_of(dev, struct mousedev, dev); input_put_device(mousedev->handle.dev); kfree(mousedev); } static int mousedev_open_device(struct mousedev *mousedev) { int retval; retval = mutex_lock_interruptible(&mousedev->mutex); if (retval) return retval; if (!mousedev->exist) retval = -ENODEV; else if (!mousedev->open++) { retval = input_open_device(&mousedev->handle); if (retval) mousedev->open--; } mutex_unlock(&mousedev->mutex); return retval; } static void mousedev_close_device(struct mousedev *mousedev) { mutex_lock(&mousedev->mutex); if (mousedev->exist && !--mousedev->open) input_close_device(&mousedev->handle); mutex_unlock(&mousedev->mutex); } /* * Open all available devices so they can all be multiplexed in one. * stream. Note that this function is called with mousedev_mix->mutex * held. */ static int mixdev_open_devices(struct mousedev *mixdev) { int error; error = mutex_lock_interruptible(&mixdev->mutex); if (error) return error; if (!mixdev->open++) { struct mousedev *mousedev; list_for_each_entry(mousedev, &mousedev_mix_list, mixdev_node) { if (!mousedev->opened_by_mixdev) { if (mousedev_open_device(mousedev)) continue; mousedev->opened_by_mixdev = true; } } } mutex_unlock(&mixdev->mutex); return 0; } /* * Close all devices that were opened as part of multiplexed * device. Note that this function is called with mousedev_mix->mutex * held. */ static void mixdev_close_devices(struct mousedev *mixdev) { mutex_lock(&mixdev->mutex); if (!--mixdev->open) { struct mousedev *mousedev; list_for_each_entry(mousedev, &mousedev_mix_list, mixdev_node) { if (mousedev->opened_by_mixdev) { mousedev->opened_by_mixdev = false; mousedev_close_device(mousedev); } } } mutex_unlock(&mixdev->mutex); } static void mousedev_attach_client(struct mousedev *mousedev, struct mousedev_client *client) { spin_lock(&mousedev->client_lock); list_add_tail_rcu(&client->node, &mousedev->client_list); spin_unlock(&mousedev->client_lock); } static void mousedev_detach_client(struct mousedev *mousedev, struct mousedev_client *client) { spin_lock(&mousedev->client_lock); list_del_rcu(&client->node); spin_unlock(&mousedev->client_lock); synchronize_rcu(); } static int mousedev_release(struct inode *inode, struct file *file) { struct mousedev_client *client = file->private_data; struct mousedev *mousedev = client->mousedev; mousedev_detach_client(mousedev, client); kfree(client); mousedev->close_device(mousedev); return 0; } static int mousedev_open(struct inode *inode, struct file *file) { struct mousedev_client *client; struct mousedev *mousedev; int error; #ifdef CONFIG_INPUT_MOUSEDEV_PSAUX if (imajor(inode) == MISC_MAJOR) mousedev = mousedev_mix; else #endif mousedev = container_of(inode->i_cdev, struct mousedev, cdev); client = kzalloc(sizeof(struct mousedev_client), GFP_KERNEL); if (!client) return -ENOMEM; spin_lock_init(&client->packet_lock); client->pos_x = xres / 2; client->pos_y = yres / 2; client->mousedev = mousedev; mousedev_attach_client(mousedev, client); error = mousedev->open_device(mousedev); if (error) goto err_free_client; file->private_data = client; stream_open(inode, file); return 0; err_free_client: mousedev_detach_client(mousedev, client); kfree(client); return error; } static void mousedev_packet(struct mousedev_client *client, u8 *ps2_data) { struct mousedev_motion *p = &client->packets[client->tail]; s8 dx, dy, dz; dx = clamp_val(p->dx, -127, 127); p->dx -= dx; dy = clamp_val(p->dy, -127, 127); p->dy -= dy; ps2_data[0] = BIT(3); ps2_data[0] |= ((dx & BIT(7)) >> 3) | ((dy & BIT(7)) >> 2); ps2_data[0] |= p->buttons & 0x07; ps2_data[1] = dx; ps2_data[2] = dy; switch (client->mode) { case MOUSEDEV_EMUL_EXPS: dz = clamp_val(p->dz, -7, 7); p->dz -= dz; ps2_data[3] = (dz & 0x0f) | ((p->buttons & 0x18) << 1); client->bufsiz = 4; break; case MOUSEDEV_EMUL_IMPS: dz = clamp_val(p->dz, -127, 127); p->dz -= dz; ps2_data[0] |= ((p->buttons & 0x10) >> 3) | ((p->buttons & 0x08) >> 1); ps2_data[3] = dz; client->bufsiz = 4; break; case MOUSEDEV_EMUL_PS2: default: p->dz = 0; ps2_data[0] |= ((p->buttons & 0x10) >> 3) | ((p->buttons & 0x08) >> 1); client->bufsiz = 3; break; } if (!p->dx && !p->dy && !p->dz) { if (client->tail == client->head) { client->ready = 0; client->last_buttons = p->buttons; } else client->tail = (client->tail + 1) % PACKET_QUEUE_LEN; } } static void mousedev_generate_response(struct mousedev_client *client, int command) { client->ps2[0] = 0xfa; /* ACK */ switch (command) { case 0xeb: /* Poll */ mousedev_packet(client, &client->ps2[1]); client->bufsiz++; /* account for leading ACK */ break; case 0xf2: /* Get ID */ switch (client->mode) { case MOUSEDEV_EMUL_PS2: client->ps2[1] = 0; break; case MOUSEDEV_EMUL_IMPS: client->ps2[1] = 3; break; case MOUSEDEV_EMUL_EXPS: client->ps2[1] = 4; break; } client->bufsiz = 2; break; case 0xe9: /* Get info */ client->ps2[1] = 0x60; client->ps2[2] = 3; client->ps2[3] = 200; client->bufsiz = 4; break; case 0xff: /* Reset */ client->impsseq = client->imexseq = 0; client->mode = MOUSEDEV_EMUL_PS2; client->ps2[1] = 0xaa; client->ps2[2] = 0x00; client->bufsiz = 3; break; default: client->bufsiz = 1; break; } client->buffer = client->bufsiz; } static ssize_t mousedev_write(struct file *file, const char __user *buffer, size_t count, loff_t *ppos) { struct mousedev_client *client = file->private_data; unsigned char c; unsigned int i; for (i = 0; i < count; i++) { if (get_user(c, buffer + i)) return -EFAULT; spin_lock_irq(&client->packet_lock); if (c == mousedev_imex_seq[client->imexseq]) { if (++client->imexseq == MOUSEDEV_SEQ_LEN) { client->imexseq = 0; client->mode = MOUSEDEV_EMUL_EXPS; } } else client->imexseq = 0; if (c == mousedev_imps_seq[client->impsseq]) { if (++client->impsseq == MOUSEDEV_SEQ_LEN) { client->impsseq = 0; client->mode = MOUSEDEV_EMUL_IMPS; } } else client->impsseq = 0; mousedev_generate_response(client, c); spin_unlock_irq(&client->packet_lock); cond_resched(); } kill_fasync(&client->fasync, SIGIO, POLL_IN); wake_up_interruptible(&client->mousedev->wait); return count; } static ssize_t mousedev_read(struct file *file, char __user *buffer, size_t count, loff_t *ppos) { struct mousedev_client *client = file->private_data; struct mousedev *mousedev = client->mousedev; u8 data[sizeof(client->ps2)]; int retval = 0; if (!client->ready && !client->buffer && mousedev->exist && (file->f_flags & O_NONBLOCK)) return -EAGAIN; retval = wait_event_interruptible(mousedev->wait, !mousedev->exist || client->ready || client->buffer); if (retval) return retval; if (!mousedev->exist) return -ENODEV; spin_lock_irq(&client->packet_lock); if (!client->buffer && client->ready) { mousedev_packet(client, client->ps2); client->buffer = client->bufsiz; } if (count > client->buffer) count = client->buffer; memcpy(data, client->ps2 + client->bufsiz - client->buffer, count); client->buffer -= count; spin_unlock_irq(&client->packet_lock); if (copy_to_user(buffer, data, count)) return -EFAULT; return count; } /* No kernel lock - fine */ static __poll_t mousedev_poll(struct file *file, poll_table *wait) { struct mousedev_client *client = file->private_data; struct mousedev *mousedev = client->mousedev; __poll_t mask; poll_wait(file, &mousedev->wait, wait); mask = mousedev->exist ? EPOLLOUT | EPOLLWRNORM : EPOLLHUP | EPOLLERR; if (client->ready || client->buffer) mask |= EPOLLIN | EPOLLRDNORM; return mask; } static const struct file_operations mousedev_fops = { .owner = THIS_MODULE, .read = mousedev_read, .write = mousedev_write, .poll = mousedev_poll, .open = mousedev_open, .release = mousedev_release, .fasync = mousedev_fasync, .llseek = noop_llseek, }; /* * Mark device non-existent. This disables writes, ioctls and * prevents new users from opening the device. Already posted * blocking reads will stay, however new ones will fail. */ static void mousedev_mark_dead(struct mousedev *mousedev) { mutex_lock(&mousedev->mutex); mousedev->exist = false; mutex_unlock(&mousedev->mutex); } /* * Wake up users waiting for IO so they can disconnect from * dead device. */ static void mousedev_hangup(struct mousedev *mousedev) { struct mousedev_client *client; spin_lock(&mousedev->client_lock); list_for_each_entry(client, &mousedev->client_list, node) kill_fasync(&client->fasync, SIGIO, POLL_HUP); spin_unlock(&mousedev->client_lock); wake_up_interruptible(&mousedev->wait); } static void mousedev_cleanup(struct mousedev *mousedev) { struct input_handle *handle = &mousedev->handle; mousedev_mark_dead(mousedev); mousedev_hangup(mousedev); /* mousedev is marked dead so no one else accesses mousedev->open */ if (mousedev->open) input_close_device(handle); } static int mousedev_reserve_minor(bool mixdev) { int minor; if (mixdev) { minor = input_get_new_minor(MOUSEDEV_MIX, 1, false); if (minor < 0) pr_err("failed to reserve mixdev minor: %d\n", minor); } else { minor = input_get_new_minor(MOUSEDEV_MINOR_BASE, MOUSEDEV_MINORS, true); if (minor < 0) pr_err("failed to reserve new minor: %d\n", minor); } return minor; } static struct mousedev *mousedev_create(struct input_dev *dev, struct input_handler *handler, bool mixdev) { struct mousedev *mousedev; int minor; int error; minor = mousedev_reserve_minor(mixdev); if (minor < 0) { error = minor; goto err_out; } mousedev = kzalloc(sizeof(struct mousedev), GFP_KERNEL); if (!mousedev) { error = -ENOMEM; goto err_free_minor; } INIT_LIST_HEAD(&mousedev->client_list); INIT_LIST_HEAD(&mousedev->mixdev_node); spin_lock_init(&mousedev->client_lock); mutex_init(&mousedev->mutex); lockdep_set_subclass(&mousedev->mutex, mixdev ? SINGLE_DEPTH_NESTING : 0); init_waitqueue_head(&mousedev->wait); if (mixdev) { dev_set_name(&mousedev->dev, "mice"); mousedev->open_device = mixdev_open_devices; mousedev->close_device = mixdev_close_devices; } else { int dev_no = minor; /* Normalize device number if it falls into legacy range */ if (dev_no < MOUSEDEV_MINOR_BASE + MOUSEDEV_MINORS) dev_no -= MOUSEDEV_MINOR_BASE; dev_set_name(&mousedev->dev, "mouse%d", dev_no); mousedev->open_device = mousedev_open_device; mousedev->close_device = mousedev_close_device; } mousedev->exist = true; mousedev->handle.dev = input_get_device(dev); mousedev->handle.name = dev_name(&mousedev->dev); mousedev->handle.handler = handler; mousedev->handle.private = mousedev; mousedev->dev.class = &input_class; if (dev) mousedev->dev.parent = &dev->dev; mousedev->dev.devt = MKDEV(INPUT_MAJOR, minor); mousedev->dev.release = mousedev_free; device_initialize(&mousedev->dev); if (!mixdev) { error = input_register_handle(&mousedev->handle); if (error) goto err_free_mousedev; } cdev_init(&mousedev->cdev, &mousedev_fops); error = cdev_device_add(&mousedev->cdev, &mousedev->dev); if (error) goto err_cleanup_mousedev; return mousedev; err_cleanup_mousedev: mousedev_cleanup(mousedev); if (!mixdev) input_unregister_handle(&mousedev->handle); err_free_mousedev: put_device(&mousedev->dev); err_free_minor: input_free_minor(minor); err_out: return ERR_PTR(error); } static void mousedev_destroy(struct mousedev *mousedev) { cdev_device_del(&mousedev->cdev, &mousedev->dev); mousedev_cleanup(mousedev); input_free_minor(MINOR(mousedev->dev.devt)); if (mousedev != mousedev_mix) input_unregister_handle(&mousedev->handle); put_device(&mousedev->dev); } static int mixdev_add_device(struct mousedev *mousedev) { int retval; retval = mutex_lock_interruptible(&mousedev_mix->mutex); if (retval) return retval; if (mousedev_mix->open) { retval = mousedev_open_device(mousedev); if (retval) goto out; mousedev->opened_by_mixdev = true; } get_device(&mousedev->dev); list_add_tail(&mousedev->mixdev_node, &mousedev_mix_list); out: mutex_unlock(&mousedev_mix->mutex); return retval; } static void mixdev_remove_device(struct mousedev *mousedev) { mutex_lock(&mousedev_mix->mutex); if (mousedev->opened_by_mixdev) { mousedev->opened_by_mixdev = false; mousedev_close_device(mousedev); } list_del_init(&mousedev->mixdev_node); mutex_unlock(&mousedev_mix->mutex); put_device(&mousedev->dev); } static int mousedev_connect(struct input_handler *handler, struct input_dev *dev, const struct input_device_id *id) { struct mousedev *mousedev; int error; mousedev = mousedev_create(dev, handler, false); if (IS_ERR(mousedev)) return PTR_ERR(mousedev); error = mixdev_add_device(mousedev); if (error) { mousedev_destroy(mousedev); return error; } return 0; } static void mousedev_disconnect(struct input_handle *handle) { struct mousedev *mousedev = handle->private; mixdev_remove_device(mousedev); mousedev_destroy(mousedev); } static const struct input_device_id mousedev_ids[] = { { .flags = INPUT_DEVICE_ID_MATCH_EVBIT | INPUT_DEVICE_ID_MATCH_KEYBIT | INPUT_DEVICE_ID_MATCH_RELBIT, .evbit = { BIT_MASK(EV_KEY) | BIT_MASK(EV_REL) }, .keybit = { [BIT_WORD(BTN_LEFT)] = BIT_MASK(BTN_LEFT) }, .relbit = { BIT_MASK(REL_X) | BIT_MASK(REL_Y) }, }, /* A mouse like device, at least one button, two relative axes */ { .flags = INPUT_DEVICE_ID_MATCH_EVBIT | INPUT_DEVICE_ID_MATCH_RELBIT, .evbit = { BIT_MASK(EV_KEY) | BIT_MASK(EV_REL) }, .relbit = { BIT_MASK(REL_WHEEL) }, }, /* A separate scrollwheel */ { .flags = INPUT_DEVICE_ID_MATCH_EVBIT | INPUT_DEVICE_ID_MATCH_KEYBIT | INPUT_DEVICE_ID_MATCH_ABSBIT, .evbit = { BIT_MASK(EV_KEY) | BIT_MASK(EV_ABS) }, .keybit = { [BIT_WORD(BTN_TOUCH)] = BIT_MASK(BTN_TOUCH) }, .absbit = { BIT_MASK(ABS_X) | BIT_MASK(ABS_Y) }, }, /* A tablet like device, at least touch detection, two absolute axes */ { .flags = INPUT_DEVICE_ID_MATCH_EVBIT | INPUT_DEVICE_ID_MATCH_KEYBIT | INPUT_DEVICE_ID_MATCH_ABSBIT, .evbit = { BIT_MASK(EV_KEY) | BIT_MASK(EV_ABS) }, .keybit = { [BIT_WORD(BTN_TOOL_FINGER)] = BIT_MASK(BTN_TOOL_FINGER) }, .absbit = { BIT_MASK(ABS_X) | BIT_MASK(ABS_Y) | BIT_MASK(ABS_PRESSURE) | BIT_MASK(ABS_TOOL_WIDTH) }, }, /* A touchpad */ { .flags = INPUT_DEVICE_ID_MATCH_EVBIT | INPUT_DEVICE_ID_MATCH_KEYBIT | INPUT_DEVICE_ID_MATCH_ABSBIT, .evbit = { BIT_MASK(EV_KEY) | BIT_MASK(EV_ABS) }, .keybit = { [BIT_WORD(BTN_LEFT)] = BIT_MASK(BTN_LEFT) }, .absbit = { BIT_MASK(ABS_X) | BIT_MASK(ABS_Y) }, }, /* Mouse-like device with absolute X and Y but ordinary clicks, like hp ILO2 High Performance mouse */ { }, /* Terminating entry */ }; MODULE_DEVICE_TABLE(input, mousedev_ids); static struct input_handler mousedev_handler = { .event = mousedev_event, .connect = mousedev_connect, .disconnect = mousedev_disconnect, .legacy_minors = true, .minor = MOUSEDEV_MINOR_BASE, .name = "mousedev", .id_table = mousedev_ids, }; #ifdef CONFIG_INPUT_MOUSEDEV_PSAUX #include <linux/miscdevice.h> static struct miscdevice psaux_mouse = { .minor = PSMOUSE_MINOR, .name = "psaux", .fops = &mousedev_fops, }; static bool psaux_registered; static void __init mousedev_psaux_register(void) { int error; error = misc_register(&psaux_mouse); if (error) pr_warn("could not register psaux device, error: %d\n", error); else psaux_registered = true; } static void __exit mousedev_psaux_unregister(void) { if (psaux_registered) misc_deregister(&psaux_mouse); } #else static inline void mousedev_psaux_register(void) { } static inline void mousedev_psaux_unregister(void) { } #endif static int __init mousedev_init(void) { int error; mousedev_mix = mousedev_create(NULL, &mousedev_handler, true); if (IS_ERR(mousedev_mix)) return PTR_ERR(mousedev_mix); error = input_register_handler(&mousedev_handler); if (error) { mousedev_destroy(mousedev_mix); return error; } mousedev_psaux_register(); pr_info("PS/2 mouse device common for all mice\n"); return 0; } static void __exit mousedev_exit(void) { mousedev_psaux_unregister(); input_unregister_handler(&mousedev_handler); mousedev_destroy(mousedev_mix); } module_init(mousedev_init); module_exit(mousedev_exit);
6 6 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 5 6 6 6 6 6 6 6 6 5 5 4 4 6 4 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 5 5 5 5 4 4 3 5 5 3 3 5 5 5 5 5 5 5 5 5 3 3 3 5 5 5 5 3 5 3 5 5 5 5 3 3 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 // SPDX-License-Identifier: GPL-2.0-or-later /* * Linux INET6 implementation * Forwarding Information Database * * Authors: * Pedro Roque <roque@di.fc.ul.pt> * * Changes: * Yuji SEKIYA @USAGI: Support default route on router node; * remove ip6_null_entry from the top of * routing table. * Ville Nuorvala: Fixed routing subtrees. */ #define pr_fmt(fmt) "IPv6: " fmt #include <linux/bpf.h> #include <linux/errno.h> #include <linux/types.h> #include <linux/net.h> #include <linux/route.h> #include <linux/netdevice.h> #include <linux/in6.h> #include <linux/init.h> #include <linux/list.h> #include <linux/slab.h> #include <net/ip.h> #include <net/ipv6.h> #include <net/ndisc.h> #include <net/addrconf.h> #include <net/lwtunnel.h> #include <net/fib_notifier.h> #include <net/ip_fib.h> #include <net/ip6_fib.h> #include <net/ip6_route.h> static struct kmem_cache *fib6_node_kmem __read_mostly; struct fib6_cleaner { struct fib6_walker w; struct net *net; int (*func)(struct fib6_info *, void *arg); int sernum; void *arg; bool skip_notify; }; #ifdef CONFIG_IPV6_SUBTREES #define FWS_INIT FWS_S #else #define FWS_INIT FWS_L #endif static struct fib6_info *fib6_find_prefix(struct net *net, struct fib6_table *table, struct fib6_node *fn); static struct fib6_node *fib6_repair_tree(struct net *net, struct fib6_table *table, struct fib6_node *fn); static int fib6_walk(struct net *net, struct fib6_walker *w); static int fib6_walk_continue(struct fib6_walker *w); /* * A routing update causes an increase of the serial number on the * affected subtree. This allows for cached routes to be asynchronously * tested when modifications are made to the destination cache as a * result of redirects, path MTU changes, etc. */ static void fib6_gc_timer_cb(struct timer_list *t); #define FOR_WALKERS(net, w) \ list_for_each_entry(w, &(net)->ipv6.fib6_walkers, lh) static void fib6_walker_link(struct net *net, struct fib6_walker *w) { write_lock_bh(&net->ipv6.fib6_walker_lock); list_add(&w->lh, &net->ipv6.fib6_walkers); write_unlock_bh(&net->ipv6.fib6_walker_lock); } static void fib6_walker_unlink(struct net *net, struct fib6_walker *w) { write_lock_bh(&net->ipv6.fib6_walker_lock); list_del(&w->lh); write_unlock_bh(&net->ipv6.fib6_walker_lock); } static int fib6_new_sernum(struct net *net) { int new, old = atomic_read(&net->ipv6.fib6_sernum); do { new = old < INT_MAX ? old + 1 : 1; } while (!atomic_try_cmpxchg(&net->ipv6.fib6_sernum, &old, new)); return new; } enum { FIB6_NO_SERNUM_CHANGE = 0, }; void fib6_update_sernum(struct net *net, struct fib6_info *f6i) { struct fib6_node *fn; fn = rcu_dereference_protected(f6i->fib6_node, lockdep_is_held(&f6i->fib6_table->tb6_lock)); if (fn) WRITE_ONCE(fn->fn_sernum, fib6_new_sernum(net)); } /* * Auxiliary address test functions for the radix tree. * * These assume a 32bit processor (although it will work on * 64bit processors) */ /* * test bit */ #if defined(__LITTLE_ENDIAN) # define BITOP_BE32_SWIZZLE (0x1F & ~7) #else # define BITOP_BE32_SWIZZLE 0 #endif static __be32 addr_bit_set(const void *token, int fn_bit) { const __be32 *addr = token; /* * Here, * 1 << ((~fn_bit ^ BITOP_BE32_SWIZZLE) & 0x1f) * is optimized version of * htonl(1 << ((~fn_bit)&0x1F)) * See include/asm-generic/bitops/le.h. */ return (__force __be32)(1 << ((~fn_bit ^ BITOP_BE32_SWIZZLE) & 0x1f)) & addr[fn_bit >> 5]; } struct fib6_info *fib6_info_alloc(gfp_t gfp_flags, bool with_fib6_nh) { struct fib6_info *f6i; size_t sz = sizeof(*f6i); if (with_fib6_nh) sz += sizeof(struct fib6_nh); f6i = kzalloc(sz, gfp_flags); if (!f6i) return NULL; /* fib6_siblings is a union with nh_list, so this initializes both */ INIT_LIST_HEAD(&f6i->fib6_siblings); refcount_set(&f6i->fib6_ref, 1); INIT_HLIST_NODE(&f6i->gc_link); return f6i; } void fib6_info_destroy_rcu(struct rcu_head *head) { struct fib6_info *f6i = container_of(head, struct fib6_info, rcu); WARN_ON(f6i->fib6_node); if (f6i->nh) nexthop_put(f6i->nh); else fib6_nh_release(f6i->fib6_nh); ip_fib_metrics_put(f6i->fib6_metrics); kfree(f6i); } EXPORT_SYMBOL_GPL(fib6_info_destroy_rcu); static struct fib6_node *node_alloc(struct net *net) { struct fib6_node *fn; fn = kmem_cache_zalloc(fib6_node_kmem, GFP_ATOMIC); if (fn) net->ipv6.rt6_stats->fib_nodes++; return fn; } static void node_free_immediate(struct net *net, struct fib6_node *fn) { kmem_cache_free(fib6_node_kmem, fn); net->ipv6.rt6_stats->fib_nodes--; } static void node_free(struct net *net, struct fib6_node *fn) { kfree_rcu(fn, rcu); net->ipv6.rt6_stats->fib_nodes--; } static void fib6_free_table(struct fib6_table *table) { inetpeer_invalidate_tree(&table->tb6_peers); kfree(table); } static void fib6_link_table(struct net *net, struct fib6_table *tb) { unsigned int h; /* * Initialize table lock at a single place to give lockdep a key, * tables aren't visible prior to being linked to the list. */ spin_lock_init(&tb->tb6_lock); h = tb->tb6_id & (FIB6_TABLE_HASHSZ - 1); /* * No protection necessary, this is the only list mutatation * operation, tables never disappear once they exist. */ hlist_add_head_rcu(&tb->tb6_hlist, &net->ipv6.fib_table_hash[h]); } #ifdef CONFIG_IPV6_MULTIPLE_TABLES static struct fib6_table *fib6_alloc_table(struct net *net, u32 id) { struct fib6_table *table; table = kzalloc(sizeof(*table), GFP_ATOMIC); if (table) { table->tb6_id = id; rcu_assign_pointer(table->tb6_root.leaf, net->ipv6.fib6_null_entry); table->tb6_root.fn_flags = RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO; inet_peer_base_init(&table->tb6_peers); INIT_HLIST_HEAD(&table->tb6_gc_hlist); } return table; } struct fib6_table *fib6_new_table(struct net *net, u32 id) { struct fib6_table *tb; if (id == 0) id = RT6_TABLE_MAIN; tb = fib6_get_table(net, id); if (tb) return tb; tb = fib6_alloc_table(net, id); if (tb) fib6_link_table(net, tb); return tb; } EXPORT_SYMBOL_GPL(fib6_new_table); struct fib6_table *fib6_get_table(struct net *net, u32 id) { struct fib6_table *tb; struct hlist_head *head; unsigned int h; if (id == 0) id = RT6_TABLE_MAIN; h = id & (FIB6_TABLE_HASHSZ - 1); rcu_read_lock(); head = &net->ipv6.fib_table_hash[h]; hlist_for_each_entry_rcu(tb, head, tb6_hlist) { if (tb->tb6_id == id) { rcu_read_unlock(); return tb; } } rcu_read_unlock(); return NULL; } EXPORT_SYMBOL_GPL(fib6_get_table); static void __net_init fib6_tables_init(struct net *net) { fib6_link_table(net, net->ipv6.fib6_main_tbl); fib6_link_table(net, net->ipv6.fib6_local_tbl); } #else struct fib6_table *fib6_new_table(struct net *net, u32 id) { return fib6_get_table(net, id); } struct fib6_table *fib6_get_table(struct net *net, u32 id) { return net->ipv6.fib6_main_tbl; } struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6, const struct sk_buff *skb, int flags, pol_lookup_t lookup) { struct rt6_info *rt; rt = pol_lookup_func(lookup, net, net->ipv6.fib6_main_tbl, fl6, skb, flags); if (rt->dst.error == -EAGAIN) { ip6_rt_put_flags(rt, flags); rt = net->ipv6.ip6_null_entry; if (!(flags & RT6_LOOKUP_F_DST_NOREF)) dst_hold(&rt->dst); } return &rt->dst; } /* called with rcu lock held; no reference taken on fib6_info */ int fib6_lookup(struct net *net, int oif, struct flowi6 *fl6, struct fib6_result *res, int flags) { return fib6_table_lookup(net, net->ipv6.fib6_main_tbl, oif, fl6, res, flags); } static void __net_init fib6_tables_init(struct net *net) { fib6_link_table(net, net->ipv6.fib6_main_tbl); } #endif unsigned int fib6_tables_seq_read(const struct net *net) { unsigned int h, fib_seq = 0; rcu_read_lock(); for (h = 0; h < FIB6_TABLE_HASHSZ; h++) { const struct hlist_head *head = &net->ipv6.fib_table_hash[h]; const struct fib6_table *tb; hlist_for_each_entry_rcu(tb, head, tb6_hlist) fib_seq += READ_ONCE(tb->fib_seq); } rcu_read_unlock(); return fib_seq; } static int call_fib6_entry_notifier(struct notifier_block *nb, enum fib_event_type event_type, struct fib6_info *rt, struct netlink_ext_ack *extack) { struct fib6_entry_notifier_info info = { .info.extack = extack, .rt = rt, }; return call_fib6_notifier(nb, event_type, &info.info); } static int call_fib6_multipath_entry_notifier(struct notifier_block *nb, enum fib_event_type event_type, struct fib6_info *rt, unsigned int nsiblings, struct netlink_ext_ack *extack) { struct fib6_entry_notifier_info info = { .info.extack = extack, .rt = rt, .nsiblings = nsiblings, }; return call_fib6_notifier(nb, event_type, &info.info); } int call_fib6_entry_notifiers(struct net *net, enum fib_event_type event_type, struct fib6_info *rt, struct netlink_ext_ack *extack) { struct fib6_entry_notifier_info info = { .info.extack = extack, .rt = rt, }; WRITE_ONCE(rt->fib6_table->fib_seq, rt->fib6_table->fib_seq + 1); return call_fib6_notifiers(net, event_type, &info.info); } int call_fib6_multipath_entry_notifiers(struct net *net, enum fib_event_type event_type, struct fib6_info *rt, unsigned int nsiblings, struct netlink_ext_ack *extack) { struct fib6_entry_notifier_info info = { .info.extack = extack, .rt = rt, .nsiblings = nsiblings, }; WRITE_ONCE(rt->fib6_table->fib_seq, rt->fib6_table->fib_seq + 1); return call_fib6_notifiers(net, event_type, &info.info); } int call_fib6_entry_notifiers_replace(struct net *net, struct fib6_info *rt) { struct fib6_entry_notifier_info info = { .rt = rt, .nsiblings = rt->fib6_nsiblings, }; WRITE_ONCE(rt->fib6_table->fib_seq, rt->fib6_table->fib_seq + 1); return call_fib6_notifiers(net, FIB_EVENT_ENTRY_REPLACE, &info.info); } struct fib6_dump_arg { struct net *net; struct notifier_block *nb; struct netlink_ext_ack *extack; }; static int fib6_rt_dump(struct fib6_info *rt, struct fib6_dump_arg *arg) { enum fib_event_type fib_event = FIB_EVENT_ENTRY_REPLACE; int err; if (!rt || rt == arg->net->ipv6.fib6_null_entry) return 0; if (rt->fib6_nsiblings) err = call_fib6_multipath_entry_notifier(arg->nb, fib_event, rt, rt->fib6_nsiblings, arg->extack); else err = call_fib6_entry_notifier(arg->nb, fib_event, rt, arg->extack); return err; } static int fib6_node_dump(struct fib6_walker *w) { int err; err = fib6_rt_dump(w->leaf, w->args); w->leaf = NULL; return err; } static int fib6_table_dump(struct net *net, struct fib6_table *tb, struct fib6_walker *w) { int err; w->root = &tb->tb6_root; spin_lock_bh(&tb->tb6_lock); err = fib6_walk(net, w); spin_unlock_bh(&tb->tb6_lock); return err; } /* Called with rcu_read_lock() */ int fib6_tables_dump(struct net *net, struct notifier_block *nb, struct netlink_ext_ack *extack) { struct fib6_dump_arg arg; struct fib6_walker *w; unsigned int h; int err = 0; w = kzalloc(sizeof(*w), GFP_ATOMIC); if (!w) return -ENOMEM; w->func = fib6_node_dump; arg.net = net; arg.nb = nb; arg.extack = extack; w->args = &arg; for (h = 0; h < FIB6_TABLE_HASHSZ; h++) { struct hlist_head *head = &net->ipv6.fib_table_hash[h]; struct fib6_table *tb; hlist_for_each_entry_rcu(tb, head, tb6_hlist) { err = fib6_table_dump(net, tb, w); if (err) goto out; } } out: kfree(w); /* The tree traversal function should never return a positive value. */ return err > 0 ? -EINVAL : err; } static int fib6_dump_node(struct fib6_walker *w) { int res; struct fib6_info *rt; for_each_fib6_walker_rt(w) { res = rt6_dump_route(rt, w->args, w->skip_in_node); if (res >= 0) { /* Frame is full, suspend walking */ w->leaf = rt; /* We'll restart from this node, so if some routes were * already dumped, skip them next time. */ w->skip_in_node += res; return 1; } w->skip_in_node = 0; /* Multipath routes are dumped in one route with the * RTA_MULTIPATH attribute. Jump 'rt' to point to the * last sibling of this route (no need to dump the * sibling routes again) */ if (rt->fib6_nsiblings) rt = list_last_entry(&rt->fib6_siblings, struct fib6_info, fib6_siblings); } w->leaf = NULL; return 0; } static void fib6_dump_end(struct netlink_callback *cb) { struct net *net = sock_net(cb->skb->sk); struct fib6_walker *w = (void *)cb->args[2]; if (w) { if (cb->args[4]) { cb->args[4] = 0; fib6_walker_unlink(net, w); } cb->args[2] = 0; kfree(w); } cb->done = (void *)cb->args[3]; cb->args[1] = 3; } static int fib6_dump_done(struct netlink_callback *cb) { fib6_dump_end(cb); return cb->done ? cb->done(cb) : 0; } static int fib6_dump_table(struct fib6_table *table, struct sk_buff *skb, struct netlink_callback *cb) { struct net *net = sock_net(skb->sk); struct fib6_walker *w; int res; w = (void *)cb->args[2]; w->root = &table->tb6_root; if (cb->args[4] == 0) { w->count = 0; w->skip = 0; w->skip_in_node = 0; spin_lock_bh(&table->tb6_lock); res = fib6_walk(net, w); spin_unlock_bh(&table->tb6_lock); if (res > 0) { cb->args[4] = 1; cb->args[5] = READ_ONCE(w->root->fn_sernum); } } else { int sernum = READ_ONCE(w->root->fn_sernum); if (cb->args[5] != sernum) { /* Begin at the root if the tree changed */ cb->args[5] = sernum; w->state = FWS_INIT; w->node = w->root; w->skip = w->count; w->skip_in_node = 0; } else w->skip = 0; spin_lock_bh(&table->tb6_lock); res = fib6_walk_continue(w); spin_unlock_bh(&table->tb6_lock); if (res <= 0) { fib6_walker_unlink(net, w); cb->args[4] = 0; } } return res; } static int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) { struct rt6_rtnl_dump_arg arg = { .filter.dump_exceptions = true, .filter.dump_routes = true, .filter.rtnl_held = false, }; const struct nlmsghdr *nlh = cb->nlh; struct net *net = sock_net(skb->sk); unsigned int e = 0, s_e; struct hlist_head *head; struct fib6_walker *w; struct fib6_table *tb; unsigned int h, s_h; int err = 0; rcu_read_lock(); if (cb->strict_check) { err = ip_valid_fib_dump_req(net, nlh, &arg.filter, cb); if (err < 0) goto unlock; } else if (nlmsg_len(nlh) >= sizeof(struct rtmsg)) { struct rtmsg *rtm = nlmsg_data(nlh); if (rtm->rtm_flags & RTM_F_PREFIX) arg.filter.flags = RTM_F_PREFIX; } w = (void *)cb->args[2]; if (!w) { /* New dump: * * 1. allocate and initialize walker. */ w = kzalloc(sizeof(*w), GFP_ATOMIC); if (!w) { err = -ENOMEM; goto unlock; } w->func = fib6_dump_node; cb->args[2] = (long)w; /* 2. hook callback destructor. */ cb->args[3] = (long)cb->done; cb->done = fib6_dump_done; } arg.skb = skb; arg.cb = cb; arg.net = net; w->args = &arg; if (arg.filter.table_id) { tb = fib6_get_table(net, arg.filter.table_id); if (!tb) { if (rtnl_msg_family(cb->nlh) != PF_INET6) goto unlock; NL_SET_ERR_MSG_MOD(cb->extack, "FIB table does not exist"); err = -ENOENT; goto unlock; } if (!cb->args[0]) { err = fib6_dump_table(tb, skb, cb); if (!err) cb->args[0] = 1; } goto unlock; } s_h = cb->args[0]; s_e = cb->args[1]; for (h = s_h; h < FIB6_TABLE_HASHSZ; h++, s_e = 0) { e = 0; head = &net->ipv6.fib_table_hash[h]; hlist_for_each_entry_rcu(tb, head, tb6_hlist) { if (e < s_e) goto next; err = fib6_dump_table(tb, skb, cb); if (err != 0) goto out; next: e++; } } out: cb->args[1] = e; cb->args[0] = h; unlock: rcu_read_unlock(); if (err <= 0) fib6_dump_end(cb); return err; } void fib6_metric_set(struct fib6_info *f6i, int metric, u32 val) { if (!f6i) return; if (f6i->fib6_metrics == &dst_default_metrics) { struct dst_metrics *p = kzalloc(sizeof(*p), GFP_ATOMIC); if (!p) return; refcount_set(&p->refcnt, 1); f6i->fib6_metrics = p; } f6i->fib6_metrics->metrics[metric - 1] = val; } /* * Routing Table * * return the appropriate node for a routing tree "add" operation * by either creating and inserting or by returning an existing * node. */ static struct fib6_node *fib6_add_1(struct net *net, struct fib6_table *table, struct fib6_node *root, struct in6_addr *addr, int plen, int offset, int allow_create, int replace_required, struct netlink_ext_ack *extack) { struct fib6_node *fn, *in, *ln; struct fib6_node *pn = NULL; struct rt6key *key; int bit; __be32 dir = 0; /* insert node in tree */ fn = root; do { struct fib6_info *leaf = rcu_dereference_protected(fn->leaf, lockdep_is_held(&table->tb6_lock)); key = (struct rt6key *)((u8 *)leaf + offset); /* * Prefix match */ if (plen < fn->fn_bit || !ipv6_prefix_equal(&key->addr, addr, fn->fn_bit)) { if (!allow_create) { if (replace_required) { NL_SET_ERR_MSG(extack, "Can not replace route - no match found"); pr_warn("Can't replace route, no match found\n"); return ERR_PTR(-ENOENT); } pr_warn("NLM_F_CREATE should be set when creating new route\n"); } goto insert_above; } /* * Exact match ? */ if (plen == fn->fn_bit) { /* clean up an intermediate node */ if (!(fn->fn_flags & RTN_RTINFO)) { RCU_INIT_POINTER(fn->leaf, NULL); fib6_info_release(leaf); /* remove null_entry in the root node */ } else if (fn->fn_flags & RTN_TL_ROOT && rcu_access_pointer(fn->leaf) == net->ipv6.fib6_null_entry) { RCU_INIT_POINTER(fn->leaf, NULL); } return fn; } /* * We have more bits to go */ /* Try to walk down on tree. */ dir = addr_bit_set(addr, fn->fn_bit); pn = fn; fn = dir ? rcu_dereference_protected(fn->right, lockdep_is_held(&table->tb6_lock)) : rcu_dereference_protected(fn->left, lockdep_is_held(&table->tb6_lock)); } while (fn); if (!allow_create) { /* We should not create new node because * NLM_F_REPLACE was specified without NLM_F_CREATE * I assume it is safe to require NLM_F_CREATE when * REPLACE flag is used! Later we may want to remove the * check for replace_required, because according * to netlink specification, NLM_F_CREATE * MUST be specified if new route is created. * That would keep IPv6 consistent with IPv4 */ if (replace_required) { NL_SET_ERR_MSG(extack, "Can not replace route - no match found"); pr_warn("Can't replace route, no match found\n"); return ERR_PTR(-ENOENT); } pr_warn("NLM_F_CREATE should be set when creating new route\n"); } /* * We walked to the bottom of tree. * Create new leaf node without children. */ ln = node_alloc(net); if (!ln) return ERR_PTR(-ENOMEM); ln->fn_bit = plen; RCU_INIT_POINTER(ln->parent, pn); if (dir) rcu_assign_pointer(pn->right, ln); else rcu_assign_pointer(pn->left, ln); return ln; insert_above: /* * split since we don't have a common prefix anymore or * we have a less significant route. * we've to insert an intermediate node on the list * this new node will point to the one we need to create * and the current */ pn = rcu_dereference_protected(fn->parent, lockdep_is_held(&table->tb6_lock)); /* find 1st bit in difference between the 2 addrs. See comment in __ipv6_addr_diff: bit may be an invalid value, but if it is >= plen, the value is ignored in any case. */ bit = __ipv6_addr_diff(addr, &key->addr, sizeof(*addr)); /* * (intermediate)[in] * / \ * (new leaf node)[ln] (old node)[fn] */ if (plen > bit) { in = node_alloc(net); ln = node_alloc(net); if (!in || !ln) { if (in) node_free_immediate(net, in); if (ln) node_free_immediate(net, ln); return ERR_PTR(-ENOMEM); } /* * new intermediate node. * RTN_RTINFO will * be off since that an address that chooses one of * the branches would not match less specific routes * in the other branch */ in->fn_bit = bit; RCU_INIT_POINTER(in->parent, pn); in->leaf = fn->leaf; fib6_info_hold(rcu_dereference_protected(in->leaf, lockdep_is_held(&table->tb6_lock))); /* update parent pointer */ if (dir) rcu_assign_pointer(pn->right, in); else rcu_assign_pointer(pn->left, in); ln->fn_bit = plen; RCU_INIT_POINTER(ln->parent, in); rcu_assign_pointer(fn->parent, in); if (addr_bit_set(addr, bit)) { rcu_assign_pointer(in->right, ln); rcu_assign_pointer(in->left, fn); } else { rcu_assign_pointer(in->left, ln); rcu_assign_pointer(in->right, fn); } } else { /* plen <= bit */ /* * (new leaf node)[ln] * / \ * (old node)[fn] NULL */ ln = node_alloc(net); if (!ln) return ERR_PTR(-ENOMEM); ln->fn_bit = plen; RCU_INIT_POINTER(ln->parent, pn); if (addr_bit_set(&key->addr, plen)) RCU_INIT_POINTER(ln->right, fn); else RCU_INIT_POINTER(ln->left, fn); rcu_assign_pointer(fn->parent, ln); if (dir) rcu_assign_pointer(pn->right, ln); else rcu_assign_pointer(pn->left, ln); } return ln; } static void __fib6_drop_pcpu_from(struct fib6_nh *fib6_nh, const struct fib6_info *match, const struct fib6_table *table) { int cpu; if (!fib6_nh->rt6i_pcpu) return; rcu_read_lock(); /* release the reference to this fib entry from * all of its cached pcpu routes */ for_each_possible_cpu(cpu) { struct rt6_info **ppcpu_rt; struct rt6_info *pcpu_rt; ppcpu_rt = per_cpu_ptr(fib6_nh->rt6i_pcpu, cpu); /* Paired with xchg() in rt6_get_pcpu_route() */ pcpu_rt = READ_ONCE(*ppcpu_rt); /* only dropping the 'from' reference if the cached route * is using 'match'. The cached pcpu_rt->from only changes * from a fib6_info to NULL (ip6_dst_destroy); it can never * change from one fib6_info reference to another */ if (pcpu_rt && rcu_access_pointer(pcpu_rt->from) == match) { struct fib6_info *from; from = unrcu_pointer(xchg(&pcpu_rt->from, NULL)); fib6_info_release(from); } } rcu_read_unlock(); } struct fib6_nh_pcpu_arg { struct fib6_info *from; const struct fib6_table *table; }; static int fib6_nh_drop_pcpu_from(struct fib6_nh *nh, void *_arg) { struct fib6_nh_pcpu_arg *arg = _arg; __fib6_drop_pcpu_from(nh, arg->from, arg->table); return 0; } static void fib6_drop_pcpu_from(struct fib6_info *f6i, const struct fib6_table *table) { /* Make sure rt6_make_pcpu_route() wont add other percpu routes * while we are cleaning them here. */ f6i->fib6_destroying = 1; mb(); /* paired with the cmpxchg() in rt6_make_pcpu_route() */ if (f6i->nh) { struct fib6_nh_pcpu_arg arg = { .from = f6i, .table = table }; nexthop_for_each_fib6_nh(f6i->nh, fib6_nh_drop_pcpu_from, &arg); } else { struct fib6_nh *fib6_nh; fib6_nh = f6i->fib6_nh; __fib6_drop_pcpu_from(fib6_nh, f6i, table); } } static void fib6_purge_rt(struct fib6_info *rt, struct fib6_node *fn, struct net *net) { struct fib6_table *table = rt->fib6_table; /* Flush all cached dst in exception table */ rt6_flush_exceptions(rt); fib6_drop_pcpu_from(rt, table); if (rt->nh && !list_empty(&rt->nh_list)) list_del_init(&rt->nh_list); if (refcount_read(&rt->fib6_ref) != 1) { /* This route is used as dummy address holder in some split * nodes. It is not leaked, but it still holds other resources, * which must be released in time. So, scan ascendant nodes * and replace dummy references to this route with references * to still alive ones. */ while (fn) { struct fib6_info *leaf = rcu_dereference_protected(fn->leaf, lockdep_is_held(&table->tb6_lock)); struct fib6_info *new_leaf; if (!(fn->fn_flags & RTN_RTINFO) && leaf == rt) { new_leaf = fib6_find_prefix(net, table, fn); fib6_info_hold(new_leaf); rcu_assign_pointer(fn->leaf, new_leaf); fib6_info_release(rt); } fn = rcu_dereference_protected(fn->parent, lockdep_is_held(&table->tb6_lock)); } } fib6_clean_expires(rt); fib6_remove_gc_list(rt); } /* * Insert routing information in a node. */ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt, struct nl_info *info, struct netlink_ext_ack *extack) { struct fib6_info *leaf = rcu_dereference_protected(fn->leaf, lockdep_is_held(&rt->fib6_table->tb6_lock)); struct fib6_info *iter = NULL; struct fib6_info __rcu **ins; struct fib6_info __rcu **fallback_ins = NULL; int replace = (info->nlh && (info->nlh->nlmsg_flags & NLM_F_REPLACE)); int add = (!info->nlh || (info->nlh->nlmsg_flags & NLM_F_CREATE)); int found = 0; bool rt_can_ecmp = rt6_qualify_for_ecmp(rt); bool notify_sibling_rt = false; u16 nlflags = NLM_F_EXCL; int err; if (info->nlh && (info->nlh->nlmsg_flags & NLM_F_APPEND)) nlflags |= NLM_F_APPEND; ins = &fn->leaf; for (iter = leaf; iter; iter = rcu_dereference_protected(iter->fib6_next, lockdep_is_held(&rt->fib6_table->tb6_lock))) { /* * Search for duplicates */ if (iter->fib6_metric == rt->fib6_metric) { /* * Same priority level */ if (info->nlh && (info->nlh->nlmsg_flags & NLM_F_EXCL)) return -EEXIST; nlflags &= ~NLM_F_EXCL; if (replace) { if (rt_can_ecmp == rt6_qualify_for_ecmp(iter)) { found++; break; } fallback_ins = fallback_ins ?: ins; goto next_iter; } if (rt6_duplicate_nexthop(iter, rt)) { if (rt->fib6_nsiblings) rt->fib6_nsiblings = 0; if (!(iter->fib6_flags & RTF_EXPIRES)) return -EEXIST; if (!(rt->fib6_flags & RTF_EXPIRES)) { fib6_clean_expires(iter); fib6_remove_gc_list(iter); } else { fib6_set_expires(iter, rt->expires); fib6_add_gc_list(iter); } if (rt->fib6_pmtu) fib6_metric_set(iter, RTAX_MTU, rt->fib6_pmtu); return -EEXIST; } /* If we have the same destination and the same metric, * but not the same gateway, then the route we try to * add is sibling to this route, increment our counter * of siblings, and later we will add our route to the * list. * Only static routes (which don't have flag * RTF_EXPIRES) are used for ECMPv6. * * To avoid long list, we only had siblings if the * route have a gateway. */ if (rt_can_ecmp && rt6_qualify_for_ecmp(iter)) rt->fib6_nsiblings++; } if (iter->fib6_metric > rt->fib6_metric) break; next_iter: ins = &iter->fib6_next; } if (fallback_ins && !found) { /* No matching route with same ecmp-able-ness found, replace * first matching route */ ins = fallback_ins; iter = rcu_dereference_protected(*ins, lockdep_is_held(&rt->fib6_table->tb6_lock)); found++; } /* Reset round-robin state, if necessary */ if (ins == &fn->leaf) fn->rr_ptr = NULL; /* Link this route to others same route. */ if (rt->fib6_nsiblings) { unsigned int fib6_nsiblings; struct fib6_info *sibling, *temp_sibling; /* Find the first route that have the same metric */ sibling = leaf; notify_sibling_rt = true; while (sibling) { if (sibling->fib6_metric == rt->fib6_metric && rt6_qualify_for_ecmp(sibling)) { list_add_tail_rcu(&rt->fib6_siblings, &sibling->fib6_siblings); break; } sibling = rcu_dereference_protected(sibling->fib6_next, lockdep_is_held(&rt->fib6_table->tb6_lock)); notify_sibling_rt = false; } /* For each sibling in the list, increment the counter of * siblings. BUG() if counters does not match, list of siblings * is broken! */ fib6_nsiblings = 0; list_for_each_entry_safe(sibling, temp_sibling, &rt->fib6_siblings, fib6_siblings) { sibling->fib6_nsiblings++; BUG_ON(sibling->fib6_nsiblings != rt->fib6_nsiblings); fib6_nsiblings++; } BUG_ON(fib6_nsiblings != rt->fib6_nsiblings); rt6_multipath_rebalance(temp_sibling); } /* * insert node */ if (!replace) { if (!add) pr_warn("NLM_F_CREATE should be set when creating new route\n"); add: nlflags |= NLM_F_CREATE; /* The route should only be notified if it is the first * route in the node or if it is added as a sibling * route to the first route in the node. */ if (!info->skip_notify_kernel && (notify_sibling_rt || ins == &fn->leaf)) { enum fib_event_type fib_event; if (notify_sibling_rt) fib_event = FIB_EVENT_ENTRY_APPEND; else fib_event = FIB_EVENT_ENTRY_REPLACE; err = call_fib6_entry_notifiers(info->nl_net, fib_event, rt, extack); if (err) { struct fib6_info *sibling, *next_sibling; /* If the route has siblings, then it first * needs to be unlinked from them. */ if (!rt->fib6_nsiblings) return err; list_for_each_entry_safe(sibling, next_sibling, &rt->fib6_siblings, fib6_siblings) sibling->fib6_nsiblings--; rt->fib6_nsiblings = 0; list_del_rcu(&rt->fib6_siblings); rt6_multipath_rebalance(next_sibling); return err; } } rcu_assign_pointer(rt->fib6_next, iter); fib6_info_hold(rt); rcu_assign_pointer(rt->fib6_node, fn); rcu_assign_pointer(*ins, rt); if (!info->skip_notify) inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags); info->nl_net->ipv6.rt6_stats->fib_rt_entries++; if (!(fn->fn_flags & RTN_RTINFO)) { info->nl_net->ipv6.rt6_stats->fib_route_nodes++; fn->fn_flags |= RTN_RTINFO; } } else { int nsiblings; if (!found) { if (add) goto add; pr_warn("NLM_F_REPLACE set, but no existing node found!\n"); return -ENOENT; } if (!info->skip_notify_kernel && ins == &fn->leaf) { err = call_fib6_entry_notifiers(info->nl_net, FIB_EVENT_ENTRY_REPLACE, rt, extack); if (err) return err; } fib6_info_hold(rt); rcu_assign_pointer(rt->fib6_node, fn); rt->fib6_next = iter->fib6_next; rcu_assign_pointer(*ins, rt); if (!info->skip_notify) inet6_rt_notify(RTM_NEWROUTE, rt, info, NLM_F_REPLACE); if (!(fn->fn_flags & RTN_RTINFO)) { info->nl_net->ipv6.rt6_stats->fib_route_nodes++; fn->fn_flags |= RTN_RTINFO; } nsiblings = iter->fib6_nsiblings; iter->fib6_node = NULL; fib6_purge_rt(iter, fn, info->nl_net); if (rcu_access_pointer(fn->rr_ptr) == iter) fn->rr_ptr = NULL; fib6_info_release(iter); if (nsiblings) { /* Replacing an ECMP route, remove all siblings */ ins = &rt->fib6_next; iter = rcu_dereference_protected(*ins, lockdep_is_held(&rt->fib6_table->tb6_lock)); while (iter) { if (iter->fib6_metric > rt->fib6_metric) break; if (rt6_qualify_for_ecmp(iter)) { *ins = iter->fib6_next; iter->fib6_node = NULL; fib6_purge_rt(iter, fn, info->nl_net); if (rcu_access_pointer(fn->rr_ptr) == iter) fn->rr_ptr = NULL; fib6_info_release(iter); nsiblings--; info->nl_net->ipv6.rt6_stats->fib_rt_entries--; } else { ins = &iter->fib6_next; } iter = rcu_dereference_protected(*ins, lockdep_is_held(&rt->fib6_table->tb6_lock)); } WARN_ON(nsiblings != 0); } } return 0; } static void fib6_start_gc(struct net *net, struct fib6_info *rt) { if (!timer_pending(&net->ipv6.ip6_fib_timer) && (rt->fib6_flags & RTF_EXPIRES)) mod_timer(&net->ipv6.ip6_fib_timer, jiffies + net->ipv6.sysctl.ip6_rt_gc_interval); } void fib6_force_start_gc(struct net *net) { if (!timer_pending(&net->ipv6.ip6_fib_timer)) mod_timer(&net->ipv6.ip6_fib_timer, jiffies + net->ipv6.sysctl.ip6_rt_gc_interval); } static void __fib6_update_sernum_upto_root(struct fib6_info *rt, int sernum) { struct fib6_node *fn = rcu_dereference_protected(rt->fib6_node, lockdep_is_held(&rt->fib6_table->tb6_lock)); /* paired with smp_rmb() in fib6_get_cookie_safe() */ smp_wmb(); while (fn) { WRITE_ONCE(fn->fn_sernum, sernum); fn = rcu_dereference_protected(fn->parent, lockdep_is_held(&rt->fib6_table->tb6_lock)); } } void fib6_update_sernum_upto_root(struct net *net, struct fib6_info *rt) { __fib6_update_sernum_upto_root(rt, fib6_new_sernum(net)); } /* allow ipv4 to update sernum via ipv6_stub */ void fib6_update_sernum_stub(struct net *net, struct fib6_info *f6i) { spin_lock_bh(&f6i->fib6_table->tb6_lock); fib6_update_sernum_upto_root(net, f6i); spin_unlock_bh(&f6i->fib6_table->tb6_lock); } /* * Add routing information to the routing tree. * <destination addr>/<source addr> * with source addr info in sub-trees * Need to own table->tb6_lock */ int fib6_add(struct fib6_node *root, struct fib6_info *rt, struct nl_info *info, struct netlink_ext_ack *extack) { struct fib6_table *table = rt->fib6_table; struct fib6_node *fn; #ifdef CONFIG_IPV6_SUBTREES struct fib6_node *pn = NULL; #endif int err = -ENOMEM; int allow_create = 1; int replace_required = 0; if (info->nlh) { if (!(info->nlh->nlmsg_flags & NLM_F_CREATE)) allow_create = 0; if (info->nlh->nlmsg_flags & NLM_F_REPLACE) replace_required = 1; } if (!allow_create && !replace_required) pr_warn("RTM_NEWROUTE with no NLM_F_CREATE or NLM_F_REPLACE\n"); fn = fib6_add_1(info->nl_net, table, root, &rt->fib6_dst.addr, rt->fib6_dst.plen, offsetof(struct fib6_info, fib6_dst), allow_create, replace_required, extack); if (IS_ERR(fn)) { err = PTR_ERR(fn); fn = NULL; goto out; } #ifdef CONFIG_IPV6_SUBTREES pn = fn; if (rt->fib6_src.plen) { struct fib6_node *sn; if (!rcu_access_pointer(fn->subtree)) { struct fib6_node *sfn; /* * Create subtree. * * fn[main tree] * | * sfn[subtree root] * \ * sn[new leaf node] */ /* Create subtree root node */ sfn = node_alloc(info->nl_net); if (!sfn) goto failure; fib6_info_hold(info->nl_net->ipv6.fib6_null_entry); rcu_assign_pointer(sfn->leaf, info->nl_net->ipv6.fib6_null_entry); sfn->fn_flags = RTN_ROOT; /* Now add the first leaf node to new subtree */ sn = fib6_add_1(info->nl_net, table, sfn, &rt->fib6_src.addr, rt->fib6_src.plen, offsetof(struct fib6_info, fib6_src), allow_create, replace_required, extack); if (IS_ERR(sn)) { /* If it is failed, discard just allocated root, and then (in failure) stale node in main tree. */ node_free_immediate(info->nl_net, sfn); err = PTR_ERR(sn); goto failure; } /* Now link new subtree to main tree */ rcu_assign_pointer(sfn->parent, fn); rcu_assign_pointer(fn->subtree, sfn); } else { sn = fib6_add_1(info->nl_net, table, FIB6_SUBTREE(fn), &rt->fib6_src.addr, rt->fib6_src.plen, offsetof(struct fib6_info, fib6_src), allow_create, replace_required, extack); if (IS_ERR(sn)) { err = PTR_ERR(sn); goto failure; } } if (!rcu_access_pointer(fn->leaf)) { if (fn->fn_flags & RTN_TL_ROOT) { /* put back null_entry for root node */ rcu_assign_pointer(fn->leaf, info->nl_net->ipv6.fib6_null_entry); } else { fib6_info_hold(rt); rcu_assign_pointer(fn->leaf, rt); } } fn = sn; } #endif err = fib6_add_rt2node(fn, rt, info, extack); if (!err) { if (rt->nh) list_add(&rt->nh_list, &rt->nh->f6i_list); __fib6_update_sernum_upto_root(rt, fib6_new_sernum(info->nl_net)); if (rt->fib6_flags & RTF_EXPIRES) fib6_add_gc_list(rt); fib6_start_gc(info->nl_net, rt); } out: if (err) { #ifdef CONFIG_IPV6_SUBTREES /* * If fib6_add_1 has cleared the old leaf pointer in the * super-tree leaf node we have to find a new one for it. */ if (pn != fn) { struct fib6_info *pn_leaf = rcu_dereference_protected(pn->leaf, lockdep_is_held(&table->tb6_lock)); if (pn_leaf == rt) { pn_leaf = NULL; RCU_INIT_POINTER(pn->leaf, NULL); fib6_info_release(rt); } if (!pn_leaf && !(pn->fn_flags & RTN_RTINFO)) { pn_leaf = fib6_find_prefix(info->nl_net, table, pn); if (!pn_leaf) pn_leaf = info->nl_net->ipv6.fib6_null_entry; fib6_info_hold(pn_leaf); rcu_assign_pointer(pn->leaf, pn_leaf); } } #endif goto failure; } else if (fib6_requires_src(rt)) { fib6_routes_require_src_inc(info->nl_net); } return err; failure: /* fn->leaf could be NULL and fib6_repair_tree() needs to be called if: * 1. fn is an intermediate node and we failed to add the new * route to it in both subtree creation failure and fib6_add_rt2node() * failure case. * 2. fn is the root node in the table and we fail to add the first * default route to it. */ if (fn && (!(fn->fn_flags & (RTN_RTINFO|RTN_ROOT)) || (fn->fn_flags & RTN_TL_ROOT && !rcu_access_pointer(fn->leaf)))) fib6_repair_tree(info->nl_net, table, fn); return err; } /* * Routing tree lookup * */ struct lookup_args { int offset; /* key offset on fib6_info */ const struct in6_addr *addr; /* search key */ }; static struct fib6_node *fib6_node_lookup_1(struct fib6_node *root, struct lookup_args *args) { struct fib6_node *fn; __be32 dir; if (unlikely(args->offset == 0)) return NULL; /* * Descend on a tree */ fn = root; for (;;) { struct fib6_node *next; dir = addr_bit_set(args->addr, fn->fn_bit); next = dir ? rcu_dereference(fn->right) : rcu_dereference(fn->left); if (next) { fn = next; continue; } break; } while (fn) { struct fib6_node *subtree = FIB6_SUBTREE(fn); if (subtree || fn->fn_flags & RTN_RTINFO) { struct fib6_info *leaf = rcu_dereference(fn->leaf); struct rt6key *key; if (!leaf) goto backtrack; key = (struct rt6key *) ((u8 *)leaf + args->offset); if (ipv6_prefix_equal(&key->addr, args->addr, key->plen)) { #ifdef CONFIG_IPV6_SUBTREES if (subtree) { struct fib6_node *sfn; sfn = fib6_node_lookup_1(subtree, args + 1); if (!sfn) goto backtrack; fn = sfn; } #endif if (fn->fn_flags & RTN_RTINFO) return fn; } } backtrack: if (fn->fn_flags & RTN_ROOT) break; fn = rcu_dereference(fn->parent); } return NULL; } /* called with rcu_read_lock() held */ struct fib6_node *fib6_node_lookup(struct fib6_node *root, const struct in6_addr *daddr, const struct in6_addr *saddr) { struct fib6_node *fn; struct lookup_args args[] = { { .offset = offsetof(struct fib6_info, fib6_dst), .addr = daddr, }, #ifdef CONFIG_IPV6_SUBTREES { .offset = offsetof(struct fib6_info, fib6_src), .addr = saddr, }, #endif { .offset = 0, /* sentinel */ } }; fn = fib6_node_lookup_1(root, daddr ? args : args + 1); if (!fn || fn->fn_flags & RTN_TL_ROOT) fn = root; return fn; } /* * Get node with specified destination prefix (and source prefix, * if subtrees are used) * exact_match == true means we try to find fn with exact match of * the passed in prefix addr * exact_match == false means we try to find fn with longest prefix * match of the passed in prefix addr. This is useful for finding fn * for cached route as it will be stored in the exception table under * the node with longest prefix length. */ static struct fib6_node *fib6_locate_1(struct fib6_node *root, const struct in6_addr *addr, int plen, int offset, bool exact_match) { struct fib6_node *fn, *prev = NULL; for (fn = root; fn ; ) { struct fib6_info *leaf = rcu_dereference(fn->leaf); struct rt6key *key; /* This node is being deleted */ if (!leaf) { if (plen <= fn->fn_bit) goto out; else goto next; } key = (struct rt6key *)((u8 *)leaf + offset); /* * Prefix match */ if (plen < fn->fn_bit || !ipv6_prefix_equal(&key->addr, addr, fn->fn_bit)) goto out; if (plen == fn->fn_bit) return fn; if (fn->fn_flags & RTN_RTINFO) prev = fn; next: /* * We have more bits to go */ if (addr_bit_set(addr, fn->fn_bit)) fn = rcu_dereference(fn->right); else fn = rcu_dereference(fn->left); } out: if (exact_match) return NULL; else return prev; } struct fib6_node *fib6_locate(struct fib6_node *root, const struct in6_addr *daddr, int dst_len, const struct in6_addr *saddr, int src_len, bool exact_match) { struct fib6_node *fn; fn = fib6_locate_1(root, daddr, dst_len, offsetof(struct fib6_info, fib6_dst), exact_match); #ifdef CONFIG_IPV6_SUBTREES if (src_len) { WARN_ON(saddr == NULL); if (fn) { struct fib6_node *subtree = FIB6_SUBTREE(fn); if (subtree) { fn = fib6_locate_1(subtree, saddr, src_len, offsetof(struct fib6_info, fib6_src), exact_match); } } } #endif if (fn && fn->fn_flags & RTN_RTINFO) return fn; return NULL; } /* * Deletion * */ static struct fib6_info *fib6_find_prefix(struct net *net, struct fib6_table *table, struct fib6_node *fn) { struct fib6_node *child_left, *child_right; if (fn->fn_flags & RTN_ROOT) return net->ipv6.fib6_null_entry; while (fn) { child_left = rcu_dereference_protected(fn->left, lockdep_is_held(&table->tb6_lock)); child_right = rcu_dereference_protected(fn->right, lockdep_is_held(&table->tb6_lock)); if (child_left) return rcu_dereference_protected(child_left->leaf, lockdep_is_held(&table->tb6_lock)); if (child_right) return rcu_dereference_protected(child_right->leaf, lockdep_is_held(&table->tb6_lock)); fn = FIB6_SUBTREE(fn); } return NULL; } /* * Called to trim the tree of intermediate nodes when possible. "fn" * is the node we want to try and remove. * Need to own table->tb6_lock */ static struct fib6_node *fib6_repair_tree(struct net *net, struct fib6_table *table, struct fib6_node *fn) { int children; int nstate; struct fib6_node *child; struct fib6_walker *w; int iter = 0; /* Set fn->leaf to null_entry for root node. */ if (fn->fn_flags & RTN_TL_ROOT) { rcu_assign_pointer(fn->leaf, net->ipv6.fib6_null_entry); return fn; } for (;;) { struct fib6_node *fn_r = rcu_dereference_protected(fn->right, lockdep_is_held(&table->tb6_lock)); struct fib6_node *fn_l = rcu_dereference_protected(fn->left, lockdep_is_held(&table->tb6_lock)); struct fib6_node *pn = rcu_dereference_protected(fn->parent, lockdep_is_held(&table->tb6_lock)); struct fib6_node *pn_r = rcu_dereference_protected(pn->right, lockdep_is_held(&table->tb6_lock)); struct fib6_node *pn_l = rcu_dereference_protected(pn->left, lockdep_is_held(&table->tb6_lock)); struct fib6_info *fn_leaf = rcu_dereference_protected(fn->leaf, lockdep_is_held(&table->tb6_lock)); struct fib6_info *pn_leaf = rcu_dereference_protected(pn->leaf, lockdep_is_held(&table->tb6_lock)); struct fib6_info *new_fn_leaf; pr_debug("fixing tree: plen=%d iter=%d\n", fn->fn_bit, iter); iter++; WARN_ON(fn->fn_flags & RTN_RTINFO); WARN_ON(fn->fn_flags & RTN_TL_ROOT); WARN_ON(fn_leaf); children = 0; child = NULL; if (fn_r) { child = fn_r; children |= 1; } if (fn_l) { child = fn_l; children |= 2; } if (children == 3 || FIB6_SUBTREE(fn) #ifdef CONFIG_IPV6_SUBTREES /* Subtree root (i.e. fn) may have one child */ || (children && fn->fn_flags & RTN_ROOT) #endif ) { new_fn_leaf = fib6_find_prefix(net, table, fn); #if RT6_DEBUG >= 2 if (!new_fn_leaf) { WARN_ON(!new_fn_leaf); new_fn_leaf = net->ipv6.fib6_null_entry; } #endif fib6_info_hold(new_fn_leaf); rcu_assign_pointer(fn->leaf, new_fn_leaf); return pn; } #ifdef CONFIG_IPV6_SUBTREES if (FIB6_SUBTREE(pn) == fn) { WARN_ON(!(fn->fn_flags & RTN_ROOT)); RCU_INIT_POINTER(pn->subtree, NULL); nstate = FWS_L; } else { WARN_ON(fn->fn_flags & RTN_ROOT); #endif if (pn_r == fn) rcu_assign_pointer(pn->right, child); else if (pn_l == fn) rcu_assign_pointer(pn->left, child); #if RT6_DEBUG >= 2 else WARN_ON(1); #endif if (child) rcu_assign_pointer(child->parent, pn); nstate = FWS_R; #ifdef CONFIG_IPV6_SUBTREES } #endif read_lock(&net->ipv6.fib6_walker_lock); FOR_WALKERS(net, w) { if (!child) { if (w->node == fn) { pr_debug("W %p adjusted by delnode 1, s=%d/%d\n", w, w->state, nstate); w->node = pn; w->state = nstate; } } else { if (w->node == fn) { w->node = child; if (children&2) { pr_debug("W %p adjusted by delnode 2, s=%d\n", w, w->state); w->state = w->state >= FWS_R ? FWS_U : FWS_INIT; } else { pr_debug("W %p adjusted by delnode 2, s=%d\n", w, w->state); w->state = w->state >= FWS_C ? FWS_U : FWS_INIT; } } } } read_unlock(&net->ipv6.fib6_walker_lock); node_free(net, fn); if (pn->fn_flags & RTN_RTINFO || FIB6_SUBTREE(pn)) return pn; RCU_INIT_POINTER(pn->leaf, NULL); fib6_info_release(pn_leaf); fn = pn; } } static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn, struct fib6_info __rcu **rtp, struct nl_info *info) { struct fib6_info *leaf, *replace_rt = NULL; struct fib6_walker *w; struct fib6_info *rt = rcu_dereference_protected(*rtp, lockdep_is_held(&table->tb6_lock)); struct net *net = info->nl_net; bool notify_del = false; /* If the deleted route is the first in the node and it is not part of * a multipath route, then we need to replace it with the next route * in the node, if exists. */ leaf = rcu_dereference_protected(fn->leaf, lockdep_is_held(&table->tb6_lock)); if (leaf == rt && !rt->fib6_nsiblings) { if (rcu_access_pointer(rt->fib6_next)) replace_rt = rcu_dereference_protected(rt->fib6_next, lockdep_is_held(&table->tb6_lock)); else notify_del = true; } /* Unlink it */ *rtp = rt->fib6_next; rt->fib6_node = NULL; net->ipv6.rt6_stats->fib_rt_entries--; net->ipv6.rt6_stats->fib_discarded_routes++; /* Reset round-robin state, if necessary */ if (rcu_access_pointer(fn->rr_ptr) == rt) fn->rr_ptr = NULL; /* Remove this entry from other siblings */ if (rt->fib6_nsiblings) { struct fib6_info *sibling, *next_sibling; /* The route is deleted from a multipath route. If this * multipath route is the first route in the node, then we need * to emit a delete notification. Otherwise, we need to skip * the notification. */ if (rt->fib6_metric == leaf->fib6_metric && rt6_qualify_for_ecmp(leaf)) notify_del = true; list_for_each_entry_safe(sibling, next_sibling, &rt->fib6_siblings, fib6_siblings) sibling->fib6_nsiblings--; rt->fib6_nsiblings = 0; list_del_rcu(&rt->fib6_siblings); rt6_multipath_rebalance(next_sibling); } /* Adjust walkers */ read_lock(&net->ipv6.fib6_walker_lock); FOR_WALKERS(net, w) { if (w->state == FWS_C && w->leaf == rt) { pr_debug("walker %p adjusted by delroute\n", w); w->leaf = rcu_dereference_protected(rt->fib6_next, lockdep_is_held(&table->tb6_lock)); if (!w->leaf) w->state = FWS_U; } } read_unlock(&net->ipv6.fib6_walker_lock); /* If it was last route, call fib6_repair_tree() to: * 1. For root node, put back null_entry as how the table was created. * 2. For other nodes, expunge its radix tree node. */ if (!rcu_access_pointer(fn->leaf)) { if (!(fn->fn_flags & RTN_TL_ROOT)) { fn->fn_flags &= ~RTN_RTINFO; net->ipv6.rt6_stats->fib_route_nodes--; } fn = fib6_repair_tree(net, table, fn); } fib6_purge_rt(rt, fn, net); if (!info->skip_notify_kernel) { if (notify_del) call_fib6_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, rt, NULL); else if (replace_rt) call_fib6_entry_notifiers_replace(net, replace_rt); } if (!info->skip_notify) inet6_rt_notify(RTM_DELROUTE, rt, info, 0); fib6_info_release(rt); } /* Need to own table->tb6_lock */ int fib6_del(struct fib6_info *rt, struct nl_info *info) { struct net *net = info->nl_net; struct fib6_info __rcu **rtp; struct fib6_info __rcu **rtp_next; struct fib6_table *table; struct fib6_node *fn; if (rt == net->ipv6.fib6_null_entry) return -ENOENT; table = rt->fib6_table; fn = rcu_dereference_protected(rt->fib6_node, lockdep_is_held(&table->tb6_lock)); if (!fn) return -ENOENT; WARN_ON(!(fn->fn_flags & RTN_RTINFO)); /* * Walk the leaf entries looking for ourself */ for (rtp = &fn->leaf; *rtp; rtp = rtp_next) { struct fib6_info *cur = rcu_dereference_protected(*rtp, lockdep_is_held(&table->tb6_lock)); if (rt == cur) { if (fib6_requires_src(cur)) fib6_routes_require_src_dec(info->nl_net); fib6_del_route(table, fn, rtp, info); return 0; } rtp_next = &cur->fib6_next; } return -ENOENT; } /* * Tree traversal function. * * Certainly, it is not interrupt safe. * However, it is internally reenterable wrt itself and fib6_add/fib6_del. * It means, that we can modify tree during walking * and use this function for garbage collection, clone pruning, * cleaning tree when a device goes down etc. etc. * * It guarantees that every node will be traversed, * and that it will be traversed only once. * * Callback function w->func may return: * 0 -> continue walking. * positive value -> walking is suspended (used by tree dumps, * and probably by gc, if it will be split to several slices) * negative value -> terminate walking. * * The function itself returns: * 0 -> walk is complete. * >0 -> walk is incomplete (i.e. suspended) * <0 -> walk is terminated by an error. * * This function is called with tb6_lock held. */ static int fib6_walk_continue(struct fib6_walker *w) { struct fib6_node *fn, *pn, *left, *right; /* w->root should always be table->tb6_root */ WARN_ON_ONCE(!(w->root->fn_flags & RTN_TL_ROOT)); for (;;) { fn = w->node; if (!fn) return 0; switch (w->state) { #ifdef CONFIG_IPV6_SUBTREES case FWS_S: if (FIB6_SUBTREE(fn)) { w->node = FIB6_SUBTREE(fn); continue; } w->state = FWS_L; fallthrough; #endif case FWS_L: left = rcu_dereference_protected(fn->left, 1); if (left) { w->node = left; w->state = FWS_INIT; continue; } w->state = FWS_R; fallthrough; case FWS_R: right = rcu_dereference_protected(fn->right, 1); if (right) { w->node = right; w->state = FWS_INIT; continue; } w->state = FWS_C; w->leaf = rcu_dereference_protected(fn->leaf, 1); fallthrough; case FWS_C: if (w->leaf && fn->fn_flags & RTN_RTINFO) { int err; if (w->skip) { w->skip--; goto skip; } err = w->func(w); if (err) return err; w->count++; continue; } skip: w->state = FWS_U; fallthrough; case FWS_U: if (fn == w->root) return 0; pn = rcu_dereference_protected(fn->parent, 1); left = rcu_dereference_protected(pn->left, 1); right = rcu_dereference_protected(pn->right, 1); w->node = pn; #ifdef CONFIG_IPV6_SUBTREES if (FIB6_SUBTREE(pn) == fn) { WARN_ON(!(fn->fn_flags & RTN_ROOT)); w->state = FWS_L; continue; } #endif if (left == fn) { w->state = FWS_R; continue; } if (right == fn) { w->state = FWS_C; w->leaf = rcu_dereference_protected(w->node->leaf, 1); continue; } #if RT6_DEBUG >= 2 WARN_ON(1); #endif } } } static int fib6_walk(struct net *net, struct fib6_walker *w) { int res; w->state = FWS_INIT; w->node = w->root; fib6_walker_link(net, w); res = fib6_walk_continue(w); if (res <= 0) fib6_walker_unlink(net, w); return res; } static int fib6_clean_node(struct fib6_walker *w) { int res; struct fib6_info *rt; struct fib6_cleaner *c = container_of(w, struct fib6_cleaner, w); struct nl_info info = { .nl_net = c->net, .skip_notify = c->skip_notify, }; if (c->sernum != FIB6_NO_SERNUM_CHANGE && READ_ONCE(w->node->fn_sernum) != c->sernum) WRITE_ONCE(w->node->fn_sernum, c->sernum); if (!c->func) { WARN_ON_ONCE(c->sernum == FIB6_NO_SERNUM_CHANGE); w->leaf = NULL; return 0; } for_each_fib6_walker_rt(w) { res = c->func(rt, c->arg); if (res == -1) { w->leaf = rt; res = fib6_del(rt, &info); if (res) { #if RT6_DEBUG >= 2 pr_debug("%s: del failed: rt=%p@%p err=%d\n", __func__, rt, rcu_access_pointer(rt->fib6_node), res); #endif continue; } return 0; } else if (res == -2) { if (WARN_ON(!rt->fib6_nsiblings)) continue; rt = list_last_entry(&rt->fib6_siblings, struct fib6_info, fib6_siblings); continue; } WARN_ON(res != 0); } w->leaf = rt; return 0; } /* * Convenient frontend to tree walker. * * func is called on each route. * It may return -2 -> skip multipath route. * -1 -> delete this route. * 0 -> continue walking */ static void fib6_clean_tree(struct net *net, struct fib6_node *root, int (*func)(struct fib6_info *, void *arg), int sernum, void *arg, bool skip_notify) { struct fib6_cleaner c; c.w.root = root; c.w.func = fib6_clean_node; c.w.count = 0; c.w.skip = 0; c.w.skip_in_node = 0; c.func = func; c.sernum = sernum; c.arg = arg; c.net = net; c.skip_notify = skip_notify; fib6_walk(net, &c.w); } static void __fib6_clean_all(struct net *net, int (*func)(struct fib6_info *, void *), int sernum, void *arg, bool skip_notify) { struct fib6_table *table; struct hlist_head *head; unsigned int h; rcu_read_lock(); for (h = 0; h < FIB6_TABLE_HASHSZ; h++) { head = &net->ipv6.fib_table_hash[h]; hlist_for_each_entry_rcu(table, head, tb6_hlist) { spin_lock_bh(&table->tb6_lock); fib6_clean_tree(net, &table->tb6_root, func, sernum, arg, skip_notify); spin_unlock_bh(&table->tb6_lock); } } rcu_read_unlock(); } void fib6_clean_all(struct net *net, int (*func)(struct fib6_info *, void *), void *arg) { __fib6_clean_all(net, func, FIB6_NO_SERNUM_CHANGE, arg, false); } void fib6_clean_all_skip_notify(struct net *net, int (*func)(struct fib6_info *, void *), void *arg) { __fib6_clean_all(net, func, FIB6_NO_SERNUM_CHANGE, arg, true); } static void fib6_flush_trees(struct net *net) { int new_sernum = fib6_new_sernum(net); __fib6_clean_all(net, NULL, new_sernum, NULL, false); } /* * Garbage collection */ static int fib6_age(struct fib6_info *rt, struct fib6_gc_args *gc_args) { unsigned long now = jiffies; /* * check addrconf expiration here. * Routes are expired even if they are in use. */ if (rt->fib6_flags & RTF_EXPIRES && rt->expires) { if (time_after(now, rt->expires)) { pr_debug("expiring %p\n", rt); return -1; } gc_args->more++; } /* Also age clones in the exception table. * Note, that clones are aged out * only if they are not in use now. */ rt6_age_exceptions(rt, gc_args, now); return 0; } static void fib6_gc_table(struct net *net, struct fib6_table *tb6, struct fib6_gc_args *gc_args) { struct fib6_info *rt; struct hlist_node *n; struct nl_info info = { .nl_net = net, .skip_notify = false, }; hlist_for_each_entry_safe(rt, n, &tb6->tb6_gc_hlist, gc_link) if (fib6_age(rt, gc_args) == -1) fib6_del(rt, &info); } static void fib6_gc_all(struct net *net, struct fib6_gc_args *gc_args) { struct fib6_table *table; struct hlist_head *head; unsigned int h; rcu_read_lock(); for (h = 0; h < FIB6_TABLE_HASHSZ; h++) { head = &net->ipv6.fib_table_hash[h]; hlist_for_each_entry_rcu(table, head, tb6_hlist) { spin_lock_bh(&table->tb6_lock); fib6_gc_table(net, table, gc_args); spin_unlock_bh(&table->tb6_lock); } } rcu_read_unlock(); } void fib6_run_gc(unsigned long expires, struct net *net, bool force) { struct fib6_gc_args gc_args; unsigned long now; if (force) { spin_lock_bh(&net->ipv6.fib6_gc_lock); } else if (!spin_trylock_bh(&net->ipv6.fib6_gc_lock)) { mod_timer(&net->ipv6.ip6_fib_timer, jiffies + HZ); return; } gc_args.timeout = expires ? (int)expires : net->ipv6.sysctl.ip6_rt_gc_interval; gc_args.more = 0; fib6_gc_all(net, &gc_args); now = jiffies; net->ipv6.ip6_rt_last_gc = now; if (gc_args.more) mod_timer(&net->ipv6.ip6_fib_timer, round_jiffies(now + net->ipv6.sysctl.ip6_rt_gc_interval)); else del_timer(&net->ipv6.ip6_fib_timer); spin_unlock_bh(&net->ipv6.fib6_gc_lock); } static void fib6_gc_timer_cb(struct timer_list *t) { struct net *arg = from_timer(arg, t, ipv6.ip6_fib_timer); fib6_run_gc(0, arg, true); } static int __net_init fib6_net_init(struct net *net) { size_t size = sizeof(struct hlist_head) * FIB6_TABLE_HASHSZ; int err; err = fib6_notifier_init(net); if (err) return err; /* Default to 3-tuple */ net->ipv6.sysctl.multipath_hash_fields = FIB_MULTIPATH_HASH_FIELD_DEFAULT_MASK; spin_lock_init(&net->ipv6.fib6_gc_lock); rwlock_init(&net->ipv6.fib6_walker_lock); INIT_LIST_HEAD(&net->ipv6.fib6_walkers); timer_setup(&net->ipv6.ip6_fib_timer, fib6_gc_timer_cb, 0); net->ipv6.rt6_stats = kzalloc(sizeof(*net->ipv6.rt6_stats), GFP_KERNEL); if (!net->ipv6.rt6_stats) goto out_notifier; /* Avoid false sharing : Use at least a full cache line */ size = max_t(size_t, size, L1_CACHE_BYTES); net->ipv6.fib_table_hash = kzalloc(size, GFP_KERNEL); if (!net->ipv6.fib_table_hash) goto out_rt6_stats; net->ipv6.fib6_main_tbl = kzalloc(sizeof(*net->ipv6.fib6_main_tbl), GFP_KERNEL); if (!net->ipv6.fib6_main_tbl) goto out_fib_table_hash; net->ipv6.fib6_main_tbl->tb6_id = RT6_TABLE_MAIN; rcu_assign_pointer(net->ipv6.fib6_main_tbl->tb6_root.leaf, net->ipv6.fib6_null_entry); net->ipv6.fib6_main_tbl->tb6_root.fn_flags = RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO; inet_peer_base_init(&net->ipv6.fib6_main_tbl->tb6_peers); INIT_HLIST_HEAD(&net->ipv6.fib6_main_tbl->tb6_gc_hlist); #ifdef CONFIG_IPV6_MULTIPLE_TABLES net->ipv6.fib6_local_tbl = kzalloc(sizeof(*net->ipv6.fib6_local_tbl), GFP_KERNEL); if (!net->ipv6.fib6_local_tbl) goto out_fib6_main_tbl; net->ipv6.fib6_local_tbl->tb6_id = RT6_TABLE_LOCAL; rcu_assign_pointer(net->ipv6.fib6_local_tbl->tb6_root.leaf, net->ipv6.fib6_null_entry); net->ipv6.fib6_local_tbl->tb6_root.fn_flags = RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO; inet_peer_base_init(&net->ipv6.fib6_local_tbl->tb6_peers); INIT_HLIST_HEAD(&net->ipv6.fib6_local_tbl->tb6_gc_hlist); #endif fib6_tables_init(net); return 0; #ifdef CONFIG_IPV6_MULTIPLE_TABLES out_fib6_main_tbl: kfree(net->ipv6.fib6_main_tbl); #endif out_fib_table_hash: kfree(net->ipv6.fib_table_hash); out_rt6_stats: kfree(net->ipv6.rt6_stats); out_notifier: fib6_notifier_exit(net); return -ENOMEM; } static void fib6_net_exit(struct net *net) { unsigned int i; del_timer_sync(&net->ipv6.ip6_fib_timer); for (i = 0; i < FIB6_TABLE_HASHSZ; i++) { struct hlist_head *head = &net->ipv6.fib_table_hash[i]; struct hlist_node *tmp; struct fib6_table *tb; hlist_for_each_entry_safe(tb, tmp, head, tb6_hlist) { hlist_del(&tb->tb6_hlist); fib6_free_table(tb); } } kfree(net->ipv6.fib_table_hash); kfree(net->ipv6.rt6_stats); fib6_notifier_exit(net); } static struct pernet_operations fib6_net_ops = { .init = fib6_net_init, .exit = fib6_net_exit, }; static const struct rtnl_msg_handler fib6_rtnl_msg_handlers[] __initconst_or_module = { {.owner = THIS_MODULE, .protocol = PF_INET6, .msgtype = RTM_GETROUTE, .dumpit = inet6_dump_fib, .flags = RTNL_FLAG_DUMP_UNLOCKED | RTNL_FLAG_DUMP_SPLIT_NLM_DONE}, }; int __init fib6_init(void) { int ret = -ENOMEM; fib6_node_kmem = KMEM_CACHE(fib6_node, SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT); if (!fib6_node_kmem) goto out; ret = register_pernet_subsys(&fib6_net_ops); if (ret) goto out_kmem_cache_create; ret = rtnl_register_many(fib6_rtnl_msg_handlers); if (ret) goto out_unregister_subsys; __fib6_flush_trees = fib6_flush_trees; out: return ret; out_unregister_subsys: unregister_pernet_subsys(&fib6_net_ops); out_kmem_cache_create: kmem_cache_destroy(fib6_node_kmem); goto out; } void fib6_gc_cleanup(void) { unregister_pernet_subsys(&fib6_net_ops); kmem_cache_destroy(fib6_node_kmem); } #ifdef CONFIG_PROC_FS static int ipv6_route_native_seq_show(struct seq_file *seq, void *v) { struct fib6_info *rt = v; struct ipv6_route_iter *iter = seq->private; struct fib6_nh *fib6_nh = rt->fib6_nh; unsigned int flags = rt->fib6_flags; const struct net_device *dev; if (rt->nh) fib6_nh = nexthop_fib6_nh(rt->nh); seq_printf(seq, "%pi6 %02x ", &rt->fib6_dst.addr, rt->fib6_dst.plen); #ifdef CONFIG_IPV6_SUBTREES seq_printf(seq, "%pi6 %02x ", &rt->fib6_src.addr, rt->fib6_src.plen); #else seq_puts(seq, "00000000000000000000000000000000 00 "); #endif if (fib6_nh->fib_nh_gw_family) { flags |= RTF_GATEWAY; seq_printf(seq, "%pi6", &fib6_nh->fib_nh_gw6); } else { seq_puts(seq, "00000000000000000000000000000000"); } dev = fib6_nh->fib_nh_dev; seq_printf(seq, " %08x %08x %08x %08x %8s\n", rt->fib6_metric, refcount_read(&rt->fib6_ref), 0, flags, dev ? dev->name : ""); iter->w.leaf = NULL; return 0; } static int ipv6_route_yield(struct fib6_walker *w) { struct ipv6_route_iter *iter = w->args; if (!iter->skip) return 1; do { iter->w.leaf = rcu_dereference_protected( iter->w.leaf->fib6_next, lockdep_is_held(&iter->tbl->tb6_lock)); iter->skip--; if (!iter->skip && iter->w.leaf) return 1; } while (iter->w.leaf); return 0; } static void ipv6_route_seq_setup_walk(struct ipv6_route_iter *iter, struct net *net) { memset(&iter->w, 0, sizeof(iter->w)); iter->w.func = ipv6_route_yield; iter->w.root = &iter->tbl->tb6_root; iter->w.state = FWS_INIT; iter->w.node = iter->w.root; iter->w.args = iter; iter->sernum = READ_ONCE(iter->w.root->fn_sernum); INIT_LIST_HEAD(&iter->w.lh); fib6_walker_link(net, &iter->w); } static struct fib6_table *ipv6_route_seq_next_table(struct fib6_table *tbl, struct net *net) { unsigned int h; struct hlist_node *node; if (tbl) { h = (tbl->tb6_id & (FIB6_TABLE_HASHSZ - 1)) + 1; node = rcu_dereference(hlist_next_rcu(&tbl->tb6_hlist)); } else { h = 0; node = NULL; } while (!node && h < FIB6_TABLE_HASHSZ) { node = rcu_dereference( hlist_first_rcu(&net->ipv6.fib_table_hash[h++])); } return hlist_entry_safe(node, struct fib6_table, tb6_hlist); } static void ipv6_route_check_sernum(struct ipv6_route_iter *iter) { int sernum = READ_ONCE(iter->w.root->fn_sernum); if (iter->sernum != sernum) { iter->sernum = sernum; iter->w.state = FWS_INIT; iter->w.node = iter->w.root; WARN_ON(iter->w.skip); iter->w.skip = iter->w.count; } } static void *ipv6_route_seq_next(struct seq_file *seq, void *v, loff_t *pos) { int r; struct fib6_info *n; struct net *net = seq_file_net(seq); struct ipv6_route_iter *iter = seq->private; ++(*pos); if (!v) goto iter_table; n = rcu_dereference(((struct fib6_info *)v)->fib6_next); if (n) return n; iter_table: ipv6_route_check_sernum(iter); spin_lock_bh(&iter->tbl->tb6_lock); r = fib6_walk_continue(&iter->w); spin_unlock_bh(&iter->tbl->tb6_lock); if (r > 0) { return iter->w.leaf; } else if (r < 0) { fib6_walker_unlink(net, &iter->w); return NULL; } fib6_walker_unlink(net, &iter->w); iter->tbl = ipv6_route_seq_next_table(iter->tbl, net); if (!iter->tbl) return NULL; ipv6_route_seq_setup_walk(iter, net); goto iter_table; } static void *ipv6_route_seq_start(struct seq_file *seq, loff_t *pos) __acquires(RCU) { struct net *net = seq_file_net(seq); struct ipv6_route_iter *iter = seq->private; rcu_read_lock(); iter->tbl = ipv6_route_seq_next_table(NULL, net); iter->skip = *pos; if (iter->tbl) { loff_t p = 0; ipv6_route_seq_setup_walk(iter, net); return ipv6_route_seq_next(seq, NULL, &p); } else { return NULL; } } static bool ipv6_route_iter_active(struct ipv6_route_iter *iter) { struct fib6_walker *w = &iter->w; return w->node && !(w->state == FWS_U && w->node == w->root); } static void ipv6_route_native_seq_stop(struct seq_file *seq, void *v) __releases(RCU) { struct net *net = seq_file_net(seq); struct ipv6_route_iter *iter = seq->private; if (ipv6_route_iter_active(iter)) fib6_walker_unlink(net, &iter->w); rcu_read_unlock(); } #if IS_BUILTIN(CONFIG_IPV6) && defined(CONFIG_BPF_SYSCALL) static int ipv6_route_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta, void *v) { struct bpf_iter__ipv6_route ctx; ctx.meta = meta; ctx.rt = v; return bpf_iter_run_prog(prog, &ctx); } static int ipv6_route_seq_show(struct seq_file *seq, void *v) { struct ipv6_route_iter *iter = seq->private; struct bpf_iter_meta meta; struct bpf_prog *prog; int ret; meta.seq = seq; prog = bpf_iter_get_info(&meta, false); if (!prog) return ipv6_route_native_seq_show(seq, v); ret = ipv6_route_prog_seq_show(prog, &meta, v); iter->w.leaf = NULL; return ret; } static void ipv6_route_seq_stop(struct seq_file *seq, void *v) { struct bpf_iter_meta meta; struct bpf_prog *prog; if (!v) { meta.seq = seq; prog = bpf_iter_get_info(&meta, true); if (prog) (void)ipv6_route_prog_seq_show(prog, &meta, v); } ipv6_route_native_seq_stop(seq, v); } #else static int ipv6_route_seq_show(struct seq_file *seq, void *v) { return ipv6_route_native_seq_show(seq, v); } static void ipv6_route_seq_stop(struct seq_file *seq, void *v) { ipv6_route_native_seq_stop(seq, v); } #endif const struct seq_operations ipv6_route_seq_ops = { .start = ipv6_route_seq_start, .next = ipv6_route_seq_next, .stop = ipv6_route_seq_stop, .show = ipv6_route_seq_show }; #endif /* CONFIG_PROC_FS */
22 14 20 43 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 /* * linux/fs/nls/nls_cp869.c * * Charset cp869 translation tables. * Generated automatically from the Unicode and charset * tables from the Unicode Organization (www.unicode.org). * The Unicode to charset table has only exact mappings. */ #include <linux/module.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/nls.h> #include <linux/errno.h> static const wchar_t charset2uni[256] = { /* 0x00*/ 0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007, 0x0008, 0x0009, 0x000a, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f, /* 0x10*/ 0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017, 0x0018, 0x0019, 0x001a, 0x001b, 0x001c, 0x001d, 0x001e, 0x001f, /* 0x20*/ 0x0020, 0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0027, 0x0028, 0x0029, 0x002a, 0x002b, 0x002c, 0x002d, 0x002e, 0x002f, /* 0x30*/ 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037, 0x0038, 0x0039, 0x003a, 0x003b, 0x003c, 0x003d, 0x003e, 0x003f, /* 0x40*/ 0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047, 0x0048, 0x0049, 0x004a, 0x004b, 0x004c, 0x004d, 0x004e, 0x004f, /* 0x50*/ 0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, 0x0058, 0x0059, 0x005a, 0x005b, 0x005c, 0x005d, 0x005e, 0x005f, /* 0x60*/ 0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067, 0x0068, 0x0069, 0x006a, 0x006b, 0x006c, 0x006d, 0x006e, 0x006f, /* 0x70*/ 0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077, 0x0078, 0x0079, 0x007a, 0x007b, 0x007c, 0x007d, 0x007e, 0x007f, /* 0x80*/ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0386, 0x0000, 0x00b7, 0x00ac, 0x00a6, 0x2018, 0x2019, 0x0388, 0x2015, 0x0389, /* 0x90*/ 0x038a, 0x03aa, 0x038c, 0x0000, 0x0000, 0x038e, 0x03ab, 0x00a9, 0x038f, 0x00b2, 0x00b3, 0x03ac, 0x00a3, 0x03ad, 0x03ae, 0x03af, /* 0xa0*/ 0x03ca, 0x0390, 0x03cc, 0x03cd, 0x0391, 0x0392, 0x0393, 0x0394, 0x0395, 0x0396, 0x0397, 0x00bd, 0x0398, 0x0399, 0x00ab, 0x00bb, /* 0xb0*/ 0x2591, 0x2592, 0x2593, 0x2502, 0x2524, 0x039a, 0x039b, 0x039c, 0x039d, 0x2563, 0x2551, 0x2557, 0x255d, 0x039e, 0x039f, 0x2510, /* 0xc0*/ 0x2514, 0x2534, 0x252c, 0x251c, 0x2500, 0x253c, 0x03a0, 0x03a1, 0x255a, 0x2554, 0x2569, 0x2566, 0x2560, 0x2550, 0x256c, 0x03a3, /* 0xd0*/ 0x03a4, 0x03a5, 0x03a6, 0x03a7, 0x03a8, 0x03a9, 0x03b1, 0x03b2, 0x03b3, 0x2518, 0x250c, 0x2588, 0x2584, 0x03b4, 0x03b5, 0x2580, /* 0xe0*/ 0x03b6, 0x03b7, 0x03b8, 0x03b9, 0x03ba, 0x03bb, 0x03bc, 0x03bd, 0x03be, 0x03bf, 0x03c0, 0x03c1, 0x03c3, 0x03c2, 0x03c4, 0x0384, /* 0xf0*/ 0x00ad, 0x00b1, 0x03c5, 0x03c6, 0x03c7, 0x00a7, 0x03c8, 0x0385, 0x00b0, 0x00a8, 0x03c9, 0x03cb, 0x03b0, 0x03ce, 0x25a0, 0x00a0, }; static const unsigned char page00[256] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ 0xff, 0x00, 0x00, 0x9c, 0x00, 0x00, 0x8a, 0xf5, /* 0xa0-0xa7 */ 0xf9, 0x97, 0x00, 0xae, 0x89, 0xf0, 0x00, 0x00, /* 0xa8-0xaf */ 0xf8, 0xf1, 0x99, 0x9a, 0x00, 0x00, 0x00, 0x88, /* 0xb0-0xb7 */ 0x00, 0x00, 0x00, 0xaf, 0x00, 0xab, 0x00, 0x00, /* 0xb8-0xbf */ }; static const unsigned char page03[256] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ 0x00, 0x00, 0x00, 0x00, 0xef, 0xf7, 0x86, 0x00, /* 0x80-0x87 */ 0x8d, 0x8f, 0x90, 0x00, 0x92, 0x00, 0x95, 0x98, /* 0x88-0x8f */ 0xa1, 0xa4, 0xa5, 0xa6, 0xa7, 0xa8, 0xa9, 0xaa, /* 0x90-0x97 */ 0xac, 0xad, 0xb5, 0xb6, 0xb7, 0xb8, 0xbd, 0xbe, /* 0x98-0x9f */ 0xc6, 0xc7, 0x00, 0xcf, 0xd0, 0xd1, 0xd2, 0xd3, /* 0xa0-0xa7 */ 0xd4, 0xd5, 0x91, 0x96, 0x9b, 0x9d, 0x9e, 0x9f, /* 0xa8-0xaf */ 0xfc, 0xd6, 0xd7, 0xd8, 0xdd, 0xde, 0xe0, 0xe1, /* 0xb0-0xb7 */ 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, /* 0xb8-0xbf */ 0xea, 0xeb, 0xed, 0xec, 0xee, 0xf2, 0xf3, 0xf4, /* 0xc0-0xc7 */ 0xf6, 0xfa, 0xa0, 0xfb, 0xa2, 0xa3, 0xfd, 0x00, /* 0xc8-0xcf */ }; static const unsigned char page20[256] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x8e, 0x00, 0x00, /* 0x10-0x17 */ 0x8b, 0x8c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ }; static const unsigned char page25[256] = { 0xc4, 0x00, 0xb3, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ 0x00, 0x00, 0x00, 0x00, 0xda, 0x00, 0x00, 0x00, /* 0x08-0x0f */ 0xbf, 0x00, 0x00, 0x00, 0xc0, 0x00, 0x00, 0x00, /* 0x10-0x17 */ 0xd9, 0x00, 0x00, 0x00, 0xc3, 0x00, 0x00, 0x00, /* 0x18-0x1f */ 0x00, 0x00, 0x00, 0x00, 0xb4, 0x00, 0x00, 0x00, /* 0x20-0x27 */ 0x00, 0x00, 0x00, 0x00, 0xc2, 0x00, 0x00, 0x00, /* 0x28-0x2f */ 0x00, 0x00, 0x00, 0x00, 0xc1, 0x00, 0x00, 0x00, /* 0x30-0x37 */ 0x00, 0x00, 0x00, 0x00, 0xc5, 0x00, 0x00, 0x00, /* 0x38-0x3f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ 0xcd, 0xba, 0x00, 0x00, 0xc9, 0x00, 0x00, 0xbb, /* 0x50-0x57 */ 0x00, 0x00, 0xc8, 0x00, 0x00, 0xbc, 0x00, 0x00, /* 0x58-0x5f */ 0xcc, 0x00, 0x00, 0xb9, 0x00, 0x00, 0xcb, 0x00, /* 0x60-0x67 */ 0x00, 0xca, 0x00, 0x00, 0xce, 0x00, 0x00, 0x00, /* 0x68-0x6f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ 0xdf, 0x00, 0x00, 0x00, 0xdc, 0x00, 0x00, 0x00, /* 0x80-0x87 */ 0xdb, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ 0x00, 0xb0, 0xb1, 0xb2, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ 0xfe, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ }; static const unsigned char *const page_uni2charset[256] = { page00, NULL, NULL, page03, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, page20, NULL, NULL, NULL, NULL, page25, NULL, NULL, }; static const unsigned char charset2lower[256] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ 0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */ 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */ 0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9b, 0x00, /* 0x80-0x87 */ 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x9d, 0x8e, 0x9e, /* 0x88-0x8f */ 0x9f, 0xa0, 0xa2, 0x00, 0x00, 0xa3, 0xfb, 0x97, /* 0x90-0x97 */ 0xfd, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */ 0xa0, 0xa1, 0xa2, 0xa3, 0xd6, 0xd7, 0xd8, 0xdd, /* 0xa0-0xa7 */ 0xde, 0xe0, 0xe1, 0xab, 0xe2, 0xe3, 0xae, 0xaf, /* 0xa8-0xaf */ 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xe4, 0xe5, 0xe6, /* 0xb0-0xb7 */ 0xe7, 0xb9, 0xba, 0xbb, 0xbc, 0xe8, 0xe9, 0xbf, /* 0xb8-0xbf */ 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xea, 0xeb, /* 0xc0-0xc7 */ 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xec, /* 0xc8-0xcf */ 0xee, 0xf2, 0xf3, 0xf4, 0xf6, 0xfa, 0xd6, 0xd7, /* 0xd0-0xd7 */ 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */ 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */ 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */ 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */ 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */ }; static const unsigned char charset2upper[256] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ 0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */ 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */ 0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x86, 0x00, /* 0x80-0x87 */ 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ 0x90, 0x91, 0x92, 0x00, 0x00, 0x95, 0x96, 0x97, /* 0x90-0x97 */ 0x98, 0x99, 0x9a, 0x86, 0x9c, 0x8d, 0x8f, 0x90, /* 0x98-0x9f */ 0x91, 0xa1, 0x92, 0x95, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */ 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */ 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */ 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */ 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */ 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */ 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xa4, 0xa5, /* 0xd0-0xd7 */ 0xa6, 0xd9, 0xda, 0xdb, 0xdc, 0xa7, 0xa8, 0xdf, /* 0xd8-0xdf */ 0xa9, 0xaa, 0xac, 0xad, 0xb5, 0xb6, 0xb7, 0xb8, /* 0xe0-0xe7 */ 0xbd, 0xbe, 0xc6, 0xc7, 0xcf, 0xcf, 0xd0, 0xef, /* 0xe8-0xef */ 0xf0, 0xf1, 0xd1, 0xd2, 0xd3, 0xf5, 0xd4, 0xf7, /* 0xf0-0xf7 */ 0xf8, 0xf9, 0xd5, 0x96, 0xfc, 0x98, 0xfe, 0xff, /* 0xf8-0xff */ }; static int uni2char(wchar_t uni, unsigned char *out, int boundlen) { const unsigned char *uni2charset; unsigned char cl = uni & 0x00ff; unsigned char ch = (uni & 0xff00) >> 8; if (boundlen <= 0) return -ENAMETOOLONG; uni2charset = page_uni2charset[ch]; if (uni2charset && uni2charset[cl]) out[0] = uni2charset[cl]; else return -EINVAL; return 1; } static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) { *uni = charset2uni[*rawstring]; if (*uni == 0x0000) return -EINVAL; return 1; } static struct nls_table table = { .charset = "cp869", .uni2char = uni2char, .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, }; static int __init init_nls_cp869(void) { return register_nls(&table); } static void __exit exit_nls_cp869(void) { unregister_nls(&table); } module_init(init_nls_cp869) module_exit(exit_nls_cp869) MODULE_DESCRIPTION("NLS Codepage 869 (Greek)"); MODULE_LICENSE("Dual BSD/GPL");
3 56 333 49 333 333 42 5 52 99 96 56 56 63 45 333 333 333 333 333 2 272 333 330 124 270 124 330 329 272 272 436 21 436 435 436 21 21 31 333 333 331 333 255 271 255 272 272 272 272 2 255 255 4 272 272 272 253 272 315 239 238 332 330 117 63 330 315 272 21 21 239 333 333 272 333 333 333 333 333 51 51 49 51 559 291 466 467 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) Qu Wenruo 2017. All rights reserved. */ /* * The module is used to catch unexpected/corrupted tree block data. * Such behavior can be caused either by a fuzzed image or bugs. * * The objective is to do leaf/node validation checks when tree block is read * from disk, and check *every* possible member, so other code won't * need to checking them again. * * Due to the potential and unwanted damage, every checker needs to be * carefully reviewed otherwise so it does not prevent mount of valid images. */ #include <linux/types.h> #include <linux/stddef.h> #include <linux/error-injection.h> #include "messages.h" #include "ctree.h" #include "tree-checker.h" #include "compression.h" #include "volumes.h" #include "misc.h" #include "fs.h" #include "accessors.h" #include "file-item.h" #include "inode-item.h" #include "dir-item.h" #include "extent-tree.h" /* * Error message should follow the following format: * corrupt <type>: <identifier>, <reason>[, <bad_value>] * * @type: leaf or node * @identifier: the necessary info to locate the leaf/node. * It's recommended to decode key.objecitd/offset if it's * meaningful. * @reason: describe the error * @bad_value: optional, it's recommended to output bad value and its * expected value (range). * * Since comma is used to separate the components, only space is allowed * inside each component. */ /* * Append generic "corrupt leaf/node root=%llu block=%llu slot=%d: " to @fmt. * Allows callers to customize the output. */ __printf(3, 4) __cold static void generic_err(const struct extent_buffer *eb, int slot, const char *fmt, ...) { const struct btrfs_fs_info *fs_info = eb->fs_info; struct va_format vaf; va_list args; va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; dump_page(folio_page(eb->folios[0], 0), "eb page dump"); btrfs_crit(fs_info, "corrupt %s: root=%llu block=%llu slot=%d, %pV", btrfs_header_level(eb) == 0 ? "leaf" : "node", btrfs_header_owner(eb), btrfs_header_bytenr(eb), slot, &vaf); va_end(args); } /* * Customized reporter for extent data item, since its key objectid and * offset has its own meaning. */ __printf(3, 4) __cold static void file_extent_err(const struct extent_buffer *eb, int slot, const char *fmt, ...) { const struct btrfs_fs_info *fs_info = eb->fs_info; struct btrfs_key key; struct va_format vaf; va_list args; btrfs_item_key_to_cpu(eb, &key, slot); va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; dump_page(folio_page(eb->folios[0], 0), "eb page dump"); btrfs_crit(fs_info, "corrupt %s: root=%llu block=%llu slot=%d ino=%llu file_offset=%llu, %pV", btrfs_header_level(eb) == 0 ? "leaf" : "node", btrfs_header_owner(eb), btrfs_header_bytenr(eb), slot, key.objectid, key.offset, &vaf); va_end(args); } /* * Return 0 if the btrfs_file_extent_##name is aligned to @alignment * Else return 1 */ #define CHECK_FE_ALIGNED(leaf, slot, fi, name, alignment) \ ({ \ if (unlikely(!IS_ALIGNED(btrfs_file_extent_##name((leaf), (fi)), \ (alignment)))) \ file_extent_err((leaf), (slot), \ "invalid %s for file extent, have %llu, should be aligned to %u", \ (#name), btrfs_file_extent_##name((leaf), (fi)), \ (alignment)); \ (!IS_ALIGNED(btrfs_file_extent_##name((leaf), (fi)), (alignment))); \ }) static u64 file_extent_end(struct extent_buffer *leaf, struct btrfs_key *key, struct btrfs_file_extent_item *extent) { u64 end; u64 len; if (btrfs_file_extent_type(leaf, extent) == BTRFS_FILE_EXTENT_INLINE) { len = btrfs_file_extent_ram_bytes(leaf, extent); end = ALIGN(key->offset + len, leaf->fs_info->sectorsize); } else { len = btrfs_file_extent_num_bytes(leaf, extent); end = key->offset + len; } return end; } /* * Customized report for dir_item, the only new important information is * key->objectid, which represents inode number */ __printf(3, 4) __cold static void dir_item_err(const struct extent_buffer *eb, int slot, const char *fmt, ...) { const struct btrfs_fs_info *fs_info = eb->fs_info; struct btrfs_key key; struct va_format vaf; va_list args; btrfs_item_key_to_cpu(eb, &key, slot); va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; dump_page(folio_page(eb->folios[0], 0), "eb page dump"); btrfs_crit(fs_info, "corrupt %s: root=%llu block=%llu slot=%d ino=%llu, %pV", btrfs_header_level(eb) == 0 ? "leaf" : "node", btrfs_header_owner(eb), btrfs_header_bytenr(eb), slot, key.objectid, &vaf); va_end(args); } /* * This functions checks prev_key->objectid, to ensure current key and prev_key * share the same objectid as inode number. * * This is to detect missing INODE_ITEM in subvolume trees. * * Return true if everything is OK or we don't need to check. * Return false if anything is wrong. */ static bool check_prev_ino(struct extent_buffer *leaf, struct btrfs_key *key, int slot, struct btrfs_key *prev_key) { /* No prev key, skip check */ if (slot == 0) return true; /* Only these key->types needs to be checked */ ASSERT(key->type == BTRFS_XATTR_ITEM_KEY || key->type == BTRFS_INODE_REF_KEY || key->type == BTRFS_DIR_INDEX_KEY || key->type == BTRFS_DIR_ITEM_KEY || key->type == BTRFS_EXTENT_DATA_KEY); /* * Only subvolume trees along with their reloc trees need this check. * Things like log tree doesn't follow this ino requirement. */ if (!is_fstree(btrfs_header_owner(leaf))) return true; if (key->objectid == prev_key->objectid) return true; /* Error found */ dir_item_err(leaf, slot, "invalid previous key objectid, have %llu expect %llu", prev_key->objectid, key->objectid); return false; } static int check_extent_data_item(struct extent_buffer *leaf, struct btrfs_key *key, int slot, struct btrfs_key *prev_key) { struct btrfs_fs_info *fs_info = leaf->fs_info; struct btrfs_file_extent_item *fi; u32 sectorsize = fs_info->sectorsize; u32 item_size = btrfs_item_size(leaf, slot); u64 extent_end; if (unlikely(!IS_ALIGNED(key->offset, sectorsize))) { file_extent_err(leaf, slot, "unaligned file_offset for file extent, have %llu should be aligned to %u", key->offset, sectorsize); return -EUCLEAN; } /* * Previous key must have the same key->objectid (ino). * It can be XATTR_ITEM, INODE_ITEM or just another EXTENT_DATA. * But if objectids mismatch, it means we have a missing * INODE_ITEM. */ if (unlikely(!check_prev_ino(leaf, key, slot, prev_key))) return -EUCLEAN; fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); /* * Make sure the item contains at least inline header, so the file * extent type is not some garbage. */ if (unlikely(item_size < BTRFS_FILE_EXTENT_INLINE_DATA_START)) { file_extent_err(leaf, slot, "invalid item size, have %u expect [%zu, %u)", item_size, BTRFS_FILE_EXTENT_INLINE_DATA_START, SZ_4K); return -EUCLEAN; } if (unlikely(btrfs_file_extent_type(leaf, fi) >= BTRFS_NR_FILE_EXTENT_TYPES)) { file_extent_err(leaf, slot, "invalid type for file extent, have %u expect range [0, %u]", btrfs_file_extent_type(leaf, fi), BTRFS_NR_FILE_EXTENT_TYPES - 1); return -EUCLEAN; } /* * Support for new compression/encryption must introduce incompat flag, * and must be caught in open_ctree(). */ if (unlikely(btrfs_file_extent_compression(leaf, fi) >= BTRFS_NR_COMPRESS_TYPES)) { file_extent_err(leaf, slot, "invalid compression for file extent, have %u expect range [0, %u]", btrfs_file_extent_compression(leaf, fi), BTRFS_NR_COMPRESS_TYPES - 1); return -EUCLEAN; } if (unlikely(btrfs_file_extent_encryption(leaf, fi))) { file_extent_err(leaf, slot, "invalid encryption for file extent, have %u expect 0", btrfs_file_extent_encryption(leaf, fi)); return -EUCLEAN; } if (btrfs_file_extent_type(leaf, fi) == BTRFS_FILE_EXTENT_INLINE) { /* Inline extent must have 0 as key offset */ if (unlikely(key->offset)) { file_extent_err(leaf, slot, "invalid file_offset for inline file extent, have %llu expect 0", key->offset); return -EUCLEAN; } /* Compressed inline extent has no on-disk size, skip it */ if (btrfs_file_extent_compression(leaf, fi) != BTRFS_COMPRESS_NONE) return 0; /* Uncompressed inline extent size must match item size */ if (unlikely(item_size != BTRFS_FILE_EXTENT_INLINE_DATA_START + btrfs_file_extent_ram_bytes(leaf, fi))) { file_extent_err(leaf, slot, "invalid ram_bytes for uncompressed inline extent, have %u expect %llu", item_size, BTRFS_FILE_EXTENT_INLINE_DATA_START + btrfs_file_extent_ram_bytes(leaf, fi)); return -EUCLEAN; } return 0; } /* Regular or preallocated extent has fixed item size */ if (unlikely(item_size != sizeof(*fi))) { file_extent_err(leaf, slot, "invalid item size for reg/prealloc file extent, have %u expect %zu", item_size, sizeof(*fi)); return -EUCLEAN; } if (unlikely(CHECK_FE_ALIGNED(leaf, slot, fi, ram_bytes, sectorsize) || CHECK_FE_ALIGNED(leaf, slot, fi, disk_bytenr, sectorsize) || CHECK_FE_ALIGNED(leaf, slot, fi, disk_num_bytes, sectorsize) || CHECK_FE_ALIGNED(leaf, slot, fi, offset, sectorsize) || CHECK_FE_ALIGNED(leaf, slot, fi, num_bytes, sectorsize))) return -EUCLEAN; /* Catch extent end overflow */ if (unlikely(check_add_overflow(btrfs_file_extent_num_bytes(leaf, fi), key->offset, &extent_end))) { file_extent_err(leaf, slot, "extent end overflow, have file offset %llu extent num bytes %llu", key->offset, btrfs_file_extent_num_bytes(leaf, fi)); return -EUCLEAN; } /* * Check that no two consecutive file extent items, in the same leaf, * present ranges that overlap each other. */ if (slot > 0 && prev_key->objectid == key->objectid && prev_key->type == BTRFS_EXTENT_DATA_KEY) { struct btrfs_file_extent_item *prev_fi; u64 prev_end; prev_fi = btrfs_item_ptr(leaf, slot - 1, struct btrfs_file_extent_item); prev_end = file_extent_end(leaf, prev_key, prev_fi); if (unlikely(prev_end > key->offset)) { file_extent_err(leaf, slot - 1, "file extent end range (%llu) goes beyond start offset (%llu) of the next file extent", prev_end, key->offset); return -EUCLEAN; } } /* * For non-compressed data extents, ram_bytes should match its * disk_num_bytes. * However we do not really utilize ram_bytes in this case, so this check * is only optional for DEBUG builds for developers to catch the * unexpected behaviors. */ if (IS_ENABLED(CONFIG_BTRFS_DEBUG) && btrfs_file_extent_compression(leaf, fi) == BTRFS_COMPRESS_NONE && btrfs_file_extent_disk_bytenr(leaf, fi)) { if (WARN_ON(btrfs_file_extent_ram_bytes(leaf, fi) != btrfs_file_extent_disk_num_bytes(leaf, fi))) file_extent_err(leaf, slot, "mismatch ram_bytes (%llu) and disk_num_bytes (%llu) for non-compressed extent", btrfs_file_extent_ram_bytes(leaf, fi), btrfs_file_extent_disk_num_bytes(leaf, fi)); } return 0; } static int check_csum_item(struct extent_buffer *leaf, struct btrfs_key *key, int slot, struct btrfs_key *prev_key) { struct btrfs_fs_info *fs_info = leaf->fs_info; u32 sectorsize = fs_info->sectorsize; const u32 csumsize = fs_info->csum_size; if (unlikely(key->objectid != BTRFS_EXTENT_CSUM_OBJECTID)) { generic_err(leaf, slot, "invalid key objectid for csum item, have %llu expect %llu", key->objectid, BTRFS_EXTENT_CSUM_OBJECTID); return -EUCLEAN; } if (unlikely(!IS_ALIGNED(key->offset, sectorsize))) { generic_err(leaf, slot, "unaligned key offset for csum item, have %llu should be aligned to %u", key->offset, sectorsize); return -EUCLEAN; } if (unlikely(!IS_ALIGNED(btrfs_item_size(leaf, slot), csumsize))) { generic_err(leaf, slot, "unaligned item size for csum item, have %u should be aligned to %u", btrfs_item_size(leaf, slot), csumsize); return -EUCLEAN; } if (slot > 0 && prev_key->type == BTRFS_EXTENT_CSUM_KEY) { u64 prev_csum_end; u32 prev_item_size; prev_item_size = btrfs_item_size(leaf, slot - 1); prev_csum_end = (prev_item_size / csumsize) * sectorsize; prev_csum_end += prev_key->offset; if (unlikely(prev_csum_end > key->offset)) { generic_err(leaf, slot - 1, "csum end range (%llu) goes beyond the start range (%llu) of the next csum item", prev_csum_end, key->offset); return -EUCLEAN; } } return 0; } /* Inode item error output has the same format as dir_item_err() */ #define inode_item_err(eb, slot, fmt, ...) \ dir_item_err(eb, slot, fmt, __VA_ARGS__) static int check_inode_key(struct extent_buffer *leaf, struct btrfs_key *key, int slot) { struct btrfs_key item_key; bool is_inode_item; btrfs_item_key_to_cpu(leaf, &item_key, slot); is_inode_item = (item_key.type == BTRFS_INODE_ITEM_KEY); /* For XATTR_ITEM, location key should be all 0 */ if (item_key.type == BTRFS_XATTR_ITEM_KEY) { if (unlikely(key->objectid != 0 || key->type != 0 || key->offset != 0)) return -EUCLEAN; return 0; } if (unlikely((key->objectid < BTRFS_FIRST_FREE_OBJECTID || key->objectid > BTRFS_LAST_FREE_OBJECTID) && key->objectid != BTRFS_ROOT_TREE_DIR_OBJECTID && key->objectid != BTRFS_FREE_INO_OBJECTID)) { if (is_inode_item) { generic_err(leaf, slot, "invalid key objectid: has %llu expect %llu or [%llu, %llu] or %llu", key->objectid, BTRFS_ROOT_TREE_DIR_OBJECTID, BTRFS_FIRST_FREE_OBJECTID, BTRFS_LAST_FREE_OBJECTID, BTRFS_FREE_INO_OBJECTID); } else { dir_item_err(leaf, slot, "invalid location key objectid: has %llu expect %llu or [%llu, %llu] or %llu", key->objectid, BTRFS_ROOT_TREE_DIR_OBJECTID, BTRFS_FIRST_FREE_OBJECTID, BTRFS_LAST_FREE_OBJECTID, BTRFS_FREE_INO_OBJECTID); } return -EUCLEAN; } if (unlikely(key->offset != 0)) { if (is_inode_item) inode_item_err(leaf, slot, "invalid key offset: has %llu expect 0", key->offset); else dir_item_err(leaf, slot, "invalid location key offset:has %llu expect 0", key->offset); return -EUCLEAN; } return 0; } static int check_root_key(struct extent_buffer *leaf, struct btrfs_key *key, int slot) { struct btrfs_key item_key; bool is_root_item; btrfs_item_key_to_cpu(leaf, &item_key, slot); is_root_item = (item_key.type == BTRFS_ROOT_ITEM_KEY); /* * Bad rootid for reloc trees. * * Reloc trees are only for subvolume trees, other trees only need * to be COWed to be relocated. */ if (unlikely(is_root_item && key->objectid == BTRFS_TREE_RELOC_OBJECTID && !is_fstree(key->offset))) { generic_err(leaf, slot, "invalid reloc tree for root %lld, root id is not a subvolume tree", key->offset); return -EUCLEAN; } /* No such tree id */ if (unlikely(key->objectid == 0)) { if (is_root_item) generic_err(leaf, slot, "invalid root id 0"); else dir_item_err(leaf, slot, "invalid location key root id 0"); return -EUCLEAN; } /* DIR_ITEM/INDEX/INODE_REF is not allowed to point to non-fs trees */ if (unlikely(!is_fstree(key->objectid) && !is_root_item)) { dir_item_err(leaf, slot, "invalid location key objectid, have %llu expect [%llu, %llu]", key->objectid, BTRFS_FIRST_FREE_OBJECTID, BTRFS_LAST_FREE_OBJECTID); return -EUCLEAN; } /* * ROOT_ITEM with non-zero offset means this is a snapshot, created at * @offset transid. * Furthermore, for location key in DIR_ITEM, its offset is always -1. * * So here we only check offset for reloc tree whose key->offset must * be a valid tree. */ if (unlikely(key->objectid == BTRFS_TREE_RELOC_OBJECTID && key->offset == 0)) { generic_err(leaf, slot, "invalid root id 0 for reloc tree"); return -EUCLEAN; } return 0; } static int check_dir_item(struct extent_buffer *leaf, struct btrfs_key *key, struct btrfs_key *prev_key, int slot) { struct btrfs_fs_info *fs_info = leaf->fs_info; struct btrfs_dir_item *di; u32 item_size = btrfs_item_size(leaf, slot); u32 cur = 0; if (unlikely(!check_prev_ino(leaf, key, slot, prev_key))) return -EUCLEAN; di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item); while (cur < item_size) { struct btrfs_key location_key; u32 name_len; u32 data_len; u32 max_name_len; u32 total_size; u32 name_hash; u8 dir_type; int ret; /* header itself should not cross item boundary */ if (unlikely(cur + sizeof(*di) > item_size)) { dir_item_err(leaf, slot, "dir item header crosses item boundary, have %zu boundary %u", cur + sizeof(*di), item_size); return -EUCLEAN; } /* Location key check */ btrfs_dir_item_key_to_cpu(leaf, di, &location_key); if (location_key.type == BTRFS_ROOT_ITEM_KEY) { ret = check_root_key(leaf, &location_key, slot); if (unlikely(ret < 0)) return ret; } else if (location_key.type == BTRFS_INODE_ITEM_KEY || location_key.type == 0) { ret = check_inode_key(leaf, &location_key, slot); if (unlikely(ret < 0)) return ret; } else { dir_item_err(leaf, slot, "invalid location key type, have %u, expect %u or %u", location_key.type, BTRFS_ROOT_ITEM_KEY, BTRFS_INODE_ITEM_KEY); return -EUCLEAN; } /* dir type check */ dir_type = btrfs_dir_ftype(leaf, di); if (unlikely(dir_type <= BTRFS_FT_UNKNOWN || dir_type >= BTRFS_FT_MAX)) { dir_item_err(leaf, slot, "invalid dir item type, have %u expect (0, %u)", dir_type, BTRFS_FT_MAX); return -EUCLEAN; } if (unlikely(key->type == BTRFS_XATTR_ITEM_KEY && dir_type != BTRFS_FT_XATTR)) { dir_item_err(leaf, slot, "invalid dir item type for XATTR key, have %u expect %u", dir_type, BTRFS_FT_XATTR); return -EUCLEAN; } if (unlikely(dir_type == BTRFS_FT_XATTR && key->type != BTRFS_XATTR_ITEM_KEY)) { dir_item_err(leaf, slot, "xattr dir type found for non-XATTR key"); return -EUCLEAN; } if (dir_type == BTRFS_FT_XATTR) max_name_len = XATTR_NAME_MAX; else max_name_len = BTRFS_NAME_LEN; /* Name/data length check */ name_len = btrfs_dir_name_len(leaf, di); data_len = btrfs_dir_data_len(leaf, di); if (unlikely(name_len > max_name_len)) { dir_item_err(leaf, slot, "dir item name len too long, have %u max %u", name_len, max_name_len); return -EUCLEAN; } if (unlikely(name_len + data_len > BTRFS_MAX_XATTR_SIZE(fs_info))) { dir_item_err(leaf, slot, "dir item name and data len too long, have %u max %u", name_len + data_len, BTRFS_MAX_XATTR_SIZE(fs_info)); return -EUCLEAN; } if (unlikely(data_len && dir_type != BTRFS_FT_XATTR)) { dir_item_err(leaf, slot, "dir item with invalid data len, have %u expect 0", data_len); return -EUCLEAN; } total_size = sizeof(*di) + name_len + data_len; /* header and name/data should not cross item boundary */ if (unlikely(cur + total_size > item_size)) { dir_item_err(leaf, slot, "dir item data crosses item boundary, have %u boundary %u", cur + total_size, item_size); return -EUCLEAN; } /* * Special check for XATTR/DIR_ITEM, as key->offset is name * hash, should match its name */ if (key->type == BTRFS_DIR_ITEM_KEY || key->type == BTRFS_XATTR_ITEM_KEY) { char namebuf[MAX(BTRFS_NAME_LEN, XATTR_NAME_MAX)]; read_extent_buffer(leaf, namebuf, (unsigned long)(di + 1), name_len); name_hash = btrfs_name_hash(namebuf, name_len); if (unlikely(key->offset != name_hash)) { dir_item_err(leaf, slot, "name hash mismatch with key, have 0x%016x expect 0x%016llx", name_hash, key->offset); return -EUCLEAN; } } cur += total_size; di = (struct btrfs_dir_item *)((void *)di + total_size); } return 0; } __printf(3, 4) __cold static void block_group_err(const struct extent_buffer *eb, int slot, const char *fmt, ...) { const struct btrfs_fs_info *fs_info = eb->fs_info; struct btrfs_key key; struct va_format vaf; va_list args; btrfs_item_key_to_cpu(eb, &key, slot); va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; dump_page(folio_page(eb->folios[0], 0), "eb page dump"); btrfs_crit(fs_info, "corrupt %s: root=%llu block=%llu slot=%d bg_start=%llu bg_len=%llu, %pV", btrfs_header_level(eb) == 0 ? "leaf" : "node", btrfs_header_owner(eb), btrfs_header_bytenr(eb), slot, key.objectid, key.offset, &vaf); va_end(args); } static int check_block_group_item(struct extent_buffer *leaf, struct btrfs_key *key, int slot) { struct btrfs_fs_info *fs_info = leaf->fs_info; struct btrfs_block_group_item bgi; u32 item_size = btrfs_item_size(leaf, slot); u64 chunk_objectid; u64 flags; u64 type; /* * Here we don't really care about alignment since extent allocator can * handle it. We care more about the size. */ if (unlikely(key->offset == 0)) { block_group_err(leaf, slot, "invalid block group size 0"); return -EUCLEAN; } if (unlikely(item_size != sizeof(bgi))) { block_group_err(leaf, slot, "invalid item size, have %u expect %zu", item_size, sizeof(bgi)); return -EUCLEAN; } read_extent_buffer(leaf, &bgi, btrfs_item_ptr_offset(leaf, slot), sizeof(bgi)); chunk_objectid = btrfs_stack_block_group_chunk_objectid(&bgi); if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) { /* * We don't init the nr_global_roots until we load the global * roots, so this could be 0 at mount time. If it's 0 we'll * just assume we're fine, and later we'll check against our * actual value. */ if (unlikely(fs_info->nr_global_roots && chunk_objectid >= fs_info->nr_global_roots)) { block_group_err(leaf, slot, "invalid block group global root id, have %llu, needs to be <= %llu", chunk_objectid, fs_info->nr_global_roots); return -EUCLEAN; } } else if (unlikely(chunk_objectid != BTRFS_FIRST_CHUNK_TREE_OBJECTID)) { block_group_err(leaf, slot, "invalid block group chunk objectid, have %llu expect %llu", btrfs_stack_block_group_chunk_objectid(&bgi), BTRFS_FIRST_CHUNK_TREE_OBJECTID); return -EUCLEAN; } if (unlikely(btrfs_stack_block_group_used(&bgi) > key->offset)) { block_group_err(leaf, slot, "invalid block group used, have %llu expect [0, %llu)", btrfs_stack_block_group_used(&bgi), key->offset); return -EUCLEAN; } flags = btrfs_stack_block_group_flags(&bgi); if (unlikely(hweight64(flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) > 1)) { block_group_err(leaf, slot, "invalid profile flags, have 0x%llx (%lu bits set) expect no more than 1 bit set", flags & BTRFS_BLOCK_GROUP_PROFILE_MASK, hweight64(flags & BTRFS_BLOCK_GROUP_PROFILE_MASK)); return -EUCLEAN; } type = flags & BTRFS_BLOCK_GROUP_TYPE_MASK; if (unlikely(type != BTRFS_BLOCK_GROUP_DATA && type != BTRFS_BLOCK_GROUP_METADATA && type != BTRFS_BLOCK_GROUP_SYSTEM && type != (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA))) { block_group_err(leaf, slot, "invalid type, have 0x%llx (%lu bits set) expect either 0x%llx, 0x%llx, 0x%llx or 0x%llx", type, hweight64(type), BTRFS_BLOCK_GROUP_DATA, BTRFS_BLOCK_GROUP_METADATA, BTRFS_BLOCK_GROUP_SYSTEM, BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA); return -EUCLEAN; } return 0; } __printf(5, 6) __cold static void chunk_err(const struct btrfs_fs_info *fs_info, const struct extent_buffer *leaf, const struct btrfs_chunk *chunk, u64 logical, const char *fmt, ...) { bool is_sb = !leaf; struct va_format vaf; va_list args; int i; int slot = -1; if (!is_sb) { /* * Get the slot number by iterating through all slots, this * would provide better readability. */ for (i = 0; i < btrfs_header_nritems(leaf); i++) { if (btrfs_item_ptr_offset(leaf, i) == (unsigned long)chunk) { slot = i; break; } } } va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; if (is_sb) btrfs_crit(fs_info, "corrupt superblock syschunk array: chunk_start=%llu, %pV", logical, &vaf); else btrfs_crit(fs_info, "corrupt leaf: root=%llu block=%llu slot=%d chunk_start=%llu, %pV", BTRFS_CHUNK_TREE_OBJECTID, leaf->start, slot, logical, &vaf); va_end(args); } /* * The common chunk check which could also work on super block sys chunk array. * * If @leaf is NULL, then @chunk must be an on-stack chunk item. * (For superblock sys_chunk array, and fs_info->sectorsize is unreliable) * * Return -EUCLEAN if anything is corrupted. * Return 0 if everything is OK. */ int btrfs_check_chunk_valid(const struct btrfs_fs_info *fs_info, const struct extent_buffer *leaf, const struct btrfs_chunk *chunk, u64 logical, u32 sectorsize) { u64 length; u64 chunk_end; u64 stripe_len; u16 num_stripes; u16 sub_stripes; u64 type; u64 features; u32 chunk_sector_size; bool mixed = false; int raid_index; int nparity; int ncopies; if (leaf) { length = btrfs_chunk_length(leaf, chunk); stripe_len = btrfs_chunk_stripe_len(leaf, chunk); num_stripes = btrfs_chunk_num_stripes(leaf, chunk); sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk); type = btrfs_chunk_type(leaf, chunk); chunk_sector_size = btrfs_chunk_sector_size(leaf, chunk); } else { length = btrfs_stack_chunk_length(chunk); stripe_len = btrfs_stack_chunk_stripe_len(chunk); num_stripes = btrfs_stack_chunk_num_stripes(chunk); sub_stripes = btrfs_stack_chunk_sub_stripes(chunk); type = btrfs_stack_chunk_type(chunk); chunk_sector_size = btrfs_stack_chunk_sector_size(chunk); } raid_index = btrfs_bg_flags_to_raid_index(type); ncopies = btrfs_raid_array[raid_index].ncopies; nparity = btrfs_raid_array[raid_index].nparity; if (unlikely(!num_stripes)) { chunk_err(fs_info, leaf, chunk, logical, "invalid chunk num_stripes, have %u", num_stripes); return -EUCLEAN; } if (unlikely(num_stripes < ncopies)) { chunk_err(fs_info, leaf, chunk, logical, "invalid chunk num_stripes < ncopies, have %u < %d", num_stripes, ncopies); return -EUCLEAN; } if (unlikely(nparity && num_stripes == nparity)) { chunk_err(fs_info, leaf, chunk, logical, "invalid chunk num_stripes == nparity, have %u == %d", num_stripes, nparity); return -EUCLEAN; } if (unlikely(!IS_ALIGNED(logical, sectorsize))) { chunk_err(fs_info, leaf, chunk, logical, "invalid chunk logical, have %llu should aligned to %u", logical, sectorsize); return -EUCLEAN; } if (unlikely(chunk_sector_size != sectorsize)) { chunk_err(fs_info, leaf, chunk, logical, "invalid chunk sectorsize, have %u expect %u", chunk_sector_size, sectorsize); return -EUCLEAN; } if (unlikely(!length || !IS_ALIGNED(length, sectorsize))) { chunk_err(fs_info, leaf, chunk, logical, "invalid chunk length, have %llu", length); return -EUCLEAN; } if (unlikely(check_add_overflow(logical, length, &chunk_end))) { chunk_err(fs_info, leaf, chunk, logical, "invalid chunk logical start and length, have logical start %llu length %llu", logical, length); return -EUCLEAN; } if (unlikely(!is_power_of_2(stripe_len) || stripe_len != BTRFS_STRIPE_LEN)) { chunk_err(fs_info, leaf, chunk, logical, "invalid chunk stripe length: %llu", stripe_len); return -EUCLEAN; } /* * We artificially limit the chunk size, so that the number of stripes * inside a chunk can be fit into a U32. The current limit (256G) is * way too large for real world usage anyway, and it's also much larger * than our existing limit (10G). * * Thus it should be a good way to catch obvious bitflips. */ if (unlikely(length >= btrfs_stripe_nr_to_offset(U32_MAX))) { chunk_err(fs_info, leaf, chunk, logical, "chunk length too large: have %llu limit %llu", length, btrfs_stripe_nr_to_offset(U32_MAX)); return -EUCLEAN; } if (unlikely(type & ~(BTRFS_BLOCK_GROUP_TYPE_MASK | BTRFS_BLOCK_GROUP_PROFILE_MASK))) { chunk_err(fs_info, leaf, chunk, logical, "unrecognized chunk type: 0x%llx", ~(BTRFS_BLOCK_GROUP_TYPE_MASK | BTRFS_BLOCK_GROUP_PROFILE_MASK) & type); return -EUCLEAN; } if (unlikely(!has_single_bit_set(type & BTRFS_BLOCK_GROUP_PROFILE_MASK) && (type & BTRFS_BLOCK_GROUP_PROFILE_MASK) != 0)) { chunk_err(fs_info, leaf, chunk, logical, "invalid chunk profile flag: 0x%llx, expect 0 or 1 bit set", type & BTRFS_BLOCK_GROUP_PROFILE_MASK); return -EUCLEAN; } if (unlikely((type & BTRFS_BLOCK_GROUP_TYPE_MASK) == 0)) { chunk_err(fs_info, leaf, chunk, logical, "missing chunk type flag, have 0x%llx one bit must be set in 0x%llx", type, BTRFS_BLOCK_GROUP_TYPE_MASK); return -EUCLEAN; } if (unlikely((type & BTRFS_BLOCK_GROUP_SYSTEM) && (type & (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA)))) { chunk_err(fs_info, leaf, chunk, logical, "system chunk with data or metadata type: 0x%llx", type); return -EUCLEAN; } features = btrfs_super_incompat_flags(fs_info->super_copy); if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) mixed = true; if (!mixed) { if (unlikely((type & BTRFS_BLOCK_GROUP_METADATA) && (type & BTRFS_BLOCK_GROUP_DATA))) { chunk_err(fs_info, leaf, chunk, logical, "mixed chunk type in non-mixed mode: 0x%llx", type); return -EUCLEAN; } } if (unlikely((type & BTRFS_BLOCK_GROUP_RAID10 && sub_stripes != btrfs_raid_array[BTRFS_RAID_RAID10].sub_stripes) || (type & BTRFS_BLOCK_GROUP_RAID1 && num_stripes != btrfs_raid_array[BTRFS_RAID_RAID1].devs_min) || (type & BTRFS_BLOCK_GROUP_RAID1C3 && num_stripes != btrfs_raid_array[BTRFS_RAID_RAID1C3].devs_min) || (type & BTRFS_BLOCK_GROUP_RAID1C4 && num_stripes != btrfs_raid_array[BTRFS_RAID_RAID1C4].devs_min) || (type & BTRFS_BLOCK_GROUP_RAID5 && num_stripes < btrfs_raid_array[BTRFS_RAID_RAID5].devs_min) || (type & BTRFS_BLOCK_GROUP_RAID6 && num_stripes < btrfs_raid_array[BTRFS_RAID_RAID6].devs_min) || (type & BTRFS_BLOCK_GROUP_DUP && num_stripes != btrfs_raid_array[BTRFS_RAID_DUP].dev_stripes) || ((type & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0 && num_stripes != btrfs_raid_array[BTRFS_RAID_SINGLE].dev_stripes))) { chunk_err(fs_info, leaf, chunk, logical, "invalid num_stripes:sub_stripes %u:%u for profile %llu", num_stripes, sub_stripes, type & BTRFS_BLOCK_GROUP_PROFILE_MASK); return -EUCLEAN; } return 0; } /* * Enhanced version of chunk item checker. * * The common btrfs_check_chunk_valid() doesn't check item size since it needs * to work on super block sys_chunk_array which doesn't have full item ptr. */ static int check_leaf_chunk_item(struct extent_buffer *leaf, struct btrfs_chunk *chunk, struct btrfs_key *key, int slot) { struct btrfs_fs_info *fs_info = leaf->fs_info; int num_stripes; if (unlikely(btrfs_item_size(leaf, slot) < sizeof(struct btrfs_chunk))) { chunk_err(fs_info, leaf, chunk, key->offset, "invalid chunk item size: have %u expect [%zu, %u)", btrfs_item_size(leaf, slot), sizeof(struct btrfs_chunk), BTRFS_LEAF_DATA_SIZE(fs_info)); return -EUCLEAN; } num_stripes = btrfs_chunk_num_stripes(leaf, chunk); /* Let btrfs_check_chunk_valid() handle this error type */ if (num_stripes == 0) goto out; if (unlikely(btrfs_chunk_item_size(num_stripes) != btrfs_item_size(leaf, slot))) { chunk_err(fs_info, leaf, chunk, key->offset, "invalid chunk item size: have %u expect %lu", btrfs_item_size(leaf, slot), btrfs_chunk_item_size(num_stripes)); return -EUCLEAN; } out: return btrfs_check_chunk_valid(fs_info, leaf, chunk, key->offset, fs_info->sectorsize); } __printf(3, 4) __cold static void dev_item_err(const struct extent_buffer *eb, int slot, const char *fmt, ...) { struct btrfs_key key; struct va_format vaf; va_list args; btrfs_item_key_to_cpu(eb, &key, slot); va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; dump_page(folio_page(eb->folios[0], 0), "eb page dump"); btrfs_crit(eb->fs_info, "corrupt %s: root=%llu block=%llu slot=%d devid=%llu %pV", btrfs_header_level(eb) == 0 ? "leaf" : "node", btrfs_header_owner(eb), btrfs_header_bytenr(eb), slot, key.objectid, &vaf); va_end(args); } static int check_dev_item(struct extent_buffer *leaf, struct btrfs_key *key, int slot) { struct btrfs_dev_item *ditem; const u32 item_size = btrfs_item_size(leaf, slot); if (unlikely(key->objectid != BTRFS_DEV_ITEMS_OBJECTID)) { dev_item_err(leaf, slot, "invalid objectid: has=%llu expect=%llu", key->objectid, BTRFS_DEV_ITEMS_OBJECTID); return -EUCLEAN; } if (unlikely(item_size != sizeof(*ditem))) { dev_item_err(leaf, slot, "invalid item size: has %u expect %zu", item_size, sizeof(*ditem)); return -EUCLEAN; } ditem = btrfs_item_ptr(leaf, slot, struct btrfs_dev_item); if (unlikely(btrfs_device_id(leaf, ditem) != key->offset)) { dev_item_err(leaf, slot, "devid mismatch: key has=%llu item has=%llu", key->offset, btrfs_device_id(leaf, ditem)); return -EUCLEAN; } /* * For device total_bytes, we don't have reliable way to check it, as * it can be 0 for device removal. Device size check can only be done * by dev extents check. */ if (unlikely(btrfs_device_bytes_used(leaf, ditem) > btrfs_device_total_bytes(leaf, ditem))) { dev_item_err(leaf, slot, "invalid bytes used: have %llu expect [0, %llu]", btrfs_device_bytes_used(leaf, ditem), btrfs_device_total_bytes(leaf, ditem)); return -EUCLEAN; } /* * Remaining members like io_align/type/gen/dev_group aren't really * utilized. Skip them to make later usage of them easier. */ return 0; } static int check_inode_item(struct extent_buffer *leaf, struct btrfs_key *key, int slot) { struct btrfs_fs_info *fs_info = leaf->fs_info; struct btrfs_inode_item *iitem; u64 super_gen = btrfs_super_generation(fs_info->super_copy); u32 valid_mask = (S_IFMT | S_ISUID | S_ISGID | S_ISVTX | 0777); const u32 item_size = btrfs_item_size(leaf, slot); u32 mode; int ret; u32 flags; u32 ro_flags; ret = check_inode_key(leaf, key, slot); if (unlikely(ret < 0)) return ret; if (unlikely(item_size != sizeof(*iitem))) { generic_err(leaf, slot, "invalid item size: has %u expect %zu", item_size, sizeof(*iitem)); return -EUCLEAN; } iitem = btrfs_item_ptr(leaf, slot, struct btrfs_inode_item); /* Here we use super block generation + 1 to handle log tree */ if (unlikely(btrfs_inode_generation(leaf, iitem) > super_gen + 1)) { inode_item_err(leaf, slot, "invalid inode generation: has %llu expect (0, %llu]", btrfs_inode_generation(leaf, iitem), super_gen + 1); return -EUCLEAN; } /* Note for ROOT_TREE_DIR_ITEM, mkfs could set its transid 0 */ if (unlikely(btrfs_inode_transid(leaf, iitem) > super_gen + 1)) { inode_item_err(leaf, slot, "invalid inode transid: has %llu expect [0, %llu]", btrfs_inode_transid(leaf, iitem), super_gen + 1); return -EUCLEAN; } /* * For size and nbytes it's better not to be too strict, as for dir * item its size/nbytes can easily get wrong, but doesn't affect * anything in the fs. So here we skip the check. */ mode = btrfs_inode_mode(leaf, iitem); if (unlikely(mode & ~valid_mask)) { inode_item_err(leaf, slot, "unknown mode bit detected: 0x%x", mode & ~valid_mask); return -EUCLEAN; } /* * S_IFMT is not bit mapped so we can't completely rely on * is_power_of_2/has_single_bit_set, but it can save us from checking * FIFO/CHR/DIR/REG. Only needs to check BLK, LNK and SOCKS */ if (!has_single_bit_set(mode & S_IFMT)) { if (unlikely(!S_ISLNK(mode) && !S_ISBLK(mode) && !S_ISSOCK(mode))) { inode_item_err(leaf, slot, "invalid mode: has 0%o expect valid S_IF* bit(s)", mode & S_IFMT); return -EUCLEAN; } } if (unlikely(S_ISDIR(mode) && btrfs_inode_nlink(leaf, iitem) > 1)) { inode_item_err(leaf, slot, "invalid nlink: has %u expect no more than 1 for dir", btrfs_inode_nlink(leaf, iitem)); return -EUCLEAN; } btrfs_inode_split_flags(btrfs_inode_flags(leaf, iitem), &flags, &ro_flags); if (unlikely(flags & ~BTRFS_INODE_FLAG_MASK)) { inode_item_err(leaf, slot, "unknown incompat flags detected: 0x%x", flags); return -EUCLEAN; } if (unlikely(!sb_rdonly(fs_info->sb) && (ro_flags & ~BTRFS_INODE_RO_FLAG_MASK))) { inode_item_err(leaf, slot, "unknown ro-compat flags detected on writeable mount: 0x%x", ro_flags); return -EUCLEAN; } return 0; } static int check_root_item(struct extent_buffer *leaf, struct btrfs_key *key, int slot) { struct btrfs_fs_info *fs_info = leaf->fs_info; struct btrfs_root_item ri = { 0 }; const u64 valid_root_flags = BTRFS_ROOT_SUBVOL_RDONLY | BTRFS_ROOT_SUBVOL_DEAD; int ret; ret = check_root_key(leaf, key, slot); if (unlikely(ret < 0)) return ret; if (unlikely(btrfs_item_size(leaf, slot) != sizeof(ri) && btrfs_item_size(leaf, slot) != btrfs_legacy_root_item_size())) { generic_err(leaf, slot, "invalid root item size, have %u expect %zu or %u", btrfs_item_size(leaf, slot), sizeof(ri), btrfs_legacy_root_item_size()); return -EUCLEAN; } /* * For legacy root item, the members starting at generation_v2 will be * all filled with 0. * And since we allow geneartion_v2 as 0, it will still pass the check. */ read_extent_buffer(leaf, &ri, btrfs_item_ptr_offset(leaf, slot), btrfs_item_size(leaf, slot)); /* Generation related */ if (unlikely(btrfs_root_generation(&ri) > btrfs_super_generation(fs_info->super_copy) + 1)) { generic_err(leaf, slot, "invalid root generation, have %llu expect (0, %llu]", btrfs_root_generation(&ri), btrfs_super_generation(fs_info->super_copy) + 1); return -EUCLEAN; } if (unlikely(btrfs_root_generation_v2(&ri) > btrfs_super_generation(fs_info->super_copy) + 1)) { generic_err(leaf, slot, "invalid root v2 generation, have %llu expect (0, %llu]", btrfs_root_generation_v2(&ri), btrfs_super_generation(fs_info->super_copy) + 1); return -EUCLEAN; } if (unlikely(btrfs_root_last_snapshot(&ri) > btrfs_super_generation(fs_info->super_copy) + 1)) { generic_err(leaf, slot, "invalid root last_snapshot, have %llu expect (0, %llu]", btrfs_root_last_snapshot(&ri), btrfs_super_generation(fs_info->super_copy) + 1); return -EUCLEAN; } /* Alignment and level check */ if (unlikely(!IS_ALIGNED(btrfs_root_bytenr(&ri), fs_info->sectorsize))) { generic_err(leaf, slot, "invalid root bytenr, have %llu expect to be aligned to %u", btrfs_root_bytenr(&ri), fs_info->sectorsize); return -EUCLEAN; } if (unlikely(btrfs_root_level(&ri) >= BTRFS_MAX_LEVEL)) { generic_err(leaf, slot, "invalid root level, have %u expect [0, %u]", btrfs_root_level(&ri), BTRFS_MAX_LEVEL - 1); return -EUCLEAN; } if (unlikely(btrfs_root_drop_level(&ri) >= BTRFS_MAX_LEVEL)) { generic_err(leaf, slot, "invalid root level, have %u expect [0, %u]", btrfs_root_drop_level(&ri), BTRFS_MAX_LEVEL - 1); return -EUCLEAN; } /* Flags check */ if (unlikely(btrfs_root_flags(&ri) & ~valid_root_flags)) { generic_err(leaf, slot, "invalid root flags, have 0x%llx expect mask 0x%llx", btrfs_root_flags(&ri), valid_root_flags); return -EUCLEAN; } return 0; } __printf(3,4) __cold static void extent_err(const struct extent_buffer *eb, int slot, const char *fmt, ...) { struct btrfs_key key; struct va_format vaf; va_list args; u64 bytenr; u64 len; btrfs_item_key_to_cpu(eb, &key, slot); bytenr = key.objectid; if (key.type == BTRFS_METADATA_ITEM_KEY || key.type == BTRFS_TREE_BLOCK_REF_KEY || key.type == BTRFS_SHARED_BLOCK_REF_KEY) len = eb->fs_info->nodesize; else len = key.offset; va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; dump_page(folio_page(eb->folios[0], 0), "eb page dump"); btrfs_crit(eb->fs_info, "corrupt %s: block=%llu slot=%d extent bytenr=%llu len=%llu %pV", btrfs_header_level(eb) == 0 ? "leaf" : "node", eb->start, slot, bytenr, len, &vaf); va_end(args); } static bool is_valid_dref_root(u64 rootid) { /* * The following tree root objectids are allowed to have a data backref: * - subvolume trees * - data reloc tree * - tree root * For v1 space cache */ return is_fstree(rootid) || rootid == BTRFS_DATA_RELOC_TREE_OBJECTID || rootid == BTRFS_ROOT_TREE_OBJECTID; } static int check_extent_item(struct extent_buffer *leaf, struct btrfs_key *key, int slot, struct btrfs_key *prev_key) { struct btrfs_fs_info *fs_info = leaf->fs_info; struct btrfs_extent_item *ei; bool is_tree_block = false; unsigned long ptr; /* Current pointer inside inline refs */ unsigned long end; /* Extent item end */ const u32 item_size = btrfs_item_size(leaf, slot); u8 last_type = 0; u64 last_seq = U64_MAX; u64 flags; u64 generation; u64 total_refs; /* Total refs in btrfs_extent_item */ u64 inline_refs = 0; /* found total inline refs */ if (unlikely(key->type == BTRFS_METADATA_ITEM_KEY && !btrfs_fs_incompat(fs_info, SKINNY_METADATA))) { generic_err(leaf, slot, "invalid key type, METADATA_ITEM type invalid when SKINNY_METADATA feature disabled"); return -EUCLEAN; } /* key->objectid is the bytenr for both key types */ if (unlikely(!IS_ALIGNED(key->objectid, fs_info->sectorsize))) { generic_err(leaf, slot, "invalid key objectid, have %llu expect to be aligned to %u", key->objectid, fs_info->sectorsize); return -EUCLEAN; } /* key->offset is tree level for METADATA_ITEM_KEY */ if (unlikely(key->type == BTRFS_METADATA_ITEM_KEY && key->offset >= BTRFS_MAX_LEVEL)) { extent_err(leaf, slot, "invalid tree level, have %llu expect [0, %u]", key->offset, BTRFS_MAX_LEVEL - 1); return -EUCLEAN; } /* * EXTENT/METADATA_ITEM consists of: * 1) One btrfs_extent_item * Records the total refs, type and generation of the extent. * * 2) One btrfs_tree_block_info (for EXTENT_ITEM and tree backref only) * Records the first key and level of the tree block. * * 2) Zero or more btrfs_extent_inline_ref(s) * Each inline ref has one btrfs_extent_inline_ref shows: * 2.1) The ref type, one of the 4 * TREE_BLOCK_REF Tree block only * SHARED_BLOCK_REF Tree block only * EXTENT_DATA_REF Data only * SHARED_DATA_REF Data only * 2.2) Ref type specific data * Either using btrfs_extent_inline_ref::offset, or specific * data structure. * * All above inline items should follow the order: * * - All btrfs_extent_inline_ref::type should be in an ascending * order * * - Within the same type, the items should follow a descending * order by their sequence number. The sequence number is * determined by: * * btrfs_extent_inline_ref::offset for all types other than * EXTENT_DATA_REF * * hash_extent_data_ref() for EXTENT_DATA_REF */ if (unlikely(item_size < sizeof(*ei))) { extent_err(leaf, slot, "invalid item size, have %u expect [%zu, %u)", item_size, sizeof(*ei), BTRFS_LEAF_DATA_SIZE(fs_info)); return -EUCLEAN; } end = item_size + btrfs_item_ptr_offset(leaf, slot); /* Checks against extent_item */ ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item); flags = btrfs_extent_flags(leaf, ei); total_refs = btrfs_extent_refs(leaf, ei); generation = btrfs_extent_generation(leaf, ei); if (unlikely(generation > btrfs_super_generation(fs_info->super_copy) + 1)) { extent_err(leaf, slot, "invalid generation, have %llu expect (0, %llu]", generation, btrfs_super_generation(fs_info->super_copy) + 1); return -EUCLEAN; } if (unlikely(!has_single_bit_set(flags & (BTRFS_EXTENT_FLAG_DATA | BTRFS_EXTENT_FLAG_TREE_BLOCK)))) { extent_err(leaf, slot, "invalid extent flag, have 0x%llx expect 1 bit set in 0x%llx", flags, BTRFS_EXTENT_FLAG_DATA | BTRFS_EXTENT_FLAG_TREE_BLOCK); return -EUCLEAN; } is_tree_block = !!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK); if (is_tree_block) { if (unlikely(key->type == BTRFS_EXTENT_ITEM_KEY && key->offset != fs_info->nodesize)) { extent_err(leaf, slot, "invalid extent length, have %llu expect %u", key->offset, fs_info->nodesize); return -EUCLEAN; } } else { if (unlikely(key->type != BTRFS_EXTENT_ITEM_KEY)) { extent_err(leaf, slot, "invalid key type, have %u expect %u for data backref", key->type, BTRFS_EXTENT_ITEM_KEY); return -EUCLEAN; } if (unlikely(!IS_ALIGNED(key->offset, fs_info->sectorsize))) { extent_err(leaf, slot, "invalid extent length, have %llu expect aligned to %u", key->offset, fs_info->sectorsize); return -EUCLEAN; } if (unlikely(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) { extent_err(leaf, slot, "invalid extent flag, data has full backref set"); return -EUCLEAN; } } ptr = (unsigned long)(struct btrfs_extent_item *)(ei + 1); /* Check the special case of btrfs_tree_block_info */ if (is_tree_block && key->type != BTRFS_METADATA_ITEM_KEY) { struct btrfs_tree_block_info *info; info = (struct btrfs_tree_block_info *)ptr; if (unlikely(btrfs_tree_block_level(leaf, info) >= BTRFS_MAX_LEVEL)) { extent_err(leaf, slot, "invalid tree block info level, have %u expect [0, %u]", btrfs_tree_block_level(leaf, info), BTRFS_MAX_LEVEL - 1); return -EUCLEAN; } ptr = (unsigned long)(struct btrfs_tree_block_info *)(info + 1); } /* Check inline refs */ while (ptr < end) { struct btrfs_extent_inline_ref *iref; struct btrfs_extent_data_ref *dref; struct btrfs_shared_data_ref *sref; u64 seq; u64 dref_root; u64 dref_objectid; u64 dref_offset; u64 inline_offset; u8 inline_type; if (unlikely(ptr + sizeof(*iref) > end)) { extent_err(leaf, slot, "inline ref item overflows extent item, ptr %lu iref size %zu end %lu", ptr, sizeof(*iref), end); return -EUCLEAN; } iref = (struct btrfs_extent_inline_ref *)ptr; inline_type = btrfs_extent_inline_ref_type(leaf, iref); inline_offset = btrfs_extent_inline_ref_offset(leaf, iref); seq = inline_offset; if (unlikely(ptr + btrfs_extent_inline_ref_size(inline_type) > end)) { extent_err(leaf, slot, "inline ref item overflows extent item, ptr %lu iref size %u end %lu", ptr, btrfs_extent_inline_ref_size(inline_type), end); return -EUCLEAN; } switch (inline_type) { /* inline_offset is subvolid of the owner, no need to check */ case BTRFS_TREE_BLOCK_REF_KEY: inline_refs++; break; /* Contains parent bytenr */ case BTRFS_SHARED_BLOCK_REF_KEY: if (unlikely(!IS_ALIGNED(inline_offset, fs_info->sectorsize))) { extent_err(leaf, slot, "invalid tree parent bytenr, have %llu expect aligned to %u", inline_offset, fs_info->sectorsize); return -EUCLEAN; } inline_refs++; break; /* * Contains owner subvolid, owner key objectid, adjusted offset. * The only obvious corruption can happen in that offset. */ case BTRFS_EXTENT_DATA_REF_KEY: dref = (struct btrfs_extent_data_ref *)(&iref->offset); dref_root = btrfs_extent_data_ref_root(leaf, dref); dref_objectid = btrfs_extent_data_ref_objectid(leaf, dref); dref_offset = btrfs_extent_data_ref_offset(leaf, dref); seq = hash_extent_data_ref( btrfs_extent_data_ref_root(leaf, dref), btrfs_extent_data_ref_objectid(leaf, dref), btrfs_extent_data_ref_offset(leaf, dref)); if (unlikely(!is_valid_dref_root(dref_root))) { extent_err(leaf, slot, "invalid data ref root value %llu", dref_root); return -EUCLEAN; } if (unlikely(dref_objectid < BTRFS_FIRST_FREE_OBJECTID || dref_objectid > BTRFS_LAST_FREE_OBJECTID)) { extent_err(leaf, slot, "invalid data ref objectid value %llu", dref_objectid); return -EUCLEAN; } if (unlikely(!IS_ALIGNED(dref_offset, fs_info->sectorsize))) { extent_err(leaf, slot, "invalid data ref offset, have %llu expect aligned to %u", dref_offset, fs_info->sectorsize); return -EUCLEAN; } if (unlikely(btrfs_extent_data_ref_count(leaf, dref) == 0)) { extent_err(leaf, slot, "invalid data ref count, should have non-zero value"); return -EUCLEAN; } inline_refs += btrfs_extent_data_ref_count(leaf, dref); break; /* Contains parent bytenr and ref count */ case BTRFS_SHARED_DATA_REF_KEY: sref = (struct btrfs_shared_data_ref *)(iref + 1); if (unlikely(!IS_ALIGNED(inline_offset, fs_info->sectorsize))) { extent_err(leaf, slot, "invalid data parent bytenr, have %llu expect aligned to %u", inline_offset, fs_info->sectorsize); return -EUCLEAN; } if (unlikely(btrfs_shared_data_ref_count(leaf, sref) == 0)) { extent_err(leaf, slot, "invalid shared data ref count, should have non-zero value"); return -EUCLEAN; } inline_refs += btrfs_shared_data_ref_count(leaf, sref); break; case BTRFS_EXTENT_OWNER_REF_KEY: WARN_ON(!btrfs_fs_incompat(fs_info, SIMPLE_QUOTA)); break; default: extent_err(leaf, slot, "unknown inline ref type: %u", inline_type); return -EUCLEAN; } if (inline_type < last_type) { extent_err(leaf, slot, "inline ref out-of-order: has type %u, prev type %u", inline_type, last_type); return -EUCLEAN; } /* Type changed, allow the sequence starts from U64_MAX again. */ if (inline_type > last_type) last_seq = U64_MAX; if (seq > last_seq) { extent_err(leaf, slot, "inline ref out-of-order: has type %u offset %llu seq 0x%llx, prev type %u seq 0x%llx", inline_type, inline_offset, seq, last_type, last_seq); return -EUCLEAN; } last_type = inline_type; last_seq = seq; ptr += btrfs_extent_inline_ref_size(inline_type); } /* No padding is allowed */ if (unlikely(ptr != end)) { extent_err(leaf, slot, "invalid extent item size, padding bytes found"); return -EUCLEAN; } /* Finally, check the inline refs against total refs */ if (unlikely(inline_refs > total_refs)) { extent_err(leaf, slot, "invalid extent refs, have %llu expect >= inline %llu", total_refs, inline_refs); return -EUCLEAN; } if ((prev_key->type == BTRFS_EXTENT_ITEM_KEY) || (prev_key->type == BTRFS_METADATA_ITEM_KEY)) { u64 prev_end = prev_key->objectid; if (prev_key->type == BTRFS_METADATA_ITEM_KEY) prev_end += fs_info->nodesize; else prev_end += prev_key->offset; if (unlikely(prev_end > key->objectid)) { extent_err(leaf, slot, "previous extent [%llu %u %llu] overlaps current extent [%llu %u %llu]", prev_key->objectid, prev_key->type, prev_key->offset, key->objectid, key->type, key->offset); return -EUCLEAN; } } return 0; } static int check_simple_keyed_refs(struct extent_buffer *leaf, struct btrfs_key *key, int slot) { u32 expect_item_size = 0; if (key->type == BTRFS_SHARED_DATA_REF_KEY) { struct btrfs_shared_data_ref *sref; sref = btrfs_item_ptr(leaf, slot, struct btrfs_shared_data_ref); if (unlikely(btrfs_shared_data_ref_count(leaf, sref) == 0)) { extent_err(leaf, slot, "invalid shared data backref count, should have non-zero value"); return -EUCLEAN; } expect_item_size = sizeof(struct btrfs_shared_data_ref); } if (unlikely(btrfs_item_size(leaf, slot) != expect_item_size)) { generic_err(leaf, slot, "invalid item size, have %u expect %u for key type %u", btrfs_item_size(leaf, slot), expect_item_size, key->type); return -EUCLEAN; } if (unlikely(!IS_ALIGNED(key->objectid, leaf->fs_info->sectorsize))) { generic_err(leaf, slot, "invalid key objectid for shared block ref, have %llu expect aligned to %u", key->objectid, leaf->fs_info->sectorsize); return -EUCLEAN; } if (unlikely(key->type != BTRFS_TREE_BLOCK_REF_KEY && !IS_ALIGNED(key->offset, leaf->fs_info->sectorsize))) { extent_err(leaf, slot, "invalid tree parent bytenr, have %llu expect aligned to %u", key->offset, leaf->fs_info->sectorsize); return -EUCLEAN; } return 0; } static int check_extent_data_ref(struct extent_buffer *leaf, struct btrfs_key *key, int slot) { struct btrfs_extent_data_ref *dref; unsigned long ptr = btrfs_item_ptr_offset(leaf, slot); const unsigned long end = ptr + btrfs_item_size(leaf, slot); if (unlikely(btrfs_item_size(leaf, slot) % sizeof(*dref) != 0)) { generic_err(leaf, slot, "invalid item size, have %u expect aligned to %zu for key type %u", btrfs_item_size(leaf, slot), sizeof(*dref), key->type); return -EUCLEAN; } if (unlikely(!IS_ALIGNED(key->objectid, leaf->fs_info->sectorsize))) { generic_err(leaf, slot, "invalid key objectid for shared block ref, have %llu expect aligned to %u", key->objectid, leaf->fs_info->sectorsize); return -EUCLEAN; } for (; ptr < end; ptr += sizeof(*dref)) { u64 root; u64 objectid; u64 offset; /* * We cannot check the extent_data_ref hash due to possible * overflow from the leaf due to hash collisions. */ dref = (struct btrfs_extent_data_ref *)ptr; root = btrfs_extent_data_ref_root(leaf, dref); objectid = btrfs_extent_data_ref_objectid(leaf, dref); offset = btrfs_extent_data_ref_offset(leaf, dref); if (unlikely(!is_valid_dref_root(root))) { extent_err(leaf, slot, "invalid extent data backref root value %llu", root); return -EUCLEAN; } if (unlikely(objectid < BTRFS_FIRST_FREE_OBJECTID || objectid > BTRFS_LAST_FREE_OBJECTID)) { extent_err(leaf, slot, "invalid extent data backref objectid value %llu", root); return -EUCLEAN; } if (unlikely(!IS_ALIGNED(offset, leaf->fs_info->sectorsize))) { extent_err(leaf, slot, "invalid extent data backref offset, have %llu expect aligned to %u", offset, leaf->fs_info->sectorsize); return -EUCLEAN; } if (unlikely(btrfs_extent_data_ref_count(leaf, dref) == 0)) { extent_err(leaf, slot, "invalid extent data backref count, should have non-zero value"); return -EUCLEAN; } } return 0; } #define inode_ref_err(eb, slot, fmt, args...) \ inode_item_err(eb, slot, fmt, ##args) static int check_inode_ref(struct extent_buffer *leaf, struct btrfs_key *key, struct btrfs_key *prev_key, int slot) { struct btrfs_inode_ref *iref; unsigned long ptr; unsigned long end; if (unlikely(!check_prev_ino(leaf, key, slot, prev_key))) return -EUCLEAN; /* namelen can't be 0, so item_size == sizeof() is also invalid */ if (unlikely(btrfs_item_size(leaf, slot) <= sizeof(*iref))) { inode_ref_err(leaf, slot, "invalid item size, have %u expect (%zu, %u)", btrfs_item_size(leaf, slot), sizeof(*iref), BTRFS_LEAF_DATA_SIZE(leaf->fs_info)); return -EUCLEAN; } ptr = btrfs_item_ptr_offset(leaf, slot); end = ptr + btrfs_item_size(leaf, slot); while (ptr < end) { u16 namelen; if (unlikely(ptr + sizeof(iref) > end)) { inode_ref_err(leaf, slot, "inode ref overflow, ptr %lu end %lu inode_ref_size %zu", ptr, end, sizeof(iref)); return -EUCLEAN; } iref = (struct btrfs_inode_ref *)ptr; namelen = btrfs_inode_ref_name_len(leaf, iref); if (unlikely(ptr + sizeof(*iref) + namelen > end)) { inode_ref_err(leaf, slot, "inode ref overflow, ptr %lu end %lu namelen %u", ptr, end, namelen); return -EUCLEAN; } /* * NOTE: In theory we should record all found index numbers * to find any duplicated indexes, but that will be too time * consuming for inodes with too many hard links. */ ptr += sizeof(*iref) + namelen; } return 0; } static int check_raid_stripe_extent(const struct extent_buffer *leaf, const struct btrfs_key *key, int slot) { if (unlikely(!IS_ALIGNED(key->objectid, leaf->fs_info->sectorsize))) { generic_err(leaf, slot, "invalid key objectid for raid stripe extent, have %llu expect aligned to %u", key->objectid, leaf->fs_info->sectorsize); return -EUCLEAN; } if (unlikely(!btrfs_fs_incompat(leaf->fs_info, RAID_STRIPE_TREE))) { generic_err(leaf, slot, "RAID_STRIPE_EXTENT present but RAID_STRIPE_TREE incompat bit unset"); return -EUCLEAN; } return 0; } static int check_dev_extent_item(const struct extent_buffer *leaf, const struct btrfs_key *key, int slot, struct btrfs_key *prev_key) { struct btrfs_dev_extent *de; const u32 sectorsize = leaf->fs_info->sectorsize; de = btrfs_item_ptr(leaf, slot, struct btrfs_dev_extent); /* Basic fixed member checks. */ if (unlikely(btrfs_dev_extent_chunk_tree(leaf, de) != BTRFS_CHUNK_TREE_OBJECTID)) { generic_err(leaf, slot, "invalid dev extent chunk tree id, has %llu expect %llu", btrfs_dev_extent_chunk_tree(leaf, de), BTRFS_CHUNK_TREE_OBJECTID); return -EUCLEAN; } if (unlikely(btrfs_dev_extent_chunk_objectid(leaf, de) != BTRFS_FIRST_CHUNK_TREE_OBJECTID)) { generic_err(leaf, slot, "invalid dev extent chunk objectid, has %llu expect %llu", btrfs_dev_extent_chunk_objectid(leaf, de), BTRFS_FIRST_CHUNK_TREE_OBJECTID); return -EUCLEAN; } /* Alignment check. */ if (unlikely(!IS_ALIGNED(key->offset, sectorsize))) { generic_err(leaf, slot, "invalid dev extent key.offset, has %llu not aligned to %u", key->offset, sectorsize); return -EUCLEAN; } if (unlikely(!IS_ALIGNED(btrfs_dev_extent_chunk_offset(leaf, de), sectorsize))) { generic_err(leaf, slot, "invalid dev extent chunk offset, has %llu not aligned to %u", btrfs_dev_extent_chunk_objectid(leaf, de), sectorsize); return -EUCLEAN; } if (unlikely(!IS_ALIGNED(btrfs_dev_extent_length(leaf, de), sectorsize))) { generic_err(leaf, slot, "invalid dev extent length, has %llu not aligned to %u", btrfs_dev_extent_length(leaf, de), sectorsize); return -EUCLEAN; } /* Overlap check with previous dev extent. */ if (slot && prev_key->objectid == key->objectid && prev_key->type == key->type) { struct btrfs_dev_extent *prev_de; u64 prev_len; prev_de = btrfs_item_ptr(leaf, slot - 1, struct btrfs_dev_extent); prev_len = btrfs_dev_extent_length(leaf, prev_de); if (unlikely(prev_key->offset + prev_len > key->offset)) { generic_err(leaf, slot, "dev extent overlap, prev offset %llu len %llu current offset %llu", prev_key->objectid, prev_len, key->offset); return -EUCLEAN; } } return 0; } /* * Common point to switch the item-specific validation. */ static enum btrfs_tree_block_status check_leaf_item(struct extent_buffer *leaf, struct btrfs_key *key, int slot, struct btrfs_key *prev_key) { int ret = 0; struct btrfs_chunk *chunk; switch (key->type) { case BTRFS_EXTENT_DATA_KEY: ret = check_extent_data_item(leaf, key, slot, prev_key); break; case BTRFS_EXTENT_CSUM_KEY: ret = check_csum_item(leaf, key, slot, prev_key); break; case BTRFS_DIR_ITEM_KEY: case BTRFS_DIR_INDEX_KEY: case BTRFS_XATTR_ITEM_KEY: ret = check_dir_item(leaf, key, prev_key, slot); break; case BTRFS_INODE_REF_KEY: ret = check_inode_ref(leaf, key, prev_key, slot); break; case BTRFS_BLOCK_GROUP_ITEM_KEY: ret = check_block_group_item(leaf, key, slot); break; case BTRFS_CHUNK_ITEM_KEY: chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk); ret = check_leaf_chunk_item(leaf, chunk, key, slot); break; case BTRFS_DEV_ITEM_KEY: ret = check_dev_item(leaf, key, slot); break; case BTRFS_DEV_EXTENT_KEY: ret = check_dev_extent_item(leaf, key, slot, prev_key); break; case BTRFS_INODE_ITEM_KEY: ret = check_inode_item(leaf, key, slot); break; case BTRFS_ROOT_ITEM_KEY: ret = check_root_item(leaf, key, slot); break; case BTRFS_EXTENT_ITEM_KEY: case BTRFS_METADATA_ITEM_KEY: ret = check_extent_item(leaf, key, slot, prev_key); break; case BTRFS_TREE_BLOCK_REF_KEY: case BTRFS_SHARED_DATA_REF_KEY: case BTRFS_SHARED_BLOCK_REF_KEY: ret = check_simple_keyed_refs(leaf, key, slot); break; case BTRFS_EXTENT_DATA_REF_KEY: ret = check_extent_data_ref(leaf, key, slot); break; case BTRFS_RAID_STRIPE_KEY: ret = check_raid_stripe_extent(leaf, key, slot); break; } if (ret) return BTRFS_TREE_BLOCK_INVALID_ITEM; return BTRFS_TREE_BLOCK_CLEAN; } enum btrfs_tree_block_status __btrfs_check_leaf(struct extent_buffer *leaf) { struct btrfs_fs_info *fs_info = leaf->fs_info; /* No valid key type is 0, so all key should be larger than this key */ struct btrfs_key prev_key = {0, 0, 0}; struct btrfs_key key; u32 nritems = btrfs_header_nritems(leaf); int slot; if (unlikely(btrfs_header_level(leaf) != 0)) { generic_err(leaf, 0, "invalid level for leaf, have %d expect 0", btrfs_header_level(leaf)); return BTRFS_TREE_BLOCK_INVALID_LEVEL; } if (unlikely(!btrfs_header_flag(leaf, BTRFS_HEADER_FLAG_WRITTEN))) { generic_err(leaf, 0, "invalid flag for leaf, WRITTEN not set"); return BTRFS_TREE_BLOCK_WRITTEN_NOT_SET; } /* * Extent buffers from a relocation tree have a owner field that * corresponds to the subvolume tree they are based on. So just from an * extent buffer alone we can not find out what is the id of the * corresponding subvolume tree, so we can not figure out if the extent * buffer corresponds to the root of the relocation tree or not. So * skip this check for relocation trees. */ if (nritems == 0 && !btrfs_header_flag(leaf, BTRFS_HEADER_FLAG_RELOC)) { u64 owner = btrfs_header_owner(leaf); /* These trees must never be empty */ if (unlikely(owner == BTRFS_ROOT_TREE_OBJECTID || owner == BTRFS_CHUNK_TREE_OBJECTID || owner == BTRFS_DEV_TREE_OBJECTID || owner == BTRFS_FS_TREE_OBJECTID || owner == BTRFS_DATA_RELOC_TREE_OBJECTID)) { generic_err(leaf, 0, "invalid root, root %llu must never be empty", owner); return BTRFS_TREE_BLOCK_INVALID_NRITEMS; } /* Unknown tree */ if (unlikely(owner == 0)) { generic_err(leaf, 0, "invalid owner, root 0 is not defined"); return BTRFS_TREE_BLOCK_INVALID_OWNER; } /* EXTENT_TREE_V2 can have empty extent trees. */ if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) return BTRFS_TREE_BLOCK_CLEAN; if (unlikely(owner == BTRFS_EXTENT_TREE_OBJECTID)) { generic_err(leaf, 0, "invalid root, root %llu must never be empty", owner); return BTRFS_TREE_BLOCK_INVALID_NRITEMS; } return BTRFS_TREE_BLOCK_CLEAN; } if (unlikely(nritems == 0)) return BTRFS_TREE_BLOCK_CLEAN; /* * Check the following things to make sure this is a good leaf, and * leaf users won't need to bother with similar sanity checks: * * 1) key ordering * 2) item offset and size * No overlap, no hole, all inside the leaf. * 3) item content * If possible, do comprehensive sanity check. * NOTE: All checks must only rely on the item data itself. */ for (slot = 0; slot < nritems; slot++) { u32 item_end_expected; u64 item_data_end; enum btrfs_tree_block_status ret; btrfs_item_key_to_cpu(leaf, &key, slot); /* Make sure the keys are in the right order */ if (unlikely(btrfs_comp_cpu_keys(&prev_key, &key) >= 0)) { generic_err(leaf, slot, "bad key order, prev (%llu %u %llu) current (%llu %u %llu)", prev_key.objectid, prev_key.type, prev_key.offset, key.objectid, key.type, key.offset); return BTRFS_TREE_BLOCK_BAD_KEY_ORDER; } item_data_end = (u64)btrfs_item_offset(leaf, slot) + btrfs_item_size(leaf, slot); /* * Make sure the offset and ends are right, remember that the * item data starts at the end of the leaf and grows towards the * front. */ if (slot == 0) item_end_expected = BTRFS_LEAF_DATA_SIZE(fs_info); else item_end_expected = btrfs_item_offset(leaf, slot - 1); if (unlikely(item_data_end != item_end_expected)) { generic_err(leaf, slot, "unexpected item end, have %llu expect %u", item_data_end, item_end_expected); return BTRFS_TREE_BLOCK_INVALID_OFFSETS; } /* * Check to make sure that we don't point outside of the leaf, * just in case all the items are consistent to each other, but * all point outside of the leaf. */ if (unlikely(item_data_end > BTRFS_LEAF_DATA_SIZE(fs_info))) { generic_err(leaf, slot, "slot end outside of leaf, have %llu expect range [0, %u]", item_data_end, BTRFS_LEAF_DATA_SIZE(fs_info)); return BTRFS_TREE_BLOCK_INVALID_OFFSETS; } /* Also check if the item pointer overlaps with btrfs item. */ if (unlikely(btrfs_item_ptr_offset(leaf, slot) < btrfs_item_nr_offset(leaf, slot) + sizeof(struct btrfs_item))) { generic_err(leaf, slot, "slot overlaps with its data, item end %lu data start %lu", btrfs_item_nr_offset(leaf, slot) + sizeof(struct btrfs_item), btrfs_item_ptr_offset(leaf, slot)); return BTRFS_TREE_BLOCK_INVALID_OFFSETS; } /* Check if the item size and content meet other criteria. */ ret = check_leaf_item(leaf, &key, slot, &prev_key); if (unlikely(ret != BTRFS_TREE_BLOCK_CLEAN)) return ret; prev_key.objectid = key.objectid; prev_key.type = key.type; prev_key.offset = key.offset; } return BTRFS_TREE_BLOCK_CLEAN; } int btrfs_check_leaf(struct extent_buffer *leaf) { enum btrfs_tree_block_status ret; ret = __btrfs_check_leaf(leaf); if (unlikely(ret != BTRFS_TREE_BLOCK_CLEAN)) return -EUCLEAN; return 0; } ALLOW_ERROR_INJECTION(btrfs_check_leaf, ERRNO); enum btrfs_tree_block_status __btrfs_check_node(struct extent_buffer *node) { struct btrfs_fs_info *fs_info = node->fs_info; unsigned long nr = btrfs_header_nritems(node); struct btrfs_key key, next_key; int slot; int level = btrfs_header_level(node); u64 bytenr; if (unlikely(!btrfs_header_flag(node, BTRFS_HEADER_FLAG_WRITTEN))) { generic_err(node, 0, "invalid flag for node, WRITTEN not set"); return BTRFS_TREE_BLOCK_WRITTEN_NOT_SET; } if (unlikely(level <= 0 || level >= BTRFS_MAX_LEVEL)) { generic_err(node, 0, "invalid level for node, have %d expect [1, %d]", level, BTRFS_MAX_LEVEL - 1); return BTRFS_TREE_BLOCK_INVALID_LEVEL; } if (unlikely(nr == 0 || nr > BTRFS_NODEPTRS_PER_BLOCK(fs_info))) { btrfs_crit(fs_info, "corrupt node: root=%llu block=%llu, nritems too %s, have %lu expect range [1,%u]", btrfs_header_owner(node), node->start, nr == 0 ? "small" : "large", nr, BTRFS_NODEPTRS_PER_BLOCK(fs_info)); return BTRFS_TREE_BLOCK_INVALID_NRITEMS; } for (slot = 0; slot < nr - 1; slot++) { bytenr = btrfs_node_blockptr(node, slot); btrfs_node_key_to_cpu(node, &key, slot); btrfs_node_key_to_cpu(node, &next_key, slot + 1); if (unlikely(!bytenr)) { generic_err(node, slot, "invalid NULL node pointer"); return BTRFS_TREE_BLOCK_INVALID_BLOCKPTR; } if (unlikely(!IS_ALIGNED(bytenr, fs_info->sectorsize))) { generic_err(node, slot, "unaligned pointer, have %llu should be aligned to %u", bytenr, fs_info->sectorsize); return BTRFS_TREE_BLOCK_INVALID_BLOCKPTR; } if (unlikely(btrfs_comp_cpu_keys(&key, &next_key) >= 0)) { generic_err(node, slot, "bad key order, current (%llu %u %llu) next (%llu %u %llu)", key.objectid, key.type, key.offset, next_key.objectid, next_key.type, next_key.offset); return BTRFS_TREE_BLOCK_BAD_KEY_ORDER; } } return BTRFS_TREE_BLOCK_CLEAN; } int btrfs_check_node(struct extent_buffer *node) { enum btrfs_tree_block_status ret; ret = __btrfs_check_node(node); if (unlikely(ret != BTRFS_TREE_BLOCK_CLEAN)) return -EUCLEAN; return 0; } ALLOW_ERROR_INJECTION(btrfs_check_node, ERRNO); int btrfs_check_eb_owner(const struct extent_buffer *eb, u64 root_owner) { const bool is_subvol = is_fstree(root_owner); const u64 eb_owner = btrfs_header_owner(eb); /* * Skip dummy fs, as selftests don't create unique ebs for each dummy * root. */ if (btrfs_is_testing(eb->fs_info)) return 0; /* * There are several call sites (backref walking, qgroup, and data * reloc) passing 0 as @root_owner, as they are not holding the * tree root. In that case, we can not do a reliable ownership check, * so just exit. */ if (root_owner == 0) return 0; /* * These trees use key.offset as their owner, our callers don't have * the extra capacity to pass key.offset here. So we just skip them. */ if (root_owner == BTRFS_TREE_LOG_OBJECTID || root_owner == BTRFS_TREE_RELOC_OBJECTID) return 0; if (!is_subvol) { /* For non-subvolume trees, the eb owner should match root owner */ if (unlikely(root_owner != eb_owner)) { btrfs_crit(eb->fs_info, "corrupted %s, root=%llu block=%llu owner mismatch, have %llu expect %llu", btrfs_header_level(eb) == 0 ? "leaf" : "node", root_owner, btrfs_header_bytenr(eb), eb_owner, root_owner); return -EUCLEAN; } return 0; } /* * For subvolume trees, owners can mismatch, but they should all belong * to subvolume trees. */ if (unlikely(is_subvol != is_fstree(eb_owner))) { btrfs_crit(eb->fs_info, "corrupted %s, root=%llu block=%llu owner mismatch, have %llu expect [%llu, %llu]", btrfs_header_level(eb) == 0 ? "leaf" : "node", root_owner, btrfs_header_bytenr(eb), eb_owner, BTRFS_FIRST_FREE_OBJECTID, BTRFS_LAST_FREE_OBJECTID); return -EUCLEAN; } return 0; } int btrfs_verify_level_key(struct extent_buffer *eb, const struct btrfs_tree_parent_check *check) { struct btrfs_fs_info *fs_info = eb->fs_info; int found_level; struct btrfs_key found_key; int ret; found_level = btrfs_header_level(eb); if (found_level != check->level) { WARN(IS_ENABLED(CONFIG_BTRFS_DEBUG), KERN_ERR "BTRFS: tree level check failed\n"); btrfs_err(fs_info, "tree level mismatch detected, bytenr=%llu level expected=%u has=%u", eb->start, check->level, found_level); return -EIO; } if (!check->has_first_key) return 0; /* * For live tree block (new tree blocks in current transaction), * we need proper lock context to avoid race, which is impossible here. * So we only checks tree blocks which is read from disk, whose * generation <= fs_info->last_trans_committed. */ if (btrfs_header_generation(eb) > btrfs_get_last_trans_committed(fs_info)) return 0; /* We have @first_key, so this @eb must have at least one item */ if (btrfs_header_nritems(eb) == 0) { btrfs_err(fs_info, "invalid tree nritems, bytenr=%llu nritems=0 expect >0", eb->start); WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG)); return -EUCLEAN; } if (found_level) btrfs_node_key_to_cpu(eb, &found_key, 0); else btrfs_item_key_to_cpu(eb, &found_key, 0); ret = btrfs_comp_cpu_keys(&check->first_key, &found_key); if (ret) { WARN(IS_ENABLED(CONFIG_BTRFS_DEBUG), KERN_ERR "BTRFS: tree first key check failed\n"); btrfs_err(fs_info, "tree first key mismatch detected, bytenr=%llu parent_transid=%llu key expected=(%llu,%u,%llu) has=(%llu,%u,%llu)", eb->start, check->transid, check->first_key.objectid, check->first_key.type, check->first_key.offset, found_key.objectid, found_key.type, found_key.offset); } return ret; }
3 1 9 1 9 9 9 1 9 9 9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 // SPDX-License-Identifier: GPL-2.0-only /* * common LSM auditing functions * * Based on code written for SELinux by : * Stephen Smalley, <sds@tycho.nsa.gov> * James Morris <jmorris@redhat.com> * Author : Etienne Basset, <etienne.basset@ensta.org> */ #include <linux/types.h> #include <linux/stddef.h> #include <linux/kernel.h> #include <linux/gfp.h> #include <linux/fs.h> #include <linux/init.h> #include <net/sock.h> #include <linux/un.h> #include <net/af_unix.h> #include <linux/audit.h> #include <linux/ipv6.h> #include <linux/ip.h> #include <net/ip.h> #include <net/ipv6.h> #include <linux/tcp.h> #include <linux/udp.h> #include <linux/dccp.h> #include <linux/sctp.h> #include <linux/lsm_audit.h> #include <linux/security.h> /** * ipv4_skb_to_auditdata : fill auditdata from skb * @skb : the skb * @ad : the audit data to fill * @proto : the layer 4 protocol * * return 0 on success */ int ipv4_skb_to_auditdata(struct sk_buff *skb, struct common_audit_data *ad, u8 *proto) { int ret = 0; struct iphdr *ih; ih = ip_hdr(skb); ad->u.net->v4info.saddr = ih->saddr; ad->u.net->v4info.daddr = ih->daddr; if (proto) *proto = ih->protocol; /* non initial fragment */ if (ntohs(ih->frag_off) & IP_OFFSET) return 0; switch (ih->protocol) { case IPPROTO_TCP: { struct tcphdr *th = tcp_hdr(skb); ad->u.net->sport = th->source; ad->u.net->dport = th->dest; break; } case IPPROTO_UDP: { struct udphdr *uh = udp_hdr(skb); ad->u.net->sport = uh->source; ad->u.net->dport = uh->dest; break; } case IPPROTO_DCCP: { struct dccp_hdr *dh = dccp_hdr(skb); ad->u.net->sport = dh->dccph_sport; ad->u.net->dport = dh->dccph_dport; break; } case IPPROTO_SCTP: { struct sctphdr *sh = sctp_hdr(skb); ad->u.net->sport = sh->source; ad->u.net->dport = sh->dest; break; } default: ret = -EINVAL; } return ret; } #if IS_ENABLED(CONFIG_IPV6) /** * ipv6_skb_to_auditdata : fill auditdata from skb * @skb : the skb * @ad : the audit data to fill * @proto : the layer 4 protocol * * return 0 on success */ int ipv6_skb_to_auditdata(struct sk_buff *skb, struct common_audit_data *ad, u8 *proto) { int offset, ret = 0; struct ipv6hdr *ip6; u8 nexthdr; __be16 frag_off; ip6 = ipv6_hdr(skb); ad->u.net->v6info.saddr = ip6->saddr; ad->u.net->v6info.daddr = ip6->daddr; /* IPv6 can have several extension header before the Transport header * skip them */ offset = skb_network_offset(skb); offset += sizeof(*ip6); nexthdr = ip6->nexthdr; offset = ipv6_skip_exthdr(skb, offset, &nexthdr, &frag_off); if (offset < 0) return 0; if (proto) *proto = nexthdr; switch (nexthdr) { case IPPROTO_TCP: { struct tcphdr _tcph, *th; th = skb_header_pointer(skb, offset, sizeof(_tcph), &_tcph); if (th == NULL) break; ad->u.net->sport = th->source; ad->u.net->dport = th->dest; break; } case IPPROTO_UDP: { struct udphdr _udph, *uh; uh = skb_header_pointer(skb, offset, sizeof(_udph), &_udph); if (uh == NULL) break; ad->u.net->sport = uh->source; ad->u.net->dport = uh->dest; break; } case IPPROTO_DCCP: { struct dccp_hdr _dccph, *dh; dh = skb_header_pointer(skb, offset, sizeof(_dccph), &_dccph); if (dh == NULL) break; ad->u.net->sport = dh->dccph_sport; ad->u.net->dport = dh->dccph_dport; break; } case IPPROTO_SCTP: { struct sctphdr _sctph, *sh; sh = skb_header_pointer(skb, offset, sizeof(_sctph), &_sctph); if (sh == NULL) break; ad->u.net->sport = sh->source; ad->u.net->dport = sh->dest; break; } default: ret = -EINVAL; } return ret; } #endif static inline void print_ipv6_addr(struct audit_buffer *ab, const struct in6_addr *addr, __be16 port, const char *name1, const char *name2) { if (!ipv6_addr_any(addr)) audit_log_format(ab, " %s=%pI6c", name1, addr); if (port) audit_log_format(ab, " %s=%d", name2, ntohs(port)); } static inline void print_ipv4_addr(struct audit_buffer *ab, __be32 addr, __be16 port, const char *name1, const char *name2) { if (addr) audit_log_format(ab, " %s=%pI4", name1, &addr); if (port) audit_log_format(ab, " %s=%d", name2, ntohs(port)); } /** * dump_common_audit_data - helper to dump common audit data * @ab : the audit buffer * @a : common audit data * */ static void dump_common_audit_data(struct audit_buffer *ab, struct common_audit_data *a) { char comm[sizeof(current->comm)]; /* * To keep stack sizes in check force programmers to notice if they * start making this union too large! See struct lsm_network_audit * as an example of how to deal with large data. */ BUILD_BUG_ON(sizeof(a->u) > sizeof(void *)*2); audit_log_format(ab, " pid=%d comm=", task_tgid_nr(current)); audit_log_untrustedstring(ab, get_task_comm(comm, current)); switch (a->type) { case LSM_AUDIT_DATA_NONE: return; case LSM_AUDIT_DATA_IPC: audit_log_format(ab, " ipc_key=%d ", a->u.ipc_id); break; case LSM_AUDIT_DATA_CAP: audit_log_format(ab, " capability=%d ", a->u.cap); break; case LSM_AUDIT_DATA_PATH: { struct inode *inode; audit_log_d_path(ab, " path=", &a->u.path); inode = d_backing_inode(a->u.path.dentry); if (inode) { audit_log_format(ab, " dev="); audit_log_untrustedstring(ab, inode->i_sb->s_id); audit_log_format(ab, " ino=%lu", inode->i_ino); } break; } case LSM_AUDIT_DATA_FILE: { struct inode *inode; audit_log_d_path(ab, " path=", &a->u.file->f_path); inode = file_inode(a->u.file); if (inode) { audit_log_format(ab, " dev="); audit_log_untrustedstring(ab, inode->i_sb->s_id); audit_log_format(ab, " ino=%lu", inode->i_ino); } break; } case LSM_AUDIT_DATA_IOCTL_OP: { struct inode *inode; audit_log_d_path(ab, " path=", &a->u.op->path); inode = a->u.op->path.dentry->d_inode; if (inode) { audit_log_format(ab, " dev="); audit_log_untrustedstring(ab, inode->i_sb->s_id); audit_log_format(ab, " ino=%lu", inode->i_ino); } audit_log_format(ab, " ioctlcmd=0x%hx", a->u.op->cmd); break; } case LSM_AUDIT_DATA_DENTRY: { struct inode *inode; audit_log_format(ab, " name="); spin_lock(&a->u.dentry->d_lock); audit_log_untrustedstring(ab, a->u.dentry->d_name.name); spin_unlock(&a->u.dentry->d_lock); inode = d_backing_inode(a->u.dentry); if (inode) { audit_log_format(ab, " dev="); audit_log_untrustedstring(ab, inode->i_sb->s_id); audit_log_format(ab, " ino=%lu", inode->i_ino); } break; } case LSM_AUDIT_DATA_INODE: { struct dentry *dentry; struct inode *inode; rcu_read_lock(); inode = a->u.inode; dentry = d_find_alias_rcu(inode); if (dentry) { audit_log_format(ab, " name="); spin_lock(&dentry->d_lock); audit_log_untrustedstring(ab, dentry->d_name.name); spin_unlock(&dentry->d_lock); } audit_log_format(ab, " dev="); audit_log_untrustedstring(ab, inode->i_sb->s_id); audit_log_format(ab, " ino=%lu", inode->i_ino); rcu_read_unlock(); break; } case LSM_AUDIT_DATA_TASK: { struct task_struct *tsk = a->u.tsk; if (tsk) { pid_t pid = task_tgid_nr(tsk); if (pid) { char tskcomm[sizeof(tsk->comm)]; audit_log_format(ab, " opid=%d ocomm=", pid); audit_log_untrustedstring(ab, get_task_comm(tskcomm, tsk)); } } break; } case LSM_AUDIT_DATA_NET: if (a->u.net->sk) { const struct sock *sk = a->u.net->sk; const struct unix_sock *u; struct unix_address *addr; int len = 0; char *p = NULL; switch (sk->sk_family) { case AF_INET: { const struct inet_sock *inet = inet_sk(sk); print_ipv4_addr(ab, inet->inet_rcv_saddr, inet->inet_sport, "laddr", "lport"); print_ipv4_addr(ab, inet->inet_daddr, inet->inet_dport, "faddr", "fport"); break; } #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: { const struct inet_sock *inet = inet_sk(sk); print_ipv6_addr(ab, &sk->sk_v6_rcv_saddr, inet->inet_sport, "laddr", "lport"); print_ipv6_addr(ab, &sk->sk_v6_daddr, inet->inet_dport, "faddr", "fport"); break; } #endif case AF_UNIX: u = unix_sk(sk); addr = smp_load_acquire(&u->addr); if (!addr) break; if (u->path.dentry) { audit_log_d_path(ab, " path=", &u->path); break; } len = addr->len-sizeof(short); p = &addr->name->sun_path[0]; audit_log_format(ab, " path="); if (*p) audit_log_untrustedstring(ab, p); else audit_log_n_hex(ab, p, len); break; } } switch (a->u.net->family) { case AF_INET: print_ipv4_addr(ab, a->u.net->v4info.saddr, a->u.net->sport, "saddr", "src"); print_ipv4_addr(ab, a->u.net->v4info.daddr, a->u.net->dport, "daddr", "dest"); break; case AF_INET6: print_ipv6_addr(ab, &a->u.net->v6info.saddr, a->u.net->sport, "saddr", "src"); print_ipv6_addr(ab, &a->u.net->v6info.daddr, a->u.net->dport, "daddr", "dest"); break; } if (a->u.net->netif > 0) { struct net_device *dev; /* NOTE: we always use init's namespace */ dev = dev_get_by_index(&init_net, a->u.net->netif); if (dev) { audit_log_format(ab, " netif=%s", dev->name); dev_put(dev); } } break; #ifdef CONFIG_KEYS case LSM_AUDIT_DATA_KEY: audit_log_format(ab, " key_serial=%u", a->u.key_struct.key); if (a->u.key_struct.key_desc) { audit_log_format(ab, " key_desc="); audit_log_untrustedstring(ab, a->u.key_struct.key_desc); } break; #endif case LSM_AUDIT_DATA_KMOD: audit_log_format(ab, " kmod="); audit_log_untrustedstring(ab, a->u.kmod_name); break; case LSM_AUDIT_DATA_IBPKEY: { struct in6_addr sbn_pfx; memset(&sbn_pfx.s6_addr, 0, sizeof(sbn_pfx.s6_addr)); memcpy(&sbn_pfx.s6_addr, &a->u.ibpkey->subnet_prefix, sizeof(a->u.ibpkey->subnet_prefix)); audit_log_format(ab, " pkey=0x%x subnet_prefix=%pI6c", a->u.ibpkey->pkey, &sbn_pfx); break; } case LSM_AUDIT_DATA_IBENDPORT: audit_log_format(ab, " device=%s port_num=%u", a->u.ibendport->dev_name, a->u.ibendport->port); break; case LSM_AUDIT_DATA_LOCKDOWN: audit_log_format(ab, " lockdown_reason=\"%s\"", lockdown_reasons[a->u.reason]); break; case LSM_AUDIT_DATA_ANONINODE: audit_log_format(ab, " anonclass=%s", a->u.anonclass); break; case LSM_AUDIT_DATA_NLMSGTYPE: audit_log_format(ab, " nl-msgtype=%hu", a->u.nlmsg_type); break; } /* switch (a->type) */ } /** * common_lsm_audit - generic LSM auditing function * @a: auxiliary audit data * @pre_audit: lsm-specific pre-audit callback * @post_audit: lsm-specific post-audit callback * * setup the audit buffer for common security information * uses callback to print LSM specific information */ void common_lsm_audit(struct common_audit_data *a, void (*pre_audit)(struct audit_buffer *, void *), void (*post_audit)(struct audit_buffer *, void *)) { struct audit_buffer *ab; if (a == NULL) return; /* we use GFP_ATOMIC so we won't sleep */ ab = audit_log_start(audit_context(), GFP_ATOMIC | __GFP_NOWARN, AUDIT_AVC); if (ab == NULL) return; if (pre_audit) pre_audit(ab, a); dump_common_audit_data(ab, a); if (post_audit) post_audit(ab, a); audit_log_end(ab); }
1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 // SPDX-License-Identifier: GPL-2.0-or-later /* * INET 802.1Q VLAN * Ethernet-type device handling. * * Authors: Ben Greear <greearb@candelatech.com> * Please send support related email to: netdev@vger.kernel.org * VLAN Home Page: http://www.candelatech.com/~greear/vlan.html * * Fixes: * Fix for packet capture - Nick Eggleston <nick@dccinc.com>; * Add HW acceleration hooks - David S. Miller <davem@redhat.com>; * Correct all the locking - David S. Miller <davem@redhat.com>; * Use hash table for VLAN groups - David S. Miller <davem@redhat.com> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/capability.h> #include <linux/module.h> #include <linux/netdevice.h> #include <linux/skbuff.h> #include <linux/slab.h> #include <linux/init.h> #include <linux/rculist.h> #include <net/p8022.h> #include <net/arp.h> #include <linux/rtnetlink.h> #include <linux/notifier.h> #include <net/rtnetlink.h> #include <net/net_namespace.h> #include <net/netns/generic.h> #include <linux/uaccess.h> #include <linux/if_vlan.h> #include "vlan.h" #include "vlanproc.h" #define DRV_VERSION "1.8" /* Global VLAN variables */ unsigned int vlan_net_id __read_mostly; const char vlan_fullname[] = "802.1Q VLAN Support"; const char vlan_version[] = DRV_VERSION; /* End of global variables definitions. */ static int vlan_group_prealloc_vid(struct vlan_group *vg, __be16 vlan_proto, u16 vlan_id) { struct net_device **array; unsigned int vidx; unsigned int size; int pidx; ASSERT_RTNL(); pidx = vlan_proto_idx(vlan_proto); if (pidx < 0) return -EINVAL; vidx = vlan_id / VLAN_GROUP_ARRAY_PART_LEN; array = vg->vlan_devices_arrays[pidx][vidx]; if (array != NULL) return 0; size = sizeof(struct net_device *) * VLAN_GROUP_ARRAY_PART_LEN; array = kzalloc(size, GFP_KERNEL_ACCOUNT); if (array == NULL) return -ENOBUFS; /* paired with smp_rmb() in __vlan_group_get_device() */ smp_wmb(); vg->vlan_devices_arrays[pidx][vidx] = array; return 0; } static void vlan_stacked_transfer_operstate(const struct net_device *rootdev, struct net_device *dev, struct vlan_dev_priv *vlan) { if (!(vlan->flags & VLAN_FLAG_BRIDGE_BINDING)) netif_stacked_transfer_operstate(rootdev, dev); } void unregister_vlan_dev(struct net_device *dev, struct list_head *head) { struct vlan_dev_priv *vlan = vlan_dev_priv(dev); struct net_device *real_dev = vlan->real_dev; struct vlan_info *vlan_info; struct vlan_group *grp; u16 vlan_id = vlan->vlan_id; ASSERT_RTNL(); vlan_info = rtnl_dereference(real_dev->vlan_info); BUG_ON(!vlan_info); grp = &vlan_info->grp; grp->nr_vlan_devs--; if (vlan->flags & VLAN_FLAG_MVRP) vlan_mvrp_request_leave(dev); if (vlan->flags & VLAN_FLAG_GVRP) vlan_gvrp_request_leave(dev); vlan_group_set_device(grp, vlan->vlan_proto, vlan_id, NULL); netdev_upper_dev_unlink(real_dev, dev); /* Because unregister_netdevice_queue() makes sure at least one rcu * grace period is respected before device freeing, * we dont need to call synchronize_net() here. */ unregister_netdevice_queue(dev, head); if (grp->nr_vlan_devs == 0) { vlan_mvrp_uninit_applicant(real_dev); vlan_gvrp_uninit_applicant(real_dev); } vlan_vid_del(real_dev, vlan->vlan_proto, vlan_id); } int vlan_check_real_dev(struct net_device *real_dev, __be16 protocol, u16 vlan_id, struct netlink_ext_ack *extack) { const char *name = real_dev->name; if (real_dev->features & NETIF_F_VLAN_CHALLENGED || real_dev->type != ARPHRD_ETHER) { pr_info("VLANs not supported on %s\n", name); NL_SET_ERR_MSG_MOD(extack, "VLANs not supported on device"); return -EOPNOTSUPP; } if (vlan_find_dev(real_dev, protocol, vlan_id) != NULL) { NL_SET_ERR_MSG_MOD(extack, "VLAN device already exists"); return -EEXIST; } return 0; } int register_vlan_dev(struct net_device *dev, struct netlink_ext_ack *extack) { struct vlan_dev_priv *vlan = vlan_dev_priv(dev); struct net_device *real_dev = vlan->real_dev; u16 vlan_id = vlan->vlan_id; struct vlan_info *vlan_info; struct vlan_group *grp; int err; err = vlan_vid_add(real_dev, vlan->vlan_proto, vlan_id); if (err) return err; vlan_info = rtnl_dereference(real_dev->vlan_info); /* vlan_info should be there now. vlan_vid_add took care of it */ BUG_ON(!vlan_info); grp = &vlan_info->grp; if (grp->nr_vlan_devs == 0) { err = vlan_gvrp_init_applicant(real_dev); if (err < 0) goto out_vid_del; err = vlan_mvrp_init_applicant(real_dev); if (err < 0) goto out_uninit_gvrp; } err = vlan_group_prealloc_vid(grp, vlan->vlan_proto, vlan_id); if (err < 0) goto out_uninit_mvrp; err = register_netdevice(dev); if (err < 0) goto out_uninit_mvrp; err = netdev_upper_dev_link(real_dev, dev, extack); if (err) goto out_unregister_netdev; vlan_stacked_transfer_operstate(real_dev, dev, vlan); linkwatch_fire_event(dev); /* _MUST_ call rfc2863_policy() */ /* So, got the sucker initialized, now lets place * it into our local structure. */ vlan_group_set_device(grp, vlan->vlan_proto, vlan_id, dev); grp->nr_vlan_devs++; return 0; out_unregister_netdev: unregister_netdevice(dev); out_uninit_mvrp: if (grp->nr_vlan_devs == 0) vlan_mvrp_uninit_applicant(real_dev); out_uninit_gvrp: if (grp->nr_vlan_devs == 0) vlan_gvrp_uninit_applicant(real_dev); out_vid_del: vlan_vid_del(real_dev, vlan->vlan_proto, vlan_id); return err; } /* Attach a VLAN device to a mac address (ie Ethernet Card). * Returns 0 if the device was created or a negative error code otherwise. */ static int register_vlan_device(struct net_device *real_dev, u16 vlan_id) { struct net_device *new_dev; struct vlan_dev_priv *vlan; struct net *net = dev_net(real_dev); struct vlan_net *vn = net_generic(net, vlan_net_id); char name[IFNAMSIZ]; int err; if (vlan_id >= VLAN_VID_MASK) return -ERANGE; err = vlan_check_real_dev(real_dev, htons(ETH_P_8021Q), vlan_id, NULL); if (err < 0) return err; /* Gotta set up the fields for the device. */ switch (vn->name_type) { case VLAN_NAME_TYPE_RAW_PLUS_VID: /* name will look like: eth1.0005 */ snprintf(name, IFNAMSIZ, "%s.%.4i", real_dev->name, vlan_id); break; case VLAN_NAME_TYPE_PLUS_VID_NO_PAD: /* Put our vlan.VID in the name. * Name will look like: vlan5 */ snprintf(name, IFNAMSIZ, "vlan%i", vlan_id); break; case VLAN_NAME_TYPE_RAW_PLUS_VID_NO_PAD: /* Put our vlan.VID in the name. * Name will look like: eth0.5 */ snprintf(name, IFNAMSIZ, "%s.%i", real_dev->name, vlan_id); break; case VLAN_NAME_TYPE_PLUS_VID: /* Put our vlan.VID in the name. * Name will look like: vlan0005 */ default: snprintf(name, IFNAMSIZ, "vlan%.4i", vlan_id); } new_dev = alloc_netdev(sizeof(struct vlan_dev_priv), name, NET_NAME_UNKNOWN, vlan_setup); if (new_dev == NULL) return -ENOBUFS; dev_net_set(new_dev, net); /* need 4 bytes for extra VLAN header info, * hope the underlying device can handle it. */ new_dev->mtu = real_dev->mtu; vlan = vlan_dev_priv(new_dev); vlan->vlan_proto = htons(ETH_P_8021Q); vlan->vlan_id = vlan_id; vlan->real_dev = real_dev; vlan->dent = NULL; vlan->flags = VLAN_FLAG_REORDER_HDR; new_dev->rtnl_link_ops = &vlan_link_ops; err = register_vlan_dev(new_dev, NULL); if (err < 0) goto out_free_newdev; return 0; out_free_newdev: free_netdev(new_dev); return err; } static void vlan_sync_address(struct net_device *dev, struct net_device *vlandev) { struct vlan_dev_priv *vlan = vlan_dev_priv(vlandev); /* May be called without an actual change */ if (ether_addr_equal(vlan->real_dev_addr, dev->dev_addr)) return; /* vlan continues to inherit address of lower device */ if (vlan_dev_inherit_address(vlandev, dev)) goto out; /* vlan address was different from the old address and is equal to * the new address */ if (!ether_addr_equal(vlandev->dev_addr, vlan->real_dev_addr) && ether_addr_equal(vlandev->dev_addr, dev->dev_addr)) dev_uc_del(dev, vlandev->dev_addr); /* vlan address was equal to the old address and is different from * the new address */ if (ether_addr_equal(vlandev->dev_addr, vlan->real_dev_addr) && !ether_addr_equal(vlandev->dev_addr, dev->dev_addr)) dev_uc_add(dev, vlandev->dev_addr); out: ether_addr_copy(vlan->real_dev_addr, dev->dev_addr); } static void vlan_transfer_features(struct net_device *dev, struct net_device *vlandev) { struct vlan_dev_priv *vlan = vlan_dev_priv(vlandev); netif_inherit_tso_max(vlandev, dev); if (vlan_hw_offload_capable(dev->features, vlan->vlan_proto)) vlandev->hard_header_len = dev->hard_header_len; else vlandev->hard_header_len = dev->hard_header_len + VLAN_HLEN; #if IS_ENABLED(CONFIG_FCOE) vlandev->fcoe_ddp_xid = dev->fcoe_ddp_xid; #endif vlandev->priv_flags &= ~IFF_XMIT_DST_RELEASE; vlandev->priv_flags |= (vlan->real_dev->priv_flags & IFF_XMIT_DST_RELEASE); vlandev->hw_enc_features = vlan_tnl_features(vlan->real_dev); netdev_update_features(vlandev); } static int __vlan_device_event(struct net_device *dev, unsigned long event) { int err = 0; switch (event) { case NETDEV_CHANGENAME: vlan_proc_rem_dev(dev); err = vlan_proc_add_dev(dev); break; case NETDEV_REGISTER: err = vlan_proc_add_dev(dev); break; case NETDEV_UNREGISTER: vlan_proc_rem_dev(dev); break; } return err; } static int vlan_device_event(struct notifier_block *unused, unsigned long event, void *ptr) { struct netlink_ext_ack *extack = netdev_notifier_info_to_extack(ptr); struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct vlan_group *grp; struct vlan_info *vlan_info; int i, flgs; struct net_device *vlandev; struct vlan_dev_priv *vlan; bool last = false; LIST_HEAD(list); int err; if (is_vlan_dev(dev)) { int err = __vlan_device_event(dev, event); if (err) return notifier_from_errno(err); } if ((event == NETDEV_UP) && (dev->features & NETIF_F_HW_VLAN_CTAG_FILTER)) { pr_info("adding VLAN 0 to HW filter on device %s\n", dev->name); vlan_vid_add(dev, htons(ETH_P_8021Q), 0); } if (event == NETDEV_DOWN && (dev->features & NETIF_F_HW_VLAN_CTAG_FILTER)) vlan_vid_del(dev, htons(ETH_P_8021Q), 0); vlan_info = rtnl_dereference(dev->vlan_info); if (!vlan_info) goto out; grp = &vlan_info->grp; /* It is OK that we do not hold the group lock right now, * as we run under the RTNL lock. */ switch (event) { case NETDEV_CHANGE: /* Propagate real device state to vlan devices */ vlan_group_for_each_dev(grp, i, vlandev) vlan_stacked_transfer_operstate(dev, vlandev, vlan_dev_priv(vlandev)); break; case NETDEV_CHANGEADDR: /* Adjust unicast filters on underlying device */ vlan_group_for_each_dev(grp, i, vlandev) { flgs = vlandev->flags; if (!(flgs & IFF_UP)) continue; vlan_sync_address(dev, vlandev); } break; case NETDEV_CHANGEMTU: vlan_group_for_each_dev(grp, i, vlandev) { if (vlandev->mtu <= dev->mtu) continue; dev_set_mtu(vlandev, dev->mtu); } break; case NETDEV_FEAT_CHANGE: /* Propagate device features to underlying device */ vlan_group_for_each_dev(grp, i, vlandev) vlan_transfer_features(dev, vlandev); break; case NETDEV_DOWN: { struct net_device *tmp; LIST_HEAD(close_list); /* Put all VLANs for this dev in the down state too. */ vlan_group_for_each_dev(grp, i, vlandev) { flgs = vlandev->flags; if (!(flgs & IFF_UP)) continue; vlan = vlan_dev_priv(vlandev); if (!(vlan->flags & VLAN_FLAG_LOOSE_BINDING)) list_add(&vlandev->close_list, &close_list); } dev_close_many(&close_list, false); list_for_each_entry_safe(vlandev, tmp, &close_list, close_list) { vlan_stacked_transfer_operstate(dev, vlandev, vlan_dev_priv(vlandev)); list_del_init(&vlandev->close_list); } list_del(&close_list); break; } case NETDEV_UP: /* Put all VLANs for this dev in the up state too. */ vlan_group_for_each_dev(grp, i, vlandev) { flgs = dev_get_flags(vlandev); if (flgs & IFF_UP) continue; vlan = vlan_dev_priv(vlandev); if (!(vlan->flags & VLAN_FLAG_LOOSE_BINDING)) dev_change_flags(vlandev, flgs | IFF_UP, extack); vlan_stacked_transfer_operstate(dev, vlandev, vlan); } break; case NETDEV_UNREGISTER: /* twiddle thumbs on netns device moves */ if (dev->reg_state != NETREG_UNREGISTERING) break; vlan_group_for_each_dev(grp, i, vlandev) { /* removal of last vid destroys vlan_info, abort * afterwards */ if (vlan_info->nr_vids == 1) last = true; unregister_vlan_dev(vlandev, &list); if (last) break; } unregister_netdevice_many(&list); break; case NETDEV_PRE_TYPE_CHANGE: /* Forbid underlaying device to change its type. */ if (vlan_uses_dev(dev)) return NOTIFY_BAD; break; case NETDEV_NOTIFY_PEERS: case NETDEV_BONDING_FAILOVER: case NETDEV_RESEND_IGMP: /* Propagate to vlan devices */ vlan_group_for_each_dev(grp, i, vlandev) call_netdevice_notifiers(event, vlandev); break; case NETDEV_CVLAN_FILTER_PUSH_INFO: err = vlan_filter_push_vids(vlan_info, htons(ETH_P_8021Q)); if (err) return notifier_from_errno(err); break; case NETDEV_CVLAN_FILTER_DROP_INFO: vlan_filter_drop_vids(vlan_info, htons(ETH_P_8021Q)); break; case NETDEV_SVLAN_FILTER_PUSH_INFO: err = vlan_filter_push_vids(vlan_info, htons(ETH_P_8021AD)); if (err) return notifier_from_errno(err); break; case NETDEV_SVLAN_FILTER_DROP_INFO: vlan_filter_drop_vids(vlan_info, htons(ETH_P_8021AD)); break; } out: return NOTIFY_DONE; } static struct notifier_block vlan_notifier_block __read_mostly = { .notifier_call = vlan_device_event, }; /* * VLAN IOCTL handler. * o execute requested action or pass command to the device driver * arg is really a struct vlan_ioctl_args __user *. */ static int vlan_ioctl_handler(struct net *net, void __user *arg) { int err; struct vlan_ioctl_args args; struct net_device *dev = NULL; if (copy_from_user(&args, arg, sizeof(struct vlan_ioctl_args))) return -EFAULT; /* Null terminate this sucker, just in case. */ args.device1[sizeof(args.device1) - 1] = 0; args.u.device2[sizeof(args.u.device2) - 1] = 0; rtnl_lock(); switch (args.cmd) { case SET_VLAN_INGRESS_PRIORITY_CMD: case SET_VLAN_EGRESS_PRIORITY_CMD: case SET_VLAN_FLAG_CMD: case ADD_VLAN_CMD: case DEL_VLAN_CMD: case GET_VLAN_REALDEV_NAME_CMD: case GET_VLAN_VID_CMD: err = -ENODEV; dev = __dev_get_by_name(net, args.device1); if (!dev) goto out; err = -EINVAL; if (args.cmd != ADD_VLAN_CMD && !is_vlan_dev(dev)) goto out; } switch (args.cmd) { case SET_VLAN_INGRESS_PRIORITY_CMD: err = -EPERM; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) break; vlan_dev_set_ingress_priority(dev, args.u.skb_priority, args.vlan_qos); err = 0; break; case SET_VLAN_EGRESS_PRIORITY_CMD: err = -EPERM; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) break; err = vlan_dev_set_egress_priority(dev, args.u.skb_priority, args.vlan_qos); break; case SET_VLAN_FLAG_CMD: err = -EPERM; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) break; err = vlan_dev_change_flags(dev, args.vlan_qos ? args.u.flag : 0, args.u.flag); break; case SET_VLAN_NAME_TYPE_CMD: err = -EPERM; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) break; if (args.u.name_type < VLAN_NAME_TYPE_HIGHEST) { struct vlan_net *vn; vn = net_generic(net, vlan_net_id); vn->name_type = args.u.name_type; err = 0; } else { err = -EINVAL; } break; case ADD_VLAN_CMD: err = -EPERM; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) break; err = register_vlan_device(dev, args.u.VID); break; case DEL_VLAN_CMD: err = -EPERM; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) break; unregister_vlan_dev(dev, NULL); err = 0; break; case GET_VLAN_REALDEV_NAME_CMD: err = 0; vlan_dev_get_realdev_name(dev, args.u.device2, sizeof(args.u.device2)); if (copy_to_user(arg, &args, sizeof(struct vlan_ioctl_args))) err = -EFAULT; break; case GET_VLAN_VID_CMD: err = 0; args.u.VID = vlan_dev_vlan_id(dev); if (copy_to_user(arg, &args, sizeof(struct vlan_ioctl_args))) err = -EFAULT; break; default: err = -EOPNOTSUPP; break; } out: rtnl_unlock(); return err; } static int __net_init vlan_init_net(struct net *net) { struct vlan_net *vn = net_generic(net, vlan_net_id); int err; vn->name_type = VLAN_NAME_TYPE_RAW_PLUS_VID_NO_PAD; err = vlan_proc_init(net); return err; } static void __net_exit vlan_exit_net(struct net *net) { vlan_proc_cleanup(net); } static struct pernet_operations vlan_net_ops = { .init = vlan_init_net, .exit = vlan_exit_net, .id = &vlan_net_id, .size = sizeof(struct vlan_net), }; static int __init vlan_proto_init(void) { int err; pr_info("%s v%s\n", vlan_fullname, vlan_version); err = register_pernet_subsys(&vlan_net_ops); if (err < 0) goto err0; err = register_netdevice_notifier(&vlan_notifier_block); if (err < 0) goto err2; err = vlan_gvrp_init(); if (err < 0) goto err3; err = vlan_mvrp_init(); if (err < 0) goto err4; err = vlan_netlink_init(); if (err < 0) goto err5; vlan_ioctl_set(vlan_ioctl_handler); return 0; err5: vlan_mvrp_uninit(); err4: vlan_gvrp_uninit(); err3: unregister_netdevice_notifier(&vlan_notifier_block); err2: unregister_pernet_subsys(&vlan_net_ops); err0: return err; } static void __exit vlan_cleanup_module(void) { vlan_ioctl_set(NULL); vlan_netlink_fini(); unregister_netdevice_notifier(&vlan_notifier_block); unregister_pernet_subsys(&vlan_net_ops); rcu_barrier(); /* Wait for completion of call_rcu()'s */ vlan_mvrp_uninit(); vlan_gvrp_uninit(); } module_init(vlan_proto_init); module_exit(vlan_cleanup_module); MODULE_DESCRIPTION("802.1Q/802.1ad VLAN Protocol"); MODULE_LICENSE("GPL"); MODULE_VERSION(DRV_VERSION);
2 3 1 10 3 2 9 9 4 8 9 1 1 3 6 1 6 2 6 47 48 7 11 7 31 22 21 10 32 1 38 4 21 8 23 43 44 25 22 16 2 25 2 23 4 23 3 15 9 16 9 9 16 2 1 1 6 2 1 7 7 7 1 2 6 8 30 1 2 10 8 9 19 3 1 15 13 11 4 11 46 2 24 21 31 11 1 12 12 29 3 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (C) 2012-2013 Samsung Electronics Co., Ltd. */ #include <linux/slab.h> #include <linux/compat.h> #include <linux/cred.h> #include <linux/buffer_head.h> #include <linux/blkdev.h> #include <linux/fsnotify.h> #include <linux/security.h> #include <linux/msdos_fs.h> #include <linux/writeback.h> #include "exfat_raw.h" #include "exfat_fs.h" static int exfat_cont_expand(struct inode *inode, loff_t size) { int ret; unsigned int num_clusters, new_num_clusters, last_clu; struct exfat_inode_info *ei = EXFAT_I(inode); struct super_block *sb = inode->i_sb; struct exfat_sb_info *sbi = EXFAT_SB(sb); struct exfat_chain clu; ret = inode_newsize_ok(inode, size); if (ret) return ret; num_clusters = EXFAT_B_TO_CLU(exfat_ondisk_size(inode), sbi); new_num_clusters = EXFAT_B_TO_CLU_ROUND_UP(size, sbi); if (new_num_clusters == num_clusters) goto out; if (num_clusters) { exfat_chain_set(&clu, ei->start_clu, num_clusters, ei->flags); ret = exfat_find_last_cluster(sb, &clu, &last_clu); if (ret) return ret; clu.dir = last_clu + 1; } else { last_clu = EXFAT_EOF_CLUSTER; clu.dir = EXFAT_EOF_CLUSTER; } clu.size = 0; clu.flags = ei->flags; ret = exfat_alloc_cluster(inode, new_num_clusters - num_clusters, &clu, inode_needs_sync(inode)); if (ret) return ret; /* Append new clusters to chain */ if (num_clusters) { if (clu.flags != ei->flags) if (exfat_chain_cont_cluster(sb, ei->start_clu, num_clusters)) goto free_clu; if (clu.flags == ALLOC_FAT_CHAIN) if (exfat_ent_set(sb, last_clu, clu.dir)) goto free_clu; } else ei->start_clu = clu.dir; ei->flags = clu.flags; out: inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); /* Expanded range not zeroed, do not update valid_size */ i_size_write(inode, size); inode->i_blocks = round_up(size, sbi->cluster_size) >> 9; mark_inode_dirty(inode); if (IS_SYNC(inode)) return write_inode_now(inode, 1); return 0; free_clu: exfat_free_cluster(inode, &clu); return -EIO; } static bool exfat_allow_set_time(struct mnt_idmap *idmap, struct exfat_sb_info *sbi, struct inode *inode) { mode_t allow_utime = sbi->options.allow_utime; if (!vfsuid_eq_kuid(i_uid_into_vfsuid(idmap, inode), current_fsuid())) { if (vfsgid_in_group_p(i_gid_into_vfsgid(idmap, inode))) allow_utime >>= 3; if (allow_utime & MAY_WRITE) return true; } /* use a default check */ return false; } static int exfat_sanitize_mode(const struct exfat_sb_info *sbi, struct inode *inode, umode_t *mode_ptr) { mode_t i_mode, mask, perm; i_mode = inode->i_mode; mask = (S_ISREG(i_mode) || S_ISLNK(i_mode)) ? sbi->options.fs_fmask : sbi->options.fs_dmask; perm = *mode_ptr & ~(S_IFMT | mask); /* Of the r and x bits, all (subject to umask) must be present.*/ if ((perm & 0555) != (i_mode & 0555)) return -EPERM; if (exfat_mode_can_hold_ro(inode)) { /* * Of the w bits, either all (subject to umask) or none must * be present. */ if ((perm & 0222) && ((perm & 0222) != (0222 & ~mask))) return -EPERM; } else { /* * If exfat_mode_can_hold_ro(inode) is false, can't change * w bits. */ if ((perm & 0222) != (0222 & ~mask)) return -EPERM; } *mode_ptr &= S_IFMT | perm; return 0; } /* resize the file length */ int __exfat_truncate(struct inode *inode) { unsigned int num_clusters_new, num_clusters_phys; unsigned int last_clu = EXFAT_FREE_CLUSTER; struct exfat_chain clu; struct super_block *sb = inode->i_sb; struct exfat_sb_info *sbi = EXFAT_SB(sb); struct exfat_inode_info *ei = EXFAT_I(inode); /* check if the given file ID is opened */ if (ei->type != TYPE_FILE && ei->type != TYPE_DIR) return -EPERM; exfat_set_volume_dirty(sb); num_clusters_new = EXFAT_B_TO_CLU_ROUND_UP(i_size_read(inode), sbi); num_clusters_phys = EXFAT_B_TO_CLU(exfat_ondisk_size(inode), sbi); exfat_chain_set(&clu, ei->start_clu, num_clusters_phys, ei->flags); if (i_size_read(inode) > 0) { /* * Truncate FAT chain num_clusters after the first cluster * num_clusters = min(new, phys); */ unsigned int num_clusters = min(num_clusters_new, num_clusters_phys); /* * Follow FAT chain * (defensive coding - works fine even with corrupted FAT table */ if (clu.flags == ALLOC_NO_FAT_CHAIN) { clu.dir += num_clusters; clu.size -= num_clusters; } else { while (num_clusters > 0) { last_clu = clu.dir; if (exfat_get_next_cluster(sb, &(clu.dir))) return -EIO; num_clusters--; clu.size--; } } } else { ei->flags = ALLOC_NO_FAT_CHAIN; ei->start_clu = EXFAT_EOF_CLUSTER; } if (i_size_read(inode) < ei->valid_size) ei->valid_size = i_size_read(inode); if (ei->type == TYPE_FILE) ei->attr |= EXFAT_ATTR_ARCHIVE; /* * update the directory entry * * If the directory entry is updated by mark_inode_dirty(), the * directory entry will be written after a writeback cycle of * updating the bitmap/FAT, which may result in clusters being * freed but referenced by the directory entry in the event of a * sudden power failure. * __exfat_write_inode() is called for directory entry, bitmap * and FAT to be written in a same writeback. */ if (__exfat_write_inode(inode, inode_needs_sync(inode))) return -EIO; /* cut off from the FAT chain */ if (ei->flags == ALLOC_FAT_CHAIN && last_clu != EXFAT_FREE_CLUSTER && last_clu != EXFAT_EOF_CLUSTER) { if (exfat_ent_set(sb, last_clu, EXFAT_EOF_CLUSTER)) return -EIO; } /* invalidate cache and free the clusters */ /* clear exfat cache */ exfat_cache_inval_inode(inode); /* hint information */ ei->hint_bmap.off = EXFAT_EOF_CLUSTER; ei->hint_bmap.clu = EXFAT_EOF_CLUSTER; /* hint_stat will be used if this is directory. */ ei->hint_stat.eidx = 0; ei->hint_stat.clu = ei->start_clu; ei->hint_femp.eidx = EXFAT_HINT_NONE; /* free the clusters */ if (exfat_free_cluster(inode, &clu)) return -EIO; return 0; } void exfat_truncate(struct inode *inode) { struct super_block *sb = inode->i_sb; struct exfat_sb_info *sbi = EXFAT_SB(sb); struct exfat_inode_info *ei = EXFAT_I(inode); int err; mutex_lock(&sbi->s_lock); if (ei->start_clu == 0) { /* * Empty start_clu != ~0 (not allocated) */ exfat_fs_error(sb, "tried to truncate zeroed cluster."); goto write_size; } err = __exfat_truncate(inode); if (err) goto write_size; inode->i_blocks = round_up(i_size_read(inode), sbi->cluster_size) >> 9; write_size: mutex_unlock(&sbi->s_lock); } int exfat_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, unsigned int request_mask, unsigned int query_flags) { struct inode *inode = d_backing_inode(path->dentry); struct exfat_inode_info *ei = EXFAT_I(inode); generic_fillattr(idmap, request_mask, inode, stat); exfat_truncate_atime(&stat->atime); stat->result_mask |= STATX_BTIME; stat->btime.tv_sec = ei->i_crtime.tv_sec; stat->btime.tv_nsec = ei->i_crtime.tv_nsec; stat->blksize = EXFAT_SB(inode->i_sb)->cluster_size; return 0; } int exfat_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { struct exfat_sb_info *sbi = EXFAT_SB(dentry->d_sb); struct inode *inode = dentry->d_inode; unsigned int ia_valid; int error; if (unlikely(exfat_forced_shutdown(inode->i_sb))) return -EIO; if ((attr->ia_valid & ATTR_SIZE) && attr->ia_size > i_size_read(inode)) { error = exfat_cont_expand(inode, attr->ia_size); if (error || attr->ia_valid == ATTR_SIZE) return error; attr->ia_valid &= ~ATTR_SIZE; } /* Check for setting the inode time. */ ia_valid = attr->ia_valid; if ((ia_valid & (ATTR_MTIME_SET | ATTR_ATIME_SET | ATTR_TIMES_SET)) && exfat_allow_set_time(idmap, sbi, inode)) { attr->ia_valid &= ~(ATTR_MTIME_SET | ATTR_ATIME_SET | ATTR_TIMES_SET); } error = setattr_prepare(idmap, dentry, attr); attr->ia_valid = ia_valid; if (error) goto out; if (((attr->ia_valid & ATTR_UID) && (!uid_eq(from_vfsuid(idmap, i_user_ns(inode), attr->ia_vfsuid), sbi->options.fs_uid))) || ((attr->ia_valid & ATTR_GID) && (!gid_eq(from_vfsgid(idmap, i_user_ns(inode), attr->ia_vfsgid), sbi->options.fs_gid))) || ((attr->ia_valid & ATTR_MODE) && (attr->ia_mode & ~(S_IFREG | S_IFLNK | S_IFDIR | 0777)))) { error = -EPERM; goto out; } /* * We don't return -EPERM here. Yes, strange, but this is too * old behavior. */ if (attr->ia_valid & ATTR_MODE) { if (exfat_sanitize_mode(sbi, inode, &attr->ia_mode) < 0) attr->ia_valid &= ~ATTR_MODE; } if (attr->ia_valid & ATTR_SIZE) inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); setattr_copy(idmap, inode, attr); exfat_truncate_inode_atime(inode); if (attr->ia_valid & ATTR_SIZE) { error = exfat_block_truncate_page(inode, attr->ia_size); if (error) goto out; down_write(&EXFAT_I(inode)->truncate_lock); truncate_setsize(inode, attr->ia_size); /* * __exfat_write_inode() is called from exfat_truncate(), inode * is already written by it, so mark_inode_dirty() is unneeded. */ exfat_truncate(inode); up_write(&EXFAT_I(inode)->truncate_lock); } else mark_inode_dirty(inode); out: return error; } /* * modified ioctls from fat/file.c by Welmer Almesberger */ static int exfat_ioctl_get_attributes(struct inode *inode, u32 __user *user_attr) { u32 attr; inode_lock_shared(inode); attr = exfat_make_attr(inode); inode_unlock_shared(inode); return put_user(attr, user_attr); } static int exfat_ioctl_set_attributes(struct file *file, u32 __user *user_attr) { struct inode *inode = file_inode(file); struct exfat_sb_info *sbi = EXFAT_SB(inode->i_sb); int is_dir = S_ISDIR(inode->i_mode); u32 attr, oldattr; struct iattr ia; int err; err = get_user(attr, user_attr); if (err) goto out; err = mnt_want_write_file(file); if (err) goto out; inode_lock(inode); oldattr = exfat_make_attr(inode); /* * Mask attributes so we don't set reserved fields. */ attr &= (EXFAT_ATTR_READONLY | EXFAT_ATTR_HIDDEN | EXFAT_ATTR_SYSTEM | EXFAT_ATTR_ARCHIVE); attr |= (is_dir ? EXFAT_ATTR_SUBDIR : 0); /* Equivalent to a chmod() */ ia.ia_valid = ATTR_MODE | ATTR_CTIME; ia.ia_ctime = current_time(inode); if (is_dir) ia.ia_mode = exfat_make_mode(sbi, attr, 0777); else ia.ia_mode = exfat_make_mode(sbi, attr, 0666 | (inode->i_mode & 0111)); /* The root directory has no attributes */ if (inode->i_ino == EXFAT_ROOT_INO && attr != EXFAT_ATTR_SUBDIR) { err = -EINVAL; goto out_unlock_inode; } if (((attr | oldattr) & EXFAT_ATTR_SYSTEM) && !capable(CAP_LINUX_IMMUTABLE)) { err = -EPERM; goto out_unlock_inode; } /* * The security check is questionable... We single * out the RO attribute for checking by the security * module, just because it maps to a file mode. */ err = security_inode_setattr(file_mnt_idmap(file), file->f_path.dentry, &ia); if (err) goto out_unlock_inode; /* This MUST be done before doing anything irreversible... */ err = exfat_setattr(file_mnt_idmap(file), file->f_path.dentry, &ia); if (err) goto out_unlock_inode; fsnotify_change(file->f_path.dentry, ia.ia_valid); exfat_save_attr(inode, attr); mark_inode_dirty(inode); out_unlock_inode: inode_unlock(inode); mnt_drop_write_file(file); out: return err; } static int exfat_ioctl_fitrim(struct inode *inode, unsigned long arg) { struct fstrim_range range; int ret = 0; if (!capable(CAP_SYS_ADMIN)) return -EPERM; if (!bdev_max_discard_sectors(inode->i_sb->s_bdev)) return -EOPNOTSUPP; if (copy_from_user(&range, (struct fstrim_range __user *)arg, sizeof(range))) return -EFAULT; range.minlen = max_t(unsigned int, range.minlen, bdev_discard_granularity(inode->i_sb->s_bdev)); ret = exfat_trim_fs(inode, &range); if (ret < 0) return ret; if (copy_to_user((struct fstrim_range __user *)arg, &range, sizeof(range))) return -EFAULT; return 0; } static int exfat_ioctl_shutdown(struct super_block *sb, unsigned long arg) { u32 flags; if (!capable(CAP_SYS_ADMIN)) return -EPERM; if (get_user(flags, (__u32 __user *)arg)) return -EFAULT; return exfat_force_shutdown(sb, flags); } long exfat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { struct inode *inode = file_inode(filp); u32 __user *user_attr = (u32 __user *)arg; switch (cmd) { case FAT_IOCTL_GET_ATTRIBUTES: return exfat_ioctl_get_attributes(inode, user_attr); case FAT_IOCTL_SET_ATTRIBUTES: return exfat_ioctl_set_attributes(filp, user_attr); case EXFAT_IOC_SHUTDOWN: return exfat_ioctl_shutdown(inode->i_sb, arg); case FITRIM: return exfat_ioctl_fitrim(inode, arg); default: return -ENOTTY; } } #ifdef CONFIG_COMPAT long exfat_compat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { return exfat_ioctl(filp, cmd, (unsigned long)compat_ptr(arg)); } #endif int exfat_file_fsync(struct file *filp, loff_t start, loff_t end, int datasync) { struct inode *inode = filp->f_mapping->host; int err; if (unlikely(exfat_forced_shutdown(inode->i_sb))) return -EIO; err = __generic_file_fsync(filp, start, end, datasync); if (err) return err; err = sync_blockdev(inode->i_sb->s_bdev); if (err) return err; return blkdev_issue_flush(inode->i_sb->s_bdev); } static int exfat_extend_valid_size(struct file *file, loff_t new_valid_size) { int err; loff_t pos; struct inode *inode = file_inode(file); struct exfat_inode_info *ei = EXFAT_I(inode); struct address_space *mapping = inode->i_mapping; const struct address_space_operations *ops = mapping->a_ops; pos = ei->valid_size; while (pos < new_valid_size) { u32 len; struct folio *folio; unsigned long off; len = PAGE_SIZE - (pos & (PAGE_SIZE - 1)); if (pos + len > new_valid_size) len = new_valid_size - pos; err = ops->write_begin(file, mapping, pos, len, &folio, NULL); if (err) goto out; off = offset_in_folio(folio, pos); folio_zero_new_buffers(folio, off, off + len); err = ops->write_end(file, mapping, pos, len, len, folio, NULL); if (err < 0) goto out; pos += len; balance_dirty_pages_ratelimited(mapping); cond_resched(); } return 0; out: return err; } static ssize_t exfat_file_write_iter(struct kiocb *iocb, struct iov_iter *iter) { ssize_t ret; struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); struct exfat_inode_info *ei = EXFAT_I(inode); loff_t pos = iocb->ki_pos; loff_t valid_size; inode_lock(inode); valid_size = ei->valid_size; ret = generic_write_checks(iocb, iter); if (ret <= 0) goto unlock; if (iocb->ki_flags & IOCB_DIRECT) { unsigned long align = pos | iov_iter_alignment(iter); if (!IS_ALIGNED(align, i_blocksize(inode)) && !IS_ALIGNED(align, bdev_logical_block_size(inode->i_sb->s_bdev))) { ret = -EINVAL; goto unlock; } } if (pos > valid_size) { ret = exfat_extend_valid_size(file, pos); if (ret < 0 && ret != -ENOSPC) { exfat_err(inode->i_sb, "write: fail to zero from %llu to %llu(%zd)", valid_size, pos, ret); } if (ret < 0) goto unlock; } ret = __generic_file_write_iter(iocb, iter); if (ret < 0) goto unlock; inode_unlock(inode); if (pos > valid_size) pos = valid_size; if (iocb_is_dsync(iocb) && iocb->ki_pos > pos) { ssize_t err = vfs_fsync_range(file, pos, iocb->ki_pos - 1, iocb->ki_flags & IOCB_SYNC); if (err < 0) return err; } return ret; unlock: inode_unlock(inode); return ret; } static vm_fault_t exfat_page_mkwrite(struct vm_fault *vmf) { int err; struct vm_area_struct *vma = vmf->vma; struct file *file = vma->vm_file; struct inode *inode = file_inode(file); struct exfat_inode_info *ei = EXFAT_I(inode); loff_t start, end; if (!inode_trylock(inode)) return VM_FAULT_RETRY; start = ((loff_t)vma->vm_pgoff << PAGE_SHIFT); end = min_t(loff_t, i_size_read(inode), start + vma->vm_end - vma->vm_start); if (ei->valid_size < end) { err = exfat_extend_valid_size(file, end); if (err < 0) { inode_unlock(inode); return vmf_fs_error(err); } } inode_unlock(inode); return filemap_page_mkwrite(vmf); } static const struct vm_operations_struct exfat_file_vm_ops = { .fault = filemap_fault, .map_pages = filemap_map_pages, .page_mkwrite = exfat_page_mkwrite, }; static int exfat_file_mmap(struct file *file, struct vm_area_struct *vma) { file_accessed(file); vma->vm_ops = &exfat_file_vm_ops; return 0; } const struct file_operations exfat_file_operations = { .llseek = generic_file_llseek, .read_iter = generic_file_read_iter, .write_iter = exfat_file_write_iter, .unlocked_ioctl = exfat_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = exfat_compat_ioctl, #endif .mmap = exfat_file_mmap, .fsync = exfat_file_fsync, .splice_read = filemap_splice_read, .splice_write = iter_file_splice_write, }; const struct inode_operations exfat_file_inode_operations = { .setattr = exfat_setattr, .getattr = exfat_getattr, };
39 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_GENERIC_BITOPS_LE_H_ #define _ASM_GENERIC_BITOPS_LE_H_ #include <asm/types.h> #include <asm/byteorder.h> #if defined(__LITTLE_ENDIAN) #define BITOP_LE_SWIZZLE 0 #elif defined(__BIG_ENDIAN) #define BITOP_LE_SWIZZLE ((BITS_PER_LONG-1) & ~0x7) #endif static inline int test_bit_le(int nr, const void *addr) { return test_bit(nr ^ BITOP_LE_SWIZZLE, addr); } static inline void set_bit_le(int nr, void *addr) { set_bit(nr ^ BITOP_LE_SWIZZLE, addr); } static inline void clear_bit_le(int nr, void *addr) { clear_bit(nr ^ BITOP_LE_SWIZZLE, addr); } static inline void __set_bit_le(int nr, void *addr) { __set_bit(nr ^ BITOP_LE_SWIZZLE, addr); } static inline void __clear_bit_le(int nr, void *addr) { __clear_bit(nr ^ BITOP_LE_SWIZZLE, addr); } static inline int test_and_set_bit_le(int nr, void *addr) { return test_and_set_bit(nr ^ BITOP_LE_SWIZZLE, addr); } static inline int test_and_clear_bit_le(int nr, void *addr) { return test_and_clear_bit(nr ^ BITOP_LE_SWIZZLE, addr); } static inline int __test_and_set_bit_le(int nr, void *addr) { return __test_and_set_bit(nr ^ BITOP_LE_SWIZZLE, addr); } static inline int __test_and_clear_bit_le(int nr, void *addr) { return __test_and_clear_bit(nr ^ BITOP_LE_SWIZZLE, addr); } #endif /* _ASM_GENERIC_BITOPS_LE_H_ */
3 2 1 3 1 1 5 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 // SPDX-License-Identifier: GPL-2.0 /* * virtio-fs: Virtio Filesystem * Copyright (C) 2018 Red Hat, Inc. */ #include <linux/fs.h> #include <linux/dax.h> #include <linux/pci.h> #include <linux/interrupt.h> #include <linux/group_cpus.h> #include <linux/pfn_t.h> #include <linux/memremap.h> #include <linux/module.h> #include <linux/virtio.h> #include <linux/virtio_fs.h> #include <linux/delay.h> #include <linux/fs_context.h> #include <linux/fs_parser.h> #include <linux/highmem.h> #include <linux/cleanup.h> #include <linux/uio.h> #include "fuse_i.h" /* Used to help calculate the FUSE connection's max_pages limit for a request's * size. Parts of the struct fuse_req are sliced into scattergather lists in * addition to the pages used, so this can help account for that overhead. */ #define FUSE_HEADER_OVERHEAD 4 /* List of virtio-fs device instances and a lock for the list. Also provides * mutual exclusion in device removal and mounting path */ static DEFINE_MUTEX(virtio_fs_mutex); static LIST_HEAD(virtio_fs_instances); /* The /sys/fs/virtio_fs/ kset */ static struct kset *virtio_fs_kset; enum { VQ_HIPRIO, VQ_REQUEST }; #define VQ_NAME_LEN 24 /* Per-virtqueue state */ struct virtio_fs_vq { spinlock_t lock; struct virtqueue *vq; /* protected by ->lock */ struct work_struct done_work; struct list_head queued_reqs; struct list_head end_reqs; /* End these requests */ struct work_struct dispatch_work; struct fuse_dev *fud; bool connected; long in_flight; struct completion in_flight_zero; /* No inflight requests */ struct kobject *kobj; char name[VQ_NAME_LEN]; } ____cacheline_aligned_in_smp; /* A virtio-fs device instance */ struct virtio_fs { struct kobject kobj; struct kobject *mqs_kobj; struct list_head list; /* on virtio_fs_instances */ char *tag; struct virtio_fs_vq *vqs; unsigned int nvqs; /* number of virtqueues */ unsigned int num_request_queues; /* number of request queues */ struct dax_device *dax_dev; unsigned int *mq_map; /* index = cpu id, value = request vq id */ /* DAX memory window where file contents are mapped */ void *window_kaddr; phys_addr_t window_phys_addr; size_t window_len; }; struct virtio_fs_forget_req { struct fuse_in_header ih; struct fuse_forget_in arg; }; struct virtio_fs_forget { /* This request can be temporarily queued on virt queue */ struct list_head list; struct virtio_fs_forget_req req; }; struct virtio_fs_req_work { struct fuse_req *req; struct virtio_fs_vq *fsvq; struct work_struct done_work; }; static int virtio_fs_enqueue_req(struct virtio_fs_vq *fsvq, struct fuse_req *req, bool in_flight, gfp_t gfp); static const struct constant_table dax_param_enums[] = { {"always", FUSE_DAX_ALWAYS }, {"never", FUSE_DAX_NEVER }, {"inode", FUSE_DAX_INODE_USER }, {} }; enum { OPT_DAX, OPT_DAX_ENUM, }; static const struct fs_parameter_spec virtio_fs_parameters[] = { fsparam_flag("dax", OPT_DAX), fsparam_enum("dax", OPT_DAX_ENUM, dax_param_enums), {} }; static int virtio_fs_parse_param(struct fs_context *fsc, struct fs_parameter *param) { struct fs_parse_result result; struct fuse_fs_context *ctx = fsc->fs_private; int opt; opt = fs_parse(fsc, virtio_fs_parameters, param, &result); if (opt < 0) return opt; switch (opt) { case OPT_DAX: ctx->dax_mode = FUSE_DAX_ALWAYS; break; case OPT_DAX_ENUM: ctx->dax_mode = result.uint_32; break; default: return -EINVAL; } return 0; } static void virtio_fs_free_fsc(struct fs_context *fsc) { struct fuse_fs_context *ctx = fsc->fs_private; kfree(ctx); } static inline struct virtio_fs_vq *vq_to_fsvq(struct virtqueue *vq) { struct virtio_fs *fs = vq->vdev->priv; return &fs->vqs[vq->index]; } /* Should be called with fsvq->lock held. */ static inline void inc_in_flight_req(struct virtio_fs_vq *fsvq) { fsvq->in_flight++; } /* Should be called with fsvq->lock held. */ static inline void dec_in_flight_req(struct virtio_fs_vq *fsvq) { WARN_ON(fsvq->in_flight <= 0); fsvq->in_flight--; if (!fsvq->in_flight) complete(&fsvq->in_flight_zero); } static ssize_t tag_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { struct virtio_fs *fs = container_of(kobj, struct virtio_fs, kobj); return sysfs_emit(buf, "%s\n", fs->tag); } static struct kobj_attribute virtio_fs_tag_attr = __ATTR_RO(tag); static struct attribute *virtio_fs_attrs[] = { &virtio_fs_tag_attr.attr, NULL }; ATTRIBUTE_GROUPS(virtio_fs); static void virtio_fs_ktype_release(struct kobject *kobj) { struct virtio_fs *vfs = container_of(kobj, struct virtio_fs, kobj); kfree(vfs->mq_map); kfree(vfs->vqs); kfree(vfs); } static const struct kobj_type virtio_fs_ktype = { .release = virtio_fs_ktype_release, .sysfs_ops = &kobj_sysfs_ops, .default_groups = virtio_fs_groups, }; static struct virtio_fs_vq *virtio_fs_kobj_to_vq(struct virtio_fs *fs, struct kobject *kobj) { int i; for (i = 0; i < fs->nvqs; i++) { if (kobj == fs->vqs[i].kobj) return &fs->vqs[i]; } return NULL; } static ssize_t name_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { struct virtio_fs *fs = container_of(kobj->parent->parent, struct virtio_fs, kobj); struct virtio_fs_vq *fsvq = virtio_fs_kobj_to_vq(fs, kobj); if (!fsvq) return -EINVAL; return sysfs_emit(buf, "%s\n", fsvq->name); } static struct kobj_attribute virtio_fs_vq_name_attr = __ATTR_RO(name); static ssize_t cpu_list_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { struct virtio_fs *fs = container_of(kobj->parent->parent, struct virtio_fs, kobj); struct virtio_fs_vq *fsvq = virtio_fs_kobj_to_vq(fs, kobj); unsigned int cpu, qid; const size_t size = PAGE_SIZE - 1; bool first = true; int ret = 0, pos = 0; if (!fsvq) return -EINVAL; qid = fsvq->vq->index; for (cpu = 0; cpu < nr_cpu_ids; cpu++) { if (qid < VQ_REQUEST || (fs->mq_map[cpu] == qid)) { if (first) ret = snprintf(buf + pos, size - pos, "%u", cpu); else ret = snprintf(buf + pos, size - pos, ", %u", cpu); if (ret >= size - pos) break; first = false; pos += ret; } } ret = snprintf(buf + pos, size + 1 - pos, "\n"); return pos + ret; } static struct kobj_attribute virtio_fs_vq_cpu_list_attr = __ATTR_RO(cpu_list); static struct attribute *virtio_fs_vq_attrs[] = { &virtio_fs_vq_name_attr.attr, &virtio_fs_vq_cpu_list_attr.attr, NULL }; static struct attribute_group virtio_fs_vq_attr_group = { .attrs = virtio_fs_vq_attrs, }; /* Make sure virtiofs_mutex is held */ static void virtio_fs_put_locked(struct virtio_fs *fs) { lockdep_assert_held(&virtio_fs_mutex); kobject_put(&fs->kobj); } static void virtio_fs_put(struct virtio_fs *fs) { mutex_lock(&virtio_fs_mutex); virtio_fs_put_locked(fs); mutex_unlock(&virtio_fs_mutex); } static void virtio_fs_fiq_release(struct fuse_iqueue *fiq) { struct virtio_fs *vfs = fiq->priv; virtio_fs_put(vfs); } static void virtio_fs_drain_queue(struct virtio_fs_vq *fsvq) { WARN_ON(fsvq->in_flight < 0); /* Wait for in flight requests to finish.*/ spin_lock(&fsvq->lock); if (fsvq->in_flight) { /* We are holding virtio_fs_mutex. There should not be any * waiters waiting for completion. */ reinit_completion(&fsvq->in_flight_zero); spin_unlock(&fsvq->lock); wait_for_completion(&fsvq->in_flight_zero); } else { spin_unlock(&fsvq->lock); } flush_work(&fsvq->done_work); flush_work(&fsvq->dispatch_work); } static void virtio_fs_drain_all_queues_locked(struct virtio_fs *fs) { struct virtio_fs_vq *fsvq; int i; for (i = 0; i < fs->nvqs; i++) { fsvq = &fs->vqs[i]; virtio_fs_drain_queue(fsvq); } } static void virtio_fs_drain_all_queues(struct virtio_fs *fs) { /* Provides mutual exclusion between ->remove and ->kill_sb * paths. We don't want both of these draining queue at the * same time. Current completion logic reinits completion * and that means there should not be any other thread * doing reinit or waiting for completion already. */ mutex_lock(&virtio_fs_mutex); virtio_fs_drain_all_queues_locked(fs); mutex_unlock(&virtio_fs_mutex); } static void virtio_fs_start_all_queues(struct virtio_fs *fs) { struct virtio_fs_vq *fsvq; int i; for (i = 0; i < fs->nvqs; i++) { fsvq = &fs->vqs[i]; spin_lock(&fsvq->lock); fsvq->connected = true; spin_unlock(&fsvq->lock); } } static void virtio_fs_delete_queues_sysfs(struct virtio_fs *fs) { struct virtio_fs_vq *fsvq; int i; for (i = 0; i < fs->nvqs; i++) { fsvq = &fs->vqs[i]; kobject_put(fsvq->kobj); } } static int virtio_fs_add_queues_sysfs(struct virtio_fs *fs) { struct virtio_fs_vq *fsvq; char buff[12]; int i, j, ret; for (i = 0; i < fs->nvqs; i++) { fsvq = &fs->vqs[i]; sprintf(buff, "%d", i); fsvq->kobj = kobject_create_and_add(buff, fs->mqs_kobj); if (!fs->mqs_kobj) { ret = -ENOMEM; goto out_del; } ret = sysfs_create_group(fsvq->kobj, &virtio_fs_vq_attr_group); if (ret) { kobject_put(fsvq->kobj); goto out_del; } } return 0; out_del: for (j = 0; j < i; j++) { fsvq = &fs->vqs[j]; kobject_put(fsvq->kobj); } return ret; } /* Add a new instance to the list or return -EEXIST if tag name exists*/ static int virtio_fs_add_instance(struct virtio_device *vdev, struct virtio_fs *fs) { struct virtio_fs *fs2; int ret; mutex_lock(&virtio_fs_mutex); list_for_each_entry(fs2, &virtio_fs_instances, list) { if (strcmp(fs->tag, fs2->tag) == 0) { mutex_unlock(&virtio_fs_mutex); return -EEXIST; } } /* Use the virtio_device's index as a unique identifier, there is no * need to allocate our own identifiers because the virtio_fs instance * is only visible to userspace as long as the underlying virtio_device * exists. */ fs->kobj.kset = virtio_fs_kset; ret = kobject_add(&fs->kobj, NULL, "%d", vdev->index); if (ret < 0) goto out_unlock; fs->mqs_kobj = kobject_create_and_add("mqs", &fs->kobj); if (!fs->mqs_kobj) { ret = -ENOMEM; goto out_del; } ret = sysfs_create_link(&fs->kobj, &vdev->dev.kobj, "device"); if (ret < 0) goto out_put; ret = virtio_fs_add_queues_sysfs(fs); if (ret) goto out_remove; list_add_tail(&fs->list, &virtio_fs_instances); mutex_unlock(&virtio_fs_mutex); kobject_uevent(&fs->kobj, KOBJ_ADD); return 0; out_remove: sysfs_remove_link(&fs->kobj, "device"); out_put: kobject_put(fs->mqs_kobj); out_del: kobject_del(&fs->kobj); out_unlock: mutex_unlock(&virtio_fs_mutex); return ret; } /* Return the virtio_fs with a given tag, or NULL */ static struct virtio_fs *virtio_fs_find_instance(const char *tag) { struct virtio_fs *fs; mutex_lock(&virtio_fs_mutex); list_for_each_entry(fs, &virtio_fs_instances, list) { if (strcmp(fs->tag, tag) == 0) { kobject_get(&fs->kobj); goto found; } } fs = NULL; /* not found */ found: mutex_unlock(&virtio_fs_mutex); return fs; } static void virtio_fs_free_devs(struct virtio_fs *fs) { unsigned int i; for (i = 0; i < fs->nvqs; i++) { struct virtio_fs_vq *fsvq = &fs->vqs[i]; if (!fsvq->fud) continue; fuse_dev_free(fsvq->fud); fsvq->fud = NULL; } } /* Read filesystem name from virtio config into fs->tag (must kfree()). */ static int virtio_fs_read_tag(struct virtio_device *vdev, struct virtio_fs *fs) { char tag_buf[sizeof_field(struct virtio_fs_config, tag)]; char *end; size_t len; virtio_cread_bytes(vdev, offsetof(struct virtio_fs_config, tag), &tag_buf, sizeof(tag_buf)); end = memchr(tag_buf, '\0', sizeof(tag_buf)); if (end == tag_buf) return -EINVAL; /* empty tag */ if (!end) end = &tag_buf[sizeof(tag_buf)]; len = end - tag_buf; fs->tag = devm_kmalloc(&vdev->dev, len + 1, GFP_KERNEL); if (!fs->tag) return -ENOMEM; memcpy(fs->tag, tag_buf, len); fs->tag[len] = '\0'; /* While the VIRTIO specification allows any character, newlines are * awkward on mount(8) command-lines and cause problems in the sysfs * "tag" attr and uevent TAG= properties. Forbid them. */ if (strchr(fs->tag, '\n')) { dev_dbg(&vdev->dev, "refusing virtiofs tag with newline character\n"); return -EINVAL; } dev_info(&vdev->dev, "discovered new tag: %s\n", fs->tag); return 0; } /* Work function for hiprio completion */ static void virtio_fs_hiprio_done_work(struct work_struct *work) { struct virtio_fs_vq *fsvq = container_of(work, struct virtio_fs_vq, done_work); struct virtqueue *vq = fsvq->vq; /* Free completed FUSE_FORGET requests */ spin_lock(&fsvq->lock); do { unsigned int len; void *req; virtqueue_disable_cb(vq); while ((req = virtqueue_get_buf(vq, &len)) != NULL) { kfree(req); dec_in_flight_req(fsvq); } } while (!virtqueue_enable_cb(vq)); if (!list_empty(&fsvq->queued_reqs)) schedule_work(&fsvq->dispatch_work); spin_unlock(&fsvq->lock); } static void virtio_fs_request_dispatch_work(struct work_struct *work) { struct fuse_req *req; struct virtio_fs_vq *fsvq = container_of(work, struct virtio_fs_vq, dispatch_work); int ret; pr_debug("virtio-fs: worker %s called.\n", __func__); while (1) { spin_lock(&fsvq->lock); req = list_first_entry_or_null(&fsvq->end_reqs, struct fuse_req, list); if (!req) { spin_unlock(&fsvq->lock); break; } list_del_init(&req->list); spin_unlock(&fsvq->lock); fuse_request_end(req); } /* Dispatch pending requests */ while (1) { unsigned int flags; spin_lock(&fsvq->lock); req = list_first_entry_or_null(&fsvq->queued_reqs, struct fuse_req, list); if (!req) { spin_unlock(&fsvq->lock); return; } list_del_init(&req->list); spin_unlock(&fsvq->lock); flags = memalloc_nofs_save(); ret = virtio_fs_enqueue_req(fsvq, req, true, GFP_KERNEL); memalloc_nofs_restore(flags); if (ret < 0) { if (ret == -ENOSPC) { spin_lock(&fsvq->lock); list_add_tail(&req->list, &fsvq->queued_reqs); spin_unlock(&fsvq->lock); return; } req->out.h.error = ret; spin_lock(&fsvq->lock); dec_in_flight_req(fsvq); spin_unlock(&fsvq->lock); pr_err("virtio-fs: virtio_fs_enqueue_req() failed %d\n", ret); fuse_request_end(req); } } } /* * Returns 1 if queue is full and sender should wait a bit before sending * next request, 0 otherwise. */ static int send_forget_request(struct virtio_fs_vq *fsvq, struct virtio_fs_forget *forget, bool in_flight) { struct scatterlist sg; struct virtqueue *vq; int ret = 0; bool notify; struct virtio_fs_forget_req *req = &forget->req; spin_lock(&fsvq->lock); if (!fsvq->connected) { if (in_flight) dec_in_flight_req(fsvq); kfree(forget); goto out; } sg_init_one(&sg, req, sizeof(*req)); vq = fsvq->vq; dev_dbg(&vq->vdev->dev, "%s\n", __func__); ret = virtqueue_add_outbuf(vq, &sg, 1, forget, GFP_ATOMIC); if (ret < 0) { if (ret == -ENOSPC) { pr_debug("virtio-fs: Could not queue FORGET: err=%d. Will try later\n", ret); list_add_tail(&forget->list, &fsvq->queued_reqs); if (!in_flight) inc_in_flight_req(fsvq); /* Queue is full */ ret = 1; } else { pr_debug("virtio-fs: Could not queue FORGET: err=%d. Dropping it.\n", ret); kfree(forget); if (in_flight) dec_in_flight_req(fsvq); } goto out; } if (!in_flight) inc_in_flight_req(fsvq); notify = virtqueue_kick_prepare(vq); spin_unlock(&fsvq->lock); if (notify) virtqueue_notify(vq); return ret; out: spin_unlock(&fsvq->lock); return ret; } static void virtio_fs_hiprio_dispatch_work(struct work_struct *work) { struct virtio_fs_forget *forget; struct virtio_fs_vq *fsvq = container_of(work, struct virtio_fs_vq, dispatch_work); pr_debug("virtio-fs: worker %s called.\n", __func__); while (1) { spin_lock(&fsvq->lock); forget = list_first_entry_or_null(&fsvq->queued_reqs, struct virtio_fs_forget, list); if (!forget) { spin_unlock(&fsvq->lock); return; } list_del(&forget->list); spin_unlock(&fsvq->lock); if (send_forget_request(fsvq, forget, true)) return; } } /* Allocate and copy args into req->argbuf */ static int copy_args_to_argbuf(struct fuse_req *req, gfp_t gfp) { struct fuse_args *args = req->args; unsigned int offset = 0; unsigned int num_in; unsigned int num_out; unsigned int len; unsigned int i; num_in = args->in_numargs - args->in_pages; num_out = args->out_numargs - args->out_pages; len = fuse_len_args(num_in, (struct fuse_arg *) args->in_args) + fuse_len_args(num_out, args->out_args); req->argbuf = kmalloc(len, gfp); if (!req->argbuf) return -ENOMEM; for (i = 0; i < num_in; i++) { memcpy(req->argbuf + offset, args->in_args[i].value, args->in_args[i].size); offset += args->in_args[i].size; } return 0; } /* Copy args out of and free req->argbuf */ static void copy_args_from_argbuf(struct fuse_args *args, struct fuse_req *req) { unsigned int remaining; unsigned int offset; unsigned int num_in; unsigned int num_out; unsigned int i; remaining = req->out.h.len - sizeof(req->out.h); num_in = args->in_numargs - args->in_pages; num_out = args->out_numargs - args->out_pages; offset = fuse_len_args(num_in, (struct fuse_arg *)args->in_args); for (i = 0; i < num_out; i++) { unsigned int argsize = args->out_args[i].size; if (args->out_argvar && i == args->out_numargs - 1 && argsize > remaining) { argsize = remaining; } memcpy(args->out_args[i].value, req->argbuf + offset, argsize); offset += argsize; if (i != args->out_numargs - 1) remaining -= argsize; } /* Store the actual size of the variable-length arg */ if (args->out_argvar) args->out_args[args->out_numargs - 1].size = remaining; kfree(req->argbuf); req->argbuf = NULL; } /* Work function for request completion */ static void virtio_fs_request_complete(struct fuse_req *req, struct virtio_fs_vq *fsvq) { struct fuse_pqueue *fpq = &fsvq->fud->pq; struct fuse_args *args; struct fuse_args_pages *ap; unsigned int len, i, thislen; struct folio *folio; /* * TODO verify that server properly follows FUSE protocol * (oh.uniq, oh.len) */ args = req->args; copy_args_from_argbuf(args, req); if (args->out_pages && args->page_zeroing) { len = args->out_args[args->out_numargs - 1].size; ap = container_of(args, typeof(*ap), args); for (i = 0; i < ap->num_folios; i++) { thislen = ap->descs[i].length; if (len < thislen) { WARN_ON(ap->descs[i].offset); folio = ap->folios[i]; folio_zero_segment(folio, len, thislen); len = 0; } else { len -= thislen; } } } spin_lock(&fpq->lock); clear_bit(FR_SENT, &req->flags); spin_unlock(&fpq->lock); fuse_request_end(req); spin_lock(&fsvq->lock); dec_in_flight_req(fsvq); spin_unlock(&fsvq->lock); } static void virtio_fs_complete_req_work(struct work_struct *work) { struct virtio_fs_req_work *w = container_of(work, typeof(*w), done_work); virtio_fs_request_complete(w->req, w->fsvq); kfree(w); } static void virtio_fs_requests_done_work(struct work_struct *work) { struct virtio_fs_vq *fsvq = container_of(work, struct virtio_fs_vq, done_work); struct fuse_pqueue *fpq = &fsvq->fud->pq; struct virtqueue *vq = fsvq->vq; struct fuse_req *req; struct fuse_req *next; unsigned int len; LIST_HEAD(reqs); /* Collect completed requests off the virtqueue */ spin_lock(&fsvq->lock); do { virtqueue_disable_cb(vq); while ((req = virtqueue_get_buf(vq, &len)) != NULL) { spin_lock(&fpq->lock); list_move_tail(&req->list, &reqs); spin_unlock(&fpq->lock); } } while (!virtqueue_enable_cb(vq)); spin_unlock(&fsvq->lock); /* End requests */ list_for_each_entry_safe(req, next, &reqs, list) { list_del_init(&req->list); /* blocking async request completes in a worker context */ if (req->args->may_block) { struct virtio_fs_req_work *w; w = kzalloc(sizeof(*w), GFP_NOFS | __GFP_NOFAIL); INIT_WORK(&w->done_work, virtio_fs_complete_req_work); w->fsvq = fsvq; w->req = req; schedule_work(&w->done_work); } else { virtio_fs_request_complete(req, fsvq); } } /* Try to push previously queued requests, as the queue might no longer be full */ spin_lock(&fsvq->lock); if (!list_empty(&fsvq->queued_reqs)) schedule_work(&fsvq->dispatch_work); spin_unlock(&fsvq->lock); } static void virtio_fs_map_queues(struct virtio_device *vdev, struct virtio_fs *fs) { const struct cpumask *mask, *masks; unsigned int q, cpu; /* First attempt to map using existing transport layer affinities * e.g. PCIe MSI-X */ if (!vdev->config->get_vq_affinity) goto fallback; for (q = 0; q < fs->num_request_queues; q++) { mask = vdev->config->get_vq_affinity(vdev, VQ_REQUEST + q); if (!mask) goto fallback; for_each_cpu(cpu, mask) fs->mq_map[cpu] = q + VQ_REQUEST; } return; fallback: /* Attempt to map evenly in groups over the CPUs */ masks = group_cpus_evenly(fs->num_request_queues); /* If even this fails we default to all CPUs use first request queue */ if (!masks) { for_each_possible_cpu(cpu) fs->mq_map[cpu] = VQ_REQUEST; return; } for (q = 0; q < fs->num_request_queues; q++) { for_each_cpu(cpu, &masks[q]) fs->mq_map[cpu] = q + VQ_REQUEST; } kfree(masks); } /* Virtqueue interrupt handler */ static void virtio_fs_vq_done(struct virtqueue *vq) { struct virtio_fs_vq *fsvq = vq_to_fsvq(vq); dev_dbg(&vq->vdev->dev, "%s %s\n", __func__, fsvq->name); schedule_work(&fsvq->done_work); } static void virtio_fs_init_vq(struct virtio_fs_vq *fsvq, char *name, int vq_type) { strscpy(fsvq->name, name, VQ_NAME_LEN); spin_lock_init(&fsvq->lock); INIT_LIST_HEAD(&fsvq->queued_reqs); INIT_LIST_HEAD(&fsvq->end_reqs); init_completion(&fsvq->in_flight_zero); if (vq_type == VQ_REQUEST) { INIT_WORK(&fsvq->done_work, virtio_fs_requests_done_work); INIT_WORK(&fsvq->dispatch_work, virtio_fs_request_dispatch_work); } else { INIT_WORK(&fsvq->done_work, virtio_fs_hiprio_done_work); INIT_WORK(&fsvq->dispatch_work, virtio_fs_hiprio_dispatch_work); } } /* Initialize virtqueues */ static int virtio_fs_setup_vqs(struct virtio_device *vdev, struct virtio_fs *fs) { struct virtqueue_info *vqs_info; struct virtqueue **vqs; /* Specify pre_vectors to ensure that the queues before the * request queues (e.g. hiprio) don't claim any of the CPUs in * the multi-queue mapping and interrupt affinities */ struct irq_affinity desc = { .pre_vectors = VQ_REQUEST }; unsigned int i; int ret = 0; virtio_cread_le(vdev, struct virtio_fs_config, num_request_queues, &fs->num_request_queues); if (fs->num_request_queues == 0) return -EINVAL; /* Truncate nr of request queues to nr_cpu_id */ fs->num_request_queues = min_t(unsigned int, fs->num_request_queues, nr_cpu_ids); fs->nvqs = VQ_REQUEST + fs->num_request_queues; fs->vqs = kcalloc(fs->nvqs, sizeof(fs->vqs[VQ_HIPRIO]), GFP_KERNEL); if (!fs->vqs) return -ENOMEM; vqs = kmalloc_array(fs->nvqs, sizeof(vqs[VQ_HIPRIO]), GFP_KERNEL); fs->mq_map = kcalloc_node(nr_cpu_ids, sizeof(*fs->mq_map), GFP_KERNEL, dev_to_node(&vdev->dev)); vqs_info = kcalloc(fs->nvqs, sizeof(*vqs_info), GFP_KERNEL); if (!vqs || !vqs_info || !fs->mq_map) { ret = -ENOMEM; goto out; } /* Initialize the hiprio/forget request virtqueue */ vqs_info[VQ_HIPRIO].callback = virtio_fs_vq_done; virtio_fs_init_vq(&fs->vqs[VQ_HIPRIO], "hiprio", VQ_HIPRIO); vqs_info[VQ_HIPRIO].name = fs->vqs[VQ_HIPRIO].name; /* Initialize the requests virtqueues */ for (i = VQ_REQUEST; i < fs->nvqs; i++) { char vq_name[VQ_NAME_LEN]; snprintf(vq_name, VQ_NAME_LEN, "requests.%u", i - VQ_REQUEST); virtio_fs_init_vq(&fs->vqs[i], vq_name, VQ_REQUEST); vqs_info[i].callback = virtio_fs_vq_done; vqs_info[i].name = fs->vqs[i].name; } ret = virtio_find_vqs(vdev, fs->nvqs, vqs, vqs_info, &desc); if (ret < 0) goto out; for (i = 0; i < fs->nvqs; i++) fs->vqs[i].vq = vqs[i]; virtio_fs_start_all_queues(fs); out: kfree(vqs_info); kfree(vqs); if (ret) { kfree(fs->vqs); kfree(fs->mq_map); } return ret; } /* Free virtqueues (device must already be reset) */ static void virtio_fs_cleanup_vqs(struct virtio_device *vdev) { vdev->config->del_vqs(vdev); } /* Map a window offset to a page frame number. The window offset will have * been produced by .iomap_begin(), which maps a file offset to a window * offset. */ static long virtio_fs_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, long nr_pages, enum dax_access_mode mode, void **kaddr, pfn_t *pfn) { struct virtio_fs *fs = dax_get_private(dax_dev); phys_addr_t offset = PFN_PHYS(pgoff); size_t max_nr_pages = fs->window_len / PAGE_SIZE - pgoff; if (kaddr) *kaddr = fs->window_kaddr + offset; if (pfn) *pfn = phys_to_pfn_t(fs->window_phys_addr + offset, PFN_DEV | PFN_MAP); return nr_pages > max_nr_pages ? max_nr_pages : nr_pages; } static int virtio_fs_zero_page_range(struct dax_device *dax_dev, pgoff_t pgoff, size_t nr_pages) { long rc; void *kaddr; rc = dax_direct_access(dax_dev, pgoff, nr_pages, DAX_ACCESS, &kaddr, NULL); if (rc < 0) return dax_mem2blk_err(rc); memset(kaddr, 0, nr_pages << PAGE_SHIFT); dax_flush(dax_dev, kaddr, nr_pages << PAGE_SHIFT); return 0; } static const struct dax_operations virtio_fs_dax_ops = { .direct_access = virtio_fs_direct_access, .zero_page_range = virtio_fs_zero_page_range, }; static void virtio_fs_cleanup_dax(void *data) { struct dax_device *dax_dev = data; kill_dax(dax_dev); put_dax(dax_dev); } DEFINE_FREE(cleanup_dax, struct dax_dev *, if (!IS_ERR_OR_NULL(_T)) virtio_fs_cleanup_dax(_T)) static int virtio_fs_setup_dax(struct virtio_device *vdev, struct virtio_fs *fs) { struct dax_device *dax_dev __free(cleanup_dax) = NULL; struct virtio_shm_region cache_reg; struct dev_pagemap *pgmap; bool have_cache; if (!IS_ENABLED(CONFIG_FUSE_DAX)) return 0; dax_dev = alloc_dax(fs, &virtio_fs_dax_ops); if (IS_ERR(dax_dev)) { int rc = PTR_ERR(dax_dev); return rc == -EOPNOTSUPP ? 0 : rc; } /* Get cache region */ have_cache = virtio_get_shm_region(vdev, &cache_reg, (u8)VIRTIO_FS_SHMCAP_ID_CACHE); if (!have_cache) { dev_notice(&vdev->dev, "%s: No cache capability\n", __func__); return 0; } if (!devm_request_mem_region(&vdev->dev, cache_reg.addr, cache_reg.len, dev_name(&vdev->dev))) { dev_warn(&vdev->dev, "could not reserve region addr=0x%llx len=0x%llx\n", cache_reg.addr, cache_reg.len); return -EBUSY; } dev_notice(&vdev->dev, "Cache len: 0x%llx @ 0x%llx\n", cache_reg.len, cache_reg.addr); pgmap = devm_kzalloc(&vdev->dev, sizeof(*pgmap), GFP_KERNEL); if (!pgmap) return -ENOMEM; pgmap->type = MEMORY_DEVICE_FS_DAX; /* Ideally we would directly use the PCI BAR resource but * devm_memremap_pages() wants its own copy in pgmap. So * initialize a struct resource from scratch (only the start * and end fields will be used). */ pgmap->range = (struct range) { .start = (phys_addr_t) cache_reg.addr, .end = (phys_addr_t) cache_reg.addr + cache_reg.len - 1, }; pgmap->nr_range = 1; fs->window_kaddr = devm_memremap_pages(&vdev->dev, pgmap); if (IS_ERR(fs->window_kaddr)) return PTR_ERR(fs->window_kaddr); fs->window_phys_addr = (phys_addr_t) cache_reg.addr; fs->window_len = (phys_addr_t) cache_reg.len; dev_dbg(&vdev->dev, "%s: window kaddr 0x%px phys_addr 0x%llx len 0x%llx\n", __func__, fs->window_kaddr, cache_reg.addr, cache_reg.len); fs->dax_dev = no_free_ptr(dax_dev); return devm_add_action_or_reset(&vdev->dev, virtio_fs_cleanup_dax, fs->dax_dev); } static int virtio_fs_probe(struct virtio_device *vdev) { struct virtio_fs *fs; int ret; fs = kzalloc(sizeof(*fs), GFP_KERNEL); if (!fs) return -ENOMEM; kobject_init(&fs->kobj, &virtio_fs_ktype); vdev->priv = fs; ret = virtio_fs_read_tag(vdev, fs); if (ret < 0) goto out; ret = virtio_fs_setup_vqs(vdev, fs); if (ret < 0) goto out; virtio_fs_map_queues(vdev, fs); ret = virtio_fs_setup_dax(vdev, fs); if (ret < 0) goto out_vqs; /* Bring the device online in case the filesystem is mounted and * requests need to be sent before we return. */ virtio_device_ready(vdev); ret = virtio_fs_add_instance(vdev, fs); if (ret < 0) goto out_vqs; return 0; out_vqs: virtio_reset_device(vdev); virtio_fs_cleanup_vqs(vdev); out: vdev->priv = NULL; kobject_put(&fs->kobj); return ret; } static void virtio_fs_stop_all_queues(struct virtio_fs *fs) { struct virtio_fs_vq *fsvq; int i; for (i = 0; i < fs->nvqs; i++) { fsvq = &fs->vqs[i]; spin_lock(&fsvq->lock); fsvq->connected = false; spin_unlock(&fsvq->lock); } } static void virtio_fs_remove(struct virtio_device *vdev) { struct virtio_fs *fs = vdev->priv; mutex_lock(&virtio_fs_mutex); /* This device is going away. No one should get new reference */ list_del_init(&fs->list); virtio_fs_delete_queues_sysfs(fs); sysfs_remove_link(&fs->kobj, "device"); kobject_put(fs->mqs_kobj); kobject_del(&fs->kobj); virtio_fs_stop_all_queues(fs); virtio_fs_drain_all_queues_locked(fs); virtio_reset_device(vdev); virtio_fs_cleanup_vqs(vdev); vdev->priv = NULL; /* Put device reference on virtio_fs object */ virtio_fs_put_locked(fs); mutex_unlock(&virtio_fs_mutex); } #ifdef CONFIG_PM_SLEEP static int virtio_fs_freeze(struct virtio_device *vdev) { /* TODO need to save state here */ pr_warn("virtio-fs: suspend/resume not yet supported\n"); return -EOPNOTSUPP; } static int virtio_fs_restore(struct virtio_device *vdev) { /* TODO need to restore state here */ return 0; } #endif /* CONFIG_PM_SLEEP */ static const struct virtio_device_id id_table[] = { { VIRTIO_ID_FS, VIRTIO_DEV_ANY_ID }, {}, }; static const unsigned int feature_table[] = {}; static struct virtio_driver virtio_fs_driver = { .driver.name = KBUILD_MODNAME, .id_table = id_table, .feature_table = feature_table, .feature_table_size = ARRAY_SIZE(feature_table), .probe = virtio_fs_probe, .remove = virtio_fs_remove, #ifdef CONFIG_PM_SLEEP .freeze = virtio_fs_freeze, .restore = virtio_fs_restore, #endif }; static void virtio_fs_send_forget(struct fuse_iqueue *fiq, struct fuse_forget_link *link) { struct virtio_fs_forget *forget; struct virtio_fs_forget_req *req; struct virtio_fs *fs = fiq->priv; struct virtio_fs_vq *fsvq = &fs->vqs[VQ_HIPRIO]; u64 unique = fuse_get_unique(fiq); /* Allocate a buffer for the request */ forget = kmalloc(sizeof(*forget), GFP_NOFS | __GFP_NOFAIL); req = &forget->req; req->ih = (struct fuse_in_header){ .opcode = FUSE_FORGET, .nodeid = link->forget_one.nodeid, .unique = unique, .len = sizeof(*req), }; req->arg = (struct fuse_forget_in){ .nlookup = link->forget_one.nlookup, }; send_forget_request(fsvq, forget, false); kfree(link); } static void virtio_fs_send_interrupt(struct fuse_iqueue *fiq, struct fuse_req *req) { /* * TODO interrupts. * * Normal fs operations on a local filesystems aren't interruptible. * Exceptions are blocking lock operations; for example fcntl(F_SETLKW) * with shared lock between host and guest. */ } /* Count number of scatter-gather elements required */ static unsigned int sg_count_fuse_folios(struct fuse_folio_desc *folio_descs, unsigned int num_folios, unsigned int total_len) { unsigned int i; unsigned int this_len; for (i = 0; i < num_folios && total_len; i++) { this_len = min(folio_descs[i].length, total_len); total_len -= this_len; } return i; } /* Return the number of scatter-gather list elements required */ static unsigned int sg_count_fuse_req(struct fuse_req *req) { struct fuse_args *args = req->args; struct fuse_args_pages *ap = container_of(args, typeof(*ap), args); unsigned int size, total_sgs = 1 /* fuse_in_header */; if (args->in_numargs - args->in_pages) total_sgs += 1; if (args->in_pages) { size = args->in_args[args->in_numargs - 1].size; total_sgs += sg_count_fuse_folios(ap->descs, ap->num_folios, size); } if (!test_bit(FR_ISREPLY, &req->flags)) return total_sgs; total_sgs += 1 /* fuse_out_header */; if (args->out_numargs - args->out_pages) total_sgs += 1; if (args->out_pages) { size = args->out_args[args->out_numargs - 1].size; total_sgs += sg_count_fuse_folios(ap->descs, ap->num_folios, size); } return total_sgs; } /* Add folios to scatter-gather list and return number of elements used */ static unsigned int sg_init_fuse_folios(struct scatterlist *sg, struct folio **folios, struct fuse_folio_desc *folio_descs, unsigned int num_folios, unsigned int total_len) { unsigned int i; unsigned int this_len; for (i = 0; i < num_folios && total_len; i++) { sg_init_table(&sg[i], 1); this_len = min(folio_descs[i].length, total_len); sg_set_folio(&sg[i], folios[i], this_len, folio_descs[i].offset); total_len -= this_len; } return i; } /* Add args to scatter-gather list and return number of elements used */ static unsigned int sg_init_fuse_args(struct scatterlist *sg, struct fuse_req *req, struct fuse_arg *args, unsigned int numargs, bool argpages, void *argbuf, unsigned int *len_used) { struct fuse_args_pages *ap = container_of(req->args, typeof(*ap), args); unsigned int total_sgs = 0; unsigned int len; len = fuse_len_args(numargs - argpages, args); if (len) sg_init_one(&sg[total_sgs++], argbuf, len); if (argpages) total_sgs += sg_init_fuse_folios(&sg[total_sgs], ap->folios, ap->descs, ap->num_folios, args[numargs - 1].size); if (len_used) *len_used = len; return total_sgs; } /* Add a request to a virtqueue and kick the device */ static int virtio_fs_enqueue_req(struct virtio_fs_vq *fsvq, struct fuse_req *req, bool in_flight, gfp_t gfp) { /* requests need at least 4 elements */ struct scatterlist *stack_sgs[6]; struct scatterlist stack_sg[ARRAY_SIZE(stack_sgs)]; struct scatterlist **sgs = stack_sgs; struct scatterlist *sg = stack_sg; struct virtqueue *vq; struct fuse_args *args = req->args; unsigned int argbuf_used = 0; unsigned int out_sgs = 0; unsigned int in_sgs = 0; unsigned int total_sgs; unsigned int i; int ret; bool notify; struct fuse_pqueue *fpq; /* Does the sglist fit on the stack? */ total_sgs = sg_count_fuse_req(req); if (total_sgs > ARRAY_SIZE(stack_sgs)) { sgs = kmalloc_array(total_sgs, sizeof(sgs[0]), gfp); sg = kmalloc_array(total_sgs, sizeof(sg[0]), gfp); if (!sgs || !sg) { ret = -ENOMEM; goto out; } } /* Use a bounce buffer since stack args cannot be mapped */ ret = copy_args_to_argbuf(req, gfp); if (ret < 0) goto out; /* Request elements */ sg_init_one(&sg[out_sgs++], &req->in.h, sizeof(req->in.h)); out_sgs += sg_init_fuse_args(&sg[out_sgs], req, (struct fuse_arg *)args->in_args, args->in_numargs, args->in_pages, req->argbuf, &argbuf_used); /* Reply elements */ if (test_bit(FR_ISREPLY, &req->flags)) { sg_init_one(&sg[out_sgs + in_sgs++], &req->out.h, sizeof(req->out.h)); in_sgs += sg_init_fuse_args(&sg[out_sgs + in_sgs], req, args->out_args, args->out_numargs, args->out_pages, req->argbuf + argbuf_used, NULL); } WARN_ON(out_sgs + in_sgs != total_sgs); for (i = 0; i < total_sgs; i++) sgs[i] = &sg[i]; spin_lock(&fsvq->lock); if (!fsvq->connected) { spin_unlock(&fsvq->lock); ret = -ENOTCONN; goto out; } vq = fsvq->vq; ret = virtqueue_add_sgs(vq, sgs, out_sgs, in_sgs, req, GFP_ATOMIC); if (ret < 0) { spin_unlock(&fsvq->lock); goto out; } /* Request successfully sent. */ fpq = &fsvq->fud->pq; spin_lock(&fpq->lock); list_add_tail(&req->list, fpq->processing); spin_unlock(&fpq->lock); set_bit(FR_SENT, &req->flags); /* matches barrier in request_wait_answer() */ smp_mb__after_atomic(); if (!in_flight) inc_in_flight_req(fsvq); notify = virtqueue_kick_prepare(vq); spin_unlock(&fsvq->lock); if (notify) virtqueue_notify(vq); out: if (ret < 0 && req->argbuf) { kfree(req->argbuf); req->argbuf = NULL; } if (sgs != stack_sgs) { kfree(sgs); kfree(sg); } return ret; } static void virtio_fs_send_req(struct fuse_iqueue *fiq, struct fuse_req *req) { unsigned int queue_id; struct virtio_fs *fs; struct virtio_fs_vq *fsvq; int ret; if (req->in.h.opcode != FUSE_NOTIFY_REPLY) req->in.h.unique = fuse_get_unique(fiq); clear_bit(FR_PENDING, &req->flags); fs = fiq->priv; queue_id = fs->mq_map[raw_smp_processor_id()]; pr_debug("%s: opcode %u unique %#llx nodeid %#llx in.len %u out.len %u queue_id %u\n", __func__, req->in.h.opcode, req->in.h.unique, req->in.h.nodeid, req->in.h.len, fuse_len_args(req->args->out_numargs, req->args->out_args), queue_id); fsvq = &fs->vqs[queue_id]; ret = virtio_fs_enqueue_req(fsvq, req, false, GFP_ATOMIC); if (ret < 0) { if (ret == -ENOSPC) { /* * Virtqueue full. Retry submission from worker * context as we might be holding fc->bg_lock. */ spin_lock(&fsvq->lock); list_add_tail(&req->list, &fsvq->queued_reqs); inc_in_flight_req(fsvq); spin_unlock(&fsvq->lock); return; } req->out.h.error = ret; pr_err("virtio-fs: virtio_fs_enqueue_req() failed %d\n", ret); /* Can't end request in submission context. Use a worker */ spin_lock(&fsvq->lock); list_add_tail(&req->list, &fsvq->end_reqs); schedule_work(&fsvq->dispatch_work); spin_unlock(&fsvq->lock); return; } } static const struct fuse_iqueue_ops virtio_fs_fiq_ops = { .send_forget = virtio_fs_send_forget, .send_interrupt = virtio_fs_send_interrupt, .send_req = virtio_fs_send_req, .release = virtio_fs_fiq_release, }; static inline void virtio_fs_ctx_set_defaults(struct fuse_fs_context *ctx) { ctx->rootmode = S_IFDIR; ctx->default_permissions = 1; ctx->allow_other = 1; ctx->max_read = UINT_MAX; ctx->blksize = 512; ctx->destroy = true; ctx->no_control = true; ctx->no_force_umount = true; } static int virtio_fs_fill_super(struct super_block *sb, struct fs_context *fsc) { struct fuse_mount *fm = get_fuse_mount_super(sb); struct fuse_conn *fc = fm->fc; struct virtio_fs *fs = fc->iq.priv; struct fuse_fs_context *ctx = fsc->fs_private; unsigned int i; int err; virtio_fs_ctx_set_defaults(ctx); mutex_lock(&virtio_fs_mutex); /* After holding mutex, make sure virtiofs device is still there. * Though we are holding a reference to it, drive ->remove might * still have cleaned up virtual queues. In that case bail out. */ err = -EINVAL; if (list_empty(&fs->list)) { pr_info("virtio-fs: tag <%s> not found\n", fs->tag); goto err; } err = -ENOMEM; /* Allocate fuse_dev for hiprio and notification queues */ for (i = 0; i < fs->nvqs; i++) { struct virtio_fs_vq *fsvq = &fs->vqs[i]; fsvq->fud = fuse_dev_alloc(); if (!fsvq->fud) goto err_free_fuse_devs; } /* virtiofs allocates and installs its own fuse devices */ ctx->fudptr = NULL; if (ctx->dax_mode != FUSE_DAX_NEVER) { if (ctx->dax_mode == FUSE_DAX_ALWAYS && !fs->dax_dev) { err = -EINVAL; pr_err("virtio-fs: dax can't be enabled as filesystem" " device does not support it.\n"); goto err_free_fuse_devs; } ctx->dax_dev = fs->dax_dev; } err = fuse_fill_super_common(sb, ctx); if (err < 0) goto err_free_fuse_devs; for (i = 0; i < fs->nvqs; i++) { struct virtio_fs_vq *fsvq = &fs->vqs[i]; fuse_dev_install(fsvq->fud, fc); } /* Previous unmount will stop all queues. Start these again */ virtio_fs_start_all_queues(fs); fuse_send_init(fm); mutex_unlock(&virtio_fs_mutex); return 0; err_free_fuse_devs: virtio_fs_free_devs(fs); err: mutex_unlock(&virtio_fs_mutex); return err; } static void virtio_fs_conn_destroy(struct fuse_mount *fm) { struct fuse_conn *fc = fm->fc; struct virtio_fs *vfs = fc->iq.priv; struct virtio_fs_vq *fsvq = &vfs->vqs[VQ_HIPRIO]; /* Stop dax worker. Soon evict_inodes() will be called which * will free all memory ranges belonging to all inodes. */ if (IS_ENABLED(CONFIG_FUSE_DAX)) fuse_dax_cancel_work(fc); /* Stop forget queue. Soon destroy will be sent */ spin_lock(&fsvq->lock); fsvq->connected = false; spin_unlock(&fsvq->lock); virtio_fs_drain_all_queues(vfs); fuse_conn_destroy(fm); /* fuse_conn_destroy() must have sent destroy. Stop all queues * and drain one more time and free fuse devices. Freeing fuse * devices will drop their reference on fuse_conn and that in * turn will drop its reference on virtio_fs object. */ virtio_fs_stop_all_queues(vfs); virtio_fs_drain_all_queues(vfs); virtio_fs_free_devs(vfs); } static void virtio_kill_sb(struct super_block *sb) { struct fuse_mount *fm = get_fuse_mount_super(sb); bool last; /* If mount failed, we can still be called without any fc */ if (sb->s_root) { last = fuse_mount_remove(fm); if (last) virtio_fs_conn_destroy(fm); } kill_anon_super(sb); fuse_mount_destroy(fm); } static int virtio_fs_test_super(struct super_block *sb, struct fs_context *fsc) { struct fuse_mount *fsc_fm = fsc->s_fs_info; struct fuse_mount *sb_fm = get_fuse_mount_super(sb); return fsc_fm->fc->iq.priv == sb_fm->fc->iq.priv; } static int virtio_fs_get_tree(struct fs_context *fsc) { struct virtio_fs *fs; struct super_block *sb; struct fuse_conn *fc = NULL; struct fuse_mount *fm; unsigned int virtqueue_size; int err = -EIO; /* This gets a reference on virtio_fs object. This ptr gets installed * in fc->iq->priv. Once fuse_conn is going away, it calls ->put() * to drop the reference to this object. */ fs = virtio_fs_find_instance(fsc->source); if (!fs) { pr_info("virtio-fs: tag <%s> not found\n", fsc->source); return -EINVAL; } virtqueue_size = virtqueue_get_vring_size(fs->vqs[VQ_REQUEST].vq); if (WARN_ON(virtqueue_size <= FUSE_HEADER_OVERHEAD)) goto out_err; err = -ENOMEM; fc = kzalloc(sizeof(struct fuse_conn), GFP_KERNEL); if (!fc) goto out_err; fm = kzalloc(sizeof(struct fuse_mount), GFP_KERNEL); if (!fm) goto out_err; fuse_conn_init(fc, fm, fsc->user_ns, &virtio_fs_fiq_ops, fs); fc->release = fuse_free_conn; fc->delete_stale = true; fc->auto_submounts = true; fc->sync_fs = true; fc->use_pages_for_kvec_io = true; /* Tell FUSE to split requests that exceed the virtqueue's size */ fc->max_pages_limit = min_t(unsigned int, fc->max_pages_limit, virtqueue_size - FUSE_HEADER_OVERHEAD); fsc->s_fs_info = fm; sb = sget_fc(fsc, virtio_fs_test_super, set_anon_super_fc); if (fsc->s_fs_info) fuse_mount_destroy(fm); if (IS_ERR(sb)) return PTR_ERR(sb); if (!sb->s_root) { err = virtio_fs_fill_super(sb, fsc); if (err) { deactivate_locked_super(sb); return err; } sb->s_flags |= SB_ACTIVE; } WARN_ON(fsc->root); fsc->root = dget(sb->s_root); return 0; out_err: kfree(fc); virtio_fs_put(fs); return err; } static const struct fs_context_operations virtio_fs_context_ops = { .free = virtio_fs_free_fsc, .parse_param = virtio_fs_parse_param, .get_tree = virtio_fs_get_tree, }; static int virtio_fs_init_fs_context(struct fs_context *fsc) { struct fuse_fs_context *ctx; if (fsc->purpose == FS_CONTEXT_FOR_SUBMOUNT) return fuse_init_fs_context_submount(fsc); ctx = kzalloc(sizeof(struct fuse_fs_context), GFP_KERNEL); if (!ctx) return -ENOMEM; fsc->fs_private = ctx; fsc->ops = &virtio_fs_context_ops; return 0; } static struct file_system_type virtio_fs_type = { .owner = THIS_MODULE, .name = "virtiofs", .init_fs_context = virtio_fs_init_fs_context, .kill_sb = virtio_kill_sb, .fs_flags = FS_ALLOW_IDMAP, }; static int virtio_fs_uevent(const struct kobject *kobj, struct kobj_uevent_env *env) { const struct virtio_fs *fs = container_of(kobj, struct virtio_fs, kobj); add_uevent_var(env, "TAG=%s", fs->tag); return 0; } static const struct kset_uevent_ops virtio_fs_uevent_ops = { .uevent = virtio_fs_uevent, }; static int __init virtio_fs_sysfs_init(void) { virtio_fs_kset = kset_create_and_add("virtiofs", &virtio_fs_uevent_ops, fs_kobj); if (!virtio_fs_kset) return -ENOMEM; return 0; } static void virtio_fs_sysfs_exit(void) { kset_unregister(virtio_fs_kset); virtio_fs_kset = NULL; } static int __init virtio_fs_init(void) { int ret; ret = virtio_fs_sysfs_init(); if (ret < 0) return ret; ret = register_virtio_driver(&virtio_fs_driver); if (ret < 0) goto sysfs_exit; ret = register_filesystem(&virtio_fs_type); if (ret < 0) goto unregister_virtio_driver; return 0; unregister_virtio_driver: unregister_virtio_driver(&virtio_fs_driver); sysfs_exit: virtio_fs_sysfs_exit(); return ret; } module_init(virtio_fs_init); static void __exit virtio_fs_exit(void) { unregister_filesystem(&virtio_fs_type); unregister_virtio_driver(&virtio_fs_driver); virtio_fs_sysfs_exit(); } module_exit(virtio_fs_exit); MODULE_AUTHOR("Stefan Hajnoczi <stefanha@redhat.com>"); MODULE_DESCRIPTION("Virtio Filesystem"); MODULE_LICENSE("GPL"); MODULE_ALIAS_FS(KBUILD_MODNAME); MODULE_DEVICE_TABLE(virtio, id_table);
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082 6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121 6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132 6133 6134 6135 6136 6137 6138 6139 6140 6141 6142 6143 6144 6145 6146 6147 6148 6149 6150 6151 6152 6153 6154 6155 6156 6157 6158 6159 6160 6161 6162 6163 6164 6165 6166 6167 6168 6169 6170 6171 6172 6173 6174 6175 6176 6177 6178 6179 6180 6181 6182 6183 6184 6185 6186 6187 6188 6189 6190 6191 6192 6193 6194 6195 6196 6197 6198 6199 6200 6201 6202 6203 6204 6205 6206 6207 6208 6209 6210 6211 6212 6213 6214 6215 6216 6217 6218 6219 6220 6221 6222 6223 6224 6225 6226 6227 6228 6229 6230 6231 6232 6233 6234 6235 6236 6237 6238 6239 6240 6241 6242 6243 6244 6245 6246 6247 6248 6249 6250 6251 6252 6253 6254 6255 6256 6257 6258 6259 6260 6261 6262 6263 6264 6265 6266 6267 6268 6269 6270 6271 6272 6273 6274 6275 6276 6277 6278 6279 6280 6281 6282 6283 6284 6285 6286 6287 6288 6289 6290 6291 6292 6293 6294 6295 6296 6297 6298 6299 6300 6301 6302 6303 6304 6305 6306 6307 6308 6309 6310 6311 6312 6313 6314 6315 6316 6317 6318 6319 6320 6321 6322 6323 6324 6325 6326 6327 6328 6329 6330 6331 6332 6333 6334 6335 6336 6337 6338 6339 6340 6341 6342 6343 6344 6345 6346 6347 6348 6349 6350 6351 6352 6353 6354 6355 6356 6357 6358 6359 6360 6361 6362 6363 6364 6365 6366 6367 6368 6369 6370 6371 6372 6373 6374 6375 6376 6377 6378 6379 6380 6381 6382 6383 6384 6385 6386 6387 6388 6389 6390 6391 6392 6393 6394 6395 6396 6397 6398 6399 6400 6401 6402 6403 6404 6405 6406 6407 6408 6409 6410 6411 6412 6413 6414 6415 6416 6417 6418 6419 6420 6421 6422 6423 6424 6425 6426 6427 6428 6429 6430 6431 6432 6433 6434 6435 6436 6437 6438 6439 6440 6441 6442 6443 6444 6445 6446 6447 6448 6449 6450 6451 6452 6453 6454 6455 6456 6457 6458 6459 6460 6461 6462 6463 6464 6465 6466 6467 6468 6469 6470 6471 6472 6473 6474 6475 6476 6477 6478 6479 6480 6481 6482 6483 6484 6485 6486 6487 6488 6489 6490 6491 6492 6493 6494 6495 6496 6497 6498 6499 6500 6501 6502 6503 6504 6505 6506 6507 6508 6509 6510 6511 6512 6513 6514 6515 6516 6517 6518 6519 6520 6521 6522 6523 6524 6525 6526 6527 6528 6529 6530 6531 6532 6533 6534 6535 6536 6537 6538 6539 6540 6541 6542 6543 6544 6545 6546 6547 6548 6549 6550 6551 6552 6553 6554 6555 6556 6557 6558 6559 6560 6561 6562 6563 6564 6565 6566 6567 6568 6569 6570 6571 6572 6573 6574 6575 6576 6577 6578 6579 6580 6581 6582 6583 6584 6585 6586 6587 6588 6589 6590 6591 6592 6593 6594 6595 6596 6597 6598 6599 6600 6601 6602 6603 6604 6605 6606 6607 6608 6609 6610 6611 6612 6613 6614 6615 6616 6617 6618 6619 6620 6621 6622 6623 6624 6625 6626 6627 6628 6629 6630 6631 6632 6633 6634 6635 6636 6637 6638 6639 6640 6641 6642 6643 6644 6645 6646 6647 6648 6649 6650 6651 6652 6653 6654 6655 6656 6657 6658 6659 6660 6661 6662 6663 6664 6665 6666 6667 6668 6669 6670 6671 6672 6673 6674 6675 6676 6677 6678 6679 6680 6681 6682 6683 6684 6685 6686 6687 6688 6689 6690 6691 6692 6693 6694 6695 6696 6697 6698 6699 6700 6701 6702 6703 6704 6705 6706 6707 6708 6709 6710 6711 6712 6713 6714 6715 6716 6717 6718 6719 6720 6721 6722 6723 6724 6725 6726 6727 6728 6729 6730 6731 6732 6733 6734 6735 6736 6737 6738 6739 6740 6741 6742 6743 6744 6745 6746 6747 6748 6749 6750 6751 6752 6753 6754 6755 6756 6757 6758 6759 6760 6761 6762 6763 6764 6765 6766 6767 6768 6769 6770 6771 6772 6773 6774 6775 6776 6777 6778 6779 6780 6781 6782 6783 6784 6785 6786 6787 6788 6789 6790 6791 6792 6793 6794 6795 6796 6797 6798 6799 6800 6801 6802 6803 6804 6805 6806 6807 6808 6809 6810 6811 6812 6813 6814 6815 6816 6817 6818 6819 6820 6821 6822 6823 6824 6825 6826 6827 6828 6829 6830 6831 6832 6833 6834 6835 6836 6837 6838 6839 6840 6841 6842 6843 6844 6845 6846 6847 6848 6849 6850 6851 6852 6853 6854 6855 6856 6857 6858 6859 6860 6861 6862 6863 6864 6865 6866 6867 6868 6869 6870 6871 6872 6873 6874 6875 6876 6877 6878 6879 6880 6881 6882 6883 6884 6885 6886 6887 6888 6889 6890 6891 6892 6893 6894 6895 6896 6897 6898 6899 6900 6901 6902 6903 6904 6905 6906 6907 6908 6909 6910 6911 6912 6913 6914 6915 6916 6917 6918 6919 6920 6921 6922 6923 6924 6925 6926 6927 6928 6929 6930 6931 6932 6933 6934 6935 6936 6937 6938 6939 6940 6941 6942 6943 6944 6945 6946 6947 6948 6949 6950 6951 6952 6953 6954 6955 6956 6957 6958 6959 6960 6961 6962 6963 6964 6965 6966 6967 6968 6969 6970 6971 6972 6973 6974 6975 6976 6977 6978 6979 6980 6981 6982 6983 6984 6985 6986 6987 6988 6989 6990 6991 6992 6993 6994 6995 6996 6997 6998 6999 7000 7001 7002 7003 7004 7005 7006 7007 7008 7009 7010 7011 7012 7013 7014 7015 7016 7017 7018 7019 7020 7021 7022 7023 7024 7025 7026 7027 7028 7029 7030 7031 7032 7033 7034 7035 7036 7037 7038 7039 7040 7041 7042 7043 7044 7045 7046 7047 7048 7049 7050 7051 7052 7053 7054 7055 7056 7057 7058 7059 7060 7061 7062 7063 7064 7065 7066 7067 7068 7069 7070 7071 7072 7073 7074 7075 7076 7077 7078 7079 7080 7081 7082 7083 7084 7085 7086 7087 7088 7089 7090 7091 7092 7093 7094 7095 7096 7097 7098 7099 7100 7101 7102 7103 7104 7105 7106 7107 7108 7109 7110 7111 7112 7113 7114 7115 7116 7117 7118 7119 7120 7121 7122 7123 7124 7125 7126 7127 7128 7129 7130 7131 7132 7133 7134 7135 7136 7137 7138 7139 7140 7141 7142 7143 7144 7145 7146 7147 7148 7149 7150 7151 7152 7153 7154 7155 7156 7157 7158 7159 7160 7161 7162 7163 7164 7165 7166 7167 7168 7169 7170 7171 7172 7173 7174 7175 7176 7177 7178 7179 7180 7181 7182 7183 7184 7185 7186 7187 7188 7189 7190 7191 7192 7193 7194 7195 7196 7197 7198 7199 7200 7201 7202 7203 7204 7205 7206 7207 7208 7209 7210 7211 7212 7213 7214 7215 7216 7217 7218 7219 7220 7221 7222 7223 7224 7225 7226 7227 7228 7229 7230 7231 7232 7233 7234 7235 7236 7237 7238 7239 7240 7241 7242 7243 7244 7245 7246 7247 7248 7249 7250 7251 7252 7253 7254 7255 7256 7257 7258 7259 7260 7261 7262 7263 7264 7265 7266 7267 7268 7269 7270 7271 7272 7273 7274 7275 7276 7277 7278 7279 7280 7281 7282 7283 7284 7285 7286 7287 7288 7289 7290 7291 7292 7293 7294 7295 7296 7297 7298 7299 7300 7301 7302 7303 7304 7305 7306 7307 7308 7309 7310 7311 7312 7313 7314 7315 7316 7317 7318 7319 7320 7321 7322 7323 7324 7325 7326 7327 7328 7329 7330 7331 7332 7333 7334 7335 7336 7337 7338 7339 7340 7341 7342 7343 7344 7345 7346 7347 7348 7349 7350 7351 7352 7353 7354 7355 7356 7357 7358 7359 7360 7361 7362 7363 7364 7365 7366 7367 7368 7369 7370 7371 7372 7373 7374 7375 7376 7377 7378 7379 7380 7381 7382 7383 7384 7385 7386 7387 7388 7389 7390 7391 7392 7393 7394 7395 7396 7397 7398 7399 7400 7401 7402 7403 7404 7405 7406 7407 7408 7409 7410 7411 7412 7413 7414 7415 7416 7417 7418 7419 7420 7421 7422 7423 7424 7425 7426 7427 7428 7429 7430 7431 7432 7433 7434 7435 7436 7437 7438 7439 7440 7441 7442 7443 7444 7445 7446 7447 7448 7449 7450 7451 7452 7453 7454 7455 7456 7457 7458 7459 7460 7461 7462 7463 7464 7465 7466 7467 7468 7469 7470 7471 7472 7473 7474 7475 7476 7477 7478 7479 7480 7481 7482 7483 7484 7485 7486 7487 7488 7489 7490 7491 7492 7493 7494 7495 7496 7497 7498 7499 7500 7501 7502 7503 7504 7505 7506 7507 7508 7509 7510 7511 7512 7513 7514 7515 7516 7517 7518 7519 7520 7521 7522 7523 7524 7525 7526 7527 7528 7529 7530 7531 7532 7533 7534 7535 7536 7537 7538 7539 7540 7541 7542 7543 7544 7545 7546 7547 7548 7549 7550 7551 7552 7553 7554 7555 7556 7557 7558 7559 7560 7561 7562 7563 7564 7565 7566 7567 7568 7569 7570 7571 7572 7573 7574 7575 7576 7577 7578 7579 7580 7581 7582 7583 7584 7585 7586 7587 7588 7589 7590 7591 7592 7593 7594 7595 7596 7597 7598 7599 7600 7601 7602 7603 7604 7605 7606 7607 7608 7609 7610 7611 7612 7613 7614 7615 7616 7617 7618 7619 7620 7621 7622 7623 7624 7625 7626 7627 7628 7629 7630 7631 7632 7633 7634 7635 7636 7637 7638 7639 7640 7641 7642 7643 7644 7645 7646 7647 7648 7649 7650 7651 7652 7653 7654 7655 7656 7657 7658 7659 7660 7661 7662 7663 7664 7665 7666 7667 7668 7669 7670 7671 7672 7673 7674 7675 7676 7677 7678 7679 7680 7681 7682 7683 7684 7685 7686 7687 7688 7689 7690 7691 7692 7693 7694 7695 7696 7697 7698 7699 7700 7701 7702 7703 7704 7705 7706 7707 7708 7709 7710 7711 7712 7713 7714 7715 7716 7717 7718 7719 7720 7721 7722 7723 7724 7725 7726 7727 7728 7729 7730 7731 7732 7733 7734 7735 7736 7737 7738 7739 7740 7741 7742 7743 7744 7745 7746 7747 7748 7749 7750 7751 7752 7753 7754 7755 7756 7757 7758 7759 7760 7761 7762 7763 7764 7765 7766 7767 7768 7769 7770 7771 7772 7773 7774 7775 7776 7777 7778 7779 7780 7781 7782 7783 7784 7785 7786 7787 7788 7789 7790 7791 7792 7793 7794 7795 7796 7797 7798 7799 7800 /* SPDX-License-Identifier: GPL-2.0-only */ /* * mac80211 <-> driver interface * * Copyright 2002-2005, Devicescape Software, Inc. * Copyright 2006-2007 Jiri Benc <jbenc@suse.cz> * Copyright 2007-2010 Johannes Berg <johannes@sipsolutions.net> * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright (C) 2015 - 2017 Intel Deutschland GmbH * Copyright (C) 2018 - 2024 Intel Corporation */ #ifndef MAC80211_H #define MAC80211_H #include <linux/bug.h> #include <linux/kernel.h> #include <linux/if_ether.h> #include <linux/skbuff.h> #include <linux/ieee80211.h> #include <linux/lockdep.h> #include <net/cfg80211.h> #include <net/codel.h> #include <net/ieee80211_radiotap.h> #include <linux/unaligned.h> /** * DOC: Introduction * * mac80211 is the Linux stack for 802.11 hardware that implements * only partial functionality in hard- or firmware. This document * defines the interface between mac80211 and low-level hardware * drivers. */ /** * DOC: Calling mac80211 from interrupts * * Only ieee80211_tx_status_irqsafe() and ieee80211_rx_irqsafe() can be * called in hardware interrupt context. The low-level driver must not call any * other functions in hardware interrupt context. If there is a need for such * call, the low-level driver should first ACK the interrupt and perform the * IEEE 802.11 code call after this, e.g. from a scheduled workqueue or even * tasklet function. * * NOTE: If the driver opts to use the _irqsafe() functions, it may not also * use the non-IRQ-safe functions! */ /** * DOC: Warning * * If you're reading this document and not the header file itself, it will * be incomplete because not all documentation has been converted yet. */ /** * DOC: Frame format * * As a general rule, when frames are passed between mac80211 and the driver, * they start with the IEEE 802.11 header and include the same octets that are * sent over the air except for the FCS which should be calculated by the * hardware. * * There are, however, various exceptions to this rule for advanced features: * * The first exception is for hardware encryption and decryption offload * where the IV/ICV may or may not be generated in hardware. * * Secondly, when the hardware handles fragmentation, the frame handed to * the driver from mac80211 is the MSDU, not the MPDU. */ /** * DOC: mac80211 workqueue * * mac80211 provides its own workqueue for drivers and internal mac80211 use. * The workqueue is a single threaded workqueue and can only be accessed by * helpers for sanity checking. Drivers must ensure all work added onto the * mac80211 workqueue should be cancelled on the driver stop() callback. * * mac80211 will flush the workqueue upon interface removal and during * suspend. * * All work performed on the mac80211 workqueue must not acquire the RTNL lock. * */ /** * DOC: mac80211 software tx queueing * * mac80211 uses an intermediate queueing implementation, designed to allow the * driver to keep hardware queues short and to provide some fairness between * different stations/interfaces. * * Drivers must provide the .wake_tx_queue driver operation by either * linking it to ieee80211_handle_wake_tx_queue() or implementing a custom * handler. * * Intermediate queues (struct ieee80211_txq) are kept per-sta per-tid, with * another per-sta for non-data/non-mgmt and bufferable management frames, and * a single per-vif queue for multicast data frames. * * The driver is expected to initialize its private per-queue data for stations * and interfaces in the .add_interface and .sta_add ops. * * The driver can't access the internal TX queues (iTXQs) directly. * Whenever mac80211 adds a new frame to a queue, it calls the .wake_tx_queue * driver op. * Drivers implementing a custom .wake_tx_queue op can get them by calling * ieee80211_tx_dequeue(). Drivers using ieee80211_handle_wake_tx_queue() will * simply get the individual frames pushed via the .tx driver operation. * * Drivers can optionally delegate responsibility for scheduling queues to * mac80211, to take advantage of airtime fairness accounting. In this case, to * obtain the next queue to pull frames from, the driver calls * ieee80211_next_txq(). The driver is then expected to return the txq using * ieee80211_return_txq(). * * For AP powersave TIM handling, the driver only needs to indicate if it has * buffered packets in the driver specific data structures by calling * ieee80211_sta_set_buffered(). For frames buffered in the ieee80211_txq * struct, mac80211 sets the appropriate TIM PVB bits and calls * .release_buffered_frames(). * In that callback the driver is therefore expected to release its own * buffered frames and afterwards also frames from the ieee80211_txq (obtained * via the usual ieee80211_tx_dequeue). */ /** * DOC: HW timestamping * * Timing Measurement and Fine Timing Measurement require accurate timestamps * of the action frames TX/RX and their respective acks. * * To report hardware timestamps for Timing Measurement or Fine Timing * Measurement frame RX, the low level driver should set the SKB's hwtstamp * field to the frame RX timestamp and report the ack TX timestamp in the * ieee80211_rx_status struct. * * Similarly, to report hardware timestamps for Timing Measurement or Fine * Timing Measurement frame TX, the driver should set the SKB's hwtstamp field * to the frame TX timestamp and report the ack RX timestamp in the * ieee80211_tx_status struct. */ struct device; /** * enum ieee80211_max_queues - maximum number of queues * * @IEEE80211_MAX_QUEUES: Maximum number of regular device queues. * @IEEE80211_MAX_QUEUE_MAP: bitmap with maximum queues set */ enum ieee80211_max_queues { IEEE80211_MAX_QUEUES = 16, IEEE80211_MAX_QUEUE_MAP = BIT(IEEE80211_MAX_QUEUES) - 1, }; #define IEEE80211_INVAL_HW_QUEUE 0xff /** * enum ieee80211_ac_numbers - AC numbers as used in mac80211 * @IEEE80211_AC_VO: voice * @IEEE80211_AC_VI: video * @IEEE80211_AC_BE: best effort * @IEEE80211_AC_BK: background */ enum ieee80211_ac_numbers { IEEE80211_AC_VO = 0, IEEE80211_AC_VI = 1, IEEE80211_AC_BE = 2, IEEE80211_AC_BK = 3, }; /** * struct ieee80211_tx_queue_params - transmit queue configuration * * The information provided in this structure is required for QoS * transmit queue configuration. Cf. IEEE 802.11 7.3.2.29. * * @aifs: arbitration interframe space [0..255] * @cw_min: minimum contention window [a value of the form * 2^n-1 in the range 1..32767] * @cw_max: maximum contention window [like @cw_min] * @txop: maximum burst time in units of 32 usecs, 0 meaning disabled * @acm: is mandatory admission control required for the access category * @uapsd: is U-APSD mode enabled for the queue * @mu_edca: is the MU EDCA configured * @mu_edca_param_rec: MU EDCA Parameter Record for HE */ struct ieee80211_tx_queue_params { u16 txop; u16 cw_min; u16 cw_max; u8 aifs; bool acm; bool uapsd; bool mu_edca; struct ieee80211_he_mu_edca_param_ac_rec mu_edca_param_rec; }; struct ieee80211_low_level_stats { unsigned int dot11ACKFailureCount; unsigned int dot11RTSFailureCount; unsigned int dot11FCSErrorCount; unsigned int dot11RTSSuccessCount; }; /** * enum ieee80211_chanctx_change - change flag for channel context * @IEEE80211_CHANCTX_CHANGE_WIDTH: The channel width changed * @IEEE80211_CHANCTX_CHANGE_RX_CHAINS: The number of RX chains changed * @IEEE80211_CHANCTX_CHANGE_RADAR: radar detection flag changed * @IEEE80211_CHANCTX_CHANGE_CHANNEL: switched to another operating channel, * this is used only with channel switching with CSA * @IEEE80211_CHANCTX_CHANGE_MIN_DEF: The min chandef changed * @IEEE80211_CHANCTX_CHANGE_AP: The AP channel definition changed, so (wider * bandwidth) OFDMA settings need to be changed * @IEEE80211_CHANCTX_CHANGE_PUNCTURING: The punctured channel(s) bitmap * was changed. */ enum ieee80211_chanctx_change { IEEE80211_CHANCTX_CHANGE_WIDTH = BIT(0), IEEE80211_CHANCTX_CHANGE_RX_CHAINS = BIT(1), IEEE80211_CHANCTX_CHANGE_RADAR = BIT(2), IEEE80211_CHANCTX_CHANGE_CHANNEL = BIT(3), IEEE80211_CHANCTX_CHANGE_MIN_DEF = BIT(4), IEEE80211_CHANCTX_CHANGE_AP = BIT(5), IEEE80211_CHANCTX_CHANGE_PUNCTURING = BIT(6), }; /** * struct ieee80211_chan_req - A channel "request" * @oper: channel definition to use for operation * @ap: the channel definition of the AP, if any * (otherwise the chan member is %NULL) */ struct ieee80211_chan_req { struct cfg80211_chan_def oper; struct cfg80211_chan_def ap; }; /** * struct ieee80211_chanctx_conf - channel context that vifs may be tuned to * * This is the driver-visible part. The ieee80211_chanctx * that contains it is visible in mac80211 only. * * @def: the channel definition * @min_def: the minimum channel definition currently required. * @ap: the channel definition the AP actually is operating as, * for use with (wider bandwidth) OFDMA * @radio_idx: index of the wiphy radio used used for this channel * @rx_chains_static: The number of RX chains that must always be * active on the channel to receive MIMO transmissions * @rx_chains_dynamic: The number of RX chains that must be enabled * after RTS/CTS handshake to receive SMPS MIMO transmissions; * this will always be >= @rx_chains_static. * @radar_enabled: whether radar detection is enabled on this channel. * @drv_priv: data area for driver use, will always be aligned to * sizeof(void *), size is determined in hw information. */ struct ieee80211_chanctx_conf { struct cfg80211_chan_def def; struct cfg80211_chan_def min_def; struct cfg80211_chan_def ap; int radio_idx; u8 rx_chains_static, rx_chains_dynamic; bool radar_enabled; u8 drv_priv[] __aligned(sizeof(void *)); }; /** * enum ieee80211_chanctx_switch_mode - channel context switch mode * @CHANCTX_SWMODE_REASSIGN_VIF: Both old and new contexts already * exist (and will continue to exist), but the virtual interface * needs to be switched from one to the other. * @CHANCTX_SWMODE_SWAP_CONTEXTS: The old context exists but will stop * to exist with this call, the new context doesn't exist but * will be active after this call, the virtual interface switches * from the old to the new (note that the driver may of course * implement this as an on-the-fly chandef switch of the existing * hardware context, but the mac80211 pointer for the old context * will cease to exist and only the new one will later be used * for changes/removal.) */ enum ieee80211_chanctx_switch_mode { CHANCTX_SWMODE_REASSIGN_VIF, CHANCTX_SWMODE_SWAP_CONTEXTS, }; /** * struct ieee80211_vif_chanctx_switch - vif chanctx switch information * * This is structure is used to pass information about a vif that * needs to switch from one chanctx to another. The * &ieee80211_chanctx_switch_mode defines how the switch should be * done. * * @vif: the vif that should be switched from old_ctx to new_ctx * @link_conf: the link conf that's switching * @old_ctx: the old context to which the vif was assigned * @new_ctx: the new context to which the vif must be assigned */ struct ieee80211_vif_chanctx_switch { struct ieee80211_vif *vif; struct ieee80211_bss_conf *link_conf; struct ieee80211_chanctx_conf *old_ctx; struct ieee80211_chanctx_conf *new_ctx; }; /** * enum ieee80211_bss_change - BSS change notification flags * * These flags are used with the bss_info_changed(), link_info_changed() * and vif_cfg_changed() callbacks to indicate which parameter(s) changed. * * @BSS_CHANGED_ASSOC: association status changed (associated/disassociated), * also implies a change in the AID. * @BSS_CHANGED_ERP_CTS_PROT: CTS protection changed * @BSS_CHANGED_ERP_PREAMBLE: preamble changed * @BSS_CHANGED_ERP_SLOT: slot timing changed * @BSS_CHANGED_HT: 802.11n parameters changed * @BSS_CHANGED_BASIC_RATES: Basic rateset changed * @BSS_CHANGED_BEACON_INT: Beacon interval changed * @BSS_CHANGED_BSSID: BSSID changed, for whatever * reason (IBSS and managed mode) * @BSS_CHANGED_BEACON: Beacon data changed, retrieve * new beacon (beaconing modes) * @BSS_CHANGED_BEACON_ENABLED: Beaconing should be * enabled/disabled (beaconing modes) * @BSS_CHANGED_CQM: Connection quality monitor config changed * @BSS_CHANGED_IBSS: IBSS join status changed * @BSS_CHANGED_ARP_FILTER: Hardware ARP filter address list or state changed. * @BSS_CHANGED_QOS: QoS for this association was enabled/disabled. Note * that it is only ever disabled for station mode. * @BSS_CHANGED_IDLE: Idle changed for this BSS/interface. * @BSS_CHANGED_SSID: SSID changed for this BSS (AP and IBSS mode) * @BSS_CHANGED_AP_PROBE_RESP: Probe Response changed for this BSS (AP mode) * @BSS_CHANGED_PS: PS changed for this BSS (STA mode) * @BSS_CHANGED_TXPOWER: TX power setting changed for this interface * @BSS_CHANGED_P2P_PS: P2P powersave settings (CTWindow, opportunistic PS) * changed * @BSS_CHANGED_BEACON_INFO: Data from the AP's beacon became available: * currently dtim_period only is under consideration. * @BSS_CHANGED_BANDWIDTH: The bandwidth used by this interface changed, * note that this is only called when it changes after the channel * context had been assigned. * @BSS_CHANGED_OCB: OCB join status changed * @BSS_CHANGED_MU_GROUPS: VHT MU-MIMO group id or user position changed * @BSS_CHANGED_KEEP_ALIVE: keep alive options (idle period or protected * keep alive) changed. * @BSS_CHANGED_MCAST_RATE: Multicast Rate setting changed for this interface * @BSS_CHANGED_FTM_RESPONDER: fine timing measurement request responder * functionality changed for this BSS (AP mode). * @BSS_CHANGED_TWT: TWT status changed * @BSS_CHANGED_HE_OBSS_PD: OBSS Packet Detection status changed. * @BSS_CHANGED_HE_BSS_COLOR: BSS Color has changed * @BSS_CHANGED_FILS_DISCOVERY: FILS discovery status changed. * @BSS_CHANGED_UNSOL_BCAST_PROBE_RESP: Unsolicited broadcast probe response * status changed. * @BSS_CHANGED_MLD_VALID_LINKS: MLD valid links status changed. * @BSS_CHANGED_MLD_TTLM: negotiated TID to link mapping was changed * @BSS_CHANGED_TPE: transmit power envelope changed */ enum ieee80211_bss_change { BSS_CHANGED_ASSOC = 1<<0, BSS_CHANGED_ERP_CTS_PROT = 1<<1, BSS_CHANGED_ERP_PREAMBLE = 1<<2, BSS_CHANGED_ERP_SLOT = 1<<3, BSS_CHANGED_HT = 1<<4, BSS_CHANGED_BASIC_RATES = 1<<5, BSS_CHANGED_BEACON_INT = 1<<6, BSS_CHANGED_BSSID = 1<<7, BSS_CHANGED_BEACON = 1<<8, BSS_CHANGED_BEACON_ENABLED = 1<<9, BSS_CHANGED_CQM = 1<<10, BSS_CHANGED_IBSS = 1<<11, BSS_CHANGED_ARP_FILTER = 1<<12, BSS_CHANGED_QOS = 1<<13, BSS_CHANGED_IDLE = 1<<14, BSS_CHANGED_SSID = 1<<15, BSS_CHANGED_AP_PROBE_RESP = 1<<16, BSS_CHANGED_PS = 1<<17, BSS_CHANGED_TXPOWER = 1<<18, BSS_CHANGED_P2P_PS = 1<<19, BSS_CHANGED_BEACON_INFO = 1<<20, BSS_CHANGED_BANDWIDTH = 1<<21, BSS_CHANGED_OCB = 1<<22, BSS_CHANGED_MU_GROUPS = 1<<23, BSS_CHANGED_KEEP_ALIVE = 1<<24, BSS_CHANGED_MCAST_RATE = 1<<25, BSS_CHANGED_FTM_RESPONDER = 1<<26, BSS_CHANGED_TWT = 1<<27, BSS_CHANGED_HE_OBSS_PD = 1<<28, BSS_CHANGED_HE_BSS_COLOR = 1<<29, BSS_CHANGED_FILS_DISCOVERY = 1<<30, BSS_CHANGED_UNSOL_BCAST_PROBE_RESP = BIT_ULL(31), BSS_CHANGED_MLD_VALID_LINKS = BIT_ULL(33), BSS_CHANGED_MLD_TTLM = BIT_ULL(34), BSS_CHANGED_TPE = BIT_ULL(35), /* when adding here, make sure to change ieee80211_reconfig */ }; /* * The maximum number of IPv4 addresses listed for ARP filtering. If the number * of addresses for an interface increase beyond this value, hardware ARP * filtering will be disabled. */ #define IEEE80211_BSS_ARP_ADDR_LIST_LEN 4 /** * enum ieee80211_event_type - event to be notified to the low level driver * @RSSI_EVENT: AP's rssi crossed the a threshold set by the driver. * @MLME_EVENT: event related to MLME * @BAR_RX_EVENT: a BAR was received * @BA_FRAME_TIMEOUT: Frames were released from the reordering buffer because * they timed out. This won't be called for each frame released, but only * once each time the timeout triggers. */ enum ieee80211_event_type { RSSI_EVENT, MLME_EVENT, BAR_RX_EVENT, BA_FRAME_TIMEOUT, }; /** * enum ieee80211_rssi_event_data - relevant when event type is %RSSI_EVENT * @RSSI_EVENT_HIGH: AP's rssi went below the threshold set by the driver. * @RSSI_EVENT_LOW: AP's rssi went above the threshold set by the driver. */ enum ieee80211_rssi_event_data { RSSI_EVENT_HIGH, RSSI_EVENT_LOW, }; /** * struct ieee80211_rssi_event - data attached to an %RSSI_EVENT * @data: See &enum ieee80211_rssi_event_data */ struct ieee80211_rssi_event { enum ieee80211_rssi_event_data data; }; /** * enum ieee80211_mlme_event_data - relevant when event type is %MLME_EVENT * @AUTH_EVENT: the MLME operation is authentication * @ASSOC_EVENT: the MLME operation is association * @DEAUTH_RX_EVENT: deauth received.. * @DEAUTH_TX_EVENT: deauth sent. */ enum ieee80211_mlme_event_data { AUTH_EVENT, ASSOC_EVENT, DEAUTH_RX_EVENT, DEAUTH_TX_EVENT, }; /** * enum ieee80211_mlme_event_status - relevant when event type is %MLME_EVENT * @MLME_SUCCESS: the MLME operation completed successfully. * @MLME_DENIED: the MLME operation was denied by the peer. * @MLME_TIMEOUT: the MLME operation timed out. */ enum ieee80211_mlme_event_status { MLME_SUCCESS, MLME_DENIED, MLME_TIMEOUT, }; /** * struct ieee80211_mlme_event - data attached to an %MLME_EVENT * @data: See &enum ieee80211_mlme_event_data * @status: See &enum ieee80211_mlme_event_status * @reason: the reason code if applicable */ struct ieee80211_mlme_event { enum ieee80211_mlme_event_data data; enum ieee80211_mlme_event_status status; u16 reason; }; /** * struct ieee80211_ba_event - data attached for BlockAck related events * @sta: pointer to the &ieee80211_sta to which this event relates * @tid: the tid * @ssn: the starting sequence number (for %BAR_RX_EVENT) */ struct ieee80211_ba_event { struct ieee80211_sta *sta; u16 tid; u16 ssn; }; /** * struct ieee80211_event - event to be sent to the driver * @type: The event itself. See &enum ieee80211_event_type. * @u.rssi: relevant if &type is %RSSI_EVENT * @u.mlme: relevant if &type is %AUTH_EVENT * @u.ba: relevant if &type is %BAR_RX_EVENT or %BA_FRAME_TIMEOUT * @u:union holding the fields above */ struct ieee80211_event { enum ieee80211_event_type type; union { struct ieee80211_rssi_event rssi; struct ieee80211_mlme_event mlme; struct ieee80211_ba_event ba; } u; }; /** * struct ieee80211_mu_group_data - STA's VHT MU-MIMO group data * * This structure describes the group id data of VHT MU-MIMO * * @membership: 64 bits array - a bit is set if station is member of the group * @position: 2 bits per group id indicating the position in the group */ struct ieee80211_mu_group_data { u8 membership[WLAN_MEMBERSHIP_LEN]; u8 position[WLAN_USER_POSITION_LEN]; }; /** * struct ieee80211_ftm_responder_params - FTM responder parameters * * @lci: LCI subelement content * @civicloc: CIVIC location subelement content * @lci_len: LCI data length * @civicloc_len: Civic data length */ struct ieee80211_ftm_responder_params { const u8 *lci; const u8 *civicloc; size_t lci_len; size_t civicloc_len; }; /** * struct ieee80211_fils_discovery - FILS discovery parameters from * IEEE Std 802.11ai-2016, Annex C.3 MIB detail. * * @min_interval: Minimum packet interval in TUs (0 - 10000) * @max_interval: Maximum packet interval in TUs (0 - 10000) */ struct ieee80211_fils_discovery { u32 min_interval; u32 max_interval; }; #define IEEE80211_TPE_EIRP_ENTRIES_320MHZ 5 struct ieee80211_parsed_tpe_eirp { bool valid; s8 power[IEEE80211_TPE_EIRP_ENTRIES_320MHZ]; u8 count; }; #define IEEE80211_TPE_PSD_ENTRIES_320MHZ 16 struct ieee80211_parsed_tpe_psd { bool valid; s8 power[IEEE80211_TPE_PSD_ENTRIES_320MHZ]; u8 count, n; }; /** * struct ieee80211_parsed_tpe - parsed transmit power envelope information * @max_local: maximum local EIRP, one value for 20, 40, 80, 160, 320 MHz each * (indexed by TX power category) * @max_reg_client: maximum regulatory client EIRP, one value for 20, 40, 80, * 160, 320 MHz each * (indexed by TX power category) * @psd_local: maximum local power spectral density, one value for each 20 MHz * subchannel per bss_conf's chanreq.oper * (indexed by TX power category) * @psd_reg_client: maximum regulatory power spectral density, one value for * each 20 MHz subchannel per bss_conf's chanreq.oper * (indexed by TX power category) */ struct ieee80211_parsed_tpe { struct ieee80211_parsed_tpe_eirp max_local[2], max_reg_client[2]; struct ieee80211_parsed_tpe_psd psd_local[2], psd_reg_client[2]; }; /** * struct ieee80211_bss_conf - holds the BSS's changing parameters * * This structure keeps information about a BSS (and an association * to that BSS) that can change during the lifetime of the BSS. * * @vif: reference to owning VIF * @bss: the cfg80211 bss descriptor. Valid only for a station, and only * when associated. Note: This contains information which is not * necessarily authenticated. For example, information coming from probe * responses. * @addr: (link) address used locally * @link_id: link ID, or 0 for non-MLO * @htc_trig_based_pkt_ext: default PE in 4us units, if BSS supports HE * @uora_exists: is the UORA element advertised by AP * @uora_ocw_range: UORA element's OCW Range field * @frame_time_rts_th: HE duration RTS threshold, in units of 32us * @he_support: does this BSS support HE * @twt_requester: does this BSS support TWT requester (relevant for managed * mode only, set if the AP advertises TWT responder role) * @twt_responder: does this BSS support TWT requester (relevant for managed * mode only, set if the AP advertises TWT responder role) * @twt_protected: does this BSS support protected TWT frames * @twt_broadcast: does this BSS support broadcast TWT * @use_cts_prot: use CTS protection * @use_short_preamble: use 802.11b short preamble * @use_short_slot: use short slot time (only relevant for ERP) * @dtim_period: num of beacons before the next DTIM, for beaconing, * valid in station mode only if after the driver was notified * with the %BSS_CHANGED_BEACON_INFO flag, will be non-zero then. * @sync_tsf: last beacon's/probe response's TSF timestamp (could be old * as it may have been received during scanning long ago). If the * HW flag %IEEE80211_HW_TIMING_BEACON_ONLY is set, then this can * only come from a beacon, but might not become valid until after * association when a beacon is received (which is notified with the * %BSS_CHANGED_DTIM flag.). See also sync_dtim_count important notice. * @sync_device_ts: the device timestamp corresponding to the sync_tsf, * the driver/device can use this to calculate synchronisation * (see @sync_tsf). See also sync_dtim_count important notice. * @sync_dtim_count: Only valid when %IEEE80211_HW_TIMING_BEACON_ONLY * is requested, see @sync_tsf/@sync_device_ts. * IMPORTANT: These three sync_* parameters would possibly be out of sync * by the time the driver will use them. The synchronized view is currently * guaranteed only in certain callbacks. * Note also that this is not used with MLD associations, mac80211 doesn't * know how to track beacons for all of the links for this. * @beacon_int: beacon interval * @assoc_capability: capabilities taken from assoc resp * @basic_rates: bitmap of basic rates, each bit stands for an * index into the rate table configured by the driver in * the current band. * @beacon_rate: associated AP's beacon TX rate * @mcast_rate: per-band multicast rate index + 1 (0: disabled) * @bssid: The BSSID for this BSS * @enable_beacon: whether beaconing should be enabled or not * @chanreq: Channel request for this BSS -- the hardware might be * configured a higher bandwidth than this BSS uses, for example. * @mu_group: VHT MU-MIMO group membership data * @ht_operation_mode: HT operation mode like in &struct ieee80211_ht_operation. * This field is only valid when the channel is a wide HT/VHT channel. * Note that with TDLS this can be the case (channel is HT, protection must * be used from this field) even when the BSS association isn't using HT. * @cqm_rssi_thold: Connection quality monitor RSSI threshold, a zero value * implies disabled. As with the cfg80211 callback, a change here should * cause an event to be sent indicating where the current value is in * relation to the newly configured threshold. * @cqm_rssi_low: Connection quality monitor RSSI lower threshold, a zero value * implies disabled. This is an alternative mechanism to the single * threshold event and can't be enabled simultaneously with it. * @cqm_rssi_high: Connection quality monitor RSSI upper threshold. * @cqm_rssi_hyst: Connection quality monitor RSSI hysteresis * @qos: This is a QoS-enabled BSS. * @hidden_ssid: The SSID of the current vif is hidden. Only valid in AP-mode. * @txpower: TX power in dBm. INT_MIN means not configured. * @txpower_type: TX power adjustment used to control per packet Transmit * Power Control (TPC) in lower driver for the current vif. In particular * TPC is enabled if value passed in %txpower_type is * NL80211_TX_POWER_LIMITED (allow using less than specified from * userspace), whereas TPC is disabled if %txpower_type is set to * NL80211_TX_POWER_FIXED (use value configured from userspace) * @p2p_noa_attr: P2P NoA attribute for P2P powersave * @allow_p2p_go_ps: indication for AP or P2P GO interface, whether it's allowed * to use P2P PS mechanism or not. AP/P2P GO is not allowed to use P2P PS * if it has associated clients without P2P PS support. * @max_idle_period: the time period during which the station can refrain from * transmitting frames to its associated AP without being disassociated. * In units of 1000 TUs. Zero value indicates that the AP did not include * a (valid) BSS Max Idle Period Element. * @protected_keep_alive: if set, indicates that the station should send an RSN * protected frame to the AP to reset the idle timer at the AP for the * station. * @ftm_responder: whether to enable or disable fine timing measurement FTM * responder functionality. * @ftmr_params: configurable lci/civic parameter when enabling FTM responder. * @nontransmitted: this BSS is a nontransmitted BSS profile * @transmitter_bssid: the address of transmitter AP * @bssid_index: index inside the multiple BSSID set * @bssid_indicator: 2^bssid_indicator is the maximum number of APs in set * @ema_ap: AP supports enhancements of discovery and advertisement of * nontransmitted BSSIDs * @profile_periodicity: the least number of beacon frames need to be received * in order to discover all the nontransmitted BSSIDs in the set. * @he_oper: HE operation information of the BSS (AP/Mesh) or of the AP we are * connected to (STA) * @he_obss_pd: OBSS Packet Detection parameters. * @he_bss_color: BSS coloring settings, if BSS supports HE * @fils_discovery: FILS discovery configuration * @unsol_bcast_probe_resp_interval: Unsolicited broadcast probe response * interval. * @beacon_tx_rate: The configured beacon transmit rate that needs to be passed * to driver when rate control is offloaded to firmware. * @power_type: power type of BSS for 6 GHz * @tpe: transmit power envelope information * @pwr_reduction: power constraint of BSS. * @eht_support: does this BSS support EHT * @csa_active: marks whether a channel switch is going on. * @mu_mimo_owner: indicates interface owns MU-MIMO capability * @chanctx_conf: The channel context this interface is assigned to, or %NULL * when it is not assigned. This pointer is RCU-protected due to the TX * path needing to access it; even though the netdev carrier will always * be off when it is %NULL there can still be races and packets could be * processed after it switches back to %NULL. * @color_change_active: marks whether a color change is ongoing. * @color_change_color: the bss color that will be used after the change. * @ht_ldpc: in AP mode, indicates interface has HT LDPC capability. * @vht_ldpc: in AP mode, indicates interface has VHT LDPC capability. * @he_ldpc: in AP mode, indicates interface has HE LDPC capability. * @vht_su_beamformer: in AP mode, does this BSS support operation as an VHT SU * beamformer * @vht_su_beamformee: in AP mode, does this BSS support operation as an VHT SU * beamformee * @vht_mu_beamformer: in AP mode, does this BSS support operation as an VHT MU * beamformer * @vht_mu_beamformee: in AP mode, does this BSS support operation as an VHT MU * beamformee * @he_su_beamformer: in AP-mode, does this BSS support operation as an HE SU * beamformer * @he_su_beamformee: in AP-mode, does this BSS support operation as an HE SU * beamformee * @he_mu_beamformer: in AP-mode, does this BSS support operation as an HE MU * beamformer * @he_full_ul_mumimo: does this BSS support the reception (AP) or transmission * (non-AP STA) of an HE TB PPDU on an RU that spans the entire PPDU * bandwidth * @eht_su_beamformer: in AP-mode, does this BSS enable operation as an EHT SU * beamformer * @eht_su_beamformee: in AP-mode, does this BSS enable operation as an EHT SU * beamformee * @eht_mu_beamformer: in AP-mode, does this BSS enable operation as an EHT MU * beamformer * @eht_80mhz_full_bw_ul_mumimo: in AP-mode, does this BSS support the * reception of an EHT TB PPDU on an RU that spans the entire PPDU * bandwidth * @bss_param_ch_cnt: in BSS-mode, the BSS params change count. This * information is the latest known value. It can come from this link's * beacon or from a beacon sent by another link. * @bss_param_ch_cnt_link_id: in BSS-mode, the link_id to which the beacon * that updated &bss_param_ch_cnt belongs. E.g. if link 1 doesn't hear * its beacons, and link 2 sent a beacon with an RNR element that updated * link 1's BSS params change count, then, link 1's * bss_param_ch_cnt_link_id will be 2. That means that link 1 knows that * link 2 was the link that updated its bss_param_ch_cnt value. * In case link 1 hears its beacon again, bss_param_ch_cnt_link_id will * be updated to 1, even if bss_param_ch_cnt didn't change. This allows * the link to know that it heard the latest value from its own beacon * (as opposed to hearing its value from another link's beacon). */ struct ieee80211_bss_conf { struct ieee80211_vif *vif; struct cfg80211_bss *bss; const u8 *bssid; unsigned int link_id; u8 addr[ETH_ALEN] __aligned(2); u8 htc_trig_based_pkt_ext; bool uora_exists; u8 uora_ocw_range; u16 frame_time_rts_th; bool he_support; bool twt_requester; bool twt_responder; bool twt_protected; bool twt_broadcast; /* erp related data */ bool use_cts_prot; bool use_short_preamble; bool use_short_slot; bool enable_beacon; u8 dtim_period; u16 beacon_int; u16 assoc_capability; u64 sync_tsf; u32 sync_device_ts; u8 sync_dtim_count; u32 basic_rates; struct ieee80211_rate *beacon_rate; int mcast_rate[NUM_NL80211_BANDS]; u16 ht_operation_mode; s32 cqm_rssi_thold; u32 cqm_rssi_hyst; s32 cqm_rssi_low; s32 cqm_rssi_high; struct ieee80211_chan_req chanreq; struct ieee80211_mu_group_data mu_group; bool qos; bool hidden_ssid; int txpower; enum nl80211_tx_power_setting txpower_type; struct ieee80211_p2p_noa_attr p2p_noa_attr; bool allow_p2p_go_ps; u16 max_idle_period; bool protected_keep_alive; bool ftm_responder; struct ieee80211_ftm_responder_params *ftmr_params; /* Multiple BSSID data */ bool nontransmitted; u8 transmitter_bssid[ETH_ALEN]; u8 bssid_index; u8 bssid_indicator; bool ema_ap; u8 profile_periodicity; struct { u32 params; u16 nss_set; } he_oper; struct ieee80211_he_obss_pd he_obss_pd; struct cfg80211_he_bss_color he_bss_color; struct ieee80211_fils_discovery fils_discovery; u32 unsol_bcast_probe_resp_interval; struct cfg80211_bitrate_mask beacon_tx_rate; enum ieee80211_ap_reg_power power_type; struct ieee80211_parsed_tpe tpe; u8 pwr_reduction; bool eht_support; bool csa_active; bool mu_mimo_owner; struct ieee80211_chanctx_conf __rcu *chanctx_conf; bool color_change_active; u8 color_change_color; bool ht_ldpc; bool vht_ldpc; bool he_ldpc; bool vht_su_beamformer; bool vht_su_beamformee; bool vht_mu_beamformer; bool vht_mu_beamformee; bool he_su_beamformer; bool he_su_beamformee; bool he_mu_beamformer; bool he_full_ul_mumimo; bool eht_su_beamformer; bool eht_su_beamformee; bool eht_mu_beamformer; bool eht_80mhz_full_bw_ul_mumimo; u8 bss_param_ch_cnt; u8 bss_param_ch_cnt_link_id; }; /** * enum mac80211_tx_info_flags - flags to describe transmission information/status * * These flags are used with the @flags member of &ieee80211_tx_info. * * @IEEE80211_TX_CTL_REQ_TX_STATUS: require TX status callback for this frame. * @IEEE80211_TX_CTL_ASSIGN_SEQ: The driver has to assign a sequence * number to this frame, taking care of not overwriting the fragment * number and increasing the sequence number only when the * IEEE80211_TX_CTL_FIRST_FRAGMENT flag is set. mac80211 will properly * assign sequence numbers to QoS-data frames but cannot do so correctly * for non-QoS-data and management frames because beacons need them from * that counter as well and mac80211 cannot guarantee proper sequencing. * If this flag is set, the driver should instruct the hardware to * assign a sequence number to the frame or assign one itself. Cf. IEEE * 802.11-2007 7.1.3.4.1 paragraph 3. This flag will always be set for * beacons and always be clear for frames without a sequence number field. * @IEEE80211_TX_CTL_NO_ACK: tell the low level not to wait for an ack * @IEEE80211_TX_CTL_CLEAR_PS_FILT: clear powersave filter for destination * station * @IEEE80211_TX_CTL_FIRST_FRAGMENT: this is a first fragment of the frame * @IEEE80211_TX_CTL_SEND_AFTER_DTIM: send this frame after DTIM beacon * @IEEE80211_TX_CTL_AMPDU: this frame should be sent as part of an A-MPDU * @IEEE80211_TX_CTL_INJECTED: Frame was injected, internal to mac80211. * @IEEE80211_TX_STAT_TX_FILTERED: The frame was not transmitted * because the destination STA was in powersave mode. Note that to * avoid race conditions, the filter must be set by the hardware or * firmware upon receiving a frame that indicates that the station * went to sleep (must be done on device to filter frames already on * the queue) and may only be unset after mac80211 gives the OK for * that by setting the IEEE80211_TX_CTL_CLEAR_PS_FILT (see above), * since only then is it guaranteed that no more frames are in the * hardware queue. * @IEEE80211_TX_STAT_ACK: Frame was acknowledged * @IEEE80211_TX_STAT_AMPDU: The frame was aggregated, so status * is for the whole aggregation. * @IEEE80211_TX_STAT_AMPDU_NO_BACK: no block ack was returned, * so consider using block ack request (BAR). * @IEEE80211_TX_CTL_RATE_CTRL_PROBE: internal to mac80211, can be * set by rate control algorithms to indicate probe rate, will * be cleared for fragmented frames (except on the last fragment) * @IEEE80211_TX_INTFL_OFFCHAN_TX_OK: Internal to mac80211. Used to indicate * that a frame can be transmitted while the queues are stopped for * off-channel operation. * @IEEE80211_TX_CTL_HW_80211_ENCAP: This frame uses hardware encapsulation * (header conversion) * @IEEE80211_TX_INTFL_RETRIED: completely internal to mac80211, * used to indicate that a frame was already retried due to PS * @IEEE80211_TX_INTFL_DONT_ENCRYPT: completely internal to mac80211, * used to indicate frame should not be encrypted * @IEEE80211_TX_CTL_NO_PS_BUFFER: This frame is a response to a poll * frame (PS-Poll or uAPSD) or a non-bufferable MMPDU and must * be sent although the station is in powersave mode. * @IEEE80211_TX_CTL_MORE_FRAMES: More frames will be passed to the * transmit function after the current frame, this can be used * by drivers to kick the DMA queue only if unset or when the * queue gets full. * @IEEE80211_TX_INTFL_RETRANSMISSION: This frame is being retransmitted * after TX status because the destination was asleep, it must not * be modified again (no seqno assignment, crypto, etc.) * @IEEE80211_TX_INTFL_MLME_CONN_TX: This frame was transmitted by the MLME * code for connection establishment, this indicates that its status * should kick the MLME state machine. * @IEEE80211_TX_INTFL_NL80211_FRAME_TX: Frame was requested through nl80211 * MLME command (internal to mac80211 to figure out whether to send TX * status to user space) * @IEEE80211_TX_CTL_LDPC: tells the driver to use LDPC for this frame * @IEEE80211_TX_CTL_STBC: Enables Space-Time Block Coding (STBC) for this * frame and selects the maximum number of streams that it can use. * @IEEE80211_TX_CTL_TX_OFFCHAN: Marks this packet to be transmitted on * the off-channel channel when a remain-on-channel offload is done * in hardware -- normal packets still flow and are expected to be * handled properly by the device. * @IEEE80211_TX_INTFL_TKIP_MIC_FAILURE: Marks this packet to be used for TKIP * testing. It will be sent out with incorrect Michael MIC key to allow * TKIP countermeasures to be tested. * @IEEE80211_TX_CTL_NO_CCK_RATE: This frame will be sent at non CCK rate. * This flag is actually used for management frame especially for P2P * frames not being sent at CCK rate in 2GHz band. * @IEEE80211_TX_STATUS_EOSP: This packet marks the end of service period, * when its status is reported the service period ends. For frames in * an SP that mac80211 transmits, it is already set; for driver frames * the driver may set this flag. It is also used to do the same for * PS-Poll responses. * @IEEE80211_TX_CTL_USE_MINRATE: This frame will be sent at lowest rate. * This flag is used to send nullfunc frame at minimum rate when * the nullfunc is used for connection monitoring purpose. * @IEEE80211_TX_CTL_DONTFRAG: Don't fragment this packet even if it * would be fragmented by size (this is optional, only used for * monitor injection). * @IEEE80211_TX_STAT_NOACK_TRANSMITTED: A frame that was marked with * IEEE80211_TX_CTL_NO_ACK has been successfully transmitted without * any errors (like issues specific to the driver/HW). * This flag must not be set for frames that don't request no-ack * behaviour with IEEE80211_TX_CTL_NO_ACK. * * Note: If you have to add new flags to the enumeration, then don't * forget to update %IEEE80211_TX_TEMPORARY_FLAGS when necessary. */ enum mac80211_tx_info_flags { IEEE80211_TX_CTL_REQ_TX_STATUS = BIT(0), IEEE80211_TX_CTL_ASSIGN_SEQ = BIT(1), IEEE80211_TX_CTL_NO_ACK = BIT(2), IEEE80211_TX_CTL_CLEAR_PS_FILT = BIT(3), IEEE80211_TX_CTL_FIRST_FRAGMENT = BIT(4), IEEE80211_TX_CTL_SEND_AFTER_DTIM = BIT(5), IEEE80211_TX_CTL_AMPDU = BIT(6), IEEE80211_TX_CTL_INJECTED = BIT(7), IEEE80211_TX_STAT_TX_FILTERED = BIT(8), IEEE80211_TX_STAT_ACK = BIT(9), IEEE80211_TX_STAT_AMPDU = BIT(10), IEEE80211_TX_STAT_AMPDU_NO_BACK = BIT(11), IEEE80211_TX_CTL_RATE_CTRL_PROBE = BIT(12), IEEE80211_TX_INTFL_OFFCHAN_TX_OK = BIT(13), IEEE80211_TX_CTL_HW_80211_ENCAP = BIT(14), IEEE80211_TX_INTFL_RETRIED = BIT(15), IEEE80211_TX_INTFL_DONT_ENCRYPT = BIT(16), IEEE80211_TX_CTL_NO_PS_BUFFER = BIT(17), IEEE80211_TX_CTL_MORE_FRAMES = BIT(18), IEEE80211_TX_INTFL_RETRANSMISSION = BIT(19), IEEE80211_TX_INTFL_MLME_CONN_TX = BIT(20), IEEE80211_TX_INTFL_NL80211_FRAME_TX = BIT(21), IEEE80211_TX_CTL_LDPC = BIT(22), IEEE80211_TX_CTL_STBC = BIT(23) | BIT(24), IEEE80211_TX_CTL_TX_OFFCHAN = BIT(25), IEEE80211_TX_INTFL_TKIP_MIC_FAILURE = BIT(26), IEEE80211_TX_CTL_NO_CCK_RATE = BIT(27), IEEE80211_TX_STATUS_EOSP = BIT(28), IEEE80211_TX_CTL_USE_MINRATE = BIT(29), IEEE80211_TX_CTL_DONTFRAG = BIT(30), IEEE80211_TX_STAT_NOACK_TRANSMITTED = BIT(31), }; #define IEEE80211_TX_CTL_STBC_SHIFT 23 #define IEEE80211_TX_RC_S1G_MCS IEEE80211_TX_RC_VHT_MCS /** * enum mac80211_tx_control_flags - flags to describe transmit control * * @IEEE80211_TX_CTRL_PORT_CTRL_PROTO: this frame is a port control * protocol frame (e.g. EAP) * @IEEE80211_TX_CTRL_PS_RESPONSE: This frame is a response to a poll * frame (PS-Poll or uAPSD). * @IEEE80211_TX_CTRL_RATE_INJECT: This frame is injected with rate information * @IEEE80211_TX_CTRL_AMSDU: This frame is an A-MSDU frame * @IEEE80211_TX_CTRL_FAST_XMIT: This frame is going through the fast_xmit path * @IEEE80211_TX_CTRL_SKIP_MPATH_LOOKUP: This frame skips mesh path lookup * @IEEE80211_TX_INTCFL_NEED_TXPROCESSING: completely internal to mac80211, * used to indicate that a pending frame requires TX processing before * it can be sent out. * @IEEE80211_TX_CTRL_NO_SEQNO: Do not overwrite the sequence number that * has already been assigned to this frame. * @IEEE80211_TX_CTRL_DONT_REORDER: This frame should not be reordered * relative to other frames that have this flag set, independent * of their QoS TID or other priority field values. * @IEEE80211_TX_CTRL_MCAST_MLO_FIRST_TX: first MLO TX, used mostly internally * for sequence number assignment * @IEEE80211_TX_CTRL_DONT_USE_RATE_MASK: Don't use rate mask for this frame * which is transmitted due to scanning or offchannel TX, not in normal * operation on the interface. * @IEEE80211_TX_CTRL_MLO_LINK: If not @IEEE80211_LINK_UNSPECIFIED, this * frame should be transmitted on the specific link. This really is * only relevant for frames that do not have data present, and is * also not used for 802.3 format frames. Note that even if the frame * is on a specific link, address translation might still apply if * it's intended for an MLD. * * These flags are used in tx_info->control.flags. */ enum mac80211_tx_control_flags { IEEE80211_TX_CTRL_PORT_CTRL_PROTO = BIT(0), IEEE80211_TX_CTRL_PS_RESPONSE = BIT(1), IEEE80211_TX_CTRL_RATE_INJECT = BIT(2), IEEE80211_TX_CTRL_AMSDU = BIT(3), IEEE80211_TX_CTRL_FAST_XMIT = BIT(4), IEEE80211_TX_CTRL_SKIP_MPATH_LOOKUP = BIT(5), IEEE80211_TX_INTCFL_NEED_TXPROCESSING = BIT(6), IEEE80211_TX_CTRL_NO_SEQNO = BIT(7), IEEE80211_TX_CTRL_DONT_REORDER = BIT(8), IEEE80211_TX_CTRL_MCAST_MLO_FIRST_TX = BIT(9), IEEE80211_TX_CTRL_DONT_USE_RATE_MASK = BIT(10), IEEE80211_TX_CTRL_MLO_LINK = 0xf0000000, }; #define IEEE80211_LINK_UNSPECIFIED 0xf #define IEEE80211_TX_CTRL_MLO_LINK_UNSPEC \ u32_encode_bits(IEEE80211_LINK_UNSPECIFIED, \ IEEE80211_TX_CTRL_MLO_LINK) /** * enum mac80211_tx_status_flags - flags to describe transmit status * * @IEEE80211_TX_STATUS_ACK_SIGNAL_VALID: ACK signal is valid * * These flags are used in tx_info->status.flags. */ enum mac80211_tx_status_flags { IEEE80211_TX_STATUS_ACK_SIGNAL_VALID = BIT(0), }; /* * This definition is used as a mask to clear all temporary flags, which are * set by the tx handlers for each transmission attempt by the mac80211 stack. */ #define IEEE80211_TX_TEMPORARY_FLAGS (IEEE80211_TX_CTL_NO_ACK | \ IEEE80211_TX_CTL_CLEAR_PS_FILT | IEEE80211_TX_CTL_FIRST_FRAGMENT | \ IEEE80211_TX_CTL_SEND_AFTER_DTIM | IEEE80211_TX_CTL_AMPDU | \ IEEE80211_TX_STAT_TX_FILTERED | IEEE80211_TX_STAT_ACK | \ IEEE80211_TX_STAT_AMPDU | IEEE80211_TX_STAT_AMPDU_NO_BACK | \ IEEE80211_TX_CTL_RATE_CTRL_PROBE | IEEE80211_TX_CTL_NO_PS_BUFFER | \ IEEE80211_TX_CTL_MORE_FRAMES | IEEE80211_TX_CTL_LDPC | \ IEEE80211_TX_CTL_STBC | IEEE80211_TX_STATUS_EOSP) /** * enum mac80211_rate_control_flags - per-rate flags set by the * Rate Control algorithm. * * These flags are set by the Rate control algorithm for each rate during tx, * in the @flags member of struct ieee80211_tx_rate. * * @IEEE80211_TX_RC_USE_RTS_CTS: Use RTS/CTS exchange for this rate. * @IEEE80211_TX_RC_USE_CTS_PROTECT: CTS-to-self protection is required. * This is set if the current BSS requires ERP protection. * @IEEE80211_TX_RC_USE_SHORT_PREAMBLE: Use short preamble. * @IEEE80211_TX_RC_MCS: HT rate. * @IEEE80211_TX_RC_VHT_MCS: VHT MCS rate, in this case the idx field is split * into a higher 4 bits (Nss) and lower 4 bits (MCS number) * @IEEE80211_TX_RC_GREEN_FIELD: Indicates whether this rate should be used in * Greenfield mode. * @IEEE80211_TX_RC_40_MHZ_WIDTH: Indicates if the Channel Width should be 40 MHz. * @IEEE80211_TX_RC_80_MHZ_WIDTH: Indicates 80 MHz transmission * @IEEE80211_TX_RC_160_MHZ_WIDTH: Indicates 160 MHz transmission * (80+80 isn't supported yet) * @IEEE80211_TX_RC_DUP_DATA: The frame should be transmitted on both of the * adjacent 20 MHz channels, if the current channel type is * NL80211_CHAN_HT40MINUS or NL80211_CHAN_HT40PLUS. * @IEEE80211_TX_RC_SHORT_GI: Short Guard interval should be used for this rate. */ enum mac80211_rate_control_flags { IEEE80211_TX_RC_USE_RTS_CTS = BIT(0), IEEE80211_TX_RC_USE_CTS_PROTECT = BIT(1), IEEE80211_TX_RC_USE_SHORT_PREAMBLE = BIT(2), /* rate index is an HT/VHT MCS instead of an index */ IEEE80211_TX_RC_MCS = BIT(3), IEEE80211_TX_RC_GREEN_FIELD = BIT(4), IEEE80211_TX_RC_40_MHZ_WIDTH = BIT(5), IEEE80211_TX_RC_DUP_DATA = BIT(6), IEEE80211_TX_RC_SHORT_GI = BIT(7), IEEE80211_TX_RC_VHT_MCS = BIT(8), IEEE80211_TX_RC_80_MHZ_WIDTH = BIT(9), IEEE80211_TX_RC_160_MHZ_WIDTH = BIT(10), }; /* there are 40 bytes if you don't need the rateset to be kept */ #define IEEE80211_TX_INFO_DRIVER_DATA_SIZE 40 /* if you do need the rateset, then you have less space */ #define IEEE80211_TX_INFO_RATE_DRIVER_DATA_SIZE 24 /* maximum number of rate stages */ #define IEEE80211_TX_MAX_RATES 4 /* maximum number of rate table entries */ #define IEEE80211_TX_RATE_TABLE_SIZE 4 /** * struct ieee80211_tx_rate - rate selection/status * * @idx: rate index to attempt to send with * @flags: rate control flags (&enum mac80211_rate_control_flags) * @count: number of tries in this rate before going to the next rate * * A value of -1 for @idx indicates an invalid rate and, if used * in an array of retry rates, that no more rates should be tried. * * When used for transmit status reporting, the driver should * always report the rate along with the flags it used. * * &struct ieee80211_tx_info contains an array of these structs * in the control information, and it will be filled by the rate * control algorithm according to what should be sent. For example, * if this array contains, in the format { <idx>, <count> } the * information:: * * { 3, 2 }, { 2, 2 }, { 1, 4 }, { -1, 0 }, { -1, 0 } * * then this means that the frame should be transmitted * up to twice at rate 3, up to twice at rate 2, and up to four * times at rate 1 if it doesn't get acknowledged. Say it gets * acknowledged by the peer after the fifth attempt, the status * information should then contain:: * * { 3, 2 }, { 2, 2 }, { 1, 1 }, { -1, 0 } ... * * since it was transmitted twice at rate 3, twice at rate 2 * and once at rate 1 after which we received an acknowledgement. */ struct ieee80211_tx_rate { s8 idx; u16 count:5, flags:11; } __packed; #define IEEE80211_MAX_TX_RETRY 31 static inline bool ieee80211_rate_valid(struct ieee80211_tx_rate *rate) { return rate->idx >= 0 && rate->count > 0; } static inline void ieee80211_rate_set_vht(struct ieee80211_tx_rate *rate, u8 mcs, u8 nss) { WARN_ON(mcs & ~0xF); WARN_ON((nss - 1) & ~0x7); rate->idx = ((nss - 1) << 4) | mcs; } static inline u8 ieee80211_rate_get_vht_mcs(const struct ieee80211_tx_rate *rate) { return rate->idx & 0xF; } static inline u8 ieee80211_rate_get_vht_nss(const struct ieee80211_tx_rate *rate) { return (rate->idx >> 4) + 1; } /** * struct ieee80211_tx_info - skb transmit information * * This structure is placed in skb->cb for three uses: * (1) mac80211 TX control - mac80211 tells the driver what to do * (2) driver internal use (if applicable) * (3) TX status information - driver tells mac80211 what happened * * @flags: transmit info flags, defined above * @band: the band to transmit on (use e.g. for checking for races), * not valid if the interface is an MLD since we won't know which * link the frame will be transmitted on * @hw_queue: HW queue to put the frame on, skb_get_queue_mapping() gives the AC * @status_data: internal data for TX status handling, assigned privately, * see also &enum ieee80211_status_data for the internal documentation * @status_data_idr: indicates status data is IDR allocated ID for ack frame * @tx_time_est: TX time estimate in units of 4us, used internally * @control: union part for control data * @control.rates: TX rates array to try * @control.rts_cts_rate_idx: rate for RTS or CTS * @control.use_rts: use RTS * @control.use_cts_prot: use RTS/CTS * @control.short_preamble: use short preamble (CCK only) * @control.skip_table: skip externally configured rate table * @control.jiffies: timestamp for expiry on powersave clients * @control.vif: virtual interface (may be NULL) * @control.hw_key: key to encrypt with (may be NULL) * @control.flags: control flags, see &enum mac80211_tx_control_flags * @control.enqueue_time: enqueue time (for iTXQs) * @driver_rates: alias to @control.rates to reserve space * @pad: padding * @rate_driver_data: driver use area if driver needs @control.rates * @status: union part for status data * @status.rates: attempted rates * @status.ack_signal: ACK signal * @status.ampdu_ack_len: AMPDU ack length * @status.ampdu_len: AMPDU length * @status.antenna: (legacy, kept only for iwlegacy) * @status.tx_time: airtime consumed for transmission; note this is only * used for WMM AC, not for airtime fairness * @status.flags: status flags, see &enum mac80211_tx_status_flags * @status.status_driver_data: driver use area * @ack: union part for pure ACK data * @ack.cookie: cookie for the ACK * @driver_data: array of driver_data pointers */ struct ieee80211_tx_info { /* common information */ u32 flags; u32 band:3, status_data_idr:1, status_data:13, hw_queue:4, tx_time_est:10; /* 1 free bit */ union { struct { union { /* rate control */ struct { struct ieee80211_tx_rate rates[ IEEE80211_TX_MAX_RATES]; s8 rts_cts_rate_idx; u8 use_rts:1; u8 use_cts_prot:1; u8 short_preamble:1; u8 skip_table:1; /* for injection only (bitmap) */ u8 antennas:2; /* 14 bits free */ }; /* only needed before rate control */ unsigned long jiffies; }; /* NB: vif can be NULL for injected frames */ struct ieee80211_vif *vif; struct ieee80211_key_conf *hw_key; u32 flags; codel_time_t enqueue_time; } control; struct { u64 cookie; } ack; struct { struct ieee80211_tx_rate rates[IEEE80211_TX_MAX_RATES]; s32 ack_signal; u8 ampdu_ack_len; u8 ampdu_len; u8 antenna; u8 pad; u16 tx_time; u8 flags; u8 pad2; void *status_driver_data[16 / sizeof(void *)]; } status; struct { struct ieee80211_tx_rate driver_rates[ IEEE80211_TX_MAX_RATES]; u8 pad[4]; void *rate_driver_data[ IEEE80211_TX_INFO_RATE_DRIVER_DATA_SIZE / sizeof(void *)]; }; void *driver_data[ IEEE80211_TX_INFO_DRIVER_DATA_SIZE / sizeof(void *)]; }; }; static inline u16 ieee80211_info_set_tx_time_est(struct ieee80211_tx_info *info, u16 tx_time_est) { /* We only have 10 bits in tx_time_est, so store airtime * in increments of 4us and clamp the maximum to 2**12-1 */ info->tx_time_est = min_t(u16, tx_time_est, 4095) >> 2; return info->tx_time_est << 2; } static inline u16 ieee80211_info_get_tx_time_est(struct ieee80211_tx_info *info) { return info->tx_time_est << 2; } /*** * struct ieee80211_rate_status - mrr stage for status path * * This struct is used in struct ieee80211_tx_status to provide drivers a * dynamic way to report about used rates and power levels per packet. * * @rate_idx The actual used rate. * @try_count How often the rate was tried. * @tx_power_idx An idx into the ieee80211_hw->tx_power_levels list of the * corresponding wifi hardware. The idx shall point to the power level * that was used when sending the packet. */ struct ieee80211_rate_status { struct rate_info rate_idx; u8 try_count; u8 tx_power_idx; }; /** * struct ieee80211_tx_status - extended tx status info for rate control * * @sta: Station that the packet was transmitted for * @info: Basic tx status information * @skb: Packet skb (can be NULL if not provided by the driver) * @rates: Mrr stages that were used when sending the packet * @n_rates: Number of mrr stages (count of instances for @rates) * @free_list: list where processed skbs are stored to be free'd by the driver * @ack_hwtstamp: Hardware timestamp of the received ack in nanoseconds * Only needed for Timing measurement and Fine timing measurement action * frames. Only reported by devices that have timestamping enabled. */ struct ieee80211_tx_status { struct ieee80211_sta *sta; struct ieee80211_tx_info *info; struct sk_buff *skb; struct ieee80211_rate_status *rates; ktime_t ack_hwtstamp; u8 n_rates; struct list_head *free_list; }; /** * struct ieee80211_scan_ies - descriptors for different blocks of IEs * * This structure is used to point to different blocks of IEs in HW scan * and scheduled scan. These blocks contain the IEs passed by userspace * and the ones generated by mac80211. * * @ies: pointers to band specific IEs. * @len: lengths of band_specific IEs. * @common_ies: IEs for all bands (especially vendor specific ones) * @common_ie_len: length of the common_ies */ struct ieee80211_scan_ies { const u8 *ies[NUM_NL80211_BANDS]; size_t len[NUM_NL80211_BANDS]; const u8 *common_ies; size_t common_ie_len; }; static inline struct ieee80211_tx_info *IEEE80211_SKB_CB(struct sk_buff *skb) { return (struct ieee80211_tx_info *)skb->cb; } static inline struct ieee80211_rx_status *IEEE80211_SKB_RXCB(struct sk_buff *skb) { return (struct ieee80211_rx_status *)skb->cb; } /** * ieee80211_tx_info_clear_status - clear TX status * * @info: The &struct ieee80211_tx_info to be cleared. * * When the driver passes an skb back to mac80211, it must report * a number of things in TX status. This function clears everything * in the TX status but the rate control information (it does clear * the count since you need to fill that in anyway). * * NOTE: While the rates array is kept intact, this will wipe all of the * driver_data fields in info, so it's up to the driver to restore * any fields it needs after calling this helper. */ static inline void ieee80211_tx_info_clear_status(struct ieee80211_tx_info *info) { int i; BUILD_BUG_ON(offsetof(struct ieee80211_tx_info, status.rates) != offsetof(struct ieee80211_tx_info, control.rates)); BUILD_BUG_ON(offsetof(struct ieee80211_tx_info, status.rates) != offsetof(struct ieee80211_tx_info, driver_rates)); BUILD_BUG_ON(offsetof(struct ieee80211_tx_info, status.rates) != 8); /* clear the rate counts */ for (i = 0; i < IEEE80211_TX_MAX_RATES; i++) info->status.rates[i].count = 0; memset_after(&info->status, 0, rates); } /** * enum mac80211_rx_flags - receive flags * * These flags are used with the @flag member of &struct ieee80211_rx_status. * @RX_FLAG_MMIC_ERROR: Michael MIC error was reported on this frame. * Use together with %RX_FLAG_MMIC_STRIPPED. * @RX_FLAG_DECRYPTED: This frame was decrypted in hardware. * @RX_FLAG_MMIC_STRIPPED: the Michael MIC is stripped off this frame, * verification has been done by the hardware. * @RX_FLAG_IV_STRIPPED: The IV and ICV are stripped from this frame. * If this flag is set, the stack cannot do any replay detection * hence the driver or hardware will have to do that. * @RX_FLAG_PN_VALIDATED: Currently only valid for CCMP/GCMP frames, this * flag indicates that the PN was verified for replay protection. * Note that this flag is also currently only supported when a frame * is also decrypted (ie. @RX_FLAG_DECRYPTED must be set) * @RX_FLAG_DUP_VALIDATED: The driver should set this flag if it did * de-duplication by itself. * @RX_FLAG_FAILED_FCS_CRC: Set this flag if the FCS check failed on * the frame. * @RX_FLAG_FAILED_PLCP_CRC: Set this flag if the PCLP check failed on * the frame. * @RX_FLAG_MACTIME: The timestamp passed in the RX status (@mactime * field) is valid if this field is non-zero, and the position * where the timestamp was sampled depends on the value. * @RX_FLAG_MACTIME_START: The timestamp passed in the RX status (@mactime * field) is valid and contains the time the first symbol of the MPDU * was received. This is useful in monitor mode and for proper IBSS * merging. * @RX_FLAG_MACTIME_END: The timestamp passed in the RX status (@mactime * field) is valid and contains the time the last symbol of the MPDU * (including FCS) was received. * @RX_FLAG_MACTIME_PLCP_START: The timestamp passed in the RX status (@mactime * field) is valid and contains the time the SYNC preamble was received. * @RX_FLAG_MACTIME_IS_RTAP_TS64: The timestamp passed in the RX status @mactime * is only for use in the radiotap timestamp header, not otherwise a valid * @mactime value. Note this is a separate flag so that we continue to see * %RX_FLAG_MACTIME as unset. Also note that in this case the timestamp is * reported to be 64 bits wide, not just 32. * @RX_FLAG_NO_SIGNAL_VAL: The signal strength value is not present. * Valid only for data frames (mainly A-MPDU) * @RX_FLAG_AMPDU_DETAILS: A-MPDU details are known, in particular the reference * number (@ampdu_reference) must be populated and be a distinct number for * each A-MPDU * @RX_FLAG_AMPDU_LAST_KNOWN: last subframe is known, should be set on all * subframes of a single A-MPDU * @RX_FLAG_AMPDU_IS_LAST: this subframe is the last subframe of the A-MPDU * @RX_FLAG_AMPDU_DELIM_CRC_ERROR: A delimiter CRC error has been detected * on this subframe * @RX_FLAG_MIC_STRIPPED: The mic was stripped of this packet. Decryption was * done by the hardware * @RX_FLAG_ONLY_MONITOR: Report frame only to monitor interfaces without * processing it in any regular way. * This is useful if drivers offload some frames but still want to report * them for sniffing purposes. * @RX_FLAG_SKIP_MONITOR: Process and report frame to all interfaces except * monitor interfaces. * This is useful if drivers offload some frames but still want to report * them for sniffing purposes. * @RX_FLAG_AMSDU_MORE: Some drivers may prefer to report separate A-MSDU * subframes instead of a one huge frame for performance reasons. * All, but the last MSDU from an A-MSDU should have this flag set. E.g. * if an A-MSDU has 3 frames, the first 2 must have the flag set, while * the 3rd (last) one must not have this flag set. The flag is used to * deal with retransmission/duplication recovery properly since A-MSDU * subframes share the same sequence number. Reported subframes can be * either regular MSDU or singly A-MSDUs. Subframes must not be * interleaved with other frames. * @RX_FLAG_RADIOTAP_TLV_AT_END: This frame contains radiotap TLVs in the * skb->data (before the 802.11 header). * If used, the SKB's mac_header pointer must be set to point * to the 802.11 header after the TLVs, and any padding added after TLV * data to align to 4 must be cleared by the driver putting the TLVs * in the skb. * @RX_FLAG_ALLOW_SAME_PN: Allow the same PN as same packet before. * This is used for AMSDU subframes which can have the same PN as * the first subframe. * @RX_FLAG_ICV_STRIPPED: The ICV is stripped from this frame. CRC checking must * be done in the hardware. * @RX_FLAG_AMPDU_EOF_BIT: Value of the EOF bit in the A-MPDU delimiter for this * frame * @RX_FLAG_AMPDU_EOF_BIT_KNOWN: The EOF value is known * @RX_FLAG_RADIOTAP_HE: HE radiotap data is present * (&struct ieee80211_radiotap_he, mac80211 will fill in * * - DATA3_DATA_MCS * - DATA3_DATA_DCM * - DATA3_CODING * - DATA5_GI * - DATA5_DATA_BW_RU_ALLOC * - DATA6_NSTS * - DATA3_STBC * * from the RX info data, so leave those zeroed when building this data) * @RX_FLAG_RADIOTAP_HE_MU: HE MU radiotap data is present * (&struct ieee80211_radiotap_he_mu) * @RX_FLAG_RADIOTAP_LSIG: L-SIG radiotap data is present * @RX_FLAG_NO_PSDU: use the frame only for radiotap reporting, with * the "0-length PSDU" field included there. The value for it is * in &struct ieee80211_rx_status. Note that if this value isn't * known the frame shouldn't be reported. * @RX_FLAG_8023: the frame has an 802.3 header (decap offload performed by * hardware or driver) */ enum mac80211_rx_flags { RX_FLAG_MMIC_ERROR = BIT(0), RX_FLAG_DECRYPTED = BIT(1), RX_FLAG_ONLY_MONITOR = BIT(2), RX_FLAG_MMIC_STRIPPED = BIT(3), RX_FLAG_IV_STRIPPED = BIT(4), RX_FLAG_FAILED_FCS_CRC = BIT(5), RX_FLAG_FAILED_PLCP_CRC = BIT(6), RX_FLAG_MACTIME_IS_RTAP_TS64 = BIT(7), RX_FLAG_NO_SIGNAL_VAL = BIT(8), RX_FLAG_AMPDU_DETAILS = BIT(9), RX_FLAG_PN_VALIDATED = BIT(10), RX_FLAG_DUP_VALIDATED = BIT(11), RX_FLAG_AMPDU_LAST_KNOWN = BIT(12), RX_FLAG_AMPDU_IS_LAST = BIT(13), RX_FLAG_AMPDU_DELIM_CRC_ERROR = BIT(14), /* one free bit at 15 */ RX_FLAG_MACTIME = BIT(16) | BIT(17), RX_FLAG_MACTIME_PLCP_START = 1 << 16, RX_FLAG_MACTIME_START = 2 << 16, RX_FLAG_MACTIME_END = 3 << 16, RX_FLAG_SKIP_MONITOR = BIT(18), RX_FLAG_AMSDU_MORE = BIT(19), RX_FLAG_RADIOTAP_TLV_AT_END = BIT(20), RX_FLAG_MIC_STRIPPED = BIT(21), RX_FLAG_ALLOW_SAME_PN = BIT(22), RX_FLAG_ICV_STRIPPED = BIT(23), RX_FLAG_AMPDU_EOF_BIT = BIT(24), RX_FLAG_AMPDU_EOF_BIT_KNOWN = BIT(25), RX_FLAG_RADIOTAP_HE = BIT(26), RX_FLAG_RADIOTAP_HE_MU = BIT(27), RX_FLAG_RADIOTAP_LSIG = BIT(28), RX_FLAG_NO_PSDU = BIT(29), RX_FLAG_8023 = BIT(30), }; /** * enum mac80211_rx_encoding_flags - MCS & bandwidth flags * * @RX_ENC_FLAG_SHORTPRE: Short preamble was used for this frame * @RX_ENC_FLAG_SHORT_GI: Short guard interval was used * @RX_ENC_FLAG_HT_GF: This frame was received in a HT-greenfield transmission, * if the driver fills this value it should add * %IEEE80211_RADIOTAP_MCS_HAVE_FMT * to @hw.radiotap_mcs_details to advertise that fact. * @RX_ENC_FLAG_LDPC: LDPC was used * @RX_ENC_FLAG_STBC_MASK: STBC 2 bit bitmask. 1 - Nss=1, 2 - Nss=2, 3 - Nss=3 * @RX_ENC_FLAG_BF: packet was beamformed */ enum mac80211_rx_encoding_flags { RX_ENC_FLAG_SHORTPRE = BIT(0), RX_ENC_FLAG_SHORT_GI = BIT(2), RX_ENC_FLAG_HT_GF = BIT(3), RX_ENC_FLAG_STBC_MASK = BIT(4) | BIT(5), RX_ENC_FLAG_LDPC = BIT(6), RX_ENC_FLAG_BF = BIT(7), }; #define RX_ENC_FLAG_STBC_SHIFT 4 enum mac80211_rx_encoding { RX_ENC_LEGACY = 0, RX_ENC_HT, RX_ENC_VHT, RX_ENC_HE, RX_ENC_EHT, }; /** * struct ieee80211_rx_status - receive status * * The low-level driver should provide this information (the subset * supported by hardware) to the 802.11 code with each received * frame, in the skb's control buffer (cb). * * @mactime: value in microseconds of the 64-bit Time Synchronization Function * (TSF) timer when the first data symbol (MPDU) arrived at the hardware. * @boottime_ns: CLOCK_BOOTTIME timestamp the frame was received at, this is * needed only for beacons and probe responses that update the scan cache. * @ack_tx_hwtstamp: Hardware timestamp for the ack TX in nanoseconds. Only * needed for Timing measurement and Fine timing measurement action frames. * Only reported by devices that have timestamping enabled. * @device_timestamp: arbitrary timestamp for the device, mac80211 doesn't use * it but can store it and pass it back to the driver for synchronisation * @band: the active band when this frame was received * @freq: frequency the radio was tuned to when receiving this frame, in MHz * This field must be set for management frames, but isn't strictly needed * for data (other) frames - for those it only affects radiotap reporting. * @freq_offset: @freq has a positive offset of 500Khz. * @signal: signal strength when receiving this frame, either in dBm, in dB or * unspecified depending on the hardware capabilities flags * @IEEE80211_HW_SIGNAL_* * @chains: bitmask of receive chains for which separate signal strength * values were filled. * @chain_signal: per-chain signal strength, in dBm (unlike @signal, doesn't * support dB or unspecified units) * @antenna: antenna used * @rate_idx: index of data rate into band's supported rates or MCS index if * HT or VHT is used (%RX_FLAG_HT/%RX_FLAG_VHT) * @nss: number of streams (VHT, HE and EHT only) * @flag: %RX_FLAG_\* * @encoding: &enum mac80211_rx_encoding * @bw: &enum rate_info_bw * @enc_flags: uses bits from &enum mac80211_rx_encoding_flags * @he_ru: HE RU, from &enum nl80211_he_ru_alloc * @he_gi: HE GI, from &enum nl80211_he_gi * @he_dcm: HE DCM value * @eht: EHT specific rate information * @eht.ru: EHT RU, from &enum nl80211_eht_ru_alloc * @eht.gi: EHT GI, from &enum nl80211_eht_gi * @rx_flags: internal RX flags for mac80211 * @ampdu_reference: A-MPDU reference number, must be a different value for * each A-MPDU but the same for each subframe within one A-MPDU * @zero_length_psdu_type: radiotap type of the 0-length PSDU * @link_valid: if the link which is identified by @link_id is valid. This flag * is set only when connection is MLO. * @link_id: id of the link used to receive the packet. This is used along with * @link_valid. */ struct ieee80211_rx_status { u64 mactime; union { u64 boottime_ns; ktime_t ack_tx_hwtstamp; }; u32 device_timestamp; u32 ampdu_reference; u32 flag; u16 freq: 13, freq_offset: 1; u8 enc_flags; u8 encoding:3, bw:4; union { struct { u8 he_ru:3; u8 he_gi:2; u8 he_dcm:1; }; struct { u8 ru:4; u8 gi:2; } eht; }; u8 rate_idx; u8 nss; u8 rx_flags; u8 band; u8 antenna; s8 signal; u8 chains; s8 chain_signal[IEEE80211_MAX_CHAINS]; u8 zero_length_psdu_type; u8 link_valid:1, link_id:4; }; static inline u32 ieee80211_rx_status_to_khz(struct ieee80211_rx_status *rx_status) { return MHZ_TO_KHZ(rx_status->freq) + (rx_status->freq_offset ? 500 : 0); } /** * enum ieee80211_conf_flags - configuration flags * * Flags to define PHY configuration options * * @IEEE80211_CONF_MONITOR: there's a monitor interface present -- use this * to determine for example whether to calculate timestamps for packets * or not, do not use instead of filter flags! * @IEEE80211_CONF_PS: Enable 802.11 power save mode (managed mode only). * This is the power save mode defined by IEEE 802.11-2007 section 11.2, * meaning that the hardware still wakes up for beacons, is able to * transmit frames and receive the possible acknowledgment frames. * Not to be confused with hardware specific wakeup/sleep states, * driver is responsible for that. See the section "Powersave support" * for more. * @IEEE80211_CONF_IDLE: The device is running, but idle; if the flag is set * the driver should be prepared to handle configuration requests but * may turn the device off as much as possible. Typically, this flag will * be set when an interface is set UP but not associated or scanning, but * it can also be unset in that case when monitor interfaces are active. * @IEEE80211_CONF_OFFCHANNEL: The device is currently not on its main * operating channel. */ enum ieee80211_conf_flags { IEEE80211_CONF_MONITOR = (1<<0), IEEE80211_CONF_PS = (1<<1), IEEE80211_CONF_IDLE = (1<<2), IEEE80211_CONF_OFFCHANNEL = (1<<3), }; /** * enum ieee80211_conf_changed - denotes which configuration changed * * @IEEE80211_CONF_CHANGE_LISTEN_INTERVAL: the listen interval changed * @IEEE80211_CONF_CHANGE_MONITOR: the monitor flag changed * @IEEE80211_CONF_CHANGE_PS: the PS flag or dynamic PS timeout changed * @IEEE80211_CONF_CHANGE_POWER: the TX power changed * @IEEE80211_CONF_CHANGE_CHANNEL: the channel/channel_type changed * @IEEE80211_CONF_CHANGE_RETRY_LIMITS: retry limits changed * @IEEE80211_CONF_CHANGE_IDLE: Idle flag changed * @IEEE80211_CONF_CHANGE_SMPS: Spatial multiplexing powersave mode changed * Note that this is only valid if channel contexts are not used, * otherwise each channel context has the number of chains listed. */ enum ieee80211_conf_changed { IEEE80211_CONF_CHANGE_SMPS = BIT(1), IEEE80211_CONF_CHANGE_LISTEN_INTERVAL = BIT(2), IEEE80211_CONF_CHANGE_MONITOR = BIT(3), IEEE80211_CONF_CHANGE_PS = BIT(4), IEEE80211_CONF_CHANGE_POWER = BIT(5), IEEE80211_CONF_CHANGE_CHANNEL = BIT(6), IEEE80211_CONF_CHANGE_RETRY_LIMITS = BIT(7), IEEE80211_CONF_CHANGE_IDLE = BIT(8), }; /** * enum ieee80211_smps_mode - spatial multiplexing power save mode * * @IEEE80211_SMPS_AUTOMATIC: automatic * @IEEE80211_SMPS_OFF: off * @IEEE80211_SMPS_STATIC: static * @IEEE80211_SMPS_DYNAMIC: dynamic * @IEEE80211_SMPS_NUM_MODES: internal, don't use */ enum ieee80211_smps_mode { IEEE80211_SMPS_AUTOMATIC, IEEE80211_SMPS_OFF, IEEE80211_SMPS_STATIC, IEEE80211_SMPS_DYNAMIC, /* keep last */ IEEE80211_SMPS_NUM_MODES, }; /** * struct ieee80211_conf - configuration of the device * * This struct indicates how the driver shall configure the hardware. * * @flags: configuration flags defined above * * @listen_interval: listen interval in units of beacon interval * @ps_dtim_period: The DTIM period of the AP we're connected to, for use * in power saving. Power saving will not be enabled until a beacon * has been received and the DTIM period is known. * @dynamic_ps_timeout: The dynamic powersave timeout (in ms), see the * powersave documentation below. This variable is valid only when * the CONF_PS flag is set. * * @power_level: requested transmit power (in dBm), backward compatibility * value only that is set to the minimum of all interfaces * * @chandef: the channel definition to tune to * @radar_enabled: whether radar detection is enabled * * @long_frame_max_tx_count: Maximum number of transmissions for a "long" frame * (a frame not RTS protected), called "dot11LongRetryLimit" in 802.11, * but actually means the number of transmissions not the number of retries * @short_frame_max_tx_count: Maximum number of transmissions for a "short" * frame, called "dot11ShortRetryLimit" in 802.11, but actually means the * number of transmissions not the number of retries * * @smps_mode: spatial multiplexing powersave mode; note that * %IEEE80211_SMPS_STATIC is used when the device is not * configured for an HT channel. * Note that this is only valid if channel contexts are not used, * otherwise each channel context has the number of chains listed. */ struct ieee80211_conf { u32 flags; int power_level, dynamic_ps_timeout; u16 listen_interval; u8 ps_dtim_period; u8 long_frame_max_tx_count, short_frame_max_tx_count; struct cfg80211_chan_def chandef; bool radar_enabled; enum ieee80211_smps_mode smps_mode; }; /** * struct ieee80211_channel_switch - holds the channel switch data * * The information provided in this structure is required for channel switch * operation. * * @timestamp: value in microseconds of the 64-bit Time Synchronization * Function (TSF) timer when the frame containing the channel switch * announcement was received. This is simply the rx.mactime parameter * the driver passed into mac80211. * @device_timestamp: arbitrary timestamp for the device, this is the * rx.device_timestamp parameter the driver passed to mac80211. * @block_tx: Indicates whether transmission must be blocked before the * scheduled channel switch, as indicated by the AP. * @chandef: the new channel to switch to * @count: the number of TBTT's until the channel switch event * @delay: maximum delay between the time the AP transmitted the last beacon in * current channel and the expected time of the first beacon in the new * channel, expressed in TU. * @link_id: the link ID of the link doing the channel switch, 0 for non-MLO */ struct ieee80211_channel_switch { u64 timestamp; u32 device_timestamp; bool block_tx; struct cfg80211_chan_def chandef; u8 count; u8 link_id; u32 delay; }; /** * enum ieee80211_vif_flags - virtual interface flags * * @IEEE80211_VIF_BEACON_FILTER: the device performs beacon filtering * on this virtual interface to avoid unnecessary CPU wakeups * @IEEE80211_VIF_SUPPORTS_CQM_RSSI: the device can do connection quality * monitoring on this virtual interface -- i.e. it can monitor * connection quality related parameters, such as the RSSI level and * provide notifications if configured trigger levels are reached. * @IEEE80211_VIF_SUPPORTS_UAPSD: The device can do U-APSD for this * interface. This flag should be set during interface addition, * but may be set/cleared as late as authentication to an AP. It is * only valid for managed/station mode interfaces. * @IEEE80211_VIF_GET_NOA_UPDATE: request to handle NOA attributes * and send P2P_PS notification to the driver if NOA changed, even * this is not pure P2P vif. * @IEEE80211_VIF_EML_ACTIVE: The driver indicates that EML operation is * enabled for the interface. * @IEEE80211_VIF_IGNORE_OFDMA_WIDER_BW: Ignore wider bandwidth OFDMA * operation on this interface and request a channel context without * the AP definition. Use this e.g. because the device is able to * handle OFDMA (downlink and trigger for uplink) on a per-AP basis. * @IEEE80211_VIF_REMOVE_AP_AFTER_DISASSOC: indicates that the AP sta should * be removed only after setting the vif as unassociated, and not the * opposite. Only relevant for STA vifs. */ enum ieee80211_vif_flags { IEEE80211_VIF_BEACON_FILTER = BIT(0), IEEE80211_VIF_SUPPORTS_CQM_RSSI = BIT(1), IEEE80211_VIF_SUPPORTS_UAPSD = BIT(2), IEEE80211_VIF_GET_NOA_UPDATE = BIT(3), IEEE80211_VIF_EML_ACTIVE = BIT(4), IEEE80211_VIF_IGNORE_OFDMA_WIDER_BW = BIT(5), IEEE80211_VIF_REMOVE_AP_AFTER_DISASSOC = BIT(6), }; /** * enum ieee80211_offload_flags - virtual interface offload flags * * @IEEE80211_OFFLOAD_ENCAP_ENABLED: tx encapsulation offload is enabled * The driver supports sending frames passed as 802.3 frames by mac80211. * It must also support sending 802.11 packets for the same interface. * @IEEE80211_OFFLOAD_ENCAP_4ADDR: support 4-address mode encapsulation offload * @IEEE80211_OFFLOAD_DECAP_ENABLED: rx encapsulation offload is enabled * The driver supports passing received 802.11 frames as 802.3 frames to * mac80211. */ enum ieee80211_offload_flags { IEEE80211_OFFLOAD_ENCAP_ENABLED = BIT(0), IEEE80211_OFFLOAD_ENCAP_4ADDR = BIT(1), IEEE80211_OFFLOAD_DECAP_ENABLED = BIT(2), }; /** * struct ieee80211_vif_cfg - interface configuration * @assoc: association status * @ibss_joined: indicates whether this station is part of an IBSS or not * @ibss_creator: indicates if a new IBSS network is being created * @ps: power-save mode (STA only). This flag is NOT affected by * offchannel/dynamic_ps operations. * @aid: association ID number, valid only when @assoc is true * @eml_cap: EML capabilities as described in P802.11be_D4.1 Figure 9-1001j. * @eml_med_sync_delay: Medium Synchronization delay as described in * P802.11be_D4.1 Figure 9-1001i. * @mld_capa_op: MLD Capabilities and Operations per P802.11be_D4.1 * Figure 9-1001k * @arp_addr_list: List of IPv4 addresses for hardware ARP filtering. The * may filter ARP queries targeted for other addresses than listed here. * The driver must allow ARP queries targeted for all address listed here * to pass through. An empty list implies no ARP queries need to pass. * @arp_addr_cnt: Number of addresses currently on the list. Note that this * may be larger than %IEEE80211_BSS_ARP_ADDR_LIST_LEN (the arp_addr_list * array size), it's up to the driver what to do in that case. * @ssid: The SSID of the current vif. Valid in AP and IBSS mode. * @ssid_len: Length of SSID given in @ssid. * @s1g: BSS is S1G BSS (affects Association Request format). * @idle: This interface is idle. There's also a global idle flag in the * hardware config which may be more appropriate depending on what * your driver/device needs to do. * @ap_addr: AP MLD address, or BSSID for non-MLO connections * (station mode only) */ struct ieee80211_vif_cfg { /* association related data */ bool assoc, ibss_joined; bool ibss_creator; bool ps; u16 aid; u16 eml_cap; u16 eml_med_sync_delay; u16 mld_capa_op; __be32 arp_addr_list[IEEE80211_BSS_ARP_ADDR_LIST_LEN]; int arp_addr_cnt; u8 ssid[IEEE80211_MAX_SSID_LEN]; size_t ssid_len; bool s1g; bool idle; u8 ap_addr[ETH_ALEN] __aligned(2); }; #define IEEE80211_TTLM_NUM_TIDS 8 /** * struct ieee80211_neg_ttlm - negotiated TID to link map info * * @downlink: bitmap of active links per TID for downlink, or 0 if mapping for * this TID is not included. * @uplink: bitmap of active links per TID for uplink, or 0 if mapping for this * TID is not included. * @valid: info is valid or not. */ struct ieee80211_neg_ttlm { u16 downlink[IEEE80211_TTLM_NUM_TIDS]; u16 uplink[IEEE80211_TTLM_NUM_TIDS]; bool valid; }; /** * enum ieee80211_neg_ttlm_res - return value for negotiated TTLM handling * @NEG_TTLM_RES_ACCEPT: accept the request * @NEG_TTLM_RES_REJECT: reject the request * @NEG_TTLM_RES_SUGGEST_PREFERRED: reject and suggest a new mapping */ enum ieee80211_neg_ttlm_res { NEG_TTLM_RES_ACCEPT, NEG_TTLM_RES_REJECT, NEG_TTLM_RES_SUGGEST_PREFERRED }; /** * struct ieee80211_vif - per-interface data * * Data in this structure is continually present for driver * use during the life of a virtual interface. * * @type: type of this virtual interface * @cfg: vif configuration, see &struct ieee80211_vif_cfg * @bss_conf: BSS configuration for this interface, either our own * or the BSS we're associated to * @link_conf: in case of MLD, the per-link BSS configuration, * indexed by link ID * @valid_links: bitmap of valid links, or 0 for non-MLO. * @active_links: The bitmap of active links, or 0 for non-MLO. * The driver shouldn't change this directly, but use the * API calls meant for that purpose. * @dormant_links: subset of the valid links that are disabled/suspended * due to advertised or negotiated TTLM respectively. * 0 for non-MLO. * @suspended_links: subset of dormant_links representing links that are * suspended due to negotiated TTLM, and could be activated in the * future by tearing down the TTLM negotiation. * 0 for non-MLO. * @neg_ttlm: negotiated TID to link mapping info. * see &struct ieee80211_neg_ttlm. * @addr: address of this interface * @addr_valid: indicates if the address is actively used. Set to false for * passive monitor interfaces, true in all other cases. * @p2p: indicates whether this AP or STA interface is a p2p * interface, i.e. a GO or p2p-sta respectively * @netdev_features: tx netdev features supported by the hardware for this * vif. mac80211 initializes this to hw->netdev_features, and the driver * can mask out specific tx features. mac80211 will handle software fixup * for masked offloads (GSO, CSUM) * @driver_flags: flags/capabilities the driver has for this interface, * these need to be set (or cleared) when the interface is added * or, if supported by the driver, the interface type is changed * at runtime, mac80211 will never touch this field * @offload_flags: hardware offload capabilities/flags for this interface. * These are initialized by mac80211 before calling .add_interface, * .change_interface or .update_vif_offload and updated by the driver * within these ops, based on supported features or runtime change * restrictions. * @hw_queue: hardware queue for each AC * @cab_queue: content-after-beacon (DTIM beacon really) queue, AP mode only * @debugfs_dir: debugfs dentry, can be used by drivers to create own per * interface debug files. Note that it will be NULL for the virtual * monitor interface (if that is requested.) * @probe_req_reg: probe requests should be reported to mac80211 for this * interface. * @rx_mcast_action_reg: multicast Action frames should be reported to mac80211 * for this interface. * @drv_priv: data area for driver use, will always be aligned to * sizeof(void \*). * @txq: the multicast data TX queue * @offload_flags: 802.3 -> 802.11 enapsulation offload flags, see * &enum ieee80211_offload_flags. * @mbssid_tx_vif: Pointer to the transmitting interface if MBSSID is enabled. */ struct ieee80211_vif { enum nl80211_iftype type; struct ieee80211_vif_cfg cfg; struct ieee80211_bss_conf bss_conf; struct ieee80211_bss_conf __rcu *link_conf[IEEE80211_MLD_MAX_NUM_LINKS]; u16 valid_links, active_links, dormant_links, suspended_links; struct ieee80211_neg_ttlm neg_ttlm; u8 addr[ETH_ALEN] __aligned(2); bool addr_valid; bool p2p; u8 cab_queue; u8 hw_queue[IEEE80211_NUM_ACS]; struct ieee80211_txq *txq; netdev_features_t netdev_features; u32 driver_flags; u32 offload_flags; #ifdef CONFIG_MAC80211_DEBUGFS struct dentry *debugfs_dir; #endif bool probe_req_reg; bool rx_mcast_action_reg; struct ieee80211_vif *mbssid_tx_vif; /* must be last */ u8 drv_priv[] __aligned(sizeof(void *)); }; /** * ieee80211_vif_usable_links - Return the usable links for the vif * @vif: the vif for which the usable links are requested * Return: the usable link bitmap */ static inline u16 ieee80211_vif_usable_links(const struct ieee80211_vif *vif) { return vif->valid_links & ~vif->dormant_links; } /** * ieee80211_vif_is_mld - Returns true iff the vif is an MLD one * @vif: the vif * Return: %true if the vif is an MLD, %false otherwise. */ static inline bool ieee80211_vif_is_mld(const struct ieee80211_vif *vif) { /* valid_links != 0 indicates this vif is an MLD */ return vif->valid_links != 0; } /** * ieee80211_vif_link_active - check if a given link is active * @vif: the vif * @link_id: the link ID to check * Return: %true if the vif is an MLD and the link is active, or if * the vif is not an MLD and the link ID is 0; %false otherwise. */ static inline bool ieee80211_vif_link_active(const struct ieee80211_vif *vif, unsigned int link_id) { if (!ieee80211_vif_is_mld(vif)) return link_id == 0; return vif->active_links & BIT(link_id); } #define for_each_vif_active_link(vif, link, link_id) \ for (link_id = 0; link_id < ARRAY_SIZE((vif)->link_conf); link_id++) \ if ((!(vif)->active_links || \ (vif)->active_links & BIT(link_id)) && \ (link = link_conf_dereference_check(vif, link_id))) static inline bool ieee80211_vif_is_mesh(struct ieee80211_vif *vif) { #ifdef CONFIG_MAC80211_MESH return vif->type == NL80211_IFTYPE_MESH_POINT; #endif return false; } /** * wdev_to_ieee80211_vif - return a vif struct from a wdev * @wdev: the wdev to get the vif for * * This can be used by mac80211 drivers with direct cfg80211 APIs * (like the vendor commands) that get a wdev. * * Return: pointer to the wdev, or %NULL if the given wdev isn't * associated with a vif that the driver knows about (e.g. monitor * or AP_VLAN interfaces.) */ struct ieee80211_vif *wdev_to_ieee80211_vif(struct wireless_dev *wdev); /** * ieee80211_vif_to_wdev - return a wdev struct from a vif * @vif: the vif to get the wdev for * * This can be used by mac80211 drivers with direct cfg80211 APIs * (like the vendor commands) that needs to get the wdev for a vif. * This can also be useful to get the netdev associated to a vif. * * Return: pointer to the wdev */ struct wireless_dev *ieee80211_vif_to_wdev(struct ieee80211_vif *vif); static inline bool lockdep_vif_wiphy_mutex_held(struct ieee80211_vif *vif) { return lockdep_is_held(&ieee80211_vif_to_wdev(vif)->wiphy->mtx); } #define link_conf_dereference_protected(vif, link_id) \ rcu_dereference_protected((vif)->link_conf[link_id], \ lockdep_vif_wiphy_mutex_held(vif)) #define link_conf_dereference_check(vif, link_id) \ rcu_dereference_check((vif)->link_conf[link_id], \ lockdep_vif_wiphy_mutex_held(vif)) /** * enum ieee80211_key_flags - key flags * * These flags are used for communication about keys between the driver * and mac80211, with the @flags parameter of &struct ieee80211_key_conf. * * @IEEE80211_KEY_FLAG_GENERATE_IV: This flag should be set by the * driver to indicate that it requires IV generation for this * particular key. Setting this flag does not necessarily mean that SKBs * will have sufficient tailroom for ICV or MIC. * @IEEE80211_KEY_FLAG_GENERATE_MMIC: This flag should be set by * the driver for a TKIP key if it requires Michael MIC * generation in software. * @IEEE80211_KEY_FLAG_PAIRWISE: Set by mac80211, this flag indicates * that the key is pairwise rather then a shared key. * @IEEE80211_KEY_FLAG_SW_MGMT_TX: This flag should be set by the driver for a * CCMP/GCMP key if it requires CCMP/GCMP encryption of management frames * (MFP) to be done in software. * @IEEE80211_KEY_FLAG_PUT_IV_SPACE: This flag should be set by the driver * if space should be prepared for the IV, but the IV * itself should not be generated. Do not set together with * @IEEE80211_KEY_FLAG_GENERATE_IV on the same key. Setting this flag does * not necessarily mean that SKBs will have sufficient tailroom for ICV or * MIC. * @IEEE80211_KEY_FLAG_RX_MGMT: This key will be used to decrypt received * management frames. The flag can help drivers that have a hardware * crypto implementation that doesn't deal with management frames * properly by allowing them to not upload the keys to hardware and * fall back to software crypto. Note that this flag deals only with * RX, if your crypto engine can't deal with TX you can also set the * %IEEE80211_KEY_FLAG_SW_MGMT_TX flag to encrypt such frames in SW. * @IEEE80211_KEY_FLAG_GENERATE_IV_MGMT: This flag should be set by the * driver for a CCMP/GCMP key to indicate that is requires IV generation * only for management frames (MFP). * @IEEE80211_KEY_FLAG_RESERVE_TAILROOM: This flag should be set by the * driver for a key to indicate that sufficient tailroom must always * be reserved for ICV or MIC, even when HW encryption is enabled. * @IEEE80211_KEY_FLAG_PUT_MIC_SPACE: This flag should be set by the driver for * a TKIP key if it only requires MIC space. Do not set together with * @IEEE80211_KEY_FLAG_GENERATE_MMIC on the same key. * @IEEE80211_KEY_FLAG_NO_AUTO_TX: Key needs explicit Tx activation. * @IEEE80211_KEY_FLAG_GENERATE_MMIE: This flag should be set by the driver * for a AES_CMAC or a AES_GMAC key to indicate that it requires sequence * number generation only * @IEEE80211_KEY_FLAG_SPP_AMSDU: SPP A-MSDUs can be used with this key * (set by mac80211 from the sta->spp_amsdu flag) */ enum ieee80211_key_flags { IEEE80211_KEY_FLAG_GENERATE_IV_MGMT = BIT(0), IEEE80211_KEY_FLAG_GENERATE_IV = BIT(1), IEEE80211_KEY_FLAG_GENERATE_MMIC = BIT(2), IEEE80211_KEY_FLAG_PAIRWISE = BIT(3), IEEE80211_KEY_FLAG_SW_MGMT_TX = BIT(4), IEEE80211_KEY_FLAG_PUT_IV_SPACE = BIT(5), IEEE80211_KEY_FLAG_RX_MGMT = BIT(6), IEEE80211_KEY_FLAG_RESERVE_TAILROOM = BIT(7), IEEE80211_KEY_FLAG_PUT_MIC_SPACE = BIT(8), IEEE80211_KEY_FLAG_NO_AUTO_TX = BIT(9), IEEE80211_KEY_FLAG_GENERATE_MMIE = BIT(10), IEEE80211_KEY_FLAG_SPP_AMSDU = BIT(11), }; /** * struct ieee80211_key_conf - key information * * This key information is given by mac80211 to the driver by * the set_key() callback in &struct ieee80211_ops. * * @hw_key_idx: To be set by the driver, this is the key index the driver * wants to be given when a frame is transmitted and needs to be * encrypted in hardware. * @cipher: The key's cipher suite selector. * @tx_pn: PN used for TX keys, may be used by the driver as well if it * needs to do software PN assignment by itself (e.g. due to TSO) * @flags: key flags, see &enum ieee80211_key_flags. * @keyidx: the key index (0-7) * @keylen: key material length * @key: key material. For ALG_TKIP the key is encoded as a 256-bit (32 byte) * data block: * - Temporal Encryption Key (128 bits) * - Temporal Authenticator Tx MIC Key (64 bits) * - Temporal Authenticator Rx MIC Key (64 bits) * @icv_len: The ICV length for this key type * @iv_len: The IV length for this key type * @link_id: the link ID, 0 for non-MLO, or -1 for pairwise keys */ struct ieee80211_key_conf { atomic64_t tx_pn; u32 cipher; u8 icv_len; u8 iv_len; u8 hw_key_idx; s8 keyidx; u16 flags; s8 link_id; u8 keylen; u8 key[]; }; #define IEEE80211_MAX_PN_LEN 16 #define TKIP_PN_TO_IV16(pn) ((u16)(pn & 0xffff)) #define TKIP_PN_TO_IV32(pn) ((u32)((pn >> 16) & 0xffffffff)) /** * struct ieee80211_key_seq - key sequence counter * * @tkip: TKIP data, containing IV32 and IV16 in host byte order * @ccmp: PN data, most significant byte first (big endian, * reverse order than in packet) * @aes_cmac: PN data, most significant byte first (big endian, * reverse order than in packet) * @aes_gmac: PN data, most significant byte first (big endian, * reverse order than in packet) * @gcmp: PN data, most significant byte first (big endian, * reverse order than in packet) * @hw: data for HW-only (e.g. cipher scheme) keys */ struct ieee80211_key_seq { union { struct { u32 iv32; u16 iv16; } tkip; struct { u8 pn[6]; } ccmp; struct { u8 pn[6]; } aes_cmac; struct { u8 pn[6]; } aes_gmac; struct { u8 pn[6]; } gcmp; struct { u8 seq[IEEE80211_MAX_PN_LEN]; u8 seq_len; } hw; }; }; /** * enum set_key_cmd - key command * * Used with the set_key() callback in &struct ieee80211_ops, this * indicates whether a key is being removed or added. * * @SET_KEY: a key is set * @DISABLE_KEY: a key must be disabled */ enum set_key_cmd { SET_KEY, DISABLE_KEY, }; /** * enum ieee80211_sta_state - station state * * @IEEE80211_STA_NOTEXIST: station doesn't exist at all, * this is a special state for add/remove transitions * @IEEE80211_STA_NONE: station exists without special state * @IEEE80211_STA_AUTH: station is authenticated * @IEEE80211_STA_ASSOC: station is associated * @IEEE80211_STA_AUTHORIZED: station is authorized (802.1X) */ enum ieee80211_sta_state { /* NOTE: These need to be ordered correctly! */ IEEE80211_STA_NOTEXIST, IEEE80211_STA_NONE, IEEE80211_STA_AUTH, IEEE80211_STA_ASSOC, IEEE80211_STA_AUTHORIZED, }; /** * enum ieee80211_sta_rx_bandwidth - station RX bandwidth * @IEEE80211_STA_RX_BW_20: station can only receive 20 MHz * @IEEE80211_STA_RX_BW_40: station can receive up to 40 MHz * @IEEE80211_STA_RX_BW_80: station can receive up to 80 MHz * @IEEE80211_STA_RX_BW_160: station can receive up to 160 MHz * (including 80+80 MHz) * @IEEE80211_STA_RX_BW_320: station can receive up to 320 MHz * * Implementation note: 20 must be zero to be initialized * correctly, the values must be sorted. */ enum ieee80211_sta_rx_bandwidth { IEEE80211_STA_RX_BW_20 = 0, IEEE80211_STA_RX_BW_40, IEEE80211_STA_RX_BW_80, IEEE80211_STA_RX_BW_160, IEEE80211_STA_RX_BW_320, }; #define IEEE80211_STA_RX_BW_MAX IEEE80211_STA_RX_BW_320 /** * struct ieee80211_sta_rates - station rate selection table * * @rcu_head: RCU head used for freeing the table on update * @rate: transmit rates/flags to be used by default. * Overriding entries per-packet is possible by using cb tx control. */ struct ieee80211_sta_rates { struct rcu_head rcu_head; struct { s8 idx; u8 count; u8 count_cts; u8 count_rts; u16 flags; } rate[IEEE80211_TX_RATE_TABLE_SIZE]; }; /** * struct ieee80211_sta_txpwr - station txpower configuration * * Used to configure txpower for station. * * @power: indicates the tx power, in dBm, to be used when sending data frames * to the STA. * @type: In particular if TPC %type is NL80211_TX_POWER_LIMITED then tx power * will be less than or equal to specified from userspace, whereas if TPC * %type is NL80211_TX_POWER_AUTOMATIC then it indicates default tx power. * NL80211_TX_POWER_FIXED is not a valid configuration option for * per peer TPC. */ struct ieee80211_sta_txpwr { s16 power; enum nl80211_tx_power_setting type; }; /** * struct ieee80211_sta_aggregates - info that is aggregated from active links * * Used for any per-link data that needs to be aggregated and updated in the * main &struct ieee80211_sta when updated or the active links change. * * @max_amsdu_len: indicates the maximal length of an A-MSDU in bytes. * This field is always valid for packets with a VHT preamble. * For packets with a HT preamble, additional limits apply: * * * If the skb is transmitted as part of a BA agreement, the * A-MSDU maximal size is min(max_amsdu_len, 4065) bytes. * * If the skb is not part of a BA agreement, the A-MSDU maximal * size is min(max_amsdu_len, 7935) bytes. * * Both additional HT limits must be enforced by the low level * driver. This is defined by the spec (IEEE 802.11-2012 section * 8.3.2.2 NOTE 2). * @max_rc_amsdu_len: Maximum A-MSDU size in bytes recommended by rate control. * @max_tid_amsdu_len: Maximum A-MSDU size in bytes for this TID */ struct ieee80211_sta_aggregates { u16 max_amsdu_len; u16 max_rc_amsdu_len; u16 max_tid_amsdu_len[IEEE80211_NUM_TIDS]; }; /** * struct ieee80211_link_sta - station Link specific info * All link specific info for a STA link for a non MLD STA(single) * or a MLD STA(multiple entries) are stored here. * * @sta: reference to owning STA * @addr: MAC address of the Link STA. For non-MLO STA this is same as the addr * in ieee80211_sta. For MLO Link STA this addr can be same or different * from addr in ieee80211_sta (representing MLD STA addr) * @link_id: the link ID for this link STA (0 for deflink) * @smps_mode: current SMPS mode (off, static or dynamic) * @supp_rates: Bitmap of supported rates * @ht_cap: HT capabilities of this STA; restricted to our own capabilities * @vht_cap: VHT capabilities of this STA; restricted to our own capabilities * @he_cap: HE capabilities of this STA * @he_6ghz_capa: on 6 GHz, holds the HE 6 GHz band capabilities * @eht_cap: EHT capabilities of this STA * @agg: per-link data for multi-link aggregation * @bandwidth: current bandwidth the station can receive with * @rx_nss: in HT/VHT, the maximum number of spatial streams the * station can receive at the moment, changed by operating mode * notifications and capabilities. The value is only valid after * the station moves to associated state. * @txpwr: the station tx power configuration * */ struct ieee80211_link_sta { struct ieee80211_sta *sta; u8 addr[ETH_ALEN]; u8 link_id; enum ieee80211_smps_mode smps_mode; u32 supp_rates[NUM_NL80211_BANDS]; struct ieee80211_sta_ht_cap ht_cap; struct ieee80211_sta_vht_cap vht_cap; struct ieee80211_sta_he_cap he_cap; struct ieee80211_he_6ghz_capa he_6ghz_capa; struct ieee80211_sta_eht_cap eht_cap; struct ieee80211_sta_aggregates agg; u8 rx_nss; enum ieee80211_sta_rx_bandwidth bandwidth; struct ieee80211_sta_txpwr txpwr; }; /** * struct ieee80211_sta - station table entry * * A station table entry represents a station we are possibly * communicating with. Since stations are RCU-managed in * mac80211, any ieee80211_sta pointer you get access to must * either be protected by rcu_read_lock() explicitly or implicitly, * or you must take good care to not use such a pointer after a * call to your sta_remove callback that removed it. * This also represents the MLD STA in case of MLO association * and holds pointers to various link STA's * * @addr: MAC address * @aid: AID we assigned to the station if we're an AP * @max_rx_aggregation_subframes: maximal amount of frames in a single AMPDU * that this station is allowed to transmit to us. * Can be modified by driver. * @wme: indicates whether the STA supports QoS/WME (if local devices does, * otherwise always false) * @drv_priv: data area for driver use, will always be aligned to * sizeof(void \*), size is determined in hw information. * @uapsd_queues: bitmap of queues configured for uapsd. Only valid * if wme is supported. The bits order is like in * IEEE80211_WMM_IE_STA_QOSINFO_AC_*. * @max_sp: max Service Period. Only valid if wme is supported. * @rates: rate control selection table * @tdls: indicates whether the STA is a TDLS peer * @tdls_initiator: indicates the STA is an initiator of the TDLS link. Only * valid if the STA is a TDLS peer in the first place. * @mfp: indicates whether the STA uses management frame protection or not. * @mlo: indicates whether the STA is MLO station. * @max_amsdu_subframes: indicates the maximal number of MSDUs in a single * A-MSDU. Taken from the Extended Capabilities element. 0 means * unlimited. * @cur: currently valid data as aggregated from the active links * For non MLO STA it will point to the deflink data. For MLO STA * ieee80211_sta_recalc_aggregates() must be called to update it. * @support_p2p_ps: indicates whether the STA supports P2P PS mechanism or not. * @txq: per-TID data TX queues; note that the last entry (%IEEE80211_NUM_TIDS) * is used for non-data frames * @deflink: This holds the default link STA information, for non MLO STA all link * specific STA information is accessed through @deflink or through * link[0] which points to address of @deflink. For MLO Link STA * the first added link STA will point to deflink. * @link: reference to Link Sta entries. For Non MLO STA, except 1st link, * i.e link[0] all links would be assigned to NULL by default and * would access link information via @deflink or link[0]. For MLO * STA, first link STA being added will point its link pointer to * @deflink address and remaining would be allocated and the address * would be assigned to link[link_id] where link_id is the id assigned * by the AP. * @valid_links: bitmap of valid links, or 0 for non-MLO * @spp_amsdu: indicates whether the STA uses SPP A-MSDU or not. */ struct ieee80211_sta { u8 addr[ETH_ALEN] __aligned(2); u16 aid; u16 max_rx_aggregation_subframes; bool wme; u8 uapsd_queues; u8 max_sp; struct ieee80211_sta_rates __rcu *rates; bool tdls; bool tdls_initiator; bool mfp; bool mlo; bool spp_amsdu; u8 max_amsdu_subframes; struct ieee80211_sta_aggregates *cur; bool support_p2p_ps; struct ieee80211_txq *txq[IEEE80211_NUM_TIDS + 1]; u16 valid_links; struct ieee80211_link_sta deflink; struct ieee80211_link_sta __rcu *link[IEEE80211_MLD_MAX_NUM_LINKS]; /* must be last */ u8 drv_priv[] __aligned(sizeof(void *)); }; #ifdef CONFIG_LOCKDEP bool lockdep_sta_mutex_held(struct ieee80211_sta *pubsta); #else static inline bool lockdep_sta_mutex_held(struct ieee80211_sta *pubsta) { return true; } #endif #define link_sta_dereference_protected(sta, link_id) \ rcu_dereference_protected((sta)->link[link_id], \ lockdep_sta_mutex_held(sta)) #define link_sta_dereference_check(sta, link_id) \ rcu_dereference_check((sta)->link[link_id], \ lockdep_sta_mutex_held(sta)) #define for_each_sta_active_link(vif, sta, link_sta, link_id) \ for (link_id = 0; link_id < ARRAY_SIZE((sta)->link); link_id++) \ if ((!(vif)->active_links || \ (vif)->active_links & BIT(link_id)) && \ ((link_sta) = link_sta_dereference_check(sta, link_id))) /** * enum sta_notify_cmd - sta notify command * * Used with the sta_notify() callback in &struct ieee80211_ops, this * indicates if an associated station made a power state transition. * * @STA_NOTIFY_SLEEP: a station is now sleeping * @STA_NOTIFY_AWAKE: a sleeping station woke up */ enum sta_notify_cmd { STA_NOTIFY_SLEEP, STA_NOTIFY_AWAKE, }; /** * struct ieee80211_tx_control - TX control data * * @sta: station table entry, this sta pointer may be NULL and * it is not allowed to copy the pointer, due to RCU. */ struct ieee80211_tx_control { struct ieee80211_sta *sta; }; /** * struct ieee80211_txq - Software intermediate tx queue * * @vif: &struct ieee80211_vif pointer from the add_interface callback. * @sta: station table entry, %NULL for per-vif queue * @tid: the TID for this queue (unused for per-vif queue), * %IEEE80211_NUM_TIDS for non-data (if enabled) * @ac: the AC for this queue * @drv_priv: driver private area, sized by hw->txq_data_size * * The driver can obtain packets from this queue by calling * ieee80211_tx_dequeue(). */ struct ieee80211_txq { struct ieee80211_vif *vif; struct ieee80211_sta *sta; u8 tid; u8 ac; /* must be last */ u8 drv_priv[] __aligned(sizeof(void *)); }; /** * enum ieee80211_hw_flags - hardware flags * * These flags are used to indicate hardware capabilities to * the stack. Generally, flags here should have their meaning * done in a way that the simplest hardware doesn't need setting * any particular flags. There are some exceptions to this rule, * however, so you are advised to review these flags carefully. * * @IEEE80211_HW_HAS_RATE_CONTROL: * The hardware or firmware includes rate control, and cannot be * controlled by the stack. As such, no rate control algorithm * should be instantiated, and the TX rate reported to userspace * will be taken from the TX status instead of the rate control * algorithm. * Note that this requires that the driver implement a number of * callbacks so it has the correct information, it needs to have * the @set_rts_threshold callback and must look at the BSS config * @use_cts_prot for G/N protection, @use_short_slot for slot * timing in 2.4 GHz and @use_short_preamble for preambles for * CCK frames. * * @IEEE80211_HW_RX_INCLUDES_FCS: * Indicates that received frames passed to the stack include * the FCS at the end. * * @IEEE80211_HW_HOST_BROADCAST_PS_BUFFERING: * Some wireless LAN chipsets buffer broadcast/multicast frames * for power saving stations in the hardware/firmware and others * rely on the host system for such buffering. This option is used * to configure the IEEE 802.11 upper layer to buffer broadcast and * multicast frames when there are power saving stations so that * the driver can fetch them with ieee80211_get_buffered_bc(). * * @IEEE80211_HW_SIGNAL_UNSPEC: * Hardware can provide signal values but we don't know its units. We * expect values between 0 and @max_signal. * If possible please provide dB or dBm instead. * * @IEEE80211_HW_SIGNAL_DBM: * Hardware gives signal values in dBm, decibel difference from * one milliwatt. This is the preferred method since it is standardized * between different devices. @max_signal does not need to be set. * * @IEEE80211_HW_SPECTRUM_MGMT: * Hardware supports spectrum management defined in 802.11h * Measurement, Channel Switch, Quieting, TPC * * @IEEE80211_HW_AMPDU_AGGREGATION: * Hardware supports 11n A-MPDU aggregation. * * @IEEE80211_HW_SUPPORTS_PS: * Hardware has power save support (i.e. can go to sleep). * * @IEEE80211_HW_PS_NULLFUNC_STACK: * Hardware requires nullfunc frame handling in stack, implies * stack support for dynamic PS. * * @IEEE80211_HW_SUPPORTS_DYNAMIC_PS: * Hardware has support for dynamic PS. * * @IEEE80211_HW_MFP_CAPABLE: * Hardware supports management frame protection (MFP, IEEE 802.11w). * * @IEEE80211_HW_REPORTS_TX_ACK_STATUS: * Hardware can provide ack status reports of Tx frames to * the stack. * * @IEEE80211_HW_CONNECTION_MONITOR: * The hardware performs its own connection monitoring, including * periodic keep-alives to the AP and probing the AP on beacon loss. * * @IEEE80211_HW_NEED_DTIM_BEFORE_ASSOC: * This device needs to get data from beacon before association (i.e. * dtim_period). * * @IEEE80211_HW_SUPPORTS_PER_STA_GTK: The device's crypto engine supports * per-station GTKs as used by IBSS RSN or during fast transition. If * the device doesn't support per-station GTKs, but can be asked not * to decrypt group addressed frames, then IBSS RSN support is still * possible but software crypto will be used. Advertise the wiphy flag * only in that case. * * @IEEE80211_HW_AP_LINK_PS: When operating in AP mode the device * autonomously manages the PS status of connected stations. When * this flag is set mac80211 will not trigger PS mode for connected * stations based on the PM bit of incoming frames. * Use ieee80211_start_ps()/ieee8021_end_ps() to manually configure * the PS mode of connected stations. * * @IEEE80211_HW_TX_AMPDU_SETUP_IN_HW: The device handles TX A-MPDU session * setup strictly in HW. mac80211 should not attempt to do this in * software. * * @IEEE80211_HW_WANT_MONITOR_VIF: The driver would like to be informed of * a virtual monitor interface when monitor interfaces are the only * active interfaces. * * @IEEE80211_HW_NO_VIRTUAL_MONITOR: The driver would like to be informed * of any monitor interface, as well as their configured channel. * This is useful for supporting multiple monitor interfaces on different * channels. * * @IEEE80211_HW_NO_AUTO_VIF: The driver would like for no wlanX to * be created. It is expected user-space will create vifs as * desired (and thus have them named as desired). * * @IEEE80211_HW_SW_CRYPTO_CONTROL: The driver wants to control which of the * crypto algorithms can be done in software - so don't automatically * try to fall back to it if hardware crypto fails, but do so only if * the driver returns 1. This also forces the driver to advertise its * supported cipher suites. * * @IEEE80211_HW_SUPPORT_FAST_XMIT: The driver/hardware supports fast-xmit, * this currently requires only the ability to calculate the duration * for frames. * * @IEEE80211_HW_QUEUE_CONTROL: The driver wants to control per-interface * queue mapping in order to use different queues (not just one per AC) * for different virtual interfaces. See the doc section on HW queue * control for more details. * * @IEEE80211_HW_SUPPORTS_RC_TABLE: The driver supports using a rate * selection table provided by the rate control algorithm. * * @IEEE80211_HW_P2P_DEV_ADDR_FOR_INTF: Use the P2P Device address for any * P2P Interface. This will be honoured even if more than one interface * is supported. * * @IEEE80211_HW_TIMING_BEACON_ONLY: Use sync timing from beacon frames * only, to allow getting TBTT of a DTIM beacon. * * @IEEE80211_HW_SUPPORTS_HT_CCK_RATES: Hardware supports mixing HT/CCK rates * and can cope with CCK rates in an aggregation session (e.g. by not * using aggregation for such frames.) * * @IEEE80211_HW_CHANCTX_STA_CSA: Support 802.11h based channel-switch (CSA) * for a single active channel while using channel contexts. When support * is not enabled the default action is to disconnect when getting the * CSA frame. * * @IEEE80211_HW_SUPPORTS_CLONED_SKBS: The driver will never modify the payload * or tailroom of TX skbs without copying them first. * * @IEEE80211_HW_SINGLE_SCAN_ON_ALL_BANDS: The HW supports scanning on all bands * in one command, mac80211 doesn't have to run separate scans per band. * * @IEEE80211_HW_TDLS_WIDER_BW: The device/driver supports wider bandwidth * than then BSS bandwidth for a TDLS link on the base channel. * * @IEEE80211_HW_SUPPORTS_AMSDU_IN_AMPDU: The driver supports receiving A-MSDUs * within A-MPDU. * * @IEEE80211_HW_BEACON_TX_STATUS: The device/driver provides TX status * for sent beacons. * * @IEEE80211_HW_NEEDS_UNIQUE_STA_ADDR: Hardware (or driver) requires that each * station has a unique address, i.e. each station entry can be identified * by just its MAC address; this prevents, for example, the same station * from connecting to two virtual AP interfaces at the same time. * * @IEEE80211_HW_SUPPORTS_REORDERING_BUFFER: Hardware (or driver) manages the * reordering buffer internally, guaranteeing mac80211 receives frames in * order and does not need to manage its own reorder buffer or BA session * timeout. * * @IEEE80211_HW_USES_RSS: The device uses RSS and thus requires parallel RX, * which implies using per-CPU station statistics. * * @IEEE80211_HW_TX_AMSDU: Hardware (or driver) supports software aggregated * A-MSDU frames. Requires software tx queueing and fast-xmit support. * When not using minstrel/minstrel_ht rate control, the driver must * limit the maximum A-MSDU size based on the current tx rate by setting * max_rc_amsdu_len in struct ieee80211_sta. * * @IEEE80211_HW_TX_FRAG_LIST: Hardware (or driver) supports sending frag_list * skbs, needed for zero-copy software A-MSDU. * * @IEEE80211_HW_REPORTS_LOW_ACK: The driver (or firmware) reports low ack event * by ieee80211_report_low_ack() based on its own algorithm. For such * drivers, mac80211 packet loss mechanism will not be triggered and driver * is completely depending on firmware event for station kickout. * * @IEEE80211_HW_SUPPORTS_TX_FRAG: Hardware does fragmentation by itself. * The stack will not do fragmentation. * The callback for @set_frag_threshold should be set as well. * * @IEEE80211_HW_SUPPORTS_TDLS_BUFFER_STA: Hardware supports buffer STA on * TDLS links. * * @IEEE80211_HW_DOESNT_SUPPORT_QOS_NDP: The driver (or firmware) doesn't * support QoS NDP for AP probing - that's most likely a driver bug. * * @IEEE80211_HW_BUFF_MMPDU_TXQ: use the TXQ for bufferable MMPDUs, this of * course requires the driver to use TXQs to start with. * * @IEEE80211_HW_SUPPORTS_VHT_EXT_NSS_BW: (Hardware) rate control supports VHT * extended NSS BW (dot11VHTExtendedNSSBWCapable). This flag will be set if * the selected rate control algorithm sets %RATE_CTRL_CAPA_VHT_EXT_NSS_BW * but if the rate control is built-in then it must be set by the driver. * See also the documentation for that flag. * * @IEEE80211_HW_STA_MMPDU_TXQ: use the extra non-TID per-station TXQ for all * MMPDUs on station interfaces. This of course requires the driver to use * TXQs to start with. * * @IEEE80211_HW_TX_STATUS_NO_AMPDU_LEN: Driver does not report accurate A-MPDU * length in tx status information * * @IEEE80211_HW_SUPPORTS_MULTI_BSSID: Hardware supports multi BSSID * * @IEEE80211_HW_SUPPORTS_ONLY_HE_MULTI_BSSID: Hardware supports multi BSSID * only for HE APs. Applies if @IEEE80211_HW_SUPPORTS_MULTI_BSSID is set. * * @IEEE80211_HW_AMPDU_KEYBORDER_SUPPORT: The card and driver is only * aggregating MPDUs with the same keyid, allowing mac80211 to keep Tx * A-MPDU sessions active while rekeying with Extended Key ID. * * @IEEE80211_HW_SUPPORTS_TX_ENCAP_OFFLOAD: Hardware supports tx encapsulation * offload * * @IEEE80211_HW_SUPPORTS_RX_DECAP_OFFLOAD: Hardware supports rx decapsulation * offload * * @IEEE80211_HW_SUPPORTS_CONC_MON_RX_DECAP: Hardware supports concurrent rx * decapsulation offload and passing raw 802.11 frames for monitor iface. * If this is supported, the driver must pass both 802.3 frames for real * usage and 802.11 frames with %RX_FLAG_ONLY_MONITOR set for monitor to * the stack. * * @IEEE80211_HW_DETECTS_COLOR_COLLISION: HW/driver has support for BSS color * collision detection and doesn't need it in software. * * @IEEE80211_HW_MLO_MCAST_MULTI_LINK_TX: Hardware/driver handles transmitting * multicast frames on all links, mac80211 should not do that. * * @IEEE80211_HW_DISALLOW_PUNCTURING: HW requires disabling puncturing in EHT * and connecting with a lower bandwidth instead * @IEEE80211_HW_DISALLOW_PUNCTURING_5GHZ: HW requires disabling puncturing in * EHT in 5 GHz and connecting with a lower bandwidth instead * * @IEEE80211_HW_HANDLES_QUIET_CSA: HW/driver handles quieting for CSA, so * no need to stop queues. This really should be set by a driver that * implements MLO, so operation can continue on other links when one * link is switching. * * @NUM_IEEE80211_HW_FLAGS: number of hardware flags, used for sizing arrays */ enum ieee80211_hw_flags { IEEE80211_HW_HAS_RATE_CONTROL, IEEE80211_HW_RX_INCLUDES_FCS, IEEE80211_HW_HOST_BROADCAST_PS_BUFFERING, IEEE80211_HW_SIGNAL_UNSPEC, IEEE80211_HW_SIGNAL_DBM, IEEE80211_HW_NEED_DTIM_BEFORE_ASSOC, IEEE80211_HW_SPECTRUM_MGMT, IEEE80211_HW_AMPDU_AGGREGATION, IEEE80211_HW_SUPPORTS_PS, IEEE80211_HW_PS_NULLFUNC_STACK, IEEE80211_HW_SUPPORTS_DYNAMIC_PS, IEEE80211_HW_MFP_CAPABLE, IEEE80211_HW_WANT_MONITOR_VIF, IEEE80211_HW_NO_VIRTUAL_MONITOR, IEEE80211_HW_NO_AUTO_VIF, IEEE80211_HW_SW_CRYPTO_CONTROL, IEEE80211_HW_SUPPORT_FAST_XMIT, IEEE80211_HW_REPORTS_TX_ACK_STATUS, IEEE80211_HW_CONNECTION_MONITOR, IEEE80211_HW_QUEUE_CONTROL, IEEE80211_HW_SUPPORTS_PER_STA_GTK, IEEE80211_HW_AP_LINK_PS, IEEE80211_HW_TX_AMPDU_SETUP_IN_HW, IEEE80211_HW_SUPPORTS_RC_TABLE, IEEE80211_HW_P2P_DEV_ADDR_FOR_INTF, IEEE80211_HW_TIMING_BEACON_ONLY, IEEE80211_HW_SUPPORTS_HT_CCK_RATES, IEEE80211_HW_CHANCTX_STA_CSA, IEEE80211_HW_SUPPORTS_CLONED_SKBS, IEEE80211_HW_SINGLE_SCAN_ON_ALL_BANDS, IEEE80211_HW_TDLS_WIDER_BW, IEEE80211_HW_SUPPORTS_AMSDU_IN_AMPDU, IEEE80211_HW_BEACON_TX_STATUS, IEEE80211_HW_NEEDS_UNIQUE_STA_ADDR, IEEE80211_HW_SUPPORTS_REORDERING_BUFFER, IEEE80211_HW_USES_RSS, IEEE80211_HW_TX_AMSDU, IEEE80211_HW_TX_FRAG_LIST, IEEE80211_HW_REPORTS_LOW_ACK, IEEE80211_HW_SUPPORTS_TX_FRAG, IEEE80211_HW_SUPPORTS_TDLS_BUFFER_STA, IEEE80211_HW_DOESNT_SUPPORT_QOS_NDP, IEEE80211_HW_BUFF_MMPDU_TXQ, IEEE80211_HW_SUPPORTS_VHT_EXT_NSS_BW, IEEE80211_HW_STA_MMPDU_TXQ, IEEE80211_HW_TX_STATUS_NO_AMPDU_LEN, IEEE80211_HW_SUPPORTS_MULTI_BSSID, IEEE80211_HW_SUPPORTS_ONLY_HE_MULTI_BSSID, IEEE80211_HW_AMPDU_KEYBORDER_SUPPORT, IEEE80211_HW_SUPPORTS_TX_ENCAP_OFFLOAD, IEEE80211_HW_SUPPORTS_RX_DECAP_OFFLOAD, IEEE80211_HW_SUPPORTS_CONC_MON_RX_DECAP, IEEE80211_HW_DETECTS_COLOR_COLLISION, IEEE80211_HW_MLO_MCAST_MULTI_LINK_TX, IEEE80211_HW_DISALLOW_PUNCTURING, IEEE80211_HW_DISALLOW_PUNCTURING_5GHZ, IEEE80211_HW_HANDLES_QUIET_CSA, /* keep last, obviously */ NUM_IEEE80211_HW_FLAGS }; /** * struct ieee80211_hw - hardware information and state * * This structure contains the configuration and hardware * information for an 802.11 PHY. * * @wiphy: This points to the &struct wiphy allocated for this * 802.11 PHY. You must fill in the @perm_addr and @dev * members of this structure using SET_IEEE80211_DEV() * and SET_IEEE80211_PERM_ADDR(). Additionally, all supported * bands (with channels, bitrates) are registered here. * * @conf: &struct ieee80211_conf, device configuration, don't use. * * @priv: pointer to private area that was allocated for driver use * along with this structure. * * @flags: hardware flags, see &enum ieee80211_hw_flags. * * @extra_tx_headroom: headroom to reserve in each transmit skb * for use by the driver (e.g. for transmit headers.) * * @extra_beacon_tailroom: tailroom to reserve in each beacon tx skb. * Can be used by drivers to add extra IEs. * * @max_signal: Maximum value for signal (rssi) in RX information, used * only when @IEEE80211_HW_SIGNAL_UNSPEC or @IEEE80211_HW_SIGNAL_DB * * @max_listen_interval: max listen interval in units of beacon interval * that HW supports * * @queues: number of available hardware transmit queues for * data packets. WMM/QoS requires at least four, these * queues need to have configurable access parameters. * * @rate_control_algorithm: rate control algorithm for this hardware. * If unset (NULL), the default algorithm will be used. Must be * set before calling ieee80211_register_hw(). * * @vif_data_size: size (in bytes) of the drv_priv data area * within &struct ieee80211_vif. * @sta_data_size: size (in bytes) of the drv_priv data area * within &struct ieee80211_sta. * @chanctx_data_size: size (in bytes) of the drv_priv data area * within &struct ieee80211_chanctx_conf. * @txq_data_size: size (in bytes) of the drv_priv data area * within @struct ieee80211_txq. * * @max_rates: maximum number of alternate rate retry stages the hw * can handle. * @max_report_rates: maximum number of alternate rate retry stages * the hw can report back. * @max_rate_tries: maximum number of tries for each stage * * @max_rx_aggregation_subframes: maximum buffer size (number of * sub-frames) to be used for A-MPDU block ack receiver * aggregation. * This is only relevant if the device has restrictions on the * number of subframes, if it relies on mac80211 to do reordering * it shouldn't be set. * * @max_tx_aggregation_subframes: maximum number of subframes in an * aggregate an HT/HE device will transmit. In HT AddBA we'll * advertise a constant value of 64 as some older APs crash if * the window size is smaller (an example is LinkSys WRT120N * with FW v1.0.07 build 002 Jun 18 2012). * For AddBA to HE capable peers this value will be used. * * @max_tx_fragments: maximum number of tx buffers per (A)-MSDU, sum * of 1 + skb_shinfo(skb)->nr_frags for each skb in the frag_list. * * @offchannel_tx_hw_queue: HW queue ID to use for offchannel TX * (if %IEEE80211_HW_QUEUE_CONTROL is set) * * @radiotap_mcs_details: lists which MCS information can the HW * reports, by default it is set to _MCS, _GI and _BW but doesn't * include _FMT. Use %IEEE80211_RADIOTAP_MCS_HAVE_\* values, only * adding _BW is supported today. * * @radiotap_vht_details: lists which VHT MCS information the HW reports, * the default is _GI | _BANDWIDTH. * Use the %IEEE80211_RADIOTAP_VHT_KNOWN_\* values. * * @radiotap_timestamp: Information for the radiotap timestamp field; if the * @units_pos member is set to a non-negative value then the timestamp * field will be added and populated from the &struct ieee80211_rx_status * device_timestamp. * @radiotap_timestamp.units_pos: Must be set to a combination of a * IEEE80211_RADIOTAP_TIMESTAMP_UNIT_* and a * IEEE80211_RADIOTAP_TIMESTAMP_SPOS_* value. * @radiotap_timestamp.accuracy: If non-negative, fills the accuracy in the * radiotap field and the accuracy known flag will be set. * * @netdev_features: netdev features to be set in each netdev created * from this HW. Note that not all features are usable with mac80211, * other features will be rejected during HW registration. * * @uapsd_queues: This bitmap is included in (re)association frame to indicate * for each access category if it is uAPSD trigger-enabled and delivery- * enabled. Use IEEE80211_WMM_IE_STA_QOSINFO_AC_* to set this bitmap. * Each bit corresponds to different AC. Value '1' in specific bit means * that corresponding AC is both trigger- and delivery-enabled. '0' means * neither enabled. * * @uapsd_max_sp_len: maximum number of total buffered frames the WMM AP may * deliver to a WMM STA during any Service Period triggered by the WMM STA. * Use IEEE80211_WMM_IE_STA_QOSINFO_SP_* for correct values. * * @max_nan_de_entries: maximum number of NAN DE functions supported by the * device. * * @tx_sk_pacing_shift: Pacing shift to set on TCP sockets when frames from * them are encountered. The default should typically not be changed, * unless the driver has good reasons for needing more buffers. * * @weight_multiplier: Driver specific airtime weight multiplier used while * refilling deficit of each TXQ. * * @max_mtu: the max mtu could be set. * * @tx_power_levels: a list of power levels supported by the wifi hardware. * The power levels can be specified either as integer or fractions. * The power level at idx 0 shall be the maximum positive power level. * * @max_txpwr_levels_idx: the maximum valid idx of 'tx_power_levels' list. */ struct ieee80211_hw { struct ieee80211_conf conf; struct wiphy *wiphy; const char *rate_control_algorithm; void *priv; unsigned long flags[BITS_TO_LONGS(NUM_IEEE80211_HW_FLAGS)]; unsigned int extra_tx_headroom; unsigned int extra_beacon_tailroom; int vif_data_size; int sta_data_size; int chanctx_data_size; int txq_data_size; u16 queues; u16 max_listen_interval; s8 max_signal; u8 max_rates; u8 max_report_rates; u8 max_rate_tries; u16 max_rx_aggregation_subframes; u16 max_tx_aggregation_subframes; u8 max_tx_fragments; u8 offchannel_tx_hw_queue; u8 radiotap_mcs_details; u16 radiotap_vht_details; struct { int units_pos; s16 accuracy; } radiotap_timestamp; netdev_features_t netdev_features; u8 uapsd_queues; u8 uapsd_max_sp_len; u8 max_nan_de_entries; u8 tx_sk_pacing_shift; u8 weight_multiplier; u32 max_mtu; const s8 *tx_power_levels; u8 max_txpwr_levels_idx; }; static inline bool _ieee80211_hw_check(struct ieee80211_hw *hw, enum ieee80211_hw_flags flg) { return test_bit(flg, hw->flags); } #define ieee80211_hw_check(hw, flg) _ieee80211_hw_check(hw, IEEE80211_HW_##flg) static inline void _ieee80211_hw_set(struct ieee80211_hw *hw, enum ieee80211_hw_flags flg) { return __set_bit(flg, hw->flags); } #define ieee80211_hw_set(hw, flg) _ieee80211_hw_set(hw, IEEE80211_HW_##flg) /** * struct ieee80211_scan_request - hw scan request * * @ies: pointers different parts of IEs (in req.ie) * @req: cfg80211 request. */ struct ieee80211_scan_request { struct ieee80211_scan_ies ies; /* Keep last */ struct cfg80211_scan_request req; }; /** * struct ieee80211_tdls_ch_sw_params - TDLS channel switch parameters * * @sta: peer this TDLS channel-switch request/response came from * @chandef: channel referenced in a TDLS channel-switch request * @action_code: see &enum ieee80211_tdls_actioncode * @status: channel-switch response status * @timestamp: time at which the frame was received * @switch_time: switch-timing parameter received in the frame * @switch_timeout: switch-timing parameter received in the frame * @tmpl_skb: TDLS switch-channel response template * @ch_sw_tm_ie: offset of the channel-switch timing IE inside @tmpl_skb */ struct ieee80211_tdls_ch_sw_params { struct ieee80211_sta *sta; struct cfg80211_chan_def *chandef; u8 action_code; u32 status; u32 timestamp; u16 switch_time; u16 switch_timeout; struct sk_buff *tmpl_skb; u32 ch_sw_tm_ie; }; /** * wiphy_to_ieee80211_hw - return a mac80211 driver hw struct from a wiphy * * @wiphy: the &struct wiphy which we want to query * * mac80211 drivers can use this to get to their respective * &struct ieee80211_hw. Drivers wishing to get to their own private * structure can then access it via hw->priv. Note that mac802111 drivers should * not use wiphy_priv() to try to get their private driver structure as this * is already used internally by mac80211. * * Return: The mac80211 driver hw struct of @wiphy. */ struct ieee80211_hw *wiphy_to_ieee80211_hw(struct wiphy *wiphy); /** * SET_IEEE80211_DEV - set device for 802.11 hardware * * @hw: the &struct ieee80211_hw to set the device for * @dev: the &struct device of this 802.11 device */ static inline void SET_IEEE80211_DEV(struct ieee80211_hw *hw, struct device *dev) { set_wiphy_dev(hw->wiphy, dev); } /** * SET_IEEE80211_PERM_ADDR - set the permanent MAC address for 802.11 hardware * * @hw: the &struct ieee80211_hw to set the MAC address for * @addr: the address to set */ static inline void SET_IEEE80211_PERM_ADDR(struct ieee80211_hw *hw, const u8 *addr) { memcpy(hw->wiphy->perm_addr, addr, ETH_ALEN); } static inline struct ieee80211_rate * ieee80211_get_tx_rate(const struct ieee80211_hw *hw, const struct ieee80211_tx_info *c) { if (WARN_ON_ONCE(c->control.rates[0].idx < 0)) return NULL; return &hw->wiphy->bands[c->band]->bitrates[c->control.rates[0].idx]; } static inline struct ieee80211_rate * ieee80211_get_rts_cts_rate(const struct ieee80211_hw *hw, const struct ieee80211_tx_info *c) { if (c->control.rts_cts_rate_idx < 0) return NULL; return &hw->wiphy->bands[c->band]->bitrates[c->control.rts_cts_rate_idx]; } static inline struct ieee80211_rate * ieee80211_get_alt_retry_rate(const struct ieee80211_hw *hw, const struct ieee80211_tx_info *c, int idx) { if (c->control.rates[idx + 1].idx < 0) return NULL; return &hw->wiphy->bands[c->band]->bitrates[c->control.rates[idx + 1].idx]; } /** * ieee80211_free_txskb - free TX skb * @hw: the hardware * @skb: the skb * * Free a transmit skb. Use this function when some failure * to transmit happened and thus status cannot be reported. */ void ieee80211_free_txskb(struct ieee80211_hw *hw, struct sk_buff *skb); /** * ieee80211_purge_tx_queue - purge TX skb queue * @hw: the hardware * @skbs: the skbs * * Free a set of transmit skbs. Use this function when device is going to stop * but some transmit skbs without TX status are still queued. * This function does not take the list lock and the caller must hold the * relevant locks to use it. */ void ieee80211_purge_tx_queue(struct ieee80211_hw *hw, struct sk_buff_head *skbs); /** * DOC: Hardware crypto acceleration * * mac80211 is capable of taking advantage of many hardware * acceleration designs for encryption and decryption operations. * * The set_key() callback in the &struct ieee80211_ops for a given * device is called to enable hardware acceleration of encryption and * decryption. The callback takes a @sta parameter that will be NULL * for default keys or keys used for transmission only, or point to * the station information for the peer for individual keys. * Multiple transmission keys with the same key index may be used when * VLANs are configured for an access point. * * When transmitting, the TX control data will use the @hw_key_idx * selected by the driver by modifying the &struct ieee80211_key_conf * pointed to by the @key parameter to the set_key() function. * * The set_key() call for the %SET_KEY command should return 0 if * the key is now in use, -%EOPNOTSUPP or -%ENOSPC if it couldn't be * added; if you return 0 then hw_key_idx must be assigned to the * hardware key index. You are free to use the full u8 range. * * Note that in the case that the @IEEE80211_HW_SW_CRYPTO_CONTROL flag is * set, mac80211 will not automatically fall back to software crypto if * enabling hardware crypto failed. The set_key() call may also return the * value 1 to permit this specific key/algorithm to be done in software. * * When the cmd is %DISABLE_KEY then it must succeed. * * Note that it is permissible to not decrypt a frame even if a key * for it has been uploaded to hardware. The stack will not make any * decision based on whether a key has been uploaded or not but rather * based on the receive flags. * * The &struct ieee80211_key_conf structure pointed to by the @key * parameter is guaranteed to be valid until another call to set_key() * removes it, but it can only be used as a cookie to differentiate * keys. * * In TKIP some HW need to be provided a phase 1 key, for RX decryption * acceleration (i.e. iwlwifi). Those drivers should provide update_tkip_key * handler. * The update_tkip_key() call updates the driver with the new phase 1 key. * This happens every time the iv16 wraps around (every 65536 packets). The * set_key() call will happen only once for each key (unless the AP did * rekeying); it will not include a valid phase 1 key. The valid phase 1 key is * provided by update_tkip_key only. The trigger that makes mac80211 call this * handler is software decryption with wrap around of iv16. * * The set_default_unicast_key() call updates the default WEP key index * configured to the hardware for WEP encryption type. This is required * for devices that support offload of data packets (e.g. ARP responses). * * Mac80211 drivers should set the @NL80211_EXT_FEATURE_CAN_REPLACE_PTK0 flag * when they are able to replace in-use PTK keys according to the following * requirements: * 1) They do not hand over frames decrypted with the old key to mac80211 once the call to set_key() with command %DISABLE_KEY has been completed, 2) either drop or continue to use the old key for any outgoing frames queued at the time of the key deletion (including re-transmits), 3) never send out a frame queued prior to the set_key() %SET_KEY command encrypted with the new key when also needing @IEEE80211_KEY_FLAG_GENERATE_IV and 4) never send out a frame unencrypted when it should be encrypted. Mac80211 will not queue any new frames for a deleted key to the driver. */ /** * DOC: Powersave support * * mac80211 has support for various powersave implementations. * * First, it can support hardware that handles all powersaving by itself; * such hardware should simply set the %IEEE80211_HW_SUPPORTS_PS hardware * flag. In that case, it will be told about the desired powersave mode * with the %IEEE80211_CONF_PS flag depending on the association status. * The hardware must take care of sending nullfunc frames when necessary, * i.e. when entering and leaving powersave mode. The hardware is required * to look at the AID in beacons and signal to the AP that it woke up when * it finds traffic directed to it. * * %IEEE80211_CONF_PS flag enabled means that the powersave mode defined in * IEEE 802.11-2007 section 11.2 is enabled. This is not to be confused * with hardware wakeup and sleep states. Driver is responsible for waking * up the hardware before issuing commands to the hardware and putting it * back to sleep at appropriate times. * * When PS is enabled, hardware needs to wakeup for beacons and receive the * buffered multicast/broadcast frames after the beacon. Also it must be * possible to send frames and receive the acknowledment frame. * * Other hardware designs cannot send nullfunc frames by themselves and also * need software support for parsing the TIM bitmap. This is also supported * by mac80211 by combining the %IEEE80211_HW_SUPPORTS_PS and * %IEEE80211_HW_PS_NULLFUNC_STACK flags. The hardware is of course still * required to pass up beacons. The hardware is still required to handle * waking up for multicast traffic; if it cannot the driver must handle that * as best as it can; mac80211 is too slow to do that. * * Dynamic powersave is an extension to normal powersave in which the * hardware stays awake for a user-specified period of time after sending a * frame so that reply frames need not be buffered and therefore delayed to * the next wakeup. It's a compromise of getting good enough latency when * there's data traffic and still saving significantly power in idle * periods. * * Dynamic powersave is simply supported by mac80211 enabling and disabling * PS based on traffic. Driver needs to only set %IEEE80211_HW_SUPPORTS_PS * flag and mac80211 will handle everything automatically. Additionally, * hardware having support for the dynamic PS feature may set the * %IEEE80211_HW_SUPPORTS_DYNAMIC_PS flag to indicate that it can support * dynamic PS mode itself. The driver needs to look at the * @dynamic_ps_timeout hardware configuration value and use it that value * whenever %IEEE80211_CONF_PS is set. In this case mac80211 will disable * dynamic PS feature in stack and will just keep %IEEE80211_CONF_PS * enabled whenever user has enabled powersave. * * Driver informs U-APSD client support by enabling * %IEEE80211_VIF_SUPPORTS_UAPSD flag. The mode is configured through the * uapsd parameter in conf_tx() operation. Hardware needs to send the QoS * Nullfunc frames and stay awake until the service period has ended. To * utilize U-APSD, dynamic powersave is disabled for voip AC and all frames * from that AC are transmitted with powersave enabled. * * Note: U-APSD client mode is not yet supported with * %IEEE80211_HW_PS_NULLFUNC_STACK. */ /** * DOC: Beacon filter support * * Some hardware have beacon filter support to reduce host cpu wakeups * which will reduce system power consumption. It usually works so that * the firmware creates a checksum of the beacon but omits all constantly * changing elements (TSF, TIM etc). Whenever the checksum changes the * beacon is forwarded to the host, otherwise it will be just dropped. That * way the host will only receive beacons where some relevant information * (for example ERP protection or WMM settings) have changed. * * Beacon filter support is advertised with the %IEEE80211_VIF_BEACON_FILTER * interface capability. The driver needs to enable beacon filter support * whenever power save is enabled, that is %IEEE80211_CONF_PS is set. When * power save is enabled, the stack will not check for beacon loss and the * driver needs to notify about loss of beacons with ieee80211_beacon_loss(). * * The time (or number of beacons missed) until the firmware notifies the * driver of a beacon loss event (which in turn causes the driver to call * ieee80211_beacon_loss()) should be configurable and will be controlled * by mac80211 and the roaming algorithm in the future. * * Since there may be constantly changing information elements that nothing * in the software stack cares about, we will, in the future, have mac80211 * tell the driver which information elements are interesting in the sense * that we want to see changes in them. This will include * * - a list of information element IDs * - a list of OUIs for the vendor information element * * Ideally, the hardware would filter out any beacons without changes in the * requested elements, but if it cannot support that it may, at the expense * of some efficiency, filter out only a subset. For example, if the device * doesn't support checking for OUIs it should pass up all changes in all * vendor information elements. * * Note that change, for the sake of simplification, also includes information * elements appearing or disappearing from the beacon. * * Some hardware supports an "ignore list" instead. Just make sure nothing * that was requested is on the ignore list, and include commonly changing * information element IDs in the ignore list, for example 11 (BSS load) and * the various vendor-assigned IEs with unknown contents (128, 129, 133-136, * 149, 150, 155, 156, 173, 176, 178, 179, 219); for forward compatibility * it could also include some currently unused IDs. * * * In addition to these capabilities, hardware should support notifying the * host of changes in the beacon RSSI. This is relevant to implement roaming * when no traffic is flowing (when traffic is flowing we see the RSSI of * the received data packets). This can consist of notifying the host when * the RSSI changes significantly or when it drops below or rises above * configurable thresholds. In the future these thresholds will also be * configured by mac80211 (which gets them from userspace) to implement * them as the roaming algorithm requires. * * If the hardware cannot implement this, the driver should ask it to * periodically pass beacon frames to the host so that software can do the * signal strength threshold checking. */ /** * DOC: Spatial multiplexing power save * * SMPS (Spatial multiplexing power save) is a mechanism to conserve * power in an 802.11n implementation. For details on the mechanism * and rationale, please refer to 802.11 (as amended by 802.11n-2009) * "11.2.3 SM power save". * * The mac80211 implementation is capable of sending action frames * to update the AP about the station's SMPS mode, and will instruct * the driver to enter the specific mode. It will also announce the * requested SMPS mode during the association handshake. Hardware * support for this feature is required, and can be indicated by * hardware flags. * * The default mode will be "automatic", which nl80211/cfg80211 * defines to be dynamic SMPS in (regular) powersave, and SMPS * turned off otherwise. * * To support this feature, the driver must set the appropriate * hardware support flags, and handle the SMPS flag to the config() * operation. It will then with this mechanism be instructed to * enter the requested SMPS mode while associated to an HT AP. */ /** * DOC: Frame filtering * * mac80211 requires to see many management frames for proper * operation, and users may want to see many more frames when * in monitor mode. However, for best CPU usage and power consumption, * having as few frames as possible percolate through the stack is * desirable. Hence, the hardware should filter as much as possible. * * To achieve this, mac80211 uses filter flags (see below) to tell * the driver's configure_filter() function which frames should be * passed to mac80211 and which should be filtered out. * * Before configure_filter() is invoked, the prepare_multicast() * callback is invoked with the parameters @mc_count and @mc_list * for the combined multicast address list of all virtual interfaces. * It's use is optional, and it returns a u64 that is passed to * configure_filter(). Additionally, configure_filter() has the * arguments @changed_flags telling which flags were changed and * @total_flags with the new flag states. * * If your device has no multicast address filters your driver will * need to check both the %FIF_ALLMULTI flag and the @mc_count * parameter to see whether multicast frames should be accepted * or dropped. * * All unsupported flags in @total_flags must be cleared. * Hardware does not support a flag if it is incapable of _passing_ * the frame to the stack. Otherwise the driver must ignore * the flag, but not clear it. * You must _only_ clear the flag (announce no support for the * flag to mac80211) if you are not able to pass the packet type * to the stack (so the hardware always filters it). * So for example, you should clear @FIF_CONTROL, if your hardware * always filters control frames. If your hardware always passes * control frames to the kernel and is incapable of filtering them, * you do _not_ clear the @FIF_CONTROL flag. * This rule applies to all other FIF flags as well. */ /** * DOC: AP support for powersaving clients * * In order to implement AP and P2P GO modes, mac80211 has support for * client powersaving, both "legacy" PS (PS-Poll/null data) and uAPSD. * There currently is no support for sAPSD. * * There is one assumption that mac80211 makes, namely that a client * will not poll with PS-Poll and trigger with uAPSD at the same time. * Both are supported, and both can be used by the same client, but * they can't be used concurrently by the same client. This simplifies * the driver code. * * The first thing to keep in mind is that there is a flag for complete * driver implementation: %IEEE80211_HW_AP_LINK_PS. If this flag is set, * mac80211 expects the driver to handle most of the state machine for * powersaving clients and will ignore the PM bit in incoming frames. * Drivers then use ieee80211_sta_ps_transition() to inform mac80211 of * stations' powersave transitions. In this mode, mac80211 also doesn't * handle PS-Poll/uAPSD. * * In the mode without %IEEE80211_HW_AP_LINK_PS, mac80211 will check the * PM bit in incoming frames for client powersave transitions. When a * station goes to sleep, we will stop transmitting to it. There is, * however, a race condition: a station might go to sleep while there is * data buffered on hardware queues. If the device has support for this * it will reject frames, and the driver should give the frames back to * mac80211 with the %IEEE80211_TX_STAT_TX_FILTERED flag set which will * cause mac80211 to retry the frame when the station wakes up. The * driver is also notified of powersave transitions by calling its * @sta_notify callback. * * When the station is asleep, it has three choices: it can wake up, * it can PS-Poll, or it can possibly start a uAPSD service period. * Waking up is implemented by simply transmitting all buffered (and * filtered) frames to the station. This is the easiest case. When * the station sends a PS-Poll or a uAPSD trigger frame, mac80211 * will inform the driver of this with the @allow_buffered_frames * callback; this callback is optional. mac80211 will then transmit * the frames as usual and set the %IEEE80211_TX_CTL_NO_PS_BUFFER * on each frame. The last frame in the service period (or the only * response to a PS-Poll) also has %IEEE80211_TX_STATUS_EOSP set to * indicate that it ends the service period; as this frame must have * TX status report it also sets %IEEE80211_TX_CTL_REQ_TX_STATUS. * When TX status is reported for this frame, the service period is * marked has having ended and a new one can be started by the peer. * * Additionally, non-bufferable MMPDUs can also be transmitted by * mac80211 with the %IEEE80211_TX_CTL_NO_PS_BUFFER set in them. * * Another race condition can happen on some devices like iwlwifi * when there are frames queued for the station and it wakes up * or polls; the frames that are already queued could end up being * transmitted first instead, causing reordering and/or wrong * processing of the EOSP. The cause is that allowing frames to be * transmitted to a certain station is out-of-band communication to * the device. To allow this problem to be solved, the driver can * call ieee80211_sta_block_awake() if frames are buffered when it * is notified that the station went to sleep. When all these frames * have been filtered (see above), it must call the function again * to indicate that the station is no longer blocked. * * If the driver buffers frames in the driver for aggregation in any * way, it must use the ieee80211_sta_set_buffered() call when it is * notified of the station going to sleep to inform mac80211 of any * TIDs that have frames buffered. Note that when a station wakes up * this information is reset (hence the requirement to call it when * informed of the station going to sleep). Then, when a service * period starts for any reason, @release_buffered_frames is called * with the number of frames to be released and which TIDs they are * to come from. In this case, the driver is responsible for setting * the EOSP (for uAPSD) and MORE_DATA bits in the released frames. * To help the @more_data parameter is passed to tell the driver if * there is more data on other TIDs -- the TIDs to release frames * from are ignored since mac80211 doesn't know how many frames the * buffers for those TIDs contain. * * If the driver also implement GO mode, where absence periods may * shorten service periods (or abort PS-Poll responses), it must * filter those response frames except in the case of frames that * are buffered in the driver -- those must remain buffered to avoid * reordering. Because it is possible that no frames are released * in this case, the driver must call ieee80211_sta_eosp() * to indicate to mac80211 that the service period ended anyway. * * Finally, if frames from multiple TIDs are released from mac80211 * but the driver might reorder them, it must clear & set the flags * appropriately (only the last frame may have %IEEE80211_TX_STATUS_EOSP) * and also take care of the EOSP and MORE_DATA bits in the frame. * The driver may also use ieee80211_sta_eosp() in this case. * * Note that if the driver ever buffers frames other than QoS-data * frames, it must take care to never send a non-QoS-data frame as * the last frame in a service period, adding a QoS-nulldata frame * after a non-QoS-data frame if needed. */ /** * DOC: HW queue control * * Before HW queue control was introduced, mac80211 only had a single static * assignment of per-interface AC software queues to hardware queues. This * was problematic for a few reasons: * 1) off-channel transmissions might get stuck behind other frames * 2) multiple virtual interfaces couldn't be handled correctly * 3) after-DTIM frames could get stuck behind other frames * * To solve this, hardware typically uses multiple different queues for all * the different usages, and this needs to be propagated into mac80211 so it * won't have the same problem with the software queues. * * Therefore, mac80211 now offers the %IEEE80211_HW_QUEUE_CONTROL capability * flag that tells it that the driver implements its own queue control. To do * so, the driver will set up the various queues in each &struct ieee80211_vif * and the offchannel queue in &struct ieee80211_hw. In response, mac80211 will * use those queue IDs in the hw_queue field of &struct ieee80211_tx_info and * if necessary will queue the frame on the right software queue that mirrors * the hardware queue. * Additionally, the driver has to then use these HW queue IDs for the queue * management functions (ieee80211_stop_queue() et al.) * * The driver is free to set up the queue mappings as needed; multiple virtual * interfaces may map to the same hardware queues if needed. The setup has to * happen during add_interface or change_interface callbacks. For example, a * driver supporting station+station and station+AP modes might decide to have * 10 hardware queues to handle different scenarios: * * 4 AC HW queues for 1st vif: 0, 1, 2, 3 * 4 AC HW queues for 2nd vif: 4, 5, 6, 7 * after-DTIM queue for AP: 8 * off-channel queue: 9 * * It would then set up the hardware like this: * hw.offchannel_tx_hw_queue = 9 * * and the first virtual interface that is added as follows: * vif.hw_queue[IEEE80211_AC_VO] = 0 * vif.hw_queue[IEEE80211_AC_VI] = 1 * vif.hw_queue[IEEE80211_AC_BE] = 2 * vif.hw_queue[IEEE80211_AC_BK] = 3 * vif.cab_queue = 8 // if AP mode, otherwise %IEEE80211_INVAL_HW_QUEUE * and the second virtual interface with 4-7. * * If queue 6 gets full, for example, mac80211 would only stop the second * virtual interface's BE queue since virtual interface queues are per AC. * * Note that the vif.cab_queue value should be set to %IEEE80211_INVAL_HW_QUEUE * whenever the queue is not used (i.e. the interface is not in AP mode) if the * queue could potentially be shared since mac80211 will look at cab_queue when * a queue is stopped/woken even if the interface is not in AP mode. */ /** * enum ieee80211_filter_flags - hardware filter flags * * These flags determine what the filter in hardware should be * programmed to let through and what should not be passed to the * stack. It is always safe to pass more frames than requested, * but this has negative impact on power consumption. * * @FIF_ALLMULTI: pass all multicast frames, this is used if requested * by the user or if the hardware is not capable of filtering by * multicast address. * * @FIF_FCSFAIL: pass frames with failed FCS (but you need to set the * %RX_FLAG_FAILED_FCS_CRC for them) * * @FIF_PLCPFAIL: pass frames with failed PLCP CRC (but you need to set * the %RX_FLAG_FAILED_PLCP_CRC for them * * @FIF_BCN_PRBRESP_PROMISC: This flag is set during scanning to indicate * to the hardware that it should not filter beacons or probe responses * by BSSID. Filtering them can greatly reduce the amount of processing * mac80211 needs to do and the amount of CPU wakeups, so you should * honour this flag if possible. * * @FIF_CONTROL: pass control frames (except for PS Poll) addressed to this * station * * @FIF_OTHER_BSS: pass frames destined to other BSSes * * @FIF_PSPOLL: pass PS Poll frames * * @FIF_PROBE_REQ: pass probe request frames * * @FIF_MCAST_ACTION: pass multicast Action frames */ enum ieee80211_filter_flags { FIF_ALLMULTI = 1<<1, FIF_FCSFAIL = 1<<2, FIF_PLCPFAIL = 1<<3, FIF_BCN_PRBRESP_PROMISC = 1<<4, FIF_CONTROL = 1<<5, FIF_OTHER_BSS = 1<<6, FIF_PSPOLL = 1<<7, FIF_PROBE_REQ = 1<<8, FIF_MCAST_ACTION = 1<<9, }; /** * enum ieee80211_ampdu_mlme_action - A-MPDU actions * * These flags are used with the ampdu_action() callback in * &struct ieee80211_ops to indicate which action is needed. * * Note that drivers MUST be able to deal with a TX aggregation * session being stopped even before they OK'ed starting it by * calling ieee80211_start_tx_ba_cb_irqsafe, because the peer * might receive the addBA frame and send a delBA right away! * * @IEEE80211_AMPDU_RX_START: start RX aggregation * @IEEE80211_AMPDU_RX_STOP: stop RX aggregation * @IEEE80211_AMPDU_TX_START: start TX aggregation, the driver must either * call ieee80211_start_tx_ba_cb_irqsafe() or * call ieee80211_start_tx_ba_cb_irqsafe() with status * %IEEE80211_AMPDU_TX_START_DELAY_ADDBA to delay addba after * ieee80211_start_tx_ba_cb_irqsafe is called, or just return the special * status %IEEE80211_AMPDU_TX_START_IMMEDIATE. * @IEEE80211_AMPDU_TX_OPERATIONAL: TX aggregation has become operational * @IEEE80211_AMPDU_TX_STOP_CONT: stop TX aggregation but continue transmitting * queued packets, now unaggregated. After all packets are transmitted the * driver has to call ieee80211_stop_tx_ba_cb_irqsafe(). * @IEEE80211_AMPDU_TX_STOP_FLUSH: stop TX aggregation and flush all packets, * called when the station is removed. There's no need or reason to call * ieee80211_stop_tx_ba_cb_irqsafe() in this case as mac80211 assumes the * session is gone and removes the station. * @IEEE80211_AMPDU_TX_STOP_FLUSH_CONT: called when TX aggregation is stopped * but the driver hasn't called ieee80211_stop_tx_ba_cb_irqsafe() yet and * now the connection is dropped and the station will be removed. Drivers * should clean up and drop remaining packets when this is called. */ enum ieee80211_ampdu_mlme_action { IEEE80211_AMPDU_RX_START, IEEE80211_AMPDU_RX_STOP, IEEE80211_AMPDU_TX_START, IEEE80211_AMPDU_TX_STOP_CONT, IEEE80211_AMPDU_TX_STOP_FLUSH, IEEE80211_AMPDU_TX_STOP_FLUSH_CONT, IEEE80211_AMPDU_TX_OPERATIONAL, }; #define IEEE80211_AMPDU_TX_START_IMMEDIATE 1 #define IEEE80211_AMPDU_TX_START_DELAY_ADDBA 2 /** * struct ieee80211_ampdu_params - AMPDU action parameters * * @action: the ampdu action, value from %ieee80211_ampdu_mlme_action. * @sta: peer of this AMPDU session * @tid: tid of the BA session * @ssn: start sequence number of the session. TX/RX_STOP can pass 0. When * action is set to %IEEE80211_AMPDU_RX_START the driver passes back the * actual ssn value used to start the session and writes the value here. * @buf_size: reorder buffer size (number of subframes). Valid only when the * action is set to %IEEE80211_AMPDU_RX_START or * %IEEE80211_AMPDU_TX_OPERATIONAL * @amsdu: indicates the peer's ability to receive A-MSDU within A-MPDU. * valid when the action is set to %IEEE80211_AMPDU_TX_OPERATIONAL * @timeout: BA session timeout. Valid only when the action is set to * %IEEE80211_AMPDU_RX_START */ struct ieee80211_ampdu_params { enum ieee80211_ampdu_mlme_action action; struct ieee80211_sta *sta; u16 tid; u16 ssn; u16 buf_size; bool amsdu; u16 timeout; }; /** * enum ieee80211_frame_release_type - frame release reason * @IEEE80211_FRAME_RELEASE_PSPOLL: frame released for PS-Poll * @IEEE80211_FRAME_RELEASE_UAPSD: frame(s) released due to * frame received on trigger-enabled AC */ enum ieee80211_frame_release_type { IEEE80211_FRAME_RELEASE_PSPOLL, IEEE80211_FRAME_RELEASE_UAPSD, }; /** * enum ieee80211_rate_control_changed - flags to indicate what changed * * @IEEE80211_RC_BW_CHANGED: The bandwidth that can be used to transmit * to this station changed. The actual bandwidth is in the station * information -- for HT20/40 the IEEE80211_HT_CAP_SUP_WIDTH_20_40 * flag changes, for HT and VHT the bandwidth field changes. * @IEEE80211_RC_SMPS_CHANGED: The SMPS state of the station changed. * @IEEE80211_RC_SUPP_RATES_CHANGED: The supported rate set of this peer * changed (in IBSS mode) due to discovering more information about * the peer. * @IEEE80211_RC_NSS_CHANGED: N_SS (number of spatial streams) was changed * by the peer */ enum ieee80211_rate_control_changed { IEEE80211_RC_BW_CHANGED = BIT(0), IEEE80211_RC_SMPS_CHANGED = BIT(1), IEEE80211_RC_SUPP_RATES_CHANGED = BIT(2), IEEE80211_RC_NSS_CHANGED = BIT(3), }; /** * enum ieee80211_roc_type - remain on channel type * * With the support for multi channel contexts and multi channel operations, * remain on channel operations might be limited/deferred/aborted by other * flows/operations which have higher priority (and vice versa). * Specifying the ROC type can be used by devices to prioritize the ROC * operations compared to other operations/flows. * * @IEEE80211_ROC_TYPE_NORMAL: There are no special requirements for this ROC. * @IEEE80211_ROC_TYPE_MGMT_TX: The remain on channel request is required * for sending management frames offchannel. */ enum ieee80211_roc_type { IEEE80211_ROC_TYPE_NORMAL = 0, IEEE80211_ROC_TYPE_MGMT_TX, }; /** * enum ieee80211_reconfig_type - reconfig type * * This enum is used by the reconfig_complete() callback to indicate what * reconfiguration type was completed. * * @IEEE80211_RECONFIG_TYPE_RESTART: hw restart type * (also due to resume() callback returning 1) * @IEEE80211_RECONFIG_TYPE_SUSPEND: suspend type (regardless * of wowlan configuration) */ enum ieee80211_reconfig_type { IEEE80211_RECONFIG_TYPE_RESTART, IEEE80211_RECONFIG_TYPE_SUSPEND, }; /** * struct ieee80211_prep_tx_info - prepare TX information * @duration: if non-zero, hint about the required duration, * only used with the mgd_prepare_tx() method. * @subtype: frame subtype (auth, (re)assoc, deauth, disassoc) * @success: whether the frame exchange was successful, only * used with the mgd_complete_tx() method, and then only * valid for auth and (re)assoc. * @was_assoc: set if this call is due to deauth/disassoc * while just having been associated * @link_id: the link id on which the frame will be TX'ed. * Only used with the mgd_prepare_tx() method. */ struct ieee80211_prep_tx_info { u16 duration; u16 subtype; u8 success:1, was_assoc:1; int link_id; }; /** * struct ieee80211_ops - callbacks from mac80211 to the driver * * This structure contains various callbacks that the driver may * handle or, in some cases, must handle, for example to configure * the hardware to a new channel or to transmit a frame. * * @tx: Handler that 802.11 module calls for each transmitted frame. * skb contains the buffer starting from the IEEE 802.11 header. * The low-level driver should send the frame out based on * configuration in the TX control data. This handler should, * preferably, never fail and stop queues appropriately. * Must be atomic. * * @start: Called before the first netdevice attached to the hardware * is enabled. This should turn on the hardware and must turn on * frame reception (for possibly enabled monitor interfaces.) * Returns negative error codes, these may be seen in userspace, * or zero. * When the device is started it should not have a MAC address * to avoid acknowledging frames before a non-monitor device * is added. * Must be implemented and can sleep. * * @stop: Called after last netdevice attached to the hardware * is disabled. This should turn off the hardware (at least * it must turn off frame reception.) * May be called right after add_interface if that rejects * an interface. If you added any work onto the mac80211 workqueue * you should ensure to cancel it on this callback. * Must be implemented and can sleep. * * @suspend: Suspend the device; mac80211 itself will quiesce before and * stop transmitting and doing any other configuration, and then * ask the device to suspend. This is only invoked when WoWLAN is * configured, otherwise the device is deconfigured completely and * reconfigured at resume time. * The driver may also impose special conditions under which it * wants to use the "normal" suspend (deconfigure), say if it only * supports WoWLAN when the device is associated. In this case, it * must return 1 from this function. * * @resume: If WoWLAN was configured, this indicates that mac80211 is * now resuming its operation, after this the device must be fully * functional again. If this returns an error, the only way out is * to also unregister the device. If it returns 1, then mac80211 * will also go through the regular complete restart on resume. * * @set_wakeup: Enable or disable wakeup when WoWLAN configuration is * modified. The reason is that device_set_wakeup_enable() is * supposed to be called when the configuration changes, not only * in suspend(). * * @add_interface: Called when a netdevice attached to the hardware is * enabled. Because it is not called for monitor mode devices, @start * and @stop must be implemented. * The driver should perform any initialization it needs before * the device can be enabled. The initial configuration for the * interface is given in the conf parameter. * The callback may refuse to add an interface by returning a * negative error code (which will be seen in userspace.) * Must be implemented and can sleep. * * @change_interface: Called when a netdevice changes type. This callback * is optional, but only if it is supported can interface types be * switched while the interface is UP. The callback may sleep. * Note that while an interface is being switched, it will not be * found by the interface iteration callbacks. * * @remove_interface: Notifies a driver that an interface is going down. * The @stop callback is called after this if it is the last interface * and no monitor interfaces are present. * When all interfaces are removed, the MAC address in the hardware * must be cleared so the device no longer acknowledges packets, * the mac_addr member of the conf structure is, however, set to the * MAC address of the device going away. * Hence, this callback must be implemented. It can sleep. * * @config: Handler for configuration requests. IEEE 802.11 code calls this * function to change hardware configuration, e.g., channel. * This function should never fail but returns a negative error code * if it does. The callback can sleep. * * @bss_info_changed: Handler for configuration requests related to BSS * parameters that may vary during BSS's lifespan, and may affect low * level driver (e.g. assoc/disassoc status, erp parameters). * This function should not be used if no BSS has been set, unless * for association indication. The @changed parameter indicates which * of the bss parameters has changed when a call is made. The callback * can sleep. * Note: this callback is called if @vif_cfg_changed or @link_info_changed * are not implemented. * * @vif_cfg_changed: Handler for configuration requests related to interface * (MLD) parameters from &struct ieee80211_vif_cfg that vary during the * lifetime of the interface (e.g. assoc status, IP addresses, etc.) * The @changed parameter indicates which value changed. * The callback can sleep. * * @link_info_changed: Handler for configuration requests related to link * parameters from &struct ieee80211_bss_conf that are related to an * individual link. e.g. legacy/HT/VHT/... rate information. * The @changed parameter indicates which value changed, and the @link_id * parameter indicates the link ID. Note that the @link_id will be 0 for * non-MLO connections. * The callback can sleep. * * @prepare_multicast: Prepare for multicast filter configuration. * This callback is optional, and its return value is passed * to configure_filter(). This callback must be atomic. * * @configure_filter: Configure the device's RX filter. * See the section "Frame filtering" for more information. * This callback must be implemented and can sleep. * * @config_iface_filter: Configure the interface's RX filter. * This callback is optional and is used to configure which frames * should be passed to mac80211. The filter_flags is the combination * of FIF_* flags. The changed_flags is a bit mask that indicates * which flags are changed. * This callback can sleep. * * @set_tim: Set TIM bit. mac80211 calls this function when a TIM bit * must be set or cleared for a given STA. Must be atomic. * * @set_key: See the section "Hardware crypto acceleration" * This callback is only called between add_interface and * remove_interface calls, i.e. while the given virtual interface * is enabled. * Returns a negative error code if the key can't be added. * The callback can sleep. * * @update_tkip_key: See the section "Hardware crypto acceleration" * This callback will be called in the context of Rx. Called for drivers * which set IEEE80211_KEY_FLAG_TKIP_REQ_RX_P1_KEY. * The callback must be atomic. * * @set_rekey_data: If the device supports GTK rekeying, for example while the * host is suspended, it can assign this callback to retrieve the data * necessary to do GTK rekeying, this is the KEK, KCK and replay counter. * After rekeying was done it should (for example during resume) notify * userspace of the new replay counter using ieee80211_gtk_rekey_notify(). * * @set_default_unicast_key: Set the default (unicast) key index, useful for * WEP when the device sends data packets autonomously, e.g. for ARP * offloading. The index can be 0-3, or -1 for unsetting it. * * @hw_scan: Ask the hardware to service the scan request, no need to start * the scan state machine in stack. The scan must honour the channel * configuration done by the regulatory agent in the wiphy's * registered bands. The hardware (or the driver) needs to make sure * that power save is disabled. * The @req ie/ie_len members are rewritten by mac80211 to contain the * entire IEs after the SSID, so that drivers need not look at these * at all but just send them after the SSID -- mac80211 includes the * (extended) supported rates and HT information (where applicable). * When the scan finishes, ieee80211_scan_completed() must be called; * note that it also must be called when the scan cannot finish due to * any error unless this callback returned a negative error code. * This callback is also allowed to return the special return value 1, * this indicates that hardware scan isn't desirable right now and a * software scan should be done instead. A driver wishing to use this * capability must ensure its (hardware) scan capabilities aren't * advertised as more capable than mac80211's software scan is. * The callback can sleep. * * @cancel_hw_scan: Ask the low-level tp cancel the active hw scan. * The driver should ask the hardware to cancel the scan (if possible), * but the scan will be completed only after the driver will call * ieee80211_scan_completed(). * This callback is needed for wowlan, to prevent enqueueing a new * scan_work after the low-level driver was already suspended. * The callback can sleep. * * @sched_scan_start: Ask the hardware to start scanning repeatedly at * specific intervals. The driver must call the * ieee80211_sched_scan_results() function whenever it finds results. * This process will continue until sched_scan_stop is called. * * @sched_scan_stop: Tell the hardware to stop an ongoing scheduled scan. * In this case, ieee80211_sched_scan_stopped() must not be called. * * @sw_scan_start: Notifier function that is called just before a software scan * is started. Can be NULL, if the driver doesn't need this notification. * The mac_addr parameter allows supporting NL80211_SCAN_FLAG_RANDOM_ADDR, * the driver may set the NL80211_FEATURE_SCAN_RANDOM_MAC_ADDR flag if it * can use this parameter. The callback can sleep. * * @sw_scan_complete: Notifier function that is called just after a * software scan finished. Can be NULL, if the driver doesn't need * this notification. * The callback can sleep. * * @get_stats: Return low-level statistics. * Returns zero if statistics are available. * The callback can sleep. * * @get_key_seq: If your device implements encryption in hardware and does * IV/PN assignment then this callback should be provided to read the * IV/PN for the given key from hardware. * The callback must be atomic. * * @set_frag_threshold: Configuration of fragmentation threshold. Assign this * if the device does fragmentation by itself. Note that to prevent the * stack from doing fragmentation IEEE80211_HW_SUPPORTS_TX_FRAG * should be set as well. * The callback can sleep. * * @set_rts_threshold: Configuration of RTS threshold (if device needs it) * The callback can sleep. * * @sta_add: Notifies low level driver about addition of an associated station, * AP, IBSS/WDS/mesh peer etc. This callback can sleep. * * @sta_remove: Notifies low level driver about removal of an associated * station, AP, IBSS/WDS/mesh peer etc. Note that after the callback * returns it isn't safe to use the pointer, not even RCU protected; * no RCU grace period is guaranteed between returning here and freeing * the station. See @sta_pre_rcu_remove if needed. * This callback can sleep. * * @vif_add_debugfs: Drivers can use this callback to add a debugfs vif * directory with its files. This callback should be within a * CONFIG_MAC80211_DEBUGFS conditional. This callback can sleep. * * @link_add_debugfs: Drivers can use this callback to add debugfs files * when a link is added to a mac80211 vif. This callback should be within * a CONFIG_MAC80211_DEBUGFS conditional. This callback can sleep. * For non-MLO the callback will be called once for the default bss_conf * with the vif's directory rather than a separate subdirectory. * * @sta_add_debugfs: Drivers can use this callback to add debugfs files * when a station is added to mac80211's station list. This callback * should be within a CONFIG_MAC80211_DEBUGFS conditional. This * callback can sleep. * * @link_sta_add_debugfs: Drivers can use this callback to add debugfs files * when a link is added to a mac80211 station. This callback * should be within a CONFIG_MAC80211_DEBUGFS conditional. This * callback can sleep. * For non-MLO the callback will be called once for the deflink with the * station's directory rather than a separate subdirectory. * * @sta_notify: Notifies low level driver about power state transition of an * associated station, AP, IBSS/WDS/mesh peer etc. For a VIF operating * in AP mode, this callback will not be called when the flag * %IEEE80211_HW_AP_LINK_PS is set. Must be atomic. * * @sta_set_txpwr: Configure the station tx power. This callback set the tx * power for the station. * This callback can sleep. * * @sta_state: Notifies low level driver about state transition of a * station (which can be the AP, a client, IBSS/WDS/mesh peer etc.) * This callback is mutually exclusive with @sta_add/@sta_remove. * It must not fail for down transitions but may fail for transitions * up the list of states. Also note that after the callback returns it * isn't safe to use the pointer, not even RCU protected - no RCU grace * period is guaranteed between returning here and freeing the station. * See @sta_pre_rcu_remove if needed. * The callback can sleep. * * @sta_pre_rcu_remove: Notify driver about station removal before RCU * synchronisation. This is useful if a driver needs to have station * pointers protected using RCU, it can then use this call to clear * the pointers instead of waiting for an RCU grace period to elapse * in @sta_state. * The callback can sleep. * * @link_sta_rc_update: Notifies the driver of changes to the bitrates that can * be used to transmit to the station. The changes are advertised with bits * from &enum ieee80211_rate_control_changed and the values are reflected * in the station data. This callback should only be used when the driver * uses hardware rate control (%IEEE80211_HW_HAS_RATE_CONTROL) since * otherwise the rate control algorithm is notified directly. * Must be atomic. * @sta_rate_tbl_update: Notifies the driver that the rate table changed. This * is only used if the configured rate control algorithm actually uses * the new rate table API, and is therefore optional. Must be atomic. * * @sta_statistics: Get statistics for this station. For example with beacon * filtering, the statistics kept by mac80211 might not be accurate, so * let the driver pre-fill the statistics. The driver can fill most of * the values (indicating which by setting the filled bitmap), but not * all of them make sense - see the source for which ones are possible. * Statistics that the driver doesn't fill will be filled by mac80211. * The callback can sleep. * * @conf_tx: Configure TX queue parameters (EDCF (aifs, cw_min, cw_max), * bursting) for a hardware TX queue. * Returns a negative error code on failure. * The callback can sleep. * * @get_tsf: Get the current TSF timer value from firmware/hardware. Currently, * this is only used for IBSS mode BSSID merging and debugging. Is not a * required function. * The callback can sleep. * * @set_tsf: Set the TSF timer to the specified value in the firmware/hardware. * Currently, this is only used for IBSS mode debugging. Is not a * required function. * The callback can sleep. * * @offset_tsf: Offset the TSF timer by the specified value in the * firmware/hardware. Preferred to set_tsf as it avoids delay between * calling set_tsf() and hardware getting programmed, which will show up * as TSF delay. Is not a required function. * The callback can sleep. * * @reset_tsf: Reset the TSF timer and allow firmware/hardware to synchronize * with other STAs in the IBSS. This is only used in IBSS mode. This * function is optional if the firmware/hardware takes full care of * TSF synchronization. * The callback can sleep. * * @tx_last_beacon: Determine whether the last IBSS beacon was sent by us. * This is needed only for IBSS mode and the result of this function is * used to determine whether to reply to Probe Requests. * Returns non-zero if this device sent the last beacon. * The callback can sleep. * * @get_survey: Return per-channel survey information * * @rfkill_poll: Poll rfkill hardware state. If you need this, you also * need to set wiphy->rfkill_poll to %true before registration, * and need to call wiphy_rfkill_set_hw_state() in the callback. * The callback can sleep. * * @set_coverage_class: Set slot time for given coverage class as specified * in IEEE 802.11-2007 section 17.3.8.6 and modify ACK timeout * accordingly; coverage class equals to -1 to enable ACK timeout * estimation algorithm (dynack). To disable dynack set valid value for * coverage class. This callback is not required and may sleep. * * @testmode_cmd: Implement a cfg80211 test mode command. The passed @vif may * be %NULL. The callback can sleep. * @testmode_dump: Implement a cfg80211 test mode dump. The callback can sleep. * * @flush: Flush all pending frames from the hardware queue, making sure * that the hardware queues are empty. The @queues parameter is a bitmap * of queues to flush, which is useful if different virtual interfaces * use different hardware queues; it may also indicate all queues. * If the parameter @drop is set to %true, pending frames may be dropped. * Note that vif can be NULL. * The callback can sleep. * * @flush_sta: Flush or drop all pending frames from the hardware queue(s) for * the given station, as it's about to be removed. * The callback can sleep. * * @channel_switch: Drivers that need (or want) to offload the channel * switch operation for CSAs received from the AP may implement this * callback. They must then call ieee80211_chswitch_done() to indicate * completion of the channel switch. * * @set_antenna: Set antenna configuration (tx_ant, rx_ant) on the device. * Parameters are bitmaps of allowed antennas to use for TX/RX. Drivers may * reject TX/RX mask combinations they cannot support by returning -EINVAL * (also see nl80211.h @NL80211_ATTR_WIPHY_ANTENNA_TX). * * @get_antenna: Get current antenna configuration from device (tx_ant, rx_ant). * * @remain_on_channel: Starts an off-channel period on the given channel, must * call back to ieee80211_ready_on_channel() when on that channel. Note * that normal channel traffic is not stopped as this is intended for hw * offload. Frames to transmit on the off-channel channel are transmitted * normally except for the %IEEE80211_TX_CTL_TX_OFFCHAN flag. When the * duration (which will always be non-zero) expires, the driver must call * ieee80211_remain_on_channel_expired(). * Note that this callback may be called while the device is in IDLE and * must be accepted in this case. * This callback may sleep. * @cancel_remain_on_channel: Requests that an ongoing off-channel period is * aborted before it expires. This callback may sleep. * * @set_ringparam: Set tx and rx ring sizes. * * @get_ringparam: Get tx and rx ring current and maximum sizes. * * @tx_frames_pending: Check if there is any pending frame in the hardware * queues before entering power save. * * @set_bitrate_mask: Set a mask of rates to be used for rate control selection * when transmitting a frame. Currently only legacy rates are handled. * The callback can sleep. * @event_callback: Notify driver about any event in mac80211. See * &enum ieee80211_event_type for the different types. * The callback must be atomic. * * @release_buffered_frames: Release buffered frames according to the given * parameters. In the case where the driver buffers some frames for * sleeping stations mac80211 will use this callback to tell the driver * to release some frames, either for PS-poll or uAPSD. * Note that if the @more_data parameter is %false the driver must check * if there are more frames on the given TIDs, and if there are more than * the frames being released then it must still set the more-data bit in * the frame. If the @more_data parameter is %true, then of course the * more-data bit must always be set. * The @tids parameter tells the driver which TIDs to release frames * from, for PS-poll it will always have only a single bit set. * In the case this is used for a PS-poll initiated release, the * @num_frames parameter will always be 1 so code can be shared. In * this case the driver must also set %IEEE80211_TX_STATUS_EOSP flag * on the TX status (and must report TX status) so that the PS-poll * period is properly ended. This is used to avoid sending multiple * responses for a retried PS-poll frame. * In the case this is used for uAPSD, the @num_frames parameter may be * bigger than one, but the driver may send fewer frames (it must send * at least one, however). In this case it is also responsible for * setting the EOSP flag in the QoS header of the frames. Also, when the * service period ends, the driver must set %IEEE80211_TX_STATUS_EOSP * on the last frame in the SP. Alternatively, it may call the function * ieee80211_sta_eosp() to inform mac80211 of the end of the SP. * This callback must be atomic. * @allow_buffered_frames: Prepare device to allow the given number of frames * to go out to the given station. The frames will be sent by mac80211 * via the usual TX path after this call. The TX information for frames * released will also have the %IEEE80211_TX_CTL_NO_PS_BUFFER flag set * and the last one will also have %IEEE80211_TX_STATUS_EOSP set. In case * frames from multiple TIDs are released and the driver might reorder * them between the TIDs, it must set the %IEEE80211_TX_STATUS_EOSP flag * on the last frame and clear it on all others and also handle the EOSP * bit in the QoS header correctly. Alternatively, it can also call the * ieee80211_sta_eosp() function. * The @tids parameter is a bitmap and tells the driver which TIDs the * frames will be on; it will at most have two bits set. * This callback must be atomic. * * @get_et_sset_count: Ethtool API to get string-set count. * Note that the wiphy mutex is not held for this callback since it's * expected to return a static value. * * @get_et_stats: Ethtool API to get a set of u64 stats. * * @get_et_strings: Ethtool API to get a set of strings to describe stats * and perhaps other supported types of ethtool data-sets. * Note that the wiphy mutex is not held for this callback since it's * expected to return a static value. * * @mgd_prepare_tx: Prepare for transmitting a management frame for association * before associated. In multi-channel scenarios, a virtual interface is * bound to a channel before it is associated, but as it isn't associated * yet it need not necessarily be given airtime, in particular since any * transmission to a P2P GO needs to be synchronized against the GO's * powersave state. mac80211 will call this function before transmitting a * management frame prior to transmitting that frame to allow the driver * to give it channel time for the transmission, to get a response and be * able to synchronize with the GO. * The callback will be called before each transmission and upon return * mac80211 will transmit the frame right away. * Additional information is passed in the &struct ieee80211_prep_tx_info * data. If duration there is greater than zero, mac80211 hints to the * driver the duration for which the operation is requested. * The callback is optional and can (should!) sleep. * @mgd_complete_tx: Notify the driver that the response frame for a previously * transmitted frame announced with @mgd_prepare_tx was received, the data * is filled similarly to @mgd_prepare_tx though the duration is not used. * * @mgd_protect_tdls_discover: Protect a TDLS discovery session. After sending * a TDLS discovery-request, we expect a reply to arrive on the AP's * channel. We must stay on the channel (no PSM, scan, etc.), since a TDLS * setup-response is a direct packet not buffered by the AP. * mac80211 will call this function just before the transmission of a TDLS * discovery-request. The recommended period of protection is at least * 2 * (DTIM period). * The callback is optional and can sleep. * * @add_chanctx: Notifies device driver about new channel context creation. * This callback may sleep. * @remove_chanctx: Notifies device driver about channel context destruction. * This callback may sleep. * @change_chanctx: Notifies device driver about channel context changes that * may happen when combining different virtual interfaces on the same * channel context with different settings * This callback may sleep. * @assign_vif_chanctx: Notifies device driver about channel context being bound * to vif. Possible use is for hw queue remapping. * This callback may sleep. * @unassign_vif_chanctx: Notifies device driver about channel context being * unbound from vif. * This callback may sleep. * @switch_vif_chanctx: switch a number of vifs from one chanctx to * another, as specified in the list of * @ieee80211_vif_chanctx_switch passed to the driver, according * to the mode defined in &ieee80211_chanctx_switch_mode. * This callback may sleep. * * @start_ap: Start operation on the AP interface, this is called after all the * information in bss_conf is set and beacon can be retrieved. A channel * context is bound before this is called. Note that if the driver uses * software scan or ROC, this (and @stop_ap) isn't called when the AP is * just "paused" for scanning/ROC, which is indicated by the beacon being * disabled/enabled via @bss_info_changed. * @stop_ap: Stop operation on the AP interface. * * @reconfig_complete: Called after a call to ieee80211_restart_hw() and * during resume, when the reconfiguration has completed. * This can help the driver implement the reconfiguration step (and * indicate mac80211 is ready to receive frames). * This callback may sleep. * * @ipv6_addr_change: IPv6 address assignment on the given interface changed. * Currently, this is only called for managed or P2P client interfaces. * This callback is optional; it must not sleep. * * @channel_switch_beacon: Starts a channel switch to a new channel. * Beacons are modified to include CSA or ECSA IEs before calling this * function. The corresponding count fields in these IEs must be * decremented, and when they reach 1 the driver must call * ieee80211_csa_finish(). Drivers which use ieee80211_beacon_get() * get the csa counter decremented by mac80211, but must check if it is * 1 using ieee80211_beacon_counter_is_complete() after the beacon has been * transmitted and then call ieee80211_csa_finish(). * If the CSA count starts as zero or 1, this function will not be called, * since there won't be any time to beacon before the switch anyway. * @pre_channel_switch: This is an optional callback that is called * before a channel switch procedure is started (ie. when a STA * gets a CSA or a userspace initiated channel-switch), allowing * the driver to prepare for the channel switch. * @post_channel_switch: This is an optional callback that is called * after a channel switch procedure is completed, allowing the * driver to go back to a normal configuration. * @abort_channel_switch: This is an optional callback that is called * when channel switch procedure was aborted, allowing the * driver to go back to a normal configuration. * @channel_switch_rx_beacon: This is an optional callback that is called * when channel switch procedure is in progress and additional beacon with * CSA IE was received, allowing driver to track changes in count. * @join_ibss: Join an IBSS (on an IBSS interface); this is called after all * information in bss_conf is set up and the beacon can be retrieved. A * channel context is bound before this is called. * @leave_ibss: Leave the IBSS again. * * @get_expected_throughput: extract the expected throughput towards the * specified station. The returned value is expressed in Kbps. It returns 0 * if the RC algorithm does not have proper data to provide. * * @get_txpower: get current maximum tx power (in dBm) based on configuration * and hardware limits. * * @tdls_channel_switch: Start channel-switching with a TDLS peer. The driver * is responsible for continually initiating channel-switching operations * and returning to the base channel for communication with the AP. The * driver receives a channel-switch request template and the location of * the switch-timing IE within the template as part of the invocation. * The template is valid only within the call, and the driver can * optionally copy the skb for further re-use. * @tdls_cancel_channel_switch: Stop channel-switching with a TDLS peer. Both * peers must be on the base channel when the call completes. * @tdls_recv_channel_switch: a TDLS channel-switch related frame (request or * response) has been received from a remote peer. The driver gets * parameters parsed from the incoming frame and may use them to continue * an ongoing channel-switch operation. In addition, a channel-switch * response template is provided, together with the location of the * switch-timing IE within the template. The skb can only be used within * the function call. * * @wake_tx_queue: Called when new packets have been added to the queue. * @sync_rx_queues: Process all pending frames in RSS queues. This is a * synchronization which is needed in case driver has in its RSS queues * pending frames that were received prior to the control path action * currently taken (e.g. disassociation) but are not processed yet. * * @start_nan: join an existing NAN cluster, or create a new one. * @stop_nan: leave the NAN cluster. * @nan_change_conf: change NAN configuration. The data in cfg80211_nan_conf * contains full new configuration and changes specify which parameters * are changed with respect to the last NAN config. * The driver gets both full configuration and the changed parameters since * some devices may need the full configuration while others need only the * changed parameters. * @add_nan_func: Add a NAN function. Returns 0 on success. The data in * cfg80211_nan_func must not be referenced outside the scope of * this call. * @del_nan_func: Remove a NAN function. The driver must call * ieee80211_nan_func_terminated() with * NL80211_NAN_FUNC_TERM_REASON_USER_REQUEST reason code upon removal. * @can_aggregate_in_amsdu: Called in order to determine if HW supports * aggregating two specific frames in the same A-MSDU. The relation * between the skbs should be symmetric and transitive. Note that while * skb is always a real frame, head may or may not be an A-MSDU. * @get_ftm_responder_stats: Retrieve FTM responder statistics, if available. * Statistics should be cumulative, currently no way to reset is provided. * * @start_pmsr: start peer measurement (e.g. FTM) (this call can sleep) * @abort_pmsr: abort peer measurement (this call can sleep) * @set_tid_config: Apply TID specific configurations. This callback may sleep. * @reset_tid_config: Reset TID specific configuration for the peer. * This callback may sleep. * @update_vif_offload: Update virtual interface offload flags * This callback may sleep. * @sta_set_4addr: Called to notify the driver when a station starts/stops using * 4-address mode * @set_sar_specs: Update the SAR (TX power) settings. * @sta_set_decap_offload: Called to notify the driver when a station is allowed * to use rx decapsulation offload * @add_twt_setup: Update hw with TWT agreement parameters received from the peer. * This callback allows the hw to check if requested parameters * are supported and if there is enough room for a new agreement. * The hw is expected to set agreement result in the req_type field of * twt structure. * @twt_teardown_request: Update the hw with TWT teardown request received * from the peer. * @set_radar_background: Configure dedicated offchannel chain available for * radar/CAC detection on some hw. This chain can't be used to transmit * or receive frames and it is bounded to a running wdev. * Background radar/CAC detection allows to avoid the CAC downtime * switching to a different channel during CAC detection on the selected * radar channel. * The caller is expected to set chandef pointer to NULL in order to * disable background CAC/radar detection. * @net_fill_forward_path: Called from .ndo_fill_forward_path in order to * resolve a path for hardware flow offloading * @can_activate_links: Checks if a specific active_links bitmap is * supported by the driver. * @change_vif_links: Change the valid links on an interface, note that while * removing the old link information is still valid (link_conf pointer), * but may immediately disappear after the function returns. The old or * new links bitmaps may be 0 if going from/to a non-MLO situation. * The @old array contains pointers to the old bss_conf structures * that were already removed, in case they're needed. * This callback can sleep. * @change_sta_links: Change the valid links of a station, similar to * @change_vif_links. This callback can sleep. * Note that a sta can also be inserted or removed with valid links, * i.e. passed to @sta_add/@sta_state with sta->valid_links not zero. * In fact, cannot change from having valid_links and not having them. * @set_hw_timestamp: Enable/disable HW timestamping of TM/FTM frames. This is * not restored at HW reset by mac80211 so drivers need to take care of * that. * @net_setup_tc: Called from .ndo_setup_tc in order to prepare hardware * flow offloading for flows originating from the vif. * Note that the driver must not assume that the vif driver_data is valid * at this point, since the callback can be called during netdev teardown. * @can_neg_ttlm: for managed interface, requests the driver to determine * if the requested TID-To-Link mapping can be accepted or not. * If it's not accepted the driver may suggest a preferred mapping and * modify @ttlm parameter with the suggested TID-to-Link mapping. * @prep_add_interface: prepare for interface addition. This can be used by * drivers to prepare for the addition of a new interface, e.g., allocate * the needed resources etc. This callback doesn't guarantee that an * interface with the specified type would be added, and thus drivers that * implement this callback need to handle such cases. The type is the full * &enum nl80211_iftype. */ struct ieee80211_ops { void (*tx)(struct ieee80211_hw *hw, struct ieee80211_tx_control *control, struct sk_buff *skb); int (*start)(struct ieee80211_hw *hw); void (*stop)(struct ieee80211_hw *hw, bool suspend); #ifdef CONFIG_PM int (*suspend)(struct ieee80211_hw *hw, struct cfg80211_wowlan *wowlan); int (*resume)(struct ieee80211_hw *hw); void (*set_wakeup)(struct ieee80211_hw *hw, bool enabled); #endif int (*add_interface)(struct ieee80211_hw *hw, struct ieee80211_vif *vif); int (*change_interface)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, enum nl80211_iftype new_type, bool p2p); void (*remove_interface)(struct ieee80211_hw *hw, struct ieee80211_vif *vif); int (*config)(struct ieee80211_hw *hw, u32 changed); void (*bss_info_changed)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_bss_conf *info, u64 changed); void (*vif_cfg_changed)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, u64 changed); void (*link_info_changed)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_bss_conf *info, u64 changed); int (*start_ap)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_bss_conf *link_conf); void (*stop_ap)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_bss_conf *link_conf); u64 (*prepare_multicast)(struct ieee80211_hw *hw, struct netdev_hw_addr_list *mc_list); void (*configure_filter)(struct ieee80211_hw *hw, unsigned int changed_flags, unsigned int *total_flags, u64 multicast); void (*config_iface_filter)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, unsigned int filter_flags, unsigned int changed_flags); int (*set_tim)(struct ieee80211_hw *hw, struct ieee80211_sta *sta, bool set); int (*set_key)(struct ieee80211_hw *hw, enum set_key_cmd cmd, struct ieee80211_vif *vif, struct ieee80211_sta *sta, struct ieee80211_key_conf *key); void (*update_tkip_key)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_key_conf *conf, struct ieee80211_sta *sta, u32 iv32, u16 *phase1key); void (*set_rekey_data)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct cfg80211_gtk_rekey_data *data); void (*set_default_unicast_key)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, int idx); int (*hw_scan)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_scan_request *req); void (*cancel_hw_scan)(struct ieee80211_hw *hw, struct ieee80211_vif *vif); int (*sched_scan_start)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct cfg80211_sched_scan_request *req, struct ieee80211_scan_ies *ies); int (*sched_scan_stop)(struct ieee80211_hw *hw, struct ieee80211_vif *vif); void (*sw_scan_start)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, const u8 *mac_addr); void (*sw_scan_complete)(struct ieee80211_hw *hw, struct ieee80211_vif *vif); int (*get_stats)(struct ieee80211_hw *hw, struct ieee80211_low_level_stats *stats); void (*get_key_seq)(struct ieee80211_hw *hw, struct ieee80211_key_conf *key, struct ieee80211_key_seq *seq); int (*set_frag_threshold)(struct ieee80211_hw *hw, u32 value); int (*set_rts_threshold)(struct ieee80211_hw *hw, u32 value); int (*sta_add)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_sta *sta); int (*sta_remove)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_sta *sta); #ifdef CONFIG_MAC80211_DEBUGFS void (*vif_add_debugfs)(struct ieee80211_hw *hw, struct ieee80211_vif *vif); void (*link_add_debugfs)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_bss_conf *link_conf, struct dentry *dir); void (*sta_add_debugfs)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_sta *sta, struct dentry *dir); void (*link_sta_add_debugfs)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_link_sta *link_sta, struct dentry *dir); #endif void (*sta_notify)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, enum sta_notify_cmd, struct ieee80211_sta *sta); int (*sta_set_txpwr)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_sta *sta); int (*sta_state)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_sta *sta, enum ieee80211_sta_state old_state, enum ieee80211_sta_state new_state); void (*sta_pre_rcu_remove)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_sta *sta); void (*link_sta_rc_update)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_link_sta *link_sta, u32 changed); void (*sta_rate_tbl_update)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_sta *sta); void (*sta_statistics)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_sta *sta, struct station_info *sinfo); int (*conf_tx)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, unsigned int link_id, u16 ac, const struct ieee80211_tx_queue_params *params); u64 (*get_tsf)(struct ieee80211_hw *hw, struct ieee80211_vif *vif); void (*set_tsf)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, u64 tsf); void (*offset_tsf)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, s64 offset); void (*reset_tsf)(struct ieee80211_hw *hw, struct ieee80211_vif *vif); int (*tx_last_beacon)(struct ieee80211_hw *hw); /** * @ampdu_action: * Perform a certain A-MPDU action. * The RA/TID combination determines the destination and TID we want * the ampdu action to be performed for. The action is defined through * ieee80211_ampdu_mlme_action. * When the action is set to %IEEE80211_AMPDU_TX_OPERATIONAL the driver * may neither send aggregates containing more subframes than @buf_size * nor send aggregates in a way that lost frames would exceed the * buffer size. If just limiting the aggregate size, this would be * possible with a buf_size of 8: * * - ``TX: 1.....7`` * - ``RX: 2....7`` (lost frame #1) * - ``TX: 8..1...`` * * which is invalid since #1 was now re-transmitted well past the * buffer size of 8. Correct ways to retransmit #1 would be: * * - ``TX: 1 or`` * - ``TX: 18 or`` * - ``TX: 81`` * * Even ``189`` would be wrong since 1 could be lost again. * * Returns a negative error code on failure. The driver may return * %IEEE80211_AMPDU_TX_START_IMMEDIATE for %IEEE80211_AMPDU_TX_START * if the session can start immediately. * * The callback can sleep. */ int (*ampdu_action)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_ampdu_params *params); int (*get_survey)(struct ieee80211_hw *hw, int idx, struct survey_info *survey); void (*rfkill_poll)(struct ieee80211_hw *hw); void (*set_coverage_class)(struct ieee80211_hw *hw, s16 coverage_class); #ifdef CONFIG_NL80211_TESTMODE int (*testmode_cmd)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, void *data, int len); int (*testmode_dump)(struct ieee80211_hw *hw, struct sk_buff *skb, struct netlink_callback *cb, void *data, int len); #endif void (*flush)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, u32 queues, bool drop); void (*flush_sta)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_sta *sta); void (*channel_switch)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_channel_switch *ch_switch); int (*set_antenna)(struct ieee80211_hw *hw, u32 tx_ant, u32 rx_ant); int (*get_antenna)(struct ieee80211_hw *hw, u32 *tx_ant, u32 *rx_ant); int (*remain_on_channel)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_channel *chan, int duration, enum ieee80211_roc_type type); int (*cancel_remain_on_channel)(struct ieee80211_hw *hw, struct ieee80211_vif *vif); int (*set_ringparam)(struct ieee80211_hw *hw, u32 tx, u32 rx); void (*get_ringparam)(struct ieee80211_hw *hw, u32 *tx, u32 *tx_max, u32 *rx, u32 *rx_max); bool (*tx_frames_pending)(struct ieee80211_hw *hw); int (*set_bitrate_mask)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, const struct cfg80211_bitrate_mask *mask); void (*event_callback)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, const struct ieee80211_event *event); void (*allow_buffered_frames)(struct ieee80211_hw *hw, struct ieee80211_sta *sta, u16 tids, int num_frames, enum ieee80211_frame_release_type reason, bool more_data); void (*release_buffered_frames)(struct ieee80211_hw *hw, struct ieee80211_sta *sta, u16 tids, int num_frames, enum ieee80211_frame_release_type reason, bool more_data); int (*get_et_sset_count)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, int sset); void (*get_et_stats)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ethtool_stats *stats, u64 *data); void (*get_et_strings)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, u32 sset, u8 *data); void (*mgd_prepare_tx)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_prep_tx_info *info); void (*mgd_complete_tx)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_prep_tx_info *info); void (*mgd_protect_tdls_discover)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, unsigned int link_id); int (*add_chanctx)(struct ieee80211_hw *hw, struct ieee80211_chanctx_conf *ctx); void (*remove_chanctx)(struct ieee80211_hw *hw, struct ieee80211_chanctx_conf *ctx); void (*change_chanctx)(struct ieee80211_hw *hw, struct ieee80211_chanctx_conf *ctx, u32 changed); int (*assign_vif_chanctx)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_bss_conf *link_conf, struct ieee80211_chanctx_conf *ctx); void (*unassign_vif_chanctx)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_bss_conf *link_conf, struct ieee80211_chanctx_conf *ctx); int (*switch_vif_chanctx)(struct ieee80211_hw *hw, struct ieee80211_vif_chanctx_switch *vifs, int n_vifs, enum ieee80211_chanctx_switch_mode mode); void (*reconfig_complete)(struct ieee80211_hw *hw, enum ieee80211_reconfig_type reconfig_type); #if IS_ENABLED(CONFIG_IPV6) void (*ipv6_addr_change)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct inet6_dev *idev); #endif void (*channel_switch_beacon)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct cfg80211_chan_def *chandef); int (*pre_channel_switch)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_channel_switch *ch_switch); int (*post_channel_switch)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_bss_conf *link_conf); void (*abort_channel_switch)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_bss_conf *link_conf); void (*channel_switch_rx_beacon)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_channel_switch *ch_switch); int (*join_ibss)(struct ieee80211_hw *hw, struct ieee80211_vif *vif); void (*leave_ibss)(struct ieee80211_hw *hw, struct ieee80211_vif *vif); u32 (*get_expected_throughput)(struct ieee80211_hw *hw, struct ieee80211_sta *sta); int (*get_txpower)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, unsigned int link_id, int *dbm); int (*tdls_channel_switch)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_sta *sta, u8 oper_class, struct cfg80211_chan_def *chandef, struct sk_buff *tmpl_skb, u32 ch_sw_tm_ie); void (*tdls_cancel_channel_switch)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_sta *sta); void (*tdls_recv_channel_switch)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_tdls_ch_sw_params *params); void (*wake_tx_queue)(struct ieee80211_hw *hw, struct ieee80211_txq *txq); void (*sync_rx_queues)(struct ieee80211_hw *hw); int (*start_nan)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct cfg80211_nan_conf *conf); int (*stop_nan)(struct ieee80211_hw *hw, struct ieee80211_vif *vif); int (*nan_change_conf)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct cfg80211_nan_conf *conf, u32 changes); int (*add_nan_func)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, const struct cfg80211_nan_func *nan_func); void (*del_nan_func)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, u8 instance_id); bool (*can_aggregate_in_amsdu)(struct ieee80211_hw *hw, struct sk_buff *head, struct sk_buff *skb); int (*get_ftm_responder_stats)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct cfg80211_ftm_responder_stats *ftm_stats); int (*start_pmsr)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct cfg80211_pmsr_request *request); void (*abort_pmsr)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct cfg80211_pmsr_request *request); int (*set_tid_config)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_sta *sta, struct cfg80211_tid_config *tid_conf); int (*reset_tid_config)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_sta *sta, u8 tids); void (*update_vif_offload)(struct ieee80211_hw *hw, struct ieee80211_vif *vif); void (*sta_set_4addr)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_sta *sta, bool enabled); int (*set_sar_specs)(struct ieee80211_hw *hw, const struct cfg80211_sar_specs *sar); void (*sta_set_decap_offload)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_sta *sta, bool enabled); void (*add_twt_setup)(struct ieee80211_hw *hw, struct ieee80211_sta *sta, struct ieee80211_twt_setup *twt); void (*twt_teardown_request)(struct ieee80211_hw *hw, struct ieee80211_sta *sta, u8 flowid); int (*set_radar_background)(struct ieee80211_hw *hw, struct cfg80211_chan_def *chandef); int (*net_fill_forward_path)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_sta *sta, struct net_device_path_ctx *ctx, struct net_device_path *path); bool (*can_activate_links)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, u16 active_links); int (*change_vif_links)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, u16 old_links, u16 new_links, struct ieee80211_bss_conf *old[IEEE80211_MLD_MAX_NUM_LINKS]); int (*change_sta_links)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_sta *sta, u16 old_links, u16 new_links); int (*set_hw_timestamp)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct cfg80211_set_hw_timestamp *hwts); int (*net_setup_tc)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct net_device *dev, enum tc_setup_type type, void *type_data); enum ieee80211_neg_ttlm_res (*can_neg_ttlm)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_neg_ttlm *ttlm); void (*prep_add_interface)(struct ieee80211_hw *hw, enum nl80211_iftype type); }; /** * ieee80211_alloc_hw_nm - Allocate a new hardware device * * This must be called once for each hardware device. The returned pointer * must be used to refer to this device when calling other functions. * mac80211 allocates a private data area for the driver pointed to by * @priv in &struct ieee80211_hw, the size of this area is given as * @priv_data_len. * * @priv_data_len: length of private data * @ops: callbacks for this device * @requested_name: Requested name for this device. * NULL is valid value, and means use the default naming (phy%d) * * Return: A pointer to the new hardware device, or %NULL on error. */ struct ieee80211_hw *ieee80211_alloc_hw_nm(size_t priv_data_len, const struct ieee80211_ops *ops, const char *requested_name); /** * ieee80211_alloc_hw - Allocate a new hardware device * * This must be called once for each hardware device. The returned pointer * must be used to refer to this device when calling other functions. * mac80211 allocates a private data area for the driver pointed to by * @priv in &struct ieee80211_hw, the size of this area is given as * @priv_data_len. * * @priv_data_len: length of private data * @ops: callbacks for this device * * Return: A pointer to the new hardware device, or %NULL on error. */ static inline struct ieee80211_hw *ieee80211_alloc_hw(size_t priv_data_len, const struct ieee80211_ops *ops) { return ieee80211_alloc_hw_nm(priv_data_len, ops, NULL); } /** * ieee80211_register_hw - Register hardware device * * You must call this function before any other functions in * mac80211. Note that before a hardware can be registered, you * need to fill the contained wiphy's information. * * @hw: the device to register as returned by ieee80211_alloc_hw() * * Return: 0 on success. An error code otherwise. */ int ieee80211_register_hw(struct ieee80211_hw *hw); /** * struct ieee80211_tpt_blink - throughput blink description * @throughput: throughput in Kbit/sec * @blink_time: blink time in milliseconds * (full cycle, ie. one off + one on period) */ struct ieee80211_tpt_blink { int throughput; int blink_time; }; /** * enum ieee80211_tpt_led_trigger_flags - throughput trigger flags * @IEEE80211_TPT_LEDTRIG_FL_RADIO: enable blinking with radio * @IEEE80211_TPT_LEDTRIG_FL_WORK: enable blinking when working * @IEEE80211_TPT_LEDTRIG_FL_CONNECTED: enable blinking when at least one * interface is connected in some way, including being an AP */ enum ieee80211_tpt_led_trigger_flags { IEEE80211_TPT_LEDTRIG_FL_RADIO = BIT(0), IEEE80211_TPT_LEDTRIG_FL_WORK = BIT(1), IEEE80211_TPT_LEDTRIG_FL_CONNECTED = BIT(2), }; #ifdef CONFIG_MAC80211_LEDS const char *__ieee80211_get_tx_led_name(struct ieee80211_hw *hw); const char *__ieee80211_get_rx_led_name(struct ieee80211_hw *hw); const char *__ieee80211_get_assoc_led_name(struct ieee80211_hw *hw); const char *__ieee80211_get_radio_led_name(struct ieee80211_hw *hw); const char * __ieee80211_create_tpt_led_trigger(struct ieee80211_hw *hw, unsigned int flags, const struct ieee80211_tpt_blink *blink_table, unsigned int blink_table_len); #endif /** * ieee80211_get_tx_led_name - get name of TX LED * * mac80211 creates a transmit LED trigger for each wireless hardware * that can be used to drive LEDs if your driver registers a LED device. * This function returns the name (or %NULL if not configured for LEDs) * of the trigger so you can automatically link the LED device. * * @hw: the hardware to get the LED trigger name for * * Return: The name of the LED trigger. %NULL if not configured for LEDs. */ static inline const char *ieee80211_get_tx_led_name(struct ieee80211_hw *hw) { #ifdef CONFIG_MAC80211_LEDS return __ieee80211_get_tx_led_name(hw); #else return NULL; #endif } /** * ieee80211_get_rx_led_name - get name of RX LED * * mac80211 creates a receive LED trigger for each wireless hardware * that can be used to drive LEDs if your driver registers a LED device. * This function returns the name (or %NULL if not configured for LEDs) * of the trigger so you can automatically link the LED device. * * @hw: the hardware to get the LED trigger name for * * Return: The name of the LED trigger. %NULL if not configured for LEDs. */ static inline const char *ieee80211_get_rx_led_name(struct ieee80211_hw *hw) { #ifdef CONFIG_MAC80211_LEDS return __ieee80211_get_rx_led_name(hw); #else return NULL; #endif } /** * ieee80211_get_assoc_led_name - get name of association LED * * mac80211 creates a association LED trigger for each wireless hardware * that can be used to drive LEDs if your driver registers a LED device. * This function returns the name (or %NULL if not configured for LEDs) * of the trigger so you can automatically link the LED device. * * @hw: the hardware to get the LED trigger name for * * Return: The name of the LED trigger. %NULL if not configured for LEDs. */ static inline const char *ieee80211_get_assoc_led_name(struct ieee80211_hw *hw) { #ifdef CONFIG_MAC80211_LEDS return __ieee80211_get_assoc_led_name(hw); #else return NULL; #endif } /** * ieee80211_get_radio_led_name - get name of radio LED * * mac80211 creates a radio change LED trigger for each wireless hardware * that can be used to drive LEDs if your driver registers a LED device. * This function returns the name (or %NULL if not configured for LEDs) * of the trigger so you can automatically link the LED device. * * @hw: the hardware to get the LED trigger name for * * Return: The name of the LED trigger. %NULL if not configured for LEDs. */ static inline const char *ieee80211_get_radio_led_name(struct ieee80211_hw *hw) { #ifdef CONFIG_MAC80211_LEDS return __ieee80211_get_radio_led_name(hw); #else return NULL; #endif } /** * ieee80211_create_tpt_led_trigger - create throughput LED trigger * @hw: the hardware to create the trigger for * @flags: trigger flags, see &enum ieee80211_tpt_led_trigger_flags * @blink_table: the blink table -- needs to be ordered by throughput * @blink_table_len: size of the blink table * * Return: %NULL (in case of error, or if no LED triggers are * configured) or the name of the new trigger. * * Note: This function must be called before ieee80211_register_hw(). */ static inline const char * ieee80211_create_tpt_led_trigger(struct ieee80211_hw *hw, unsigned int flags, const struct ieee80211_tpt_blink *blink_table, unsigned int blink_table_len) { #ifdef CONFIG_MAC80211_LEDS return __ieee80211_create_tpt_led_trigger(hw, flags, blink_table, blink_table_len); #else return NULL; #endif } /** * ieee80211_unregister_hw - Unregister a hardware device * * This function instructs mac80211 to free allocated resources * and unregister netdevices from the networking subsystem. * * @hw: the hardware to unregister */ void ieee80211_unregister_hw(struct ieee80211_hw *hw); /** * ieee80211_free_hw - free hardware descriptor * * This function frees everything that was allocated, including the * private data for the driver. You must call ieee80211_unregister_hw() * before calling this function. * * @hw: the hardware to free */ void ieee80211_free_hw(struct ieee80211_hw *hw); /** * ieee80211_restart_hw - restart hardware completely * * Call this function when the hardware was restarted for some reason * (hardware error, ...) and the driver is unable to restore its state * by itself. mac80211 assumes that at this point the driver/hardware * is completely uninitialised and stopped, it starts the process by * calling the ->start() operation. The driver will need to reset all * internal state that it has prior to calling this function. * * @hw: the hardware to restart */ void ieee80211_restart_hw(struct ieee80211_hw *hw); /** * ieee80211_rx_list - receive frame and store processed skbs in a list * * Use this function to hand received frames to mac80211. The receive * buffer in @skb must start with an IEEE 802.11 header. In case of a * paged @skb is used, the driver is recommended to put the ieee80211 * header of the frame on the linear part of the @skb to avoid memory * allocation and/or memcpy by the stack. * * This function may not be called in IRQ context. Calls to this function * for a single hardware must be synchronized against each other. Calls to * this function, ieee80211_rx_ni() and ieee80211_rx_irqsafe() may not be * mixed for a single hardware. Must not run concurrently with * ieee80211_tx_status_skb() or ieee80211_tx_status_ni(). * * This function must be called with BHs disabled and RCU read lock * * @hw: the hardware this frame came in on * @sta: the station the frame was received from, or %NULL * @skb: the buffer to receive, owned by mac80211 after this call * @list: the destination list */ void ieee80211_rx_list(struct ieee80211_hw *hw, struct ieee80211_sta *sta, struct sk_buff *skb, struct list_head *list); /** * ieee80211_rx_napi - receive frame from NAPI context * * Use this function to hand received frames to mac80211. The receive * buffer in @skb must start with an IEEE 802.11 header. In case of a * paged @skb is used, the driver is recommended to put the ieee80211 * header of the frame on the linear part of the @skb to avoid memory * allocation and/or memcpy by the stack. * * This function may not be called in IRQ context. Calls to this function * for a single hardware must be synchronized against each other. Calls to * this function, ieee80211_rx_ni() and ieee80211_rx_irqsafe() may not be * mixed for a single hardware. Must not run concurrently with * ieee80211_tx_status_skb() or ieee80211_tx_status_ni(). * * This function must be called with BHs disabled. * * @hw: the hardware this frame came in on * @sta: the station the frame was received from, or %NULL * @skb: the buffer to receive, owned by mac80211 after this call * @napi: the NAPI context */ void ieee80211_rx_napi(struct ieee80211_hw *hw, struct ieee80211_sta *sta, struct sk_buff *skb, struct napi_struct *napi); /** * ieee80211_rx - receive frame * * Use this function to hand received frames to mac80211. The receive * buffer in @skb must start with an IEEE 802.11 header. In case of a * paged @skb is used, the driver is recommended to put the ieee80211 * header of the frame on the linear part of the @skb to avoid memory * allocation and/or memcpy by the stack. * * This function may not be called in IRQ context. Calls to this function * for a single hardware must be synchronized against each other. Calls to * this function, ieee80211_rx_ni() and ieee80211_rx_irqsafe() may not be * mixed for a single hardware. Must not run concurrently with * ieee80211_tx_status_skb() or ieee80211_tx_status_ni(). * * In process context use instead ieee80211_rx_ni(). * * @hw: the hardware this frame came in on * @skb: the buffer to receive, owned by mac80211 after this call */ static inline void ieee80211_rx(struct ieee80211_hw *hw, struct sk_buff *skb) { ieee80211_rx_napi(hw, NULL, skb, NULL); } /** * ieee80211_rx_irqsafe - receive frame * * Like ieee80211_rx() but can be called in IRQ context * (internally defers to a tasklet.) * * Calls to this function, ieee80211_rx() or ieee80211_rx_ni() may not * be mixed for a single hardware.Must not run concurrently with * ieee80211_tx_status_skb() or ieee80211_tx_status_ni(). * * @hw: the hardware this frame came in on * @skb: the buffer to receive, owned by mac80211 after this call */ void ieee80211_rx_irqsafe(struct ieee80211_hw *hw, struct sk_buff *skb); /** * ieee80211_rx_ni - receive frame (in process context) * * Like ieee80211_rx() but can be called in process context * (internally disables bottom halves). * * Calls to this function, ieee80211_rx() and ieee80211_rx_irqsafe() may * not be mixed for a single hardware. Must not run concurrently with * ieee80211_tx_status_skb() or ieee80211_tx_status_ni(). * * @hw: the hardware this frame came in on * @skb: the buffer to receive, owned by mac80211 after this call */ static inline void ieee80211_rx_ni(struct ieee80211_hw *hw, struct sk_buff *skb) { local_bh_disable(); ieee80211_rx(hw, skb); local_bh_enable(); } /** * ieee80211_sta_ps_transition - PS transition for connected sta * * When operating in AP mode with the %IEEE80211_HW_AP_LINK_PS * flag set, use this function to inform mac80211 about a connected station * entering/leaving PS mode. * * This function may not be called in IRQ context or with softirqs enabled. * * Calls to this function for a single hardware must be synchronized against * each other. * * @sta: currently connected sta * @start: start or stop PS * * Return: 0 on success. -EINVAL when the requested PS mode is already set. */ int ieee80211_sta_ps_transition(struct ieee80211_sta *sta, bool start); /** * ieee80211_sta_ps_transition_ni - PS transition for connected sta * (in process context) * * Like ieee80211_sta_ps_transition() but can be called in process context * (internally disables bottom halves). Concurrent call restriction still * applies. * * @sta: currently connected sta * @start: start or stop PS * * Return: Like ieee80211_sta_ps_transition(). */ static inline int ieee80211_sta_ps_transition_ni(struct ieee80211_sta *sta, bool start) { int ret; local_bh_disable(); ret = ieee80211_sta_ps_transition(sta, start); local_bh_enable(); return ret; } /** * ieee80211_sta_pspoll - PS-Poll frame received * @sta: currently connected station * * When operating in AP mode with the %IEEE80211_HW_AP_LINK_PS flag set, * use this function to inform mac80211 that a PS-Poll frame from a * connected station was received. * This must be used in conjunction with ieee80211_sta_ps_transition() * and possibly ieee80211_sta_uapsd_trigger(); calls to all three must * be serialized. */ void ieee80211_sta_pspoll(struct ieee80211_sta *sta); /** * ieee80211_sta_uapsd_trigger - (potential) U-APSD trigger frame received * @sta: currently connected station * @tid: TID of the received (potential) trigger frame * * When operating in AP mode with the %IEEE80211_HW_AP_LINK_PS flag set, * use this function to inform mac80211 that a (potential) trigger frame * from a connected station was received. * This must be used in conjunction with ieee80211_sta_ps_transition() * and possibly ieee80211_sta_pspoll(); calls to all three must be * serialized. * %IEEE80211_NUM_TIDS can be passed as the tid if the tid is unknown. * In this case, mac80211 will not check that this tid maps to an AC * that is trigger enabled and assume that the caller did the proper * checks. */ void ieee80211_sta_uapsd_trigger(struct ieee80211_sta *sta, u8 tid); /* * The TX headroom reserved by mac80211 for its own tx_status functions. * This is enough for the radiotap header. */ #define IEEE80211_TX_STATUS_HEADROOM ALIGN(14, 4) /** * ieee80211_sta_set_buffered - inform mac80211 about driver-buffered frames * @sta: &struct ieee80211_sta pointer for the sleeping station * @tid: the TID that has buffered frames * @buffered: indicates whether or not frames are buffered for this TID * * If a driver buffers frames for a powersave station instead of passing * them back to mac80211 for retransmission, the station may still need * to be told that there are buffered frames via the TIM bit. * * This function informs mac80211 whether or not there are frames that are * buffered in the driver for a given TID; mac80211 can then use this data * to set the TIM bit (NOTE: This may call back into the driver's set_tim * call! Beware of the locking!) * * If all frames are released to the station (due to PS-poll or uAPSD) * then the driver needs to inform mac80211 that there no longer are * frames buffered. However, when the station wakes up mac80211 assumes * that all buffered frames will be transmitted and clears this data, * drivers need to make sure they inform mac80211 about all buffered * frames on the sleep transition (sta_notify() with %STA_NOTIFY_SLEEP). * * Note that technically mac80211 only needs to know this per AC, not per * TID, but since driver buffering will inevitably happen per TID (since * it is related to aggregation) it is easier to make mac80211 map the * TID to the AC as required instead of keeping track in all drivers that * use this API. */ void ieee80211_sta_set_buffered(struct ieee80211_sta *sta, u8 tid, bool buffered); /** * ieee80211_get_tx_rates - get the selected transmit rates for a packet * * Call this function in a driver with per-packet rate selection support * to combine the rate info in the packet tx info with the most recent * rate selection table for the station entry. * * @vif: &struct ieee80211_vif pointer from the add_interface callback. * @sta: the receiver station to which this packet is sent. * @skb: the frame to be transmitted. * @dest: buffer for extracted rate/retry information * @max_rates: maximum number of rates to fetch */ void ieee80211_get_tx_rates(struct ieee80211_vif *vif, struct ieee80211_sta *sta, struct sk_buff *skb, struct ieee80211_tx_rate *dest, int max_rates); /** * ieee80211_sta_set_expected_throughput - set the expected tpt for a station * * Call this function to notify mac80211 about a change in expected throughput * to a station. A driver for a device that does rate control in firmware can * call this function when the expected throughput estimate towards a station * changes. The information is used to tune the CoDel AQM applied to traffic * going towards that station (which can otherwise be too aggressive and cause * slow stations to starve). * * @pubsta: the station to set throughput for. * @thr: the current expected throughput in kbps. */ void ieee80211_sta_set_expected_throughput(struct ieee80211_sta *pubsta, u32 thr); /** * ieee80211_tx_rate_update - transmit rate update callback * * Drivers should call this functions with a non-NULL pub sta * This function can be used in drivers that does not have provision * in updating the tx rate in data path. * * @hw: the hardware the frame was transmitted by * @pubsta: the station to update the tx rate for. * @info: tx status information */ void ieee80211_tx_rate_update(struct ieee80211_hw *hw, struct ieee80211_sta *pubsta, struct ieee80211_tx_info *info); /** * ieee80211_tx_status_skb - transmit status callback * * Call this function for all transmitted frames after they have been * transmitted. It is permissible to not call this function for * multicast frames but this can affect statistics. * * This function may not be called in IRQ context. Calls to this function * for a single hardware must be synchronized against each other. Calls * to this function, ieee80211_tx_status_ni() and ieee80211_tx_status_irqsafe() * may not be mixed for a single hardware. Must not run concurrently with * ieee80211_rx() or ieee80211_rx_ni(). * * @hw: the hardware the frame was transmitted by * @skb: the frame that was transmitted, owned by mac80211 after this call */ void ieee80211_tx_status_skb(struct ieee80211_hw *hw, struct sk_buff *skb); /** * ieee80211_tx_status_ext - extended transmit status callback * * This function can be used as a replacement for ieee80211_tx_status_skb() * in drivers that may want to provide extra information that does not * fit into &struct ieee80211_tx_info. * * Calls to this function for a single hardware must be synchronized * against each other. Calls to this function, ieee80211_tx_status_ni() * and ieee80211_tx_status_irqsafe() may not be mixed for a single hardware. * * @hw: the hardware the frame was transmitted by * @status: tx status information */ void ieee80211_tx_status_ext(struct ieee80211_hw *hw, struct ieee80211_tx_status *status); /** * ieee80211_tx_status_noskb - transmit status callback without skb * * This function can be used as a replacement for ieee80211_tx_status_skb() * in drivers that cannot reliably map tx status information back to * specific skbs. * * Calls to this function for a single hardware must be synchronized * against each other. Calls to this function, ieee80211_tx_status_ni() * and ieee80211_tx_status_irqsafe() may not be mixed for a single hardware. * * @hw: the hardware the frame was transmitted by * @sta: the receiver station to which this packet is sent * (NULL for multicast packets) * @info: tx status information */ static inline void ieee80211_tx_status_noskb(struct ieee80211_hw *hw, struct ieee80211_sta *sta, struct ieee80211_tx_info *info) { struct ieee80211_tx_status status = { .sta = sta, .info = info, }; ieee80211_tx_status_ext(hw, &status); } /** * ieee80211_tx_status_ni - transmit status callback (in process context) * * Like ieee80211_tx_status_skb() but can be called in process context. * * Calls to this function, ieee80211_tx_status_skb() and * ieee80211_tx_status_irqsafe() may not be mixed * for a single hardware. * * @hw: the hardware the frame was transmitted by * @skb: the frame that was transmitted, owned by mac80211 after this call */ static inline void ieee80211_tx_status_ni(struct ieee80211_hw *hw, struct sk_buff *skb) { local_bh_disable(); ieee80211_tx_status_skb(hw, skb); local_bh_enable(); } /** * ieee80211_tx_status_irqsafe - IRQ-safe transmit status callback * * Like ieee80211_tx_status_skb() but can be called in IRQ context * (internally defers to a tasklet.) * * Calls to this function, ieee80211_tx_status_skb() and * ieee80211_tx_status_ni() may not be mixed for a single hardware. * * @hw: the hardware the frame was transmitted by * @skb: the frame that was transmitted, owned by mac80211 after this call */ void ieee80211_tx_status_irqsafe(struct ieee80211_hw *hw, struct sk_buff *skb); /** * ieee80211_report_low_ack - report non-responding station * * When operating in AP-mode, call this function to report a non-responding * connected STA. * * @sta: the non-responding connected sta * @num_packets: number of packets sent to @sta without a response */ void ieee80211_report_low_ack(struct ieee80211_sta *sta, u32 num_packets); #define IEEE80211_MAX_CNTDWN_COUNTERS_NUM 2 /** * struct ieee80211_mutable_offsets - mutable beacon offsets * @tim_offset: position of TIM element * @tim_length: size of TIM element * @cntdwn_counter_offs: array of IEEE80211_MAX_CNTDWN_COUNTERS_NUM offsets * to countdown counters. This array can contain zero values which * should be ignored. * @mbssid_off: position of the multiple bssid element */ struct ieee80211_mutable_offsets { u16 tim_offset; u16 tim_length; u16 cntdwn_counter_offs[IEEE80211_MAX_CNTDWN_COUNTERS_NUM]; u16 mbssid_off; }; /** * ieee80211_beacon_get_template - beacon template generation function * @hw: pointer obtained from ieee80211_alloc_hw(). * @vif: &struct ieee80211_vif pointer from the add_interface callback. * @offs: &struct ieee80211_mutable_offsets pointer to struct that will * receive the offsets that may be updated by the driver. * @link_id: the link id to which the beacon belongs (or 0 for an AP STA * that is not associated with AP MLD). * * If the driver implements beaconing modes, it must use this function to * obtain the beacon template. * * This function should be used if the beacon frames are generated by the * device, and then the driver must use the returned beacon as the template * The driver or the device are responsible to update the DTIM and, when * applicable, the CSA count. * * The driver is responsible for freeing the returned skb. * * Return: The beacon template. %NULL on error. */ struct sk_buff * ieee80211_beacon_get_template(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_mutable_offsets *offs, unsigned int link_id); /** * ieee80211_beacon_get_template_ema_index - EMA beacon template generation * @hw: pointer obtained from ieee80211_alloc_hw(). * @vif: &struct ieee80211_vif pointer from the add_interface callback. * @offs: &struct ieee80211_mutable_offsets pointer to struct that will * receive the offsets that may be updated by the driver. * @link_id: the link id to which the beacon belongs (or 0 for a non-MLD AP). * @ema_index: index of the beacon in the EMA set. * * This function follows the same rules as ieee80211_beacon_get_template() * but returns a beacon template which includes multiple BSSID element at the * requested index. * * Return: The beacon template. %NULL indicates the end of EMA templates. */ struct sk_buff * ieee80211_beacon_get_template_ema_index(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_mutable_offsets *offs, unsigned int link_id, u8 ema_index); /** * struct ieee80211_ema_beacons - List of EMA beacons * @cnt: count of EMA beacons. * * @bcn: array of EMA beacons. * @bcn.skb: the skb containing this specific beacon * @bcn.offs: &struct ieee80211_mutable_offsets pointer to struct that will * receive the offsets that may be updated by the driver. */ struct ieee80211_ema_beacons { u8 cnt; struct { struct sk_buff *skb; struct ieee80211_mutable_offsets offs; } bcn[]; }; /** * ieee80211_beacon_get_template_ema_list - EMA beacon template generation * @hw: pointer obtained from ieee80211_alloc_hw(). * @vif: &struct ieee80211_vif pointer from the add_interface callback. * @link_id: the link id to which the beacon belongs (or 0 for a non-MLD AP) * * This function follows the same rules as ieee80211_beacon_get_template() * but allocates and returns a pointer to list of all beacon templates required * to cover all profiles in the multiple BSSID set. Each template includes only * one multiple BSSID element. * * Driver must call ieee80211_beacon_free_ema_list() to free the memory. * * Return: EMA beacon templates of type struct ieee80211_ema_beacons *. * %NULL on error. */ struct ieee80211_ema_beacons * ieee80211_beacon_get_template_ema_list(struct ieee80211_hw *hw, struct ieee80211_vif *vif, unsigned int link_id); /** * ieee80211_beacon_free_ema_list - free an EMA beacon template list * @ema_beacons: list of EMA beacons of type &struct ieee80211_ema_beacons pointers. * * This function will free a list previously acquired by calling * ieee80211_beacon_get_template_ema_list() */ void ieee80211_beacon_free_ema_list(struct ieee80211_ema_beacons *ema_beacons); /** * ieee80211_beacon_get_tim - beacon generation function * @hw: pointer obtained from ieee80211_alloc_hw(). * @vif: &struct ieee80211_vif pointer from the add_interface callback. * @tim_offset: pointer to variable that will receive the TIM IE offset. * Set to 0 if invalid (in non-AP modes). * @tim_length: pointer to variable that will receive the TIM IE length, * (including the ID and length bytes!). * Set to 0 if invalid (in non-AP modes). * @link_id: the link id to which the beacon belongs (or 0 for an AP STA * that is not associated with AP MLD). * * If the driver implements beaconing modes, it must use this function to * obtain the beacon frame. * * If the beacon frames are generated by the host system (i.e., not in * hardware/firmware), the driver uses this function to get each beacon * frame from mac80211 -- it is responsible for calling this function exactly * once before the beacon is needed (e.g. based on hardware interrupt). * * The driver is responsible for freeing the returned skb. * * Return: The beacon template. %NULL on error. */ struct sk_buff *ieee80211_beacon_get_tim(struct ieee80211_hw *hw, struct ieee80211_vif *vif, u16 *tim_offset, u16 *tim_length, unsigned int link_id); /** * ieee80211_beacon_get - beacon generation function * @hw: pointer obtained from ieee80211_alloc_hw(). * @vif: &struct ieee80211_vif pointer from the add_interface callback. * @link_id: the link id to which the beacon belongs (or 0 for an AP STA * that is not associated with AP MLD). * * See ieee80211_beacon_get_tim(). * * Return: See ieee80211_beacon_get_tim(). */ static inline struct sk_buff *ieee80211_beacon_get(struct ieee80211_hw *hw, struct ieee80211_vif *vif, unsigned int link_id) { return ieee80211_beacon_get_tim(hw, vif, NULL, NULL, link_id); } /** * ieee80211_beacon_update_cntdwn - request mac80211 to decrement the beacon countdown * @vif: &struct ieee80211_vif pointer from the add_interface callback. * @link_id: valid link_id during MLO or 0 for non-MLO * * The beacon counter should be updated after each beacon transmission. * This function is called implicitly when * ieee80211_beacon_get/ieee80211_beacon_get_tim are called, however if the * beacon frames are generated by the device, the driver should call this * function after each beacon transmission to sync mac80211's beacon countdown. * * Return: new countdown value */ u8 ieee80211_beacon_update_cntdwn(struct ieee80211_vif *vif, unsigned int link_id); /** * ieee80211_beacon_set_cntdwn - request mac80211 to set beacon countdown * @vif: &struct ieee80211_vif pointer from the add_interface callback. * @counter: the new value for the counter * * The beacon countdown can be changed by the device, this API should be * used by the device driver to update csa counter in mac80211. * * It should never be used together with ieee80211_beacon_update_cntdwn(), * as it will cause a race condition around the counter value. */ void ieee80211_beacon_set_cntdwn(struct ieee80211_vif *vif, u8 counter); /** * ieee80211_csa_finish - notify mac80211 about channel switch * @vif: &struct ieee80211_vif pointer from the add_interface callback. * @link_id: valid link_id during MLO or 0 for non-MLO * * After a channel switch announcement was scheduled and the counter in this * announcement hits 1, this function must be called by the driver to * notify mac80211 that the channel can be changed. */ void ieee80211_csa_finish(struct ieee80211_vif *vif, unsigned int link_id); /** * ieee80211_beacon_cntdwn_is_complete - find out if countdown reached 1 * @vif: &struct ieee80211_vif pointer from the add_interface callback. * @link_id: valid link_id during MLO or 0 for non-MLO * * Return: %true if the countdown reached 1, %false otherwise */ bool ieee80211_beacon_cntdwn_is_complete(struct ieee80211_vif *vif, unsigned int link_id); /** * ieee80211_color_change_finish - notify mac80211 about color change * @vif: &struct ieee80211_vif pointer from the add_interface callback. * @link_id: valid link_id during MLO or 0 for non-MLO * * After a color change announcement was scheduled and the counter in this * announcement hits 1, this function must be called by the driver to * notify mac80211 that the color can be changed */ void ieee80211_color_change_finish(struct ieee80211_vif *vif, u8 link_id); /** * ieee80211_proberesp_get - retrieve a Probe Response template * @hw: pointer obtained from ieee80211_alloc_hw(). * @vif: &struct ieee80211_vif pointer from the add_interface callback. * * Creates a Probe Response template which can, for example, be uploaded to * hardware. The destination address should be set by the caller. * * Can only be called in AP mode. * * Return: The Probe Response template. %NULL on error. */ struct sk_buff *ieee80211_proberesp_get(struct ieee80211_hw *hw, struct ieee80211_vif *vif); /** * ieee80211_pspoll_get - retrieve a PS Poll template * @hw: pointer obtained from ieee80211_alloc_hw(). * @vif: &struct ieee80211_vif pointer from the add_interface callback. * * Creates a PS Poll a template which can, for example, uploaded to * hardware. The template must be updated after association so that correct * AID, BSSID and MAC address is used. * * Note: Caller (or hardware) is responsible for setting the * &IEEE80211_FCTL_PM bit. * * Return: The PS Poll template. %NULL on error. */ struct sk_buff *ieee80211_pspoll_get(struct ieee80211_hw *hw, struct ieee80211_vif *vif); /** * ieee80211_nullfunc_get - retrieve a nullfunc template * @hw: pointer obtained from ieee80211_alloc_hw(). * @vif: &struct ieee80211_vif pointer from the add_interface callback. * @link_id: If the vif is an MLD, get a frame with the link addresses * for the given link ID. For a link_id < 0 you get a frame with * MLD addresses, however useful that might be. * @qos_ok: QoS NDP is acceptable to the caller, this should be set * if at all possible * * Creates a Nullfunc template which can, for example, uploaded to * hardware. The template must be updated after association so that correct * BSSID and address is used. * * If @qos_ndp is set and the association is to an AP with QoS/WMM, the * returned packet will be QoS NDP. * * Note: Caller (or hardware) is responsible for setting the * &IEEE80211_FCTL_PM bit as well as Duration and Sequence Control fields. * * Return: The nullfunc template. %NULL on error. */ struct sk_buff *ieee80211_nullfunc_get(struct ieee80211_hw *hw, struct ieee80211_vif *vif, int link_id, bool qos_ok); /** * ieee80211_probereq_get - retrieve a Probe Request template * @hw: pointer obtained from ieee80211_alloc_hw(). * @src_addr: source MAC address * @ssid: SSID buffer * @ssid_len: length of SSID * @tailroom: tailroom to reserve at end of SKB for IEs * * Creates a Probe Request template which can, for example, be uploaded to * hardware. * * Return: The Probe Request template. %NULL on error. */ struct sk_buff *ieee80211_probereq_get(struct ieee80211_hw *hw, const u8 *src_addr, const u8 *ssid, size_t ssid_len, size_t tailroom); /** * ieee80211_rts_get - RTS frame generation function * @hw: pointer obtained from ieee80211_alloc_hw(). * @vif: &struct ieee80211_vif pointer from the add_interface callback. * @frame: pointer to the frame that is going to be protected by the RTS. * @frame_len: the frame length (in octets). * @frame_txctl: &struct ieee80211_tx_info of the frame. * @rts: The buffer where to store the RTS frame. * * If the RTS frames are generated by the host system (i.e., not in * hardware/firmware), the low-level driver uses this function to receive * the next RTS frame from the 802.11 code. The low-level is responsible * for calling this function before and RTS frame is needed. */ void ieee80211_rts_get(struct ieee80211_hw *hw, struct ieee80211_vif *vif, const void *frame, size_t frame_len, const struct ieee80211_tx_info *frame_txctl, struct ieee80211_rts *rts); /** * ieee80211_rts_duration - Get the duration field for an RTS frame * @hw: pointer obtained from ieee80211_alloc_hw(). * @vif: &struct ieee80211_vif pointer from the add_interface callback. * @frame_len: the length of the frame that is going to be protected by the RTS. * @frame_txctl: &struct ieee80211_tx_info of the frame. * * If the RTS is generated in firmware, but the host system must provide * the duration field, the low-level driver uses this function to receive * the duration field value in little-endian byteorder. * * Return: The duration. */ __le16 ieee80211_rts_duration(struct ieee80211_hw *hw, struct ieee80211_vif *vif, size_t frame_len, const struct ieee80211_tx_info *frame_txctl); /** * ieee80211_ctstoself_get - CTS-to-self frame generation function * @hw: pointer obtained from ieee80211_alloc_hw(). * @vif: &struct ieee80211_vif pointer from the add_interface callback. * @frame: pointer to the frame that is going to be protected by the CTS-to-self. * @frame_len: the frame length (in octets). * @frame_txctl: &struct ieee80211_tx_info of the frame. * @cts: The buffer where to store the CTS-to-self frame. * * If the CTS-to-self frames are generated by the host system (i.e., not in * hardware/firmware), the low-level driver uses this function to receive * the next CTS-to-self frame from the 802.11 code. The low-level is responsible * for calling this function before and CTS-to-self frame is needed. */ void ieee80211_ctstoself_get(struct ieee80211_hw *hw, struct ieee80211_vif *vif, const void *frame, size_t frame_len, const struct ieee80211_tx_info *frame_txctl, struct ieee80211_cts *cts); /** * ieee80211_ctstoself_duration - Get the duration field for a CTS-to-self frame * @hw: pointer obtained from ieee80211_alloc_hw(). * @vif: &struct ieee80211_vif pointer from the add_interface callback. * @frame_len: the length of the frame that is going to be protected by the CTS-to-self. * @frame_txctl: &struct ieee80211_tx_info of the frame. * * If the CTS-to-self is generated in firmware, but the host system must provide * the duration field, the low-level driver uses this function to receive * the duration field value in little-endian byteorder. * * Return: The duration. */ __le16 ieee80211_ctstoself_duration(struct ieee80211_hw *hw, struct ieee80211_vif *vif, size_t frame_len, const struct ieee80211_tx_info *frame_txctl); /** * ieee80211_generic_frame_duration - Calculate the duration field for a frame * @hw: pointer obtained from ieee80211_alloc_hw(). * @vif: &struct ieee80211_vif pointer from the add_interface callback. * @band: the band to calculate the frame duration on * @frame_len: the length of the frame. * @rate: the rate at which the frame is going to be transmitted. * * Calculate the duration field of some generic frame, given its * length and transmission rate (in 100kbps). * * Return: The duration. */ __le16 ieee80211_generic_frame_duration(struct ieee80211_hw *hw, struct ieee80211_vif *vif, enum nl80211_band band, size_t frame_len, struct ieee80211_rate *rate); /** * ieee80211_get_buffered_bc - accessing buffered broadcast and multicast frames * @hw: pointer as obtained from ieee80211_alloc_hw(). * @vif: &struct ieee80211_vif pointer from the add_interface callback. * * Function for accessing buffered broadcast and multicast frames. If * hardware/firmware does not implement buffering of broadcast/multicast * frames when power saving is used, 802.11 code buffers them in the host * memory. The low-level driver uses this function to fetch next buffered * frame. In most cases, this is used when generating beacon frame. * * Return: A pointer to the next buffered skb or NULL if no more buffered * frames are available. * * Note: buffered frames are returned only after DTIM beacon frame was * generated with ieee80211_beacon_get() and the low-level driver must thus * call ieee80211_beacon_get() first. ieee80211_get_buffered_bc() returns * NULL if the previous generated beacon was not DTIM, so the low-level driver * does not need to check for DTIM beacons separately and should be able to * use common code for all beacons. */ struct sk_buff * ieee80211_get_buffered_bc(struct ieee80211_hw *hw, struct ieee80211_vif *vif); /** * ieee80211_get_tkip_p1k_iv - get a TKIP phase 1 key for IV32 * * This function returns the TKIP phase 1 key for the given IV32. * * @keyconf: the parameter passed with the set key * @iv32: IV32 to get the P1K for * @p1k: a buffer to which the key will be written, as 5 u16 values */ void ieee80211_get_tkip_p1k_iv(struct ieee80211_key_conf *keyconf, u32 iv32, u16 *p1k); /** * ieee80211_get_tkip_p1k - get a TKIP phase 1 key * * This function returns the TKIP phase 1 key for the IV32 taken * from the given packet. * * @keyconf: the parameter passed with the set key * @skb: the packet to take the IV32 value from that will be encrypted * with this P1K * @p1k: a buffer to which the key will be written, as 5 u16 values */ static inline void ieee80211_get_tkip_p1k(struct ieee80211_key_conf *keyconf, struct sk_buff *skb, u16 *p1k) { struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data; const u8 *data = (u8 *)hdr + ieee80211_hdrlen(hdr->frame_control); u32 iv32 = get_unaligned_le32(&data[4]); ieee80211_get_tkip_p1k_iv(keyconf, iv32, p1k); } /** * ieee80211_get_tkip_rx_p1k - get a TKIP phase 1 key for RX * * This function returns the TKIP phase 1 key for the given IV32 * and transmitter address. * * @keyconf: the parameter passed with the set key * @ta: TA that will be used with the key * @iv32: IV32 to get the P1K for * @p1k: a buffer to which the key will be written, as 5 u16 values */ void ieee80211_get_tkip_rx_p1k(struct ieee80211_key_conf *keyconf, const u8 *ta, u32 iv32, u16 *p1k); /** * ieee80211_get_tkip_p2k - get a TKIP phase 2 key * * This function computes the TKIP RC4 key for the IV values * in the packet. * * @keyconf: the parameter passed with the set key * @skb: the packet to take the IV32/IV16 values from that will be * encrypted with this key * @p2k: a buffer to which the key will be written, 16 bytes */ void ieee80211_get_tkip_p2k(struct ieee80211_key_conf *keyconf, struct sk_buff *skb, u8 *p2k); /** * ieee80211_tkip_add_iv - write TKIP IV and Ext. IV to pos * * @pos: start of crypto header * @keyconf: the parameter passed with the set key * @pn: PN to add * * Returns: pointer to the octet following IVs (i.e. beginning of * the packet payload) * * This function writes the tkip IV value to pos (which should * point to the crypto header) */ u8 *ieee80211_tkip_add_iv(u8 *pos, struct ieee80211_key_conf *keyconf, u64 pn); /** * ieee80211_get_key_rx_seq - get key RX sequence counter * * @keyconf: the parameter passed with the set key * @tid: The TID, or -1 for the management frame value (CCMP/GCMP only); * the value on TID 0 is also used for non-QoS frames. For * CMAC, only TID 0 is valid. * @seq: buffer to receive the sequence data * * This function allows a driver to retrieve the current RX IV/PNs * for the given key. It must not be called if IV checking is done * by the device and not by mac80211. * * Note that this function may only be called when no RX processing * can be done concurrently. */ void ieee80211_get_key_rx_seq(struct ieee80211_key_conf *keyconf, int tid, struct ieee80211_key_seq *seq); /** * ieee80211_set_key_rx_seq - set key RX sequence counter * * @keyconf: the parameter passed with the set key * @tid: The TID, or -1 for the management frame value (CCMP/GCMP only); * the value on TID 0 is also used for non-QoS frames. For * CMAC, only TID 0 is valid. * @seq: new sequence data * * This function allows a driver to set the current RX IV/PNs for the * given key. This is useful when resuming from WoWLAN sleep and GTK * rekey may have been done while suspended. It should not be called * if IV checking is done by the device and not by mac80211. * * Note that this function may only be called when no RX processing * can be done concurrently. */ void ieee80211_set_key_rx_seq(struct ieee80211_key_conf *keyconf, int tid, struct ieee80211_key_seq *seq); /** * ieee80211_remove_key - remove the given key * @keyconf: the parameter passed with the set key * * Context: Must be called with the wiphy mutex held. * * Remove the given key. If the key was uploaded to the hardware at the * time this function is called, it is not deleted in the hardware but * instead assumed to have been removed already. */ void ieee80211_remove_key(struct ieee80211_key_conf *keyconf); /** * ieee80211_gtk_rekey_add - add a GTK key from rekeying during WoWLAN * @vif: the virtual interface to add the key on * @keyconf: new key data * @link_id: the link id of the key or -1 for non-MLO * * When GTK rekeying was done while the system was suspended, (a) new * key(s) will be available. These will be needed by mac80211 for proper * RX processing, so this function allows setting them. * * Return: the newly allocated key structure, which will have * similar contents to the passed key configuration but point to * mac80211-owned memory. In case of errors, the function returns an * ERR_PTR(), use IS_ERR() etc. * * Note that this function assumes the key isn't added to hardware * acceleration, so no TX will be done with the key. Since it's a GTK * on managed (station) networks, this is true anyway. If the driver * calls this function from the resume callback and subsequently uses * the return code 1 to reconfigure the device, this key will be part * of the reconfiguration. * * Note that the driver should also call ieee80211_set_key_rx_seq() * for the new key for each TID to set up sequence counters properly. * * IMPORTANT: If this replaces a key that is present in the hardware, * then it will attempt to remove it during this call. In many cases * this isn't what you want, so call ieee80211_remove_key() first for * the key that's being replaced. */ struct ieee80211_key_conf * ieee80211_gtk_rekey_add(struct ieee80211_vif *vif, struct ieee80211_key_conf *keyconf, int link_id); /** * ieee80211_gtk_rekey_notify - notify userspace supplicant of rekeying * @vif: virtual interface the rekeying was done on * @bssid: The BSSID of the AP, for checking association * @replay_ctr: the new replay counter after GTK rekeying * @gfp: allocation flags */ void ieee80211_gtk_rekey_notify(struct ieee80211_vif *vif, const u8 *bssid, const u8 *replay_ctr, gfp_t gfp); /** * ieee80211_key_mic_failure - increment MIC failure counter for the key * * Note: this is really only safe if no other RX function is called * at the same time. * * @keyconf: the key in question */ void ieee80211_key_mic_failure(struct ieee80211_key_conf *keyconf); /** * ieee80211_key_replay - increment replay counter for the key * * Note: this is really only safe if no other RX function is called * at the same time. * * @keyconf: the key in question */ void ieee80211_key_replay(struct ieee80211_key_conf *keyconf); /** * ieee80211_wake_queue - wake specific queue * @hw: pointer as obtained from ieee80211_alloc_hw(). * @queue: queue number (counted from zero). * * Drivers must use this function instead of netif_wake_queue. */ void ieee80211_wake_queue(struct ieee80211_hw *hw, int queue); /** * ieee80211_stop_queue - stop specific queue * @hw: pointer as obtained from ieee80211_alloc_hw(). * @queue: queue number (counted from zero). * * Drivers must use this function instead of netif_stop_queue. */ void ieee80211_stop_queue(struct ieee80211_hw *hw, int queue); /** * ieee80211_queue_stopped - test status of the queue * @hw: pointer as obtained from ieee80211_alloc_hw(). * @queue: queue number (counted from zero). * * Drivers must use this function instead of netif_queue_stopped. * * Return: %true if the queue is stopped. %false otherwise. */ int ieee80211_queue_stopped(struct ieee80211_hw *hw, int queue); /** * ieee80211_stop_queues - stop all queues * @hw: pointer as obtained from ieee80211_alloc_hw(). * * Drivers must use this function instead of netif_tx_stop_all_queues. */ void ieee80211_stop_queues(struct ieee80211_hw *hw); /** * ieee80211_wake_queues - wake all queues * @hw: pointer as obtained from ieee80211_alloc_hw(). * * Drivers must use this function instead of netif_tx_wake_all_queues. */ void ieee80211_wake_queues(struct ieee80211_hw *hw); /** * ieee80211_scan_completed - completed hardware scan * * When hardware scan offload is used (i.e. the hw_scan() callback is * assigned) this function needs to be called by the driver to notify * mac80211 that the scan finished. This function can be called from * any context, including hardirq context. * * @hw: the hardware that finished the scan * @info: information about the completed scan */ void ieee80211_scan_completed(struct ieee80211_hw *hw, struct cfg80211_scan_info *info); /** * ieee80211_sched_scan_results - got results from scheduled scan * * When a scheduled scan is running, this function needs to be called by the * driver whenever there are new scan results available. * * @hw: the hardware that is performing scheduled scans */ void ieee80211_sched_scan_results(struct ieee80211_hw *hw); /** * ieee80211_sched_scan_stopped - inform that the scheduled scan has stopped * * When a scheduled scan is running, this function can be called by * the driver if it needs to stop the scan to perform another task. * Usual scenarios are drivers that cannot continue the scheduled scan * while associating, for instance. * * @hw: the hardware that is performing scheduled scans */ void ieee80211_sched_scan_stopped(struct ieee80211_hw *hw); /** * enum ieee80211_interface_iteration_flags - interface iteration flags * @IEEE80211_IFACE_ITER_NORMAL: Iterate over all interfaces that have * been added to the driver; However, note that during hardware * reconfiguration (after restart_hw) it will iterate over a new * interface and over all the existing interfaces even if they * haven't been re-added to the driver yet. * @IEEE80211_IFACE_ITER_RESUME_ALL: During resume, iterate over all * interfaces, even if they haven't been re-added to the driver yet. * @IEEE80211_IFACE_ITER_ACTIVE: Iterate only active interfaces (netdev is up). * @IEEE80211_IFACE_SKIP_SDATA_NOT_IN_DRIVER: Skip any interfaces where SDATA * is not in the driver. This may fix crashes during firmware recovery * for instance. */ enum ieee80211_interface_iteration_flags { IEEE80211_IFACE_ITER_NORMAL = 0, IEEE80211_IFACE_ITER_RESUME_ALL = BIT(0), IEEE80211_IFACE_ITER_ACTIVE = BIT(1), IEEE80211_IFACE_SKIP_SDATA_NOT_IN_DRIVER = BIT(2), }; /** * ieee80211_iterate_interfaces - iterate interfaces * * This function iterates over the interfaces associated with a given * hardware and calls the callback for them. This includes active as well as * inactive interfaces. This function allows the iterator function to sleep. * Will iterate over a new interface during add_interface(). * * @hw: the hardware struct of which the interfaces should be iterated over * @iter_flags: iteration flags, see &enum ieee80211_interface_iteration_flags * @iterator: the iterator function to call * @data: first argument of the iterator function */ void ieee80211_iterate_interfaces(struct ieee80211_hw *hw, u32 iter_flags, void (*iterator)(void *data, u8 *mac, struct ieee80211_vif *vif), void *data); /** * ieee80211_iterate_active_interfaces - iterate active interfaces * * This function iterates over the interfaces associated with a given * hardware that are currently active and calls the callback for them. * This function allows the iterator function to sleep, when the iterator * function is atomic @ieee80211_iterate_active_interfaces_atomic can * be used. * Does not iterate over a new interface during add_interface(). * * @hw: the hardware struct of which the interfaces should be iterated over * @iter_flags: iteration flags, see &enum ieee80211_interface_iteration_flags * @iterator: the iterator function to call * @data: first argument of the iterator function */ static inline void ieee80211_iterate_active_interfaces(struct ieee80211_hw *hw, u32 iter_flags, void (*iterator)(void *data, u8 *mac, struct ieee80211_vif *vif), void *data) { ieee80211_iterate_interfaces(hw, iter_flags | IEEE80211_IFACE_ITER_ACTIVE, iterator, data); } /** * ieee80211_iterate_active_interfaces_atomic - iterate active interfaces * * This function iterates over the interfaces associated with a given * hardware that are currently active and calls the callback for them. * This function requires the iterator callback function to be atomic, * if that is not desired, use @ieee80211_iterate_active_interfaces instead. * Does not iterate over a new interface during add_interface(). * * @hw: the hardware struct of which the interfaces should be iterated over * @iter_flags: iteration flags, see &enum ieee80211_interface_iteration_flags * @iterator: the iterator function to call, cannot sleep * @data: first argument of the iterator function */ void ieee80211_iterate_active_interfaces_atomic(struct ieee80211_hw *hw, u32 iter_flags, void (*iterator)(void *data, u8 *mac, struct ieee80211_vif *vif), void *data); /** * ieee80211_iterate_active_interfaces_mtx - iterate active interfaces * * This function iterates over the interfaces associated with a given * hardware that are currently active and calls the callback for them. * This version can only be used while holding the wiphy mutex. * * @hw: the hardware struct of which the interfaces should be iterated over * @iter_flags: iteration flags, see &enum ieee80211_interface_iteration_flags * @iterator: the iterator function to call, cannot sleep * @data: first argument of the iterator function */ void ieee80211_iterate_active_interfaces_mtx(struct ieee80211_hw *hw, u32 iter_flags, void (*iterator)(void *data, u8 *mac, struct ieee80211_vif *vif), void *data); /** * ieee80211_iterate_stations_atomic - iterate stations * * This function iterates over all stations associated with a given * hardware that are currently uploaded to the driver and calls the callback * function for them. * This function requires the iterator callback function to be atomic, * * @hw: the hardware struct of which the interfaces should be iterated over * @iterator: the iterator function to call, cannot sleep * @data: first argument of the iterator function */ void ieee80211_iterate_stations_atomic(struct ieee80211_hw *hw, void (*iterator)(void *data, struct ieee80211_sta *sta), void *data); /** * ieee80211_iterate_stations_mtx - iterate stations * * This function iterates over all stations associated with a given * hardware that are currently uploaded to the driver and calls the callback * function for them. This version can only be used while holding the wiphy * mutex. * * @hw: the hardware struct of which the interfaces should be iterated over * @iterator: the iterator function to call * @data: first argument of the iterator function */ void ieee80211_iterate_stations_mtx(struct ieee80211_hw *hw, void (*iterator)(void *data, struct ieee80211_sta *sta), void *data); /** * ieee80211_queue_work - add work onto the mac80211 workqueue * * Drivers and mac80211 use this to add work onto the mac80211 workqueue. * This helper ensures drivers are not queueing work when they should not be. * * @hw: the hardware struct for the interface we are adding work for * @work: the work we want to add onto the mac80211 workqueue */ void ieee80211_queue_work(struct ieee80211_hw *hw, struct work_struct *work); /** * ieee80211_queue_delayed_work - add work onto the mac80211 workqueue * * Drivers and mac80211 use this to queue delayed work onto the mac80211 * workqueue. * * @hw: the hardware struct for the interface we are adding work for * @dwork: delayable work to queue onto the mac80211 workqueue * @delay: number of jiffies to wait before queueing */ void ieee80211_queue_delayed_work(struct ieee80211_hw *hw, struct delayed_work *dwork, unsigned long delay); /** * ieee80211_refresh_tx_agg_session_timer - Refresh a tx agg session timer. * @sta: the station for which to start a BA session * @tid: the TID to BA on. * * This function allows low level driver to refresh tx agg session timer * to maintain BA session, the session level will still be managed by the * mac80211. * * Note: must be called in an RCU critical section. */ void ieee80211_refresh_tx_agg_session_timer(struct ieee80211_sta *sta, u16 tid); /** * ieee80211_start_tx_ba_session - Start a tx Block Ack session. * @sta: the station for which to start a BA session * @tid: the TID to BA on. * @timeout: session timeout value (in TUs) * * Return: success if addBA request was sent, failure otherwise * * Although mac80211/low level driver/user space application can estimate * the need to start aggregation on a certain RA/TID, the session level * will be managed by the mac80211. */ int ieee80211_start_tx_ba_session(struct ieee80211_sta *sta, u16 tid, u16 timeout); /** * ieee80211_start_tx_ba_cb_irqsafe - low level driver ready to aggregate. * @vif: &struct ieee80211_vif pointer from the add_interface callback * @ra: receiver address of the BA session recipient. * @tid: the TID to BA on. * * This function must be called by low level driver once it has * finished with preparations for the BA session. It can be called * from any context. */ void ieee80211_start_tx_ba_cb_irqsafe(struct ieee80211_vif *vif, const u8 *ra, u16 tid); /** * ieee80211_stop_tx_ba_session - Stop a Block Ack session. * @sta: the station whose BA session to stop * @tid: the TID to stop BA. * * Return: negative error if the TID is invalid, or no aggregation active * * Although mac80211/low level driver/user space application can estimate * the need to stop aggregation on a certain RA/TID, the session level * will be managed by the mac80211. */ int ieee80211_stop_tx_ba_session(struct ieee80211_sta *sta, u16 tid); /** * ieee80211_stop_tx_ba_cb_irqsafe - low level driver ready to stop aggregate. * @vif: &struct ieee80211_vif pointer from the add_interface callback * @ra: receiver address of the BA session recipient. * @tid: the desired TID to BA on. * * This function must be called by low level driver once it has * finished with preparations for the BA session tear down. It * can be called from any context. */ void ieee80211_stop_tx_ba_cb_irqsafe(struct ieee80211_vif *vif, const u8 *ra, u16 tid); /** * ieee80211_find_sta - find a station * * @vif: virtual interface to look for station on * @addr: station's address * * Return: The station, if found. %NULL otherwise. * * Note: This function must be called under RCU lock and the * resulting pointer is only valid under RCU lock as well. */ struct ieee80211_sta *ieee80211_find_sta(struct ieee80211_vif *vif, const u8 *addr); /** * ieee80211_find_sta_by_ifaddr - find a station on hardware * * @hw: pointer as obtained from ieee80211_alloc_hw() * @addr: remote station's address * @localaddr: local address (vif->sdata->vif.addr). Use NULL for 'any'. * * Return: The station, if found. %NULL otherwise. * * Note: This function must be called under RCU lock and the * resulting pointer is only valid under RCU lock as well. * * NOTE: You may pass NULL for localaddr, but then you will just get * the first STA that matches the remote address 'addr'. * We can have multiple STA associated with multiple * logical stations (e.g. consider a station connecting to another * BSSID on the same AP hardware without disconnecting first). * In this case, the result of this method with localaddr NULL * is not reliable. * * DO NOT USE THIS FUNCTION with localaddr NULL if at all possible. */ struct ieee80211_sta *ieee80211_find_sta_by_ifaddr(struct ieee80211_hw *hw, const u8 *addr, const u8 *localaddr); /** * ieee80211_find_sta_by_link_addrs - find STA by link addresses * @hw: pointer as obtained from ieee80211_alloc_hw() * @addr: remote station's link address * @localaddr: local link address, use %NULL for any (but avoid that) * @link_id: pointer to obtain the link ID if the STA is found, * may be %NULL if the link ID is not needed * * Obtain the STA by link address, must use RCU protection. * * Return: pointer to STA if found, otherwise %NULL. */ struct ieee80211_sta * ieee80211_find_sta_by_link_addrs(struct ieee80211_hw *hw, const u8 *addr, const u8 *localaddr, unsigned int *link_id); /** * ieee80211_sta_block_awake - block station from waking up * @hw: the hardware * @pubsta: the station * @block: whether to block or unblock * * Some devices require that all frames that are on the queues * for a specific station that went to sleep are flushed before * a poll response or frames after the station woke up can be * delivered to that it. Note that such frames must be rejected * by the driver as filtered, with the appropriate status flag. * * This function allows implementing this mode in a race-free * manner. * * To do this, a driver must keep track of the number of frames * still enqueued for a specific station. If this number is not * zero when the station goes to sleep, the driver must call * this function to force mac80211 to consider the station to * be asleep regardless of the station's actual state. Once the * number of outstanding frames reaches zero, the driver must * call this function again to unblock the station. That will * cause mac80211 to be able to send ps-poll responses, and if * the station queried in the meantime then frames will also * be sent out as a result of this. Additionally, the driver * will be notified that the station woke up some time after * it is unblocked, regardless of whether the station actually * woke up while blocked or not. */ void ieee80211_sta_block_awake(struct ieee80211_hw *hw, struct ieee80211_sta *pubsta, bool block); /** * ieee80211_sta_eosp - notify mac80211 about end of SP * @pubsta: the station * * When a device transmits frames in a way that it can't tell * mac80211 in the TX status about the EOSP, it must clear the * %IEEE80211_TX_STATUS_EOSP bit and call this function instead. * This applies for PS-Poll as well as uAPSD. * * Note that just like with _tx_status() and _rx() drivers must * not mix calls to irqsafe/non-irqsafe versions, this function * must not be mixed with those either. Use the all irqsafe, or * all non-irqsafe, don't mix! * * NB: the _irqsafe version of this function doesn't exist, no * driver needs it right now. Don't call this function if * you'd need the _irqsafe version, look at the git history * and restore the _irqsafe version! */ void ieee80211_sta_eosp(struct ieee80211_sta *pubsta); /** * ieee80211_send_eosp_nullfunc - ask mac80211 to send NDP with EOSP * @pubsta: the station * @tid: the tid of the NDP * * Sometimes the device understands that it needs to close * the Service Period unexpectedly. This can happen when * sending frames that are filling holes in the BA window. * In this case, the device can ask mac80211 to send a * Nullfunc frame with EOSP set. When that happens, the * driver must have called ieee80211_sta_set_buffered() to * let mac80211 know that there are no buffered frames any * more, otherwise mac80211 will get the more_data bit wrong. * The low level driver must have made sure that the frame * will be sent despite the station being in power-save. * Mac80211 won't call allow_buffered_frames(). * Note that calling this function, doesn't exempt the driver * from closing the EOSP properly, it will still have to call * ieee80211_sta_eosp when the NDP is sent. */ void ieee80211_send_eosp_nullfunc(struct ieee80211_sta *pubsta, int tid); /** * ieee80211_sta_recalc_aggregates - recalculate aggregate data after a change * @pubsta: the station * * Call this function after changing a per-link aggregate data as referenced in * &struct ieee80211_sta_aggregates by accessing the agg field of * &struct ieee80211_link_sta. * * With non MLO the data in deflink will be referenced directly. In that case * there is no need to call this function. */ void ieee80211_sta_recalc_aggregates(struct ieee80211_sta *pubsta); /** * ieee80211_sta_register_airtime - register airtime usage for a sta/tid * * Register airtime usage for a given sta on a given tid. The driver must call * this function to notify mac80211 that a station used a certain amount of * airtime. This information will be used by the TXQ scheduler to schedule * stations in a way that ensures airtime fairness. * * The reported airtime should as a minimum include all time that is spent * transmitting to the remote station, including overhead and padding, but not * including time spent waiting for a TXOP. If the time is not reported by the * hardware it can in some cases be calculated from the rate and known frame * composition. When possible, the time should include any failed transmission * attempts. * * The driver can either call this function synchronously for every packet or * aggregate, or asynchronously as airtime usage information becomes available. * TX and RX airtime can be reported together, or separately by setting one of * them to 0. * * @pubsta: the station * @tid: the TID to register airtime for * @tx_airtime: airtime used during TX (in usec) * @rx_airtime: airtime used during RX (in usec) */ void ieee80211_sta_register_airtime(struct ieee80211_sta *pubsta, u8 tid, u32 tx_airtime, u32 rx_airtime); /** * ieee80211_txq_airtime_check - check if a txq can send frame to device * * @hw: pointer obtained from ieee80211_alloc_hw() * @txq: pointer obtained from station or virtual interface * * Return: %true if the AQL's airtime limit has not been reached and the txq can * continue to send more packets to the device. Otherwise return %false. */ bool ieee80211_txq_airtime_check(struct ieee80211_hw *hw, struct ieee80211_txq *txq); /** * ieee80211_iter_keys - iterate keys programmed into the device * @hw: pointer obtained from ieee80211_alloc_hw() * @vif: virtual interface to iterate, may be %NULL for all * @iter: iterator function that will be called for each key * @iter_data: custom data to pass to the iterator function * * Context: Must be called with wiphy mutex held; can sleep. * * This function can be used to iterate all the keys known to * mac80211, even those that weren't previously programmed into * the device. This is intended for use in WoWLAN if the device * needs reprogramming of the keys during suspend. * * The order in which the keys are iterated matches the order * in which they were originally installed and handed to the * set_key callback. */ void ieee80211_iter_keys(struct ieee80211_hw *hw, struct ieee80211_vif *vif, void (*iter)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_sta *sta, struct ieee80211_key_conf *key, void *data), void *iter_data); /** * ieee80211_iter_keys_rcu - iterate keys programmed into the device * @hw: pointer obtained from ieee80211_alloc_hw() * @vif: virtual interface to iterate, may be %NULL for all * @iter: iterator function that will be called for each key * @iter_data: custom data to pass to the iterator function * * This function can be used to iterate all the keys known to * mac80211, even those that weren't previously programmed into * the device. Note that due to locking reasons, keys of station * in removal process will be skipped. * * This function requires being called in an RCU critical section, * and thus iter must be atomic. */ void ieee80211_iter_keys_rcu(struct ieee80211_hw *hw, struct ieee80211_vif *vif, void (*iter)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_sta *sta, struct ieee80211_key_conf *key, void *data), void *iter_data); /** * ieee80211_iter_chan_contexts_atomic - iterate channel contexts * @hw: pointer obtained from ieee80211_alloc_hw(). * @iter: iterator function * @iter_data: data passed to iterator function * * Iterate all active channel contexts. This function is atomic and * doesn't acquire any locks internally that might be held in other * places while calling into the driver. * * The iterator will not find a context that's being added (during * the driver callback to add it) but will find it while it's being * removed. * * Note that during hardware restart, all contexts that existed * before the restart are considered already present so will be * found while iterating, whether they've been re-added already * or not. */ void ieee80211_iter_chan_contexts_atomic( struct ieee80211_hw *hw, void (*iter)(struct ieee80211_hw *hw, struct ieee80211_chanctx_conf *chanctx_conf, void *data), void *iter_data); /** * ieee80211_ap_probereq_get - retrieve a Probe Request template * @hw: pointer obtained from ieee80211_alloc_hw(). * @vif: &struct ieee80211_vif pointer from the add_interface callback. * * Creates a Probe Request template which can, for example, be uploaded to * hardware. The template is filled with bssid, ssid and supported rate * information. This function must only be called from within the * .bss_info_changed callback function and only in managed mode. The function * is only useful when the interface is associated, otherwise it will return * %NULL. * * Return: The Probe Request template. %NULL on error. */ struct sk_buff *ieee80211_ap_probereq_get(struct ieee80211_hw *hw, struct ieee80211_vif *vif); /** * ieee80211_beacon_loss - inform hardware does not receive beacons * * @vif: &struct ieee80211_vif pointer from the add_interface callback. * * When beacon filtering is enabled with %IEEE80211_VIF_BEACON_FILTER and * %IEEE80211_CONF_PS is set, the driver needs to inform whenever the * hardware is not receiving beacons with this function. */ void ieee80211_beacon_loss(struct ieee80211_vif *vif); /** * ieee80211_connection_loss - inform hardware has lost connection to the AP * * @vif: &struct ieee80211_vif pointer from the add_interface callback. * * When beacon filtering is enabled with %IEEE80211_VIF_BEACON_FILTER, and * %IEEE80211_CONF_PS and %IEEE80211_HW_CONNECTION_MONITOR are set, the driver * needs to inform if the connection to the AP has been lost. * The function may also be called if the connection needs to be terminated * for some other reason, even if %IEEE80211_HW_CONNECTION_MONITOR isn't set. * * This function will cause immediate change to disassociated state, * without connection recovery attempts. */ void ieee80211_connection_loss(struct ieee80211_vif *vif); /** * ieee80211_disconnect - request disconnection * * @vif: &struct ieee80211_vif pointer from the add_interface callback. * @reconnect: immediate reconnect is desired * * Request disconnection from the current network and, if enabled, send a * hint to the higher layers that immediate reconnect is desired. */ void ieee80211_disconnect(struct ieee80211_vif *vif, bool reconnect); /** * ieee80211_resume_disconnect - disconnect from AP after resume * * @vif: &struct ieee80211_vif pointer from the add_interface callback. * * Instructs mac80211 to disconnect from the AP after resume. * Drivers can use this after WoWLAN if they know that the * connection cannot be kept up, for example because keys were * used while the device was asleep but the replay counters or * similar cannot be retrieved from the device during resume. * * Note that due to implementation issues, if the driver uses * the reconfiguration functionality during resume the interface * will still be added as associated first during resume and then * disconnect normally later. * * This function can only be called from the resume callback and * the driver must not be holding any of its own locks while it * calls this function, or at least not any locks it needs in the * key configuration paths (if it supports HW crypto). */ void ieee80211_resume_disconnect(struct ieee80211_vif *vif); /** * ieee80211_hw_restart_disconnect - disconnect from AP after * hardware restart * @vif: &struct ieee80211_vif pointer from the add_interface callback. * * Instructs mac80211 to disconnect from the AP after * hardware restart. */ void ieee80211_hw_restart_disconnect(struct ieee80211_vif *vif); /** * ieee80211_cqm_rssi_notify - inform a configured connection quality monitoring * rssi threshold triggered * * @vif: &struct ieee80211_vif pointer from the add_interface callback. * @rssi_event: the RSSI trigger event type * @rssi_level: new RSSI level value or 0 if not available * @gfp: context flags * * When the %IEEE80211_VIF_SUPPORTS_CQM_RSSI is set, and a connection quality * monitoring is configured with an rssi threshold, the driver will inform * whenever the rssi level reaches the threshold. */ void ieee80211_cqm_rssi_notify(struct ieee80211_vif *vif, enum nl80211_cqm_rssi_threshold_event rssi_event, s32 rssi_level, gfp_t gfp); /** * ieee80211_cqm_beacon_loss_notify - inform CQM of beacon loss * * @vif: &struct ieee80211_vif pointer from the add_interface callback. * @gfp: context flags */ void ieee80211_cqm_beacon_loss_notify(struct ieee80211_vif *vif, gfp_t gfp); /** * ieee80211_radar_detected - inform that a radar was detected * * @hw: pointer as obtained from ieee80211_alloc_hw() * @chanctx_conf: Channel context on which radar is detected. Mandatory to * pass a valid pointer during MLO. For non-MLO %NULL can be passed */ void ieee80211_radar_detected(struct ieee80211_hw *hw, struct ieee80211_chanctx_conf *chanctx_conf); /** * ieee80211_chswitch_done - Complete channel switch process * @vif: &struct ieee80211_vif pointer from the add_interface callback. * @success: make the channel switch successful or not * @link_id: the link_id on which the switch was done. Ignored if success is * false. * * Complete the channel switch post-process: set the new operational channel * and wake up the suspended queues. */ void ieee80211_chswitch_done(struct ieee80211_vif *vif, bool success, unsigned int link_id); /** * ieee80211_channel_switch_disconnect - disconnect due to channel switch error * @vif: &struct ieee80211_vif pointer from the add_interface callback. * * Instruct mac80211 to disconnect due to a channel switch error. The channel * switch can request to block the tx and so, we need to make sure we do not send * a deauth frame in this case. */ void ieee80211_channel_switch_disconnect(struct ieee80211_vif *vif); /** * ieee80211_request_smps - request SM PS transition * @vif: &struct ieee80211_vif pointer from the add_interface callback. * @link_id: link ID for MLO, or 0 * @smps_mode: new SM PS mode * * This allows the driver to request an SM PS transition in managed * mode. This is useful when the driver has more information than * the stack about possible interference, for example by bluetooth. */ void ieee80211_request_smps(struct ieee80211_vif *vif, unsigned int link_id, enum ieee80211_smps_mode smps_mode); /** * ieee80211_ready_on_channel - notification of remain-on-channel start * @hw: pointer as obtained from ieee80211_alloc_hw() */ void ieee80211_ready_on_channel(struct ieee80211_hw *hw); /** * ieee80211_remain_on_channel_expired - remain_on_channel duration expired * @hw: pointer as obtained from ieee80211_alloc_hw() */ void ieee80211_remain_on_channel_expired(struct ieee80211_hw *hw); /** * ieee80211_stop_rx_ba_session - callback to stop existing BA sessions * * in order not to harm the system performance and user experience, the device * may request not to allow any rx ba session and tear down existing rx ba * sessions based on system constraints such as periodic BT activity that needs * to limit wlan activity (eg.sco or a2dp)." * in such cases, the intention is to limit the duration of the rx ppdu and * therefore prevent the peer device to use a-mpdu aggregation. * * @vif: &struct ieee80211_vif pointer from the add_interface callback. * @ba_rx_bitmap: Bit map of open rx ba per tid * @addr: & to bssid mac address */ void ieee80211_stop_rx_ba_session(struct ieee80211_vif *vif, u16 ba_rx_bitmap, const u8 *addr); /** * ieee80211_mark_rx_ba_filtered_frames - move RX BA window and mark filtered * @pubsta: station struct * @tid: the session's TID * @ssn: starting sequence number of the bitmap, all frames before this are * assumed to be out of the window after the call * @filtered: bitmap of filtered frames, BIT(0) is the @ssn entry etc. * @received_mpdus: number of received mpdus in firmware * * This function moves the BA window and releases all frames before @ssn, and * marks frames marked in the bitmap as having been filtered. Afterwards, it * checks if any frames in the window starting from @ssn can now be released * (in case they were only waiting for frames that were filtered.) * (Only work correctly if @max_rx_aggregation_subframes <= 64 frames) */ void ieee80211_mark_rx_ba_filtered_frames(struct ieee80211_sta *pubsta, u8 tid, u16 ssn, u64 filtered, u16 received_mpdus); /** * ieee80211_send_bar - send a BlockAckReq frame * * can be used to flush pending frames from the peer's aggregation reorder * buffer. * * @vif: &struct ieee80211_vif pointer from the add_interface callback. * @ra: the peer's destination address * @tid: the TID of the aggregation session * @ssn: the new starting sequence number for the receiver */ void ieee80211_send_bar(struct ieee80211_vif *vif, u8 *ra, u16 tid, u16 ssn); /** * ieee80211_manage_rx_ba_offl - helper to queue an RX BA work * @vif: &struct ieee80211_vif pointer from the add_interface callback * @addr: station mac address * @tid: the rx tid */ void ieee80211_manage_rx_ba_offl(struct ieee80211_vif *vif, const u8 *addr, unsigned int tid); /** * ieee80211_start_rx_ba_session_offl - start a Rx BA session * * Some device drivers may offload part of the Rx aggregation flow including * AddBa/DelBa negotiation but may otherwise be incapable of full Rx * reordering. * * Create structures responsible for reordering so device drivers may call here * when they complete AddBa negotiation. * * @vif: &struct ieee80211_vif pointer from the add_interface callback * @addr: station mac address * @tid: the rx tid */ static inline void ieee80211_start_rx_ba_session_offl(struct ieee80211_vif *vif, const u8 *addr, u16 tid) { if (WARN_ON(tid >= IEEE80211_NUM_TIDS)) return; ieee80211_manage_rx_ba_offl(vif, addr, tid); } /** * ieee80211_stop_rx_ba_session_offl - stop a Rx BA session * * Some device drivers may offload part of the Rx aggregation flow including * AddBa/DelBa negotiation but may otherwise be incapable of full Rx * reordering. * * Destroy structures responsible for reordering so device drivers may call here * when they complete DelBa negotiation. * * @vif: &struct ieee80211_vif pointer from the add_interface callback * @addr: station mac address * @tid: the rx tid */ static inline void ieee80211_stop_rx_ba_session_offl(struct ieee80211_vif *vif, const u8 *addr, u16 tid) { if (WARN_ON(tid >= IEEE80211_NUM_TIDS)) return; ieee80211_manage_rx_ba_offl(vif, addr, tid + IEEE80211_NUM_TIDS); } /** * ieee80211_rx_ba_timer_expired - stop a Rx BA session due to timeout * * Some device drivers do not offload AddBa/DelBa negotiation, but handle rx * buffer reording internally, and therefore also handle the session timer. * * Trigger the timeout flow, which sends a DelBa. * * @vif: &struct ieee80211_vif pointer from the add_interface callback * @addr: station mac address * @tid: the rx tid */ void ieee80211_rx_ba_timer_expired(struct ieee80211_vif *vif, const u8 *addr, unsigned int tid); /* Rate control API */ /** * struct ieee80211_tx_rate_control - rate control information for/from RC algo * * @hw: The hardware the algorithm is invoked for. * @sband: The band this frame is being transmitted on. * @bss_conf: the current BSS configuration * @skb: the skb that will be transmitted, the control information in it needs * to be filled in * @reported_rate: The rate control algorithm can fill this in to indicate * which rate should be reported to userspace as the current rate and * used for rate calculations in the mesh network. * @rts: whether RTS will be used for this frame because it is longer than the * RTS threshold * @short_preamble: whether mac80211 will request short-preamble transmission * if the selected rate supports it * @rate_idx_mask: user-requested (legacy) rate mask * @rate_idx_mcs_mask: user-requested MCS rate mask (NULL if not in use) * @bss: whether this frame is sent out in AP or IBSS mode */ struct ieee80211_tx_rate_control { struct ieee80211_hw *hw; struct ieee80211_supported_band *sband; struct ieee80211_bss_conf *bss_conf; struct sk_buff *skb; struct ieee80211_tx_rate reported_rate; bool rts, short_preamble; u32 rate_idx_mask; u8 *rate_idx_mcs_mask; bool bss; }; /** * enum rate_control_capabilities - rate control capabilities */ enum rate_control_capabilities { /** * @RATE_CTRL_CAPA_VHT_EXT_NSS_BW: * Support for extended NSS BW support (dot11VHTExtendedNSSCapable) * Note that this is only looked at if the minimum number of chains * that the AP uses is < the number of TX chains the hardware has, * otherwise the NSS difference doesn't bother us. */ RATE_CTRL_CAPA_VHT_EXT_NSS_BW = BIT(0), /** * @RATE_CTRL_CAPA_AMPDU_TRIGGER: * mac80211 should start A-MPDU sessions on tx */ RATE_CTRL_CAPA_AMPDU_TRIGGER = BIT(1), }; struct rate_control_ops { unsigned long capa; const char *name; void *(*alloc)(struct ieee80211_hw *hw); void (*add_debugfs)(struct ieee80211_hw *hw, void *priv, struct dentry *debugfsdir); void (*free)(void *priv); void *(*alloc_sta)(void *priv, struct ieee80211_sta *sta, gfp_t gfp); void (*rate_init)(void *priv, struct ieee80211_supported_band *sband, struct cfg80211_chan_def *chandef, struct ieee80211_sta *sta, void *priv_sta); void (*rate_update)(void *priv, struct ieee80211_supported_band *sband, struct cfg80211_chan_def *chandef, struct ieee80211_sta *sta, void *priv_sta, u32 changed); void (*free_sta)(void *priv, struct ieee80211_sta *sta, void *priv_sta); void (*tx_status_ext)(void *priv, struct ieee80211_supported_band *sband, void *priv_sta, struct ieee80211_tx_status *st); void (*tx_status)(void *priv, struct ieee80211_supported_band *sband, struct ieee80211_sta *sta, void *priv_sta, struct sk_buff *skb); void (*get_rate)(void *priv, struct ieee80211_sta *sta, void *priv_sta, struct ieee80211_tx_rate_control *txrc); void (*add_sta_debugfs)(void *priv, void *priv_sta, struct dentry *dir); u32 (*get_expected_throughput)(void *priv_sta); }; static inline int rate_supported(struct ieee80211_sta *sta, enum nl80211_band band, int index) { return (sta == NULL || sta->deflink.supp_rates[band] & BIT(index)); } static inline s8 rate_lowest_index(struct ieee80211_supported_band *sband, struct ieee80211_sta *sta) { int i; for (i = 0; i < sband->n_bitrates; i++) if (rate_supported(sta, sband->band, i)) return i; /* warn when we cannot find a rate. */ WARN_ON_ONCE(1); /* and return 0 (the lowest index) */ return 0; } static inline bool rate_usable_index_exists(struct ieee80211_supported_band *sband, struct ieee80211_sta *sta) { unsigned int i; for (i = 0; i < sband->n_bitrates; i++) if (rate_supported(sta, sband->band, i)) return true; return false; } /** * rate_control_set_rates - pass the sta rate selection to mac80211/driver * * When not doing a rate control probe to test rates, rate control should pass * its rate selection to mac80211. If the driver supports receiving a station * rate table, it will use it to ensure that frames are always sent based on * the most recent rate control module decision. * * @hw: pointer as obtained from ieee80211_alloc_hw() * @pubsta: &struct ieee80211_sta pointer to the target destination. * @rates: new tx rate set to be used for this station. * * Return: 0 on success. An error code otherwise. */ int rate_control_set_rates(struct ieee80211_hw *hw, struct ieee80211_sta *pubsta, struct ieee80211_sta_rates *rates); int ieee80211_rate_control_register(const struct rate_control_ops *ops); void ieee80211_rate_control_unregister(const struct rate_control_ops *ops); static inline bool conf_is_ht20(struct ieee80211_conf *conf) { return conf->chandef.width == NL80211_CHAN_WIDTH_20; } static inline bool conf_is_ht40_minus(struct ieee80211_conf *conf) { return conf->chandef.width == NL80211_CHAN_WIDTH_40 && conf->chandef.center_freq1 < conf->chandef.chan->center_freq; } static inline bool conf_is_ht40_plus(struct ieee80211_conf *conf) { return conf->chandef.width == NL80211_CHAN_WIDTH_40 && conf->chandef.center_freq1 > conf->chandef.chan->center_freq; } static inline bool conf_is_ht40(struct ieee80211_conf *conf) { return conf->chandef.width == NL80211_CHAN_WIDTH_40; } static inline bool conf_is_ht(struct ieee80211_conf *conf) { return (conf->chandef.width != NL80211_CHAN_WIDTH_5) && (conf->chandef.width != NL80211_CHAN_WIDTH_10) && (conf->chandef.width != NL80211_CHAN_WIDTH_20_NOHT); } static inline enum nl80211_iftype ieee80211_iftype_p2p(enum nl80211_iftype type, bool p2p) { if (p2p) { switch (type) { case NL80211_IFTYPE_STATION: return NL80211_IFTYPE_P2P_CLIENT; case NL80211_IFTYPE_AP: return NL80211_IFTYPE_P2P_GO; default: break; } } return type; } static inline enum nl80211_iftype ieee80211_vif_type_p2p(struct ieee80211_vif *vif) { return ieee80211_iftype_p2p(vif->type, vif->p2p); } /** * ieee80211_get_he_iftype_cap_vif - return HE capabilities for sband/vif * @sband: the sband to search for the iftype on * @vif: the vif to get the iftype from * * Return: pointer to the struct ieee80211_sta_he_cap, or %NULL is none found */ static inline const struct ieee80211_sta_he_cap * ieee80211_get_he_iftype_cap_vif(const struct ieee80211_supported_band *sband, struct ieee80211_vif *vif) { return ieee80211_get_he_iftype_cap(sband, ieee80211_vif_type_p2p(vif)); } /** * ieee80211_get_he_6ghz_capa_vif - return HE 6 GHz capabilities * @sband: the sband to search for the STA on * @vif: the vif to get the iftype from * * Return: the 6GHz capabilities */ static inline __le16 ieee80211_get_he_6ghz_capa_vif(const struct ieee80211_supported_band *sband, struct ieee80211_vif *vif) { return ieee80211_get_he_6ghz_capa(sband, ieee80211_vif_type_p2p(vif)); } /** * ieee80211_get_eht_iftype_cap_vif - return ETH capabilities for sband/vif * @sband: the sband to search for the iftype on * @vif: the vif to get the iftype from * * Return: pointer to the struct ieee80211_sta_eht_cap, or %NULL is none found */ static inline const struct ieee80211_sta_eht_cap * ieee80211_get_eht_iftype_cap_vif(const struct ieee80211_supported_band *sband, struct ieee80211_vif *vif) { return ieee80211_get_eht_iftype_cap(sband, ieee80211_vif_type_p2p(vif)); } /** * ieee80211_update_mu_groups - set the VHT MU-MIMO groud data * * @vif: the specified virtual interface * @link_id: the link ID for MLO, otherwise 0 * @membership: 64 bits array - a bit is set if station is member of the group * @position: 2 bits per group id indicating the position in the group * * Note: This function assumes that the given vif is valid and the position and * membership data is of the correct size and are in the same byte order as the * matching GroupId management frame. * Calls to this function need to be serialized with RX path. */ void ieee80211_update_mu_groups(struct ieee80211_vif *vif, unsigned int link_id, const u8 *membership, const u8 *position); void ieee80211_enable_rssi_reports(struct ieee80211_vif *vif, int rssi_min_thold, int rssi_max_thold); void ieee80211_disable_rssi_reports(struct ieee80211_vif *vif); /** * ieee80211_ave_rssi - report the average RSSI for the specified interface * * @vif: the specified virtual interface * * Note: This function assumes that the given vif is valid. * * Return: The average RSSI value for the requested interface, or 0 if not * applicable. */ int ieee80211_ave_rssi(struct ieee80211_vif *vif); /** * ieee80211_report_wowlan_wakeup - report WoWLAN wakeup * @vif: virtual interface * @wakeup: wakeup reason(s) * @gfp: allocation flags * * See cfg80211_report_wowlan_wakeup(). */ void ieee80211_report_wowlan_wakeup(struct ieee80211_vif *vif, struct cfg80211_wowlan_wakeup *wakeup, gfp_t gfp); /** * ieee80211_tx_prepare_skb - prepare an 802.11 skb for transmission * @hw: pointer as obtained from ieee80211_alloc_hw() * @vif: virtual interface * @skb: frame to be sent from within the driver * @band: the band to transmit on * @sta: optional pointer to get the station to send the frame to * * Return: %true if the skb was prepared, %false otherwise * * Note: must be called under RCU lock */ bool ieee80211_tx_prepare_skb(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct sk_buff *skb, int band, struct ieee80211_sta **sta); /** * ieee80211_parse_tx_radiotap - Sanity-check and parse the radiotap header * of injected frames. * * To accurately parse and take into account rate and retransmission fields, * you must initialize the chandef field in the ieee80211_tx_info structure * of the skb before calling this function. * * @skb: packet injected by userspace * @dev: the &struct device of this 802.11 device * * Return: %true if the radiotap header was parsed, %false otherwise */ bool ieee80211_parse_tx_radiotap(struct sk_buff *skb, struct net_device *dev); /** * struct ieee80211_noa_data - holds temporary data for tracking P2P NoA state * * @next_tsf: TSF timestamp of the next absent state change * @has_next_tsf: next absent state change event pending * * @absent: descriptor bitmask, set if GO is currently absent * * private: * * @count: count fields from the NoA descriptors * @desc: adjusted data from the NoA */ struct ieee80211_noa_data { u32 next_tsf; bool has_next_tsf; u8 absent; u8 count[IEEE80211_P2P_NOA_DESC_MAX]; struct { u32 start; u32 duration; u32 interval; } desc[IEEE80211_P2P_NOA_DESC_MAX]; }; /** * ieee80211_parse_p2p_noa - initialize NoA tracking data from P2P IE * * @attr: P2P NoA IE * @data: NoA tracking data * @tsf: current TSF timestamp * * Return: number of successfully parsed descriptors */ int ieee80211_parse_p2p_noa(const struct ieee80211_p2p_noa_attr *attr, struct ieee80211_noa_data *data, u32 tsf); /** * ieee80211_update_p2p_noa - get next pending P2P GO absent state change * * @data: NoA tracking data * @tsf: current TSF timestamp */ void ieee80211_update_p2p_noa(struct ieee80211_noa_data *data, u32 tsf); /** * ieee80211_tdls_oper_request - request userspace to perform a TDLS operation * @vif: virtual interface * @peer: the peer's destination address * @oper: the requested TDLS operation * @reason_code: reason code for the operation, valid for TDLS teardown * @gfp: allocation flags * * See cfg80211_tdls_oper_request(). */ void ieee80211_tdls_oper_request(struct ieee80211_vif *vif, const u8 *peer, enum nl80211_tdls_operation oper, u16 reason_code, gfp_t gfp); /** * ieee80211_reserve_tid - request to reserve a specific TID * * There is sometimes a need (such as in TDLS) for blocking the driver from * using a specific TID so that the FW can use it for certain operations such * as sending PTI requests. To make sure that the driver doesn't use that TID, * this function must be called as it flushes out packets on this TID and marks * it as blocked, so that any transmit for the station on this TID will be * redirected to the alternative TID in the same AC. * * Note that this function blocks and may call back into the driver, so it * should be called without driver locks held. Also note this function should * only be called from the driver's @sta_state callback. * * @sta: the station to reserve the TID for * @tid: the TID to reserve * * Returns: 0 on success, else on failure */ int ieee80211_reserve_tid(struct ieee80211_sta *sta, u8 tid); /** * ieee80211_unreserve_tid - request to unreserve a specific TID * * Once there is no longer any need for reserving a certain TID, this function * should be called, and no longer will packets have their TID modified for * preventing use of this TID in the driver. * * Note that this function blocks and acquires a lock, so it should be called * without driver locks held. Also note this function should only be called * from the driver's @sta_state callback. * * @sta: the station * @tid: the TID to unreserve */ void ieee80211_unreserve_tid(struct ieee80211_sta *sta, u8 tid); /** * ieee80211_tx_dequeue - dequeue a packet from a software tx queue * * @hw: pointer as obtained from ieee80211_alloc_hw() * @txq: pointer obtained from station or virtual interface, or from * ieee80211_next_txq() * * Return: the skb if successful, %NULL if no frame was available. * * Note that this must be called in an rcu_read_lock() critical section, * which can only be released after the SKB was handled. Some pointers in * skb->cb, e.g. the key pointer, are protected by RCU and thus the * critical section must persist not just for the duration of this call * but for the duration of the frame handling. * However, also note that while in the wake_tx_queue() method, * rcu_read_lock() is already held. * * softirqs must also be disabled when this function is called. * In process context, use ieee80211_tx_dequeue_ni() instead. */ struct sk_buff *ieee80211_tx_dequeue(struct ieee80211_hw *hw, struct ieee80211_txq *txq); /** * ieee80211_tx_dequeue_ni - dequeue a packet from a software tx queue * (in process context) * * Like ieee80211_tx_dequeue() but can be called in process context * (internally disables bottom halves). * * @hw: pointer as obtained from ieee80211_alloc_hw() * @txq: pointer obtained from station or virtual interface, or from * ieee80211_next_txq() * * Return: the skb if successful, %NULL if no frame was available. */ static inline struct sk_buff *ieee80211_tx_dequeue_ni(struct ieee80211_hw *hw, struct ieee80211_txq *txq) { struct sk_buff *skb; local_bh_disable(); skb = ieee80211_tx_dequeue(hw, txq); local_bh_enable(); return skb; } /** * ieee80211_handle_wake_tx_queue - mac80211 handler for wake_tx_queue callback * * @hw: pointer as obtained from wake_tx_queue() callback(). * @txq: pointer as obtained from wake_tx_queue() callback(). * * Drivers can use this function for the mandatory mac80211 wake_tx_queue * callback in struct ieee80211_ops. They should not call this function. */ void ieee80211_handle_wake_tx_queue(struct ieee80211_hw *hw, struct ieee80211_txq *txq); /** * ieee80211_next_txq - get next tx queue to pull packets from * * @hw: pointer as obtained from ieee80211_alloc_hw() * @ac: AC number to return packets from. * * Return: the next txq if successful, %NULL if no queue is eligible. If a txq * is returned, it should be returned with ieee80211_return_txq() after the * driver has finished scheduling it. */ struct ieee80211_txq *ieee80211_next_txq(struct ieee80211_hw *hw, u8 ac); /** * ieee80211_txq_schedule_start - start new scheduling round for TXQs * * @hw: pointer as obtained from ieee80211_alloc_hw() * @ac: AC number to acquire locks for * * Should be called before ieee80211_next_txq() or ieee80211_return_txq(). * The driver must not call multiple TXQ scheduling rounds concurrently. */ void ieee80211_txq_schedule_start(struct ieee80211_hw *hw, u8 ac); /* (deprecated) */ static inline void ieee80211_txq_schedule_end(struct ieee80211_hw *hw, u8 ac) { } void __ieee80211_schedule_txq(struct ieee80211_hw *hw, struct ieee80211_txq *txq, bool force); /** * ieee80211_schedule_txq - schedule a TXQ for transmission * * @hw: pointer as obtained from ieee80211_alloc_hw() * @txq: pointer obtained from station or virtual interface * * Schedules a TXQ for transmission if it is not already scheduled, * even if mac80211 does not have any packets buffered. * * The driver may call this function if it has buffered packets for * this TXQ internally. */ static inline void ieee80211_schedule_txq(struct ieee80211_hw *hw, struct ieee80211_txq *txq) { __ieee80211_schedule_txq(hw, txq, true); } /** * ieee80211_return_txq - return a TXQ previously acquired by ieee80211_next_txq() * * @hw: pointer as obtained from ieee80211_alloc_hw() * @txq: pointer obtained from station or virtual interface * @force: schedule txq even if mac80211 does not have any buffered packets. * * The driver may set force=true if it has buffered packets for this TXQ * internally. */ static inline void ieee80211_return_txq(struct ieee80211_hw *hw, struct ieee80211_txq *txq, bool force) { __ieee80211_schedule_txq(hw, txq, force); } /** * ieee80211_txq_may_transmit - check whether TXQ is allowed to transmit * * This function is used to check whether given txq is allowed to transmit by * the airtime scheduler, and can be used by drivers to access the airtime * fairness accounting without using the scheduling order enforced by * next_txq(). * * Returns %true if the airtime scheduler thinks the TXQ should be allowed to * transmit, and %false if it should be throttled. This function can also have * the side effect of rotating the TXQ in the scheduler rotation, which will * eventually bring the deficit to positive and allow the station to transmit * again. * * The API ieee80211_txq_may_transmit() also ensures that TXQ list will be * aligned against driver's own round-robin scheduler list. i.e it rotates * the TXQ list till it makes the requested node becomes the first entry * in TXQ list. Thus both the TXQ list and driver's list are in sync. If this * function returns %true, the driver is expected to schedule packets * for transmission, and then return the TXQ through ieee80211_return_txq(). * * @hw: pointer as obtained from ieee80211_alloc_hw() * @txq: pointer obtained from station or virtual interface * * Return: %true if transmission is allowed, %false otherwise */ bool ieee80211_txq_may_transmit(struct ieee80211_hw *hw, struct ieee80211_txq *txq); /** * ieee80211_txq_get_depth - get pending frame/byte count of given txq * * The values are not guaranteed to be coherent with regard to each other, i.e. * txq state can change half-way of this function and the caller may end up * with "new" frame_cnt and "old" byte_cnt or vice-versa. * * @txq: pointer obtained from station or virtual interface * @frame_cnt: pointer to store frame count * @byte_cnt: pointer to store byte count */ void ieee80211_txq_get_depth(struct ieee80211_txq *txq, unsigned long *frame_cnt, unsigned long *byte_cnt); /** * ieee80211_nan_func_terminated - notify about NAN function termination. * * This function is used to notify mac80211 about NAN function termination. * Note that this function can't be called from hard irq. * * @vif: &struct ieee80211_vif pointer from the add_interface callback. * @inst_id: the local instance id * @reason: termination reason (one of the NL80211_NAN_FUNC_TERM_REASON_*) * @gfp: allocation flags */ void ieee80211_nan_func_terminated(struct ieee80211_vif *vif, u8 inst_id, enum nl80211_nan_func_term_reason reason, gfp_t gfp); /** * ieee80211_nan_func_match - notify about NAN function match event. * * This function is used to notify mac80211 about NAN function match. The * cookie inside the match struct will be assigned by mac80211. * Note that this function can't be called from hard irq. * * @vif: &struct ieee80211_vif pointer from the add_interface callback. * @match: match event information * @gfp: allocation flags */ void ieee80211_nan_func_match(struct ieee80211_vif *vif, struct cfg80211_nan_match_params *match, gfp_t gfp); /** * ieee80211_calc_rx_airtime - calculate estimated transmission airtime for RX. * * This function calculates the estimated airtime usage of a frame based on the * rate information in the RX status struct and the frame length. * * @hw: pointer as obtained from ieee80211_alloc_hw() * @status: &struct ieee80211_rx_status containing the transmission rate * information. * @len: frame length in bytes * * Return: the airtime estimate */ u32 ieee80211_calc_rx_airtime(struct ieee80211_hw *hw, struct ieee80211_rx_status *status, int len); /** * ieee80211_calc_tx_airtime - calculate estimated transmission airtime for TX. * * This function calculates the estimated airtime usage of a frame based on the * rate information in the TX info struct and the frame length. * * @hw: pointer as obtained from ieee80211_alloc_hw() * @info: &struct ieee80211_tx_info of the frame. * @len: frame length in bytes * * Return: the airtime estimate */ u32 ieee80211_calc_tx_airtime(struct ieee80211_hw *hw, struct ieee80211_tx_info *info, int len); /** * ieee80211_get_fils_discovery_tmpl - Get FILS discovery template. * @hw: pointer obtained from ieee80211_alloc_hw(). * @vif: &struct ieee80211_vif pointer from the add_interface callback. * * The driver is responsible for freeing the returned skb. * * Return: FILS discovery template. %NULL on error. */ struct sk_buff *ieee80211_get_fils_discovery_tmpl(struct ieee80211_hw *hw, struct ieee80211_vif *vif); /** * ieee80211_get_unsol_bcast_probe_resp_tmpl - Get unsolicited broadcast * probe response template. * @hw: pointer obtained from ieee80211_alloc_hw(). * @vif: &struct ieee80211_vif pointer from the add_interface callback. * * The driver is responsible for freeing the returned skb. * * Return: Unsolicited broadcast probe response template. %NULL on error. */ struct sk_buff * ieee80211_get_unsol_bcast_probe_resp_tmpl(struct ieee80211_hw *hw, struct ieee80211_vif *vif); /** * ieee80211_obss_color_collision_notify - notify userland about a BSS color * collision. * @link_id: valid link_id during MLO or 0 for non-MLO * * @vif: &struct ieee80211_vif pointer from the add_interface callback. * @color_bitmap: a 64 bit bitmap representing the colors that the local BSS is * aware of. */ void ieee80211_obss_color_collision_notify(struct ieee80211_vif *vif, u64 color_bitmap, u8 link_id); /** * ieee80211_is_tx_data - check if frame is a data frame * * The function is used to check if a frame is a data frame. Frames with * hardware encapsulation enabled are data frames. * * @skb: the frame to be transmitted. * * Return: %true if @skb is a data frame, %false otherwise */ static inline bool ieee80211_is_tx_data(struct sk_buff *skb) { struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); struct ieee80211_hdr *hdr = (void *) skb->data; return info->flags & IEEE80211_TX_CTL_HW_80211_ENCAP || ieee80211_is_data(hdr->frame_control); } /** * ieee80211_set_active_links - set active links in client mode * @vif: interface to set active links on * @active_links: the new active links bitmap * * Context: Must be called with wiphy mutex held; may sleep; calls * back into the driver. * * This changes the active links on an interface. The interface * must be in client mode (in AP mode, all links are always active), * and @active_links must be a subset of the vif's valid_links. * * If a link is switched off and another is switched on at the same * time (e.g. active_links going from 0x1 to 0x10) then you will get * a sequence of calls like * * - change_vif_links(0x11) * - unassign_vif_chanctx(link_id=0) * - assign_vif_chanctx(link_id=4) * - change_sta_links(0x11) for each affected STA (the AP) * (TDLS connections on now inactive links should be torn down) * - remove group keys on the old link (link_id 0) * - add new group keys (GTK/IGTK/BIGTK) on the new link (link_id 4) * - change_sta_links(0x10) for each affected STA (the AP) * - change_vif_links(0x10) * * Return: 0 on success. An error code otherwise. */ int ieee80211_set_active_links(struct ieee80211_vif *vif, u16 active_links); /** * ieee80211_set_active_links_async - asynchronously set active links * @vif: interface to set active links on * @active_links: the new active links bitmap * * See ieee80211_set_active_links() for more information, the only * difference here is that the link change is triggered async and * can be called in any context, but the link switch will only be * completed after it returns. */ void ieee80211_set_active_links_async(struct ieee80211_vif *vif, u16 active_links); /** * ieee80211_send_teardown_neg_ttlm - tear down a negotiated TTLM request * @vif: the interface on which the tear down request should be sent. * * This function can be used to tear down a previously accepted negotiated * TTLM request. */ void ieee80211_send_teardown_neg_ttlm(struct ieee80211_vif *vif); /** * ieee80211_chan_width_to_rx_bw - convert channel width to STA RX bandwidth * @width: the channel width value to convert * Return: the STA RX bandwidth value for the channel width */ static inline enum ieee80211_sta_rx_bandwidth ieee80211_chan_width_to_rx_bw(enum nl80211_chan_width width) { switch (width) { default: WARN_ON_ONCE(1); fallthrough; case NL80211_CHAN_WIDTH_20_NOHT: case NL80211_CHAN_WIDTH_20: return IEEE80211_STA_RX_BW_20; case NL80211_CHAN_WIDTH_40: return IEEE80211_STA_RX_BW_40; case NL80211_CHAN_WIDTH_80: return IEEE80211_STA_RX_BW_80; case NL80211_CHAN_WIDTH_160: case NL80211_CHAN_WIDTH_80P80: return IEEE80211_STA_RX_BW_160; case NL80211_CHAN_WIDTH_320: return IEEE80211_STA_RX_BW_320; } } /** * ieee80211_prepare_rx_omi_bw - prepare for sending BW RX OMI * @link_sta: the link STA the OMI is going to be sent to * @bw: the bandwidth requested * * When the driver decides to do RX OMI to change bandwidth with a STA * it calls this function to prepare, then sends the OMI, and finally * calls ieee80211_finalize_rx_omi_bw(). * * Note that the (link) STA rate control is updated accordingly as well, * but the chanctx might not be updated if there are other users. * If the intention is to reduce the listen bandwidth, the driver must * ensure there are no TDLS stations nor other uses of the chanctx. * * Also note that in order to sequence correctly, narrowing bandwidth * will only happen in ieee80211_finalize_rx_omi_bw(), whereas widening * again (e.g. going back to normal) will happen here. * * Note that we treat this symmetrically, so if the driver calls this * and tells the peer to only send with a lower bandwidth, we assume * that the driver also wants to only send at that lower bandwidth, to * allow narrowing of the chanctx request for this station/interface. * * Finally, the driver must ensure that if the function returned %true, * ieee80211_finalize_rx_omi_bw() is also called, even for example in * case of HW restart. * * Context: Must be called with wiphy mutex held, and will call back * into the driver, so ensure no driver locks are held. * * Return: %true if changes are going to be made, %false otherwise */ bool ieee80211_prepare_rx_omi_bw(struct ieee80211_link_sta *link_sta, enum ieee80211_sta_rx_bandwidth bw); /** * ieee80211_finalize_rx_omi_bw - finalize BW RX OMI update * @link_sta: the link STA the OMI was sent to * * See ieee80211_client_prepare_rx_omi_bw(). Context is the same here * as well. */ void ieee80211_finalize_rx_omi_bw(struct ieee80211_link_sta *link_sta); /* for older drivers - let's not document these ... */ int ieee80211_emulate_add_chanctx(struct ieee80211_hw *hw, struct ieee80211_chanctx_conf *ctx); void ieee80211_emulate_remove_chanctx(struct ieee80211_hw *hw, struct ieee80211_chanctx_conf *ctx); void ieee80211_emulate_change_chanctx(struct ieee80211_hw *hw, struct ieee80211_chanctx_conf *ctx, u32 changed); int ieee80211_emulate_switch_vif_chanctx(struct ieee80211_hw *hw, struct ieee80211_vif_chanctx_switch *vifs, int n_vifs, enum ieee80211_chanctx_switch_mode mode); #endif /* MAC80211_H */
3 3 3 3 3 3 3 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 // SPDX-License-Identifier: GPL-2.0 /* Copyright (C) B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich */ #include "soft-interface.h" #include "main.h" #include <linux/atomic.h> #include <linux/byteorder/generic.h> #include <linux/cache.h> #include <linux/compiler.h> #include <linux/container_of.h> #include <linux/cpumask.h> #include <linux/errno.h> #include <linux/etherdevice.h> #include <linux/ethtool.h> #include <linux/gfp.h> #include <linux/if_ether.h> #include <linux/if_vlan.h> #include <linux/jiffies.h> #include <linux/kref.h> #include <linux/list.h> #include <linux/lockdep.h> #include <linux/netdevice.h> #include <linux/netlink.h> #include <linux/percpu.h> #include <linux/random.h> #include <linux/rculist.h> #include <linux/rcupdate.h> #include <linux/skbuff.h> #include <linux/slab.h> #include <linux/socket.h> #include <linux/spinlock.h> #include <linux/stddef.h> #include <linux/string.h> #include <linux/types.h> #include <net/net_namespace.h> #include <net/netlink.h> #include <uapi/linux/batadv_packet.h> #include <uapi/linux/batman_adv.h> #include "bat_algo.h" #include "bridge_loop_avoidance.h" #include "distributed-arp-table.h" #include "gateway_client.h" #include "hard-interface.h" #include "multicast.h" #include "network-coding.h" #include "send.h" #include "translation-table.h" /** * batadv_skb_head_push() - Increase header size and move (push) head pointer * @skb: packet buffer which should be modified * @len: number of bytes to add * * Return: 0 on success or negative error number in case of failure */ int batadv_skb_head_push(struct sk_buff *skb, unsigned int len) { int result; /* TODO: We must check if we can release all references to non-payload * data using __skb_header_release in our skbs to allow skb_cow_header * to work optimally. This means that those skbs are not allowed to read * or write any data which is before the current position of skb->data * after that call and thus allow other skbs with the same data buffer * to write freely in that area. */ result = skb_cow_head(skb, len); if (result < 0) return result; skb_push(skb, len); return 0; } static int batadv_interface_open(struct net_device *dev) { netif_start_queue(dev); return 0; } static int batadv_interface_release(struct net_device *dev) { netif_stop_queue(dev); return 0; } /** * batadv_sum_counter() - Sum the cpu-local counters for index 'idx' * @bat_priv: the bat priv with all the soft interface information * @idx: index of counter to sum up * * Return: sum of all cpu-local counters */ static u64 batadv_sum_counter(struct batadv_priv *bat_priv, size_t idx) { u64 *counters, sum = 0; int cpu; for_each_possible_cpu(cpu) { counters = per_cpu_ptr(bat_priv->bat_counters, cpu); sum += counters[idx]; } return sum; } static struct net_device_stats *batadv_interface_stats(struct net_device *dev) { struct batadv_priv *bat_priv = netdev_priv(dev); struct net_device_stats *stats = &dev->stats; stats->tx_packets = batadv_sum_counter(bat_priv, BATADV_CNT_TX); stats->tx_bytes = batadv_sum_counter(bat_priv, BATADV_CNT_TX_BYTES); stats->tx_dropped = batadv_sum_counter(bat_priv, BATADV_CNT_TX_DROPPED); stats->rx_packets = batadv_sum_counter(bat_priv, BATADV_CNT_RX); stats->rx_bytes = batadv_sum_counter(bat_priv, BATADV_CNT_RX_BYTES); return stats; } static int batadv_interface_set_mac_addr(struct net_device *dev, void *p) { struct batadv_priv *bat_priv = netdev_priv(dev); struct batadv_softif_vlan *vlan; struct sockaddr *addr = p; u8 old_addr[ETH_ALEN]; if (!is_valid_ether_addr(addr->sa_data)) return -EADDRNOTAVAIL; ether_addr_copy(old_addr, dev->dev_addr); eth_hw_addr_set(dev, addr->sa_data); /* only modify transtable if it has been initialized before */ if (atomic_read(&bat_priv->mesh_state) != BATADV_MESH_ACTIVE) return 0; rcu_read_lock(); hlist_for_each_entry_rcu(vlan, &bat_priv->softif_vlan_list, list) { batadv_tt_local_remove(bat_priv, old_addr, vlan->vid, "mac address changed", false); batadv_tt_local_add(dev, addr->sa_data, vlan->vid, BATADV_NULL_IFINDEX, BATADV_NO_MARK); } rcu_read_unlock(); return 0; } static int batadv_interface_change_mtu(struct net_device *dev, int new_mtu) { struct batadv_priv *bat_priv = netdev_priv(dev); /* check ranges */ if (new_mtu < ETH_MIN_MTU || new_mtu > batadv_hardif_min_mtu(dev)) return -EINVAL; WRITE_ONCE(dev->mtu, new_mtu); bat_priv->mtu_set_by_user = new_mtu; return 0; } /** * batadv_interface_set_rx_mode() - set the rx mode of a device * @dev: registered network device to modify * * We do not actually need to set any rx filters for the virtual batman * soft interface. However a dummy handler enables a user to set static * multicast listeners for instance. */ static void batadv_interface_set_rx_mode(struct net_device *dev) { } static netdev_tx_t batadv_interface_tx(struct sk_buff *skb, struct net_device *soft_iface) { struct ethhdr *ethhdr; struct batadv_priv *bat_priv = netdev_priv(soft_iface); struct batadv_hard_iface *primary_if = NULL; struct batadv_bcast_packet *bcast_packet; static const u8 stp_addr[ETH_ALEN] = {0x01, 0x80, 0xC2, 0x00, 0x00, 0x00}; static const u8 ectp_addr[ETH_ALEN] = {0xCF, 0x00, 0x00, 0x00, 0x00, 0x00}; enum batadv_dhcp_recipient dhcp_rcp = BATADV_DHCP_NO; u8 *dst_hint = NULL, chaddr[ETH_ALEN]; struct vlan_ethhdr *vhdr; unsigned int header_len = 0; int data_len = skb->len, ret; unsigned long brd_delay = 0; bool do_bcast = false, client_added; unsigned short vid; u32 seqno; int gw_mode; enum batadv_forw_mode forw_mode = BATADV_FORW_BCAST; int mcast_is_routable = 0; int network_offset = ETH_HLEN; __be16 proto; if (atomic_read(&bat_priv->mesh_state) != BATADV_MESH_ACTIVE) goto dropped; /* reset control block to avoid left overs from previous users */ memset(skb->cb, 0, sizeof(struct batadv_skb_cb)); netif_trans_update(soft_iface); vid = batadv_get_vid(skb, 0); skb_reset_mac_header(skb); ethhdr = eth_hdr(skb); proto = ethhdr->h_proto; switch (ntohs(proto)) { case ETH_P_8021Q: if (!pskb_may_pull(skb, sizeof(*vhdr))) goto dropped; vhdr = vlan_eth_hdr(skb); proto = vhdr->h_vlan_encapsulated_proto; /* drop batman-in-batman packets to prevent loops */ if (proto != htons(ETH_P_BATMAN)) { network_offset += VLAN_HLEN; break; } fallthrough; case ETH_P_BATMAN: goto dropped; } skb_set_network_header(skb, network_offset); if (batadv_bla_tx(bat_priv, skb, vid)) goto dropped; /* skb->data might have been reallocated by batadv_bla_tx() */ ethhdr = eth_hdr(skb); /* Register the client MAC in the transtable */ if (!is_multicast_ether_addr(ethhdr->h_source) && !batadv_bla_is_loopdetect_mac(ethhdr->h_source)) { client_added = batadv_tt_local_add(soft_iface, ethhdr->h_source, vid, skb->skb_iif, skb->mark); if (!client_added) goto dropped; } /* Snoop address candidates from DHCPACKs for early DAT filling */ batadv_dat_snoop_outgoing_dhcp_ack(bat_priv, skb, proto, vid); /* don't accept stp packets. STP does not help in meshes. * better use the bridge loop avoidance ... * * The same goes for ECTP sent at least by some Cisco Switches, * it might confuse the mesh when used with bridge loop avoidance. */ if (batadv_compare_eth(ethhdr->h_dest, stp_addr)) goto dropped; if (batadv_compare_eth(ethhdr->h_dest, ectp_addr)) goto dropped; gw_mode = atomic_read(&bat_priv->gw.mode); if (is_multicast_ether_addr(ethhdr->h_dest)) { /* if gw mode is off, broadcast every packet */ if (gw_mode == BATADV_GW_MODE_OFF) { do_bcast = true; goto send; } dhcp_rcp = batadv_gw_dhcp_recipient_get(skb, &header_len, chaddr); /* skb->data may have been modified by * batadv_gw_dhcp_recipient_get() */ ethhdr = eth_hdr(skb); /* if gw_mode is on, broadcast any non-DHCP message. * All the DHCP packets are going to be sent as unicast */ if (dhcp_rcp == BATADV_DHCP_NO) { do_bcast = true; goto send; } if (dhcp_rcp == BATADV_DHCP_TO_CLIENT) dst_hint = chaddr; else if ((gw_mode == BATADV_GW_MODE_SERVER) && (dhcp_rcp == BATADV_DHCP_TO_SERVER)) /* gateways should not forward any DHCP message if * directed to a DHCP server */ goto dropped; send: if (do_bcast && !is_broadcast_ether_addr(ethhdr->h_dest)) { forw_mode = batadv_mcast_forw_mode(bat_priv, skb, vid, &mcast_is_routable); switch (forw_mode) { case BATADV_FORW_BCAST: break; case BATADV_FORW_UCASTS: case BATADV_FORW_MCAST: do_bcast = false; break; case BATADV_FORW_NONE: fallthrough; default: goto dropped; } } } batadv_skb_set_priority(skb, 0); /* ethernet packet should be broadcasted */ if (do_bcast) { primary_if = batadv_primary_if_get_selected(bat_priv); if (!primary_if) goto dropped; /* in case of ARP request, we do not immediately broadcasti the * packet, instead we first wait for DAT to try to retrieve the * correct ARP entry */ if (batadv_dat_snoop_outgoing_arp_request(bat_priv, skb)) brd_delay = msecs_to_jiffies(ARP_REQ_DELAY); if (batadv_skb_head_push(skb, sizeof(*bcast_packet)) < 0) goto dropped; bcast_packet = (struct batadv_bcast_packet *)skb->data; bcast_packet->version = BATADV_COMPAT_VERSION; bcast_packet->ttl = BATADV_TTL - 1; /* batman packet type: broadcast */ bcast_packet->packet_type = BATADV_BCAST; bcast_packet->reserved = 0; /* hw address of first interface is the orig mac because only * this mac is known throughout the mesh */ ether_addr_copy(bcast_packet->orig, primary_if->net_dev->dev_addr); /* set broadcast sequence number */ seqno = atomic_inc_return(&bat_priv->bcast_seqno); bcast_packet->seqno = htonl(seqno); batadv_send_bcast_packet(bat_priv, skb, brd_delay, true); /* unicast packet */ } else { /* DHCP packets going to a server will use the GW feature */ if (dhcp_rcp == BATADV_DHCP_TO_SERVER) { ret = batadv_gw_out_of_range(bat_priv, skb); if (ret) goto dropped; ret = batadv_send_skb_via_gw(bat_priv, skb, vid); } else if (forw_mode == BATADV_FORW_UCASTS) { ret = batadv_mcast_forw_send(bat_priv, skb, vid, mcast_is_routable); } else if (forw_mode == BATADV_FORW_MCAST) { ret = batadv_mcast_forw_mcsend(bat_priv, skb); } else { if (batadv_dat_snoop_outgoing_arp_request(bat_priv, skb)) goto dropped; batadv_dat_snoop_outgoing_arp_reply(bat_priv, skb); ret = batadv_send_skb_via_tt(bat_priv, skb, dst_hint, vid); } if (ret != NET_XMIT_SUCCESS) goto dropped_freed; } batadv_inc_counter(bat_priv, BATADV_CNT_TX); batadv_add_counter(bat_priv, BATADV_CNT_TX_BYTES, data_len); goto end; dropped: kfree_skb(skb); dropped_freed: batadv_inc_counter(bat_priv, BATADV_CNT_TX_DROPPED); end: batadv_hardif_put(primary_if); return NETDEV_TX_OK; } /** * batadv_interface_rx() - receive ethernet frame on local batman-adv interface * @soft_iface: local interface which will receive the ethernet frame * @skb: ethernet frame for @soft_iface * @hdr_size: size of already parsed batman-adv header * @orig_node: originator from which the batman-adv packet was sent * * Sends an ethernet frame to the receive path of the local @soft_iface. * skb->data has still point to the batman-adv header with the size @hdr_size. * The caller has to have parsed this header already and made sure that at least * @hdr_size bytes are still available for pull in @skb. * * The packet may still get dropped. This can happen when the encapsulated * ethernet frame is invalid or contains again an batman-adv packet. Also * unicast packets will be dropped directly when it was sent between two * isolated clients. */ void batadv_interface_rx(struct net_device *soft_iface, struct sk_buff *skb, int hdr_size, struct batadv_orig_node *orig_node) { struct batadv_bcast_packet *batadv_bcast_packet; struct batadv_priv *bat_priv = netdev_priv(soft_iface); struct vlan_ethhdr *vhdr; struct ethhdr *ethhdr; unsigned short vid; int packet_type; batadv_bcast_packet = (struct batadv_bcast_packet *)skb->data; packet_type = batadv_bcast_packet->packet_type; skb_pull_rcsum(skb, hdr_size); skb_reset_mac_header(skb); /* clean the netfilter state now that the batman-adv header has been * removed */ nf_reset_ct(skb); if (unlikely(!pskb_may_pull(skb, ETH_HLEN))) goto dropped; vid = batadv_get_vid(skb, 0); ethhdr = eth_hdr(skb); switch (ntohs(ethhdr->h_proto)) { case ETH_P_8021Q: if (!pskb_may_pull(skb, VLAN_ETH_HLEN)) goto dropped; vhdr = skb_vlan_eth_hdr(skb); /* drop batman-in-batman packets to prevent loops */ if (vhdr->h_vlan_encapsulated_proto != htons(ETH_P_BATMAN)) break; fallthrough; case ETH_P_BATMAN: goto dropped; } /* skb->dev & skb->pkt_type are set here */ skb->protocol = eth_type_trans(skb, soft_iface); skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); batadv_inc_counter(bat_priv, BATADV_CNT_RX); batadv_add_counter(bat_priv, BATADV_CNT_RX_BYTES, skb->len + ETH_HLEN); /* Let the bridge loop avoidance check the packet. If will * not handle it, we can safely push it up. */ if (batadv_bla_rx(bat_priv, skb, vid, packet_type)) goto out; if (orig_node) batadv_tt_add_temporary_global_entry(bat_priv, orig_node, ethhdr->h_source, vid); if (is_multicast_ether_addr(ethhdr->h_dest)) { /* set the mark on broadcast packets if AP isolation is ON and * the packet is coming from an "isolated" client */ if (batadv_vlan_ap_isola_get(bat_priv, vid) && batadv_tt_global_is_isolated(bat_priv, ethhdr->h_source, vid)) { /* save bits in skb->mark not covered by the mask and * apply the mark on the rest */ skb->mark &= ~bat_priv->isolation_mark_mask; skb->mark |= bat_priv->isolation_mark; } } else if (batadv_is_ap_isolated(bat_priv, ethhdr->h_source, ethhdr->h_dest, vid)) { goto dropped; } netif_rx(skb); goto out; dropped: kfree_skb(skb); out: return; } /** * batadv_softif_vlan_release() - release vlan from lists and queue for free * after rcu grace period * @ref: kref pointer of the vlan object */ void batadv_softif_vlan_release(struct kref *ref) { struct batadv_softif_vlan *vlan; vlan = container_of(ref, struct batadv_softif_vlan, refcount); spin_lock_bh(&vlan->bat_priv->softif_vlan_list_lock); hlist_del_rcu(&vlan->list); spin_unlock_bh(&vlan->bat_priv->softif_vlan_list_lock); kfree_rcu(vlan, rcu); } /** * batadv_softif_vlan_get() - get the vlan object for a specific vid * @bat_priv: the bat priv with all the soft interface information * @vid: the identifier of the vlan object to retrieve * * Return: the private data of the vlan matching the vid passed as argument or * NULL otherwise. The refcounter of the returned object is incremented by 1. */ struct batadv_softif_vlan *batadv_softif_vlan_get(struct batadv_priv *bat_priv, unsigned short vid) { struct batadv_softif_vlan *vlan_tmp, *vlan = NULL; rcu_read_lock(); hlist_for_each_entry_rcu(vlan_tmp, &bat_priv->softif_vlan_list, list) { if (vlan_tmp->vid != vid) continue; if (!kref_get_unless_zero(&vlan_tmp->refcount)) continue; vlan = vlan_tmp; break; } rcu_read_unlock(); return vlan; } /** * batadv_softif_create_vlan() - allocate the needed resources for a new vlan * @bat_priv: the bat priv with all the soft interface information * @vid: the VLAN identifier * * Return: 0 on success, a negative error otherwise. */ int batadv_softif_create_vlan(struct batadv_priv *bat_priv, unsigned short vid) { struct batadv_softif_vlan *vlan; spin_lock_bh(&bat_priv->softif_vlan_list_lock); vlan = batadv_softif_vlan_get(bat_priv, vid); if (vlan) { batadv_softif_vlan_put(vlan); spin_unlock_bh(&bat_priv->softif_vlan_list_lock); return -EEXIST; } vlan = kzalloc(sizeof(*vlan), GFP_ATOMIC); if (!vlan) { spin_unlock_bh(&bat_priv->softif_vlan_list_lock); return -ENOMEM; } vlan->bat_priv = bat_priv; vlan->vid = vid; kref_init(&vlan->refcount); atomic_set(&vlan->ap_isolation, 0); kref_get(&vlan->refcount); hlist_add_head_rcu(&vlan->list, &bat_priv->softif_vlan_list); spin_unlock_bh(&bat_priv->softif_vlan_list_lock); /* add a new TT local entry. This one will be marked with the NOPURGE * flag */ batadv_tt_local_add(bat_priv->soft_iface, bat_priv->soft_iface->dev_addr, vid, BATADV_NULL_IFINDEX, BATADV_NO_MARK); /* don't return reference to new softif_vlan */ batadv_softif_vlan_put(vlan); return 0; } /** * batadv_softif_destroy_vlan() - remove and destroy a softif_vlan object * @bat_priv: the bat priv with all the soft interface information * @vlan: the object to remove */ static void batadv_softif_destroy_vlan(struct batadv_priv *bat_priv, struct batadv_softif_vlan *vlan) { /* explicitly remove the associated TT local entry because it is marked * with the NOPURGE flag */ batadv_tt_local_remove(bat_priv, bat_priv->soft_iface->dev_addr, vlan->vid, "vlan interface destroyed", false); batadv_softif_vlan_put(vlan); } /** * batadv_interface_add_vid() - ndo_add_vid API implementation * @dev: the netdev of the mesh interface * @proto: protocol of the vlan id * @vid: identifier of the new vlan * * Set up all the internal structures for handling the new vlan on top of the * mesh interface * * Return: 0 on success or a negative error code in case of failure. */ static int batadv_interface_add_vid(struct net_device *dev, __be16 proto, unsigned short vid) { struct batadv_priv *bat_priv = netdev_priv(dev); struct batadv_softif_vlan *vlan; /* only 802.1Q vlans are supported. * batman-adv does not know how to handle other types */ if (proto != htons(ETH_P_8021Q)) return -EINVAL; /* VID 0 is only used to indicate "priority tag" frames which only * contain priority information and no VID. No management structures * should be created for this VID and it should be handled like an * untagged frame. */ if (vid == 0) return 0; vid |= BATADV_VLAN_HAS_TAG; /* if a new vlan is getting created and it already exists, it means that * it was not deleted yet. batadv_softif_vlan_get() increases the * refcount in order to revive the object. * * if it does not exist then create it. */ vlan = batadv_softif_vlan_get(bat_priv, vid); if (!vlan) return batadv_softif_create_vlan(bat_priv, vid); /* add a new TT local entry. This one will be marked with the NOPURGE * flag. This must be added again, even if the vlan object already * exists, because the entry was deleted by kill_vid() */ batadv_tt_local_add(bat_priv->soft_iface, bat_priv->soft_iface->dev_addr, vid, BATADV_NULL_IFINDEX, BATADV_NO_MARK); return 0; } /** * batadv_interface_kill_vid() - ndo_kill_vid API implementation * @dev: the netdev of the mesh interface * @proto: protocol of the vlan id * @vid: identifier of the deleted vlan * * Destroy all the internal structures used to handle the vlan identified by vid * on top of the mesh interface * * Return: 0 on success, -EINVAL if the specified prototype is not ETH_P_8021Q * or -ENOENT if the specified vlan id wasn't registered. */ static int batadv_interface_kill_vid(struct net_device *dev, __be16 proto, unsigned short vid) { struct batadv_priv *bat_priv = netdev_priv(dev); struct batadv_softif_vlan *vlan; /* only 802.1Q vlans are supported. batman-adv does not know how to * handle other types */ if (proto != htons(ETH_P_8021Q)) return -EINVAL; /* "priority tag" frames are handled like "untagged" frames * and no softif_vlan needs to be destroyed */ if (vid == 0) return 0; vlan = batadv_softif_vlan_get(bat_priv, vid | BATADV_VLAN_HAS_TAG); if (!vlan) return -ENOENT; batadv_softif_destroy_vlan(bat_priv, vlan); /* finally free the vlan object */ batadv_softif_vlan_put(vlan); return 0; } /* batman-adv network devices have devices nesting below it and are a special * "super class" of normal network devices; split their locks off into a * separate class since they always nest. */ static struct lock_class_key batadv_netdev_xmit_lock_key; static struct lock_class_key batadv_netdev_addr_lock_key; /** * batadv_set_lockdep_class_one() - Set lockdep class for a single tx queue * @dev: device which owns the tx queue * @txq: tx queue to modify * @_unused: always NULL */ static void batadv_set_lockdep_class_one(struct net_device *dev, struct netdev_queue *txq, void *_unused) { lockdep_set_class(&txq->_xmit_lock, &batadv_netdev_xmit_lock_key); } /** * batadv_set_lockdep_class() - Set txq and addr_list lockdep class * @dev: network device to modify */ static void batadv_set_lockdep_class(struct net_device *dev) { lockdep_set_class(&dev->addr_list_lock, &batadv_netdev_addr_lock_key); netdev_for_each_tx_queue(dev, batadv_set_lockdep_class_one, NULL); } /** * batadv_softif_init_late() - late stage initialization of soft interface * @dev: registered network device to modify * * Return: error code on failures */ static int batadv_softif_init_late(struct net_device *dev) { struct batadv_priv *bat_priv; u32 random_seqno; int ret; size_t cnt_len = sizeof(u64) * BATADV_CNT_NUM; batadv_set_lockdep_class(dev); bat_priv = netdev_priv(dev); bat_priv->soft_iface = dev; /* batadv_interface_stats() needs to be available as soon as * register_netdevice() has been called */ bat_priv->bat_counters = __alloc_percpu(cnt_len, __alignof__(u64)); if (!bat_priv->bat_counters) return -ENOMEM; atomic_set(&bat_priv->aggregated_ogms, 1); atomic_set(&bat_priv->bonding, 0); #ifdef CONFIG_BATMAN_ADV_BLA atomic_set(&bat_priv->bridge_loop_avoidance, 1); #endif #ifdef CONFIG_BATMAN_ADV_DAT atomic_set(&bat_priv->distributed_arp_table, 1); #endif #ifdef CONFIG_BATMAN_ADV_MCAST atomic_set(&bat_priv->multicast_mode, 1); atomic_set(&bat_priv->multicast_fanout, 16); atomic_set(&bat_priv->mcast.num_want_all_unsnoopables, 0); atomic_set(&bat_priv->mcast.num_want_all_ipv4, 0); atomic_set(&bat_priv->mcast.num_want_all_ipv6, 0); atomic_set(&bat_priv->mcast.num_no_mc_ptype_capa, 0); #endif atomic_set(&bat_priv->gw.mode, BATADV_GW_MODE_OFF); atomic_set(&bat_priv->gw.bandwidth_down, 100); atomic_set(&bat_priv->gw.bandwidth_up, 20); atomic_set(&bat_priv->orig_interval, 1000); atomic_set(&bat_priv->hop_penalty, 30); #ifdef CONFIG_BATMAN_ADV_DEBUG atomic_set(&bat_priv->log_level, 0); #endif atomic_set(&bat_priv->fragmentation, 1); atomic_set(&bat_priv->packet_size_max, ETH_DATA_LEN); atomic_set(&bat_priv->bcast_queue_left, BATADV_BCAST_QUEUE_LEN); atomic_set(&bat_priv->batman_queue_left, BATADV_BATMAN_QUEUE_LEN); atomic_set(&bat_priv->mesh_state, BATADV_MESH_INACTIVE); atomic_set(&bat_priv->bcast_seqno, 1); atomic_set(&bat_priv->tt.vn, 0); atomic_set(&bat_priv->tt.ogm_append_cnt, 0); #ifdef CONFIG_BATMAN_ADV_BLA atomic_set(&bat_priv->bla.num_requests, 0); #endif atomic_set(&bat_priv->tp_num, 0); WRITE_ONCE(bat_priv->tt.local_changes, 0); bat_priv->tt.last_changeset = NULL; bat_priv->tt.last_changeset_len = 0; bat_priv->isolation_mark = 0; bat_priv->isolation_mark_mask = 0; /* randomize initial seqno to avoid collision */ get_random_bytes(&random_seqno, sizeof(random_seqno)); atomic_set(&bat_priv->frag_seqno, random_seqno); bat_priv->primary_if = NULL; batadv_nc_init_bat_priv(bat_priv); if (!bat_priv->algo_ops) { ret = batadv_algo_select(bat_priv, batadv_routing_algo); if (ret < 0) goto free_bat_counters; } ret = batadv_mesh_init(dev); if (ret < 0) goto free_bat_counters; return 0; free_bat_counters: free_percpu(bat_priv->bat_counters); bat_priv->bat_counters = NULL; return ret; } /** * batadv_softif_slave_add() - Add a slave interface to a batadv_soft_interface * @dev: batadv_soft_interface used as master interface * @slave_dev: net_device which should become the slave interface * @extack: extended ACK report struct * * Return: 0 if successful or error otherwise. */ static int batadv_softif_slave_add(struct net_device *dev, struct net_device *slave_dev, struct netlink_ext_ack *extack) { struct batadv_hard_iface *hard_iface; int ret = -EINVAL; hard_iface = batadv_hardif_get_by_netdev(slave_dev); if (!hard_iface || hard_iface->soft_iface) goto out; ret = batadv_hardif_enable_interface(hard_iface, dev); out: batadv_hardif_put(hard_iface); return ret; } /** * batadv_softif_slave_del() - Delete a slave iface from a batadv_soft_interface * @dev: batadv_soft_interface used as master interface * @slave_dev: net_device which should be removed from the master interface * * Return: 0 if successful or error otherwise. */ static int batadv_softif_slave_del(struct net_device *dev, struct net_device *slave_dev) { struct batadv_hard_iface *hard_iface; int ret = -EINVAL; hard_iface = batadv_hardif_get_by_netdev(slave_dev); if (!hard_iface || hard_iface->soft_iface != dev) goto out; batadv_hardif_disable_interface(hard_iface); ret = 0; out: batadv_hardif_put(hard_iface); return ret; } static const struct net_device_ops batadv_netdev_ops = { .ndo_init = batadv_softif_init_late, .ndo_open = batadv_interface_open, .ndo_stop = batadv_interface_release, .ndo_get_stats = batadv_interface_stats, .ndo_vlan_rx_add_vid = batadv_interface_add_vid, .ndo_vlan_rx_kill_vid = batadv_interface_kill_vid, .ndo_set_mac_address = batadv_interface_set_mac_addr, .ndo_change_mtu = batadv_interface_change_mtu, .ndo_set_rx_mode = batadv_interface_set_rx_mode, .ndo_start_xmit = batadv_interface_tx, .ndo_validate_addr = eth_validate_addr, .ndo_add_slave = batadv_softif_slave_add, .ndo_del_slave = batadv_softif_slave_del, }; static void batadv_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info) { strscpy(info->driver, "B.A.T.M.A.N. advanced", sizeof(info->driver)); strscpy(info->version, BATADV_SOURCE_VERSION, sizeof(info->version)); strscpy(info->fw_version, "N/A", sizeof(info->fw_version)); strscpy(info->bus_info, "batman", sizeof(info->bus_info)); } /* Inspired by drivers/net/ethernet/dlink/sundance.c:1702 * Declare each description string in struct.name[] to get fixed sized buffer * and compile time checking for strings longer than ETH_GSTRING_LEN. */ static const struct { const char name[ETH_GSTRING_LEN]; } batadv_counters_strings[] = { { "tx" }, { "tx_bytes" }, { "tx_dropped" }, { "rx" }, { "rx_bytes" }, { "forward" }, { "forward_bytes" }, { "mgmt_tx" }, { "mgmt_tx_bytes" }, { "mgmt_rx" }, { "mgmt_rx_bytes" }, { "frag_tx" }, { "frag_tx_bytes" }, { "frag_rx" }, { "frag_rx_bytes" }, { "frag_fwd" }, { "frag_fwd_bytes" }, { "tt_request_tx" }, { "tt_request_rx" }, { "tt_response_tx" }, { "tt_response_rx" }, { "tt_roam_adv_tx" }, { "tt_roam_adv_rx" }, #ifdef CONFIG_BATMAN_ADV_MCAST { "mcast_tx" }, { "mcast_tx_bytes" }, { "mcast_tx_local" }, { "mcast_tx_local_bytes" }, { "mcast_rx" }, { "mcast_rx_bytes" }, { "mcast_rx_local" }, { "mcast_rx_local_bytes" }, { "mcast_fwd" }, { "mcast_fwd_bytes" }, #endif #ifdef CONFIG_BATMAN_ADV_DAT { "dat_get_tx" }, { "dat_get_rx" }, { "dat_put_tx" }, { "dat_put_rx" }, { "dat_cached_reply_tx" }, #endif #ifdef CONFIG_BATMAN_ADV_NC { "nc_code" }, { "nc_code_bytes" }, { "nc_recode" }, { "nc_recode_bytes" }, { "nc_buffer" }, { "nc_decode" }, { "nc_decode_bytes" }, { "nc_decode_failed" }, { "nc_sniffed" }, #endif }; static void batadv_get_strings(struct net_device *dev, u32 stringset, u8 *data) { if (stringset == ETH_SS_STATS) memcpy(data, batadv_counters_strings, sizeof(batadv_counters_strings)); } static void batadv_get_ethtool_stats(struct net_device *dev, struct ethtool_stats *stats, u64 *data) { struct batadv_priv *bat_priv = netdev_priv(dev); int i; for (i = 0; i < BATADV_CNT_NUM; i++) data[i] = batadv_sum_counter(bat_priv, i); } static int batadv_get_sset_count(struct net_device *dev, int stringset) { if (stringset == ETH_SS_STATS) return BATADV_CNT_NUM; return -EOPNOTSUPP; } static const struct ethtool_ops batadv_ethtool_ops = { .get_drvinfo = batadv_get_drvinfo, .get_link = ethtool_op_get_link, .get_strings = batadv_get_strings, .get_ethtool_stats = batadv_get_ethtool_stats, .get_sset_count = batadv_get_sset_count, }; /** * batadv_softif_free() - Deconstructor of batadv_soft_interface * @dev: Device to cleanup and remove */ static void batadv_softif_free(struct net_device *dev) { batadv_mesh_free(dev); /* some scheduled RCU callbacks need the bat_priv struct to accomplish * their tasks. Wait for them all to be finished before freeing the * netdev and its private data (bat_priv) */ rcu_barrier(); } /** * batadv_softif_init_early() - early stage initialization of soft interface * @dev: registered network device to modify */ static void batadv_softif_init_early(struct net_device *dev) { ether_setup(dev); dev->netdev_ops = &batadv_netdev_ops; dev->needs_free_netdev = true; dev->priv_destructor = batadv_softif_free; dev->features |= NETIF_F_HW_VLAN_CTAG_FILTER; dev->priv_flags |= IFF_NO_QUEUE; dev->lltx = true; dev->netns_local = true; /* can't call min_mtu, because the needed variables * have not been initialized yet */ dev->mtu = ETH_DATA_LEN; /* generate random address */ eth_hw_addr_random(dev); dev->ethtool_ops = &batadv_ethtool_ops; } /** * batadv_softif_validate() - validate configuration of new batadv link * @tb: IFLA_INFO_DATA netlink attributes * @data: enum batadv_ifla_attrs attributes * @extack: extended ACK report struct * * Return: 0 if successful or error otherwise. */ static int batadv_softif_validate(struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { struct batadv_algo_ops *algo_ops; if (!data) return 0; if (data[IFLA_BATADV_ALGO_NAME]) { algo_ops = batadv_algo_get(nla_data(data[IFLA_BATADV_ALGO_NAME])); if (!algo_ops) return -EINVAL; } return 0; } /** * batadv_softif_newlink() - pre-initialize and register new batadv link * @src_net: the applicable net namespace * @dev: network device to register * @tb: IFLA_INFO_DATA netlink attributes * @data: enum batadv_ifla_attrs attributes * @extack: extended ACK report struct * * Return: 0 if successful or error otherwise. */ static int batadv_softif_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { struct batadv_priv *bat_priv = netdev_priv(dev); const char *algo_name; int err; if (data && data[IFLA_BATADV_ALGO_NAME]) { algo_name = nla_data(data[IFLA_BATADV_ALGO_NAME]); err = batadv_algo_select(bat_priv, algo_name); if (err) return -EINVAL; } return register_netdevice(dev); } /** * batadv_softif_destroy_netlink() - deletion of batadv_soft_interface via * netlink * @soft_iface: the to-be-removed batman-adv interface * @head: list pointer */ static void batadv_softif_destroy_netlink(struct net_device *soft_iface, struct list_head *head) { struct batadv_priv *bat_priv = netdev_priv(soft_iface); struct batadv_hard_iface *hard_iface; struct batadv_softif_vlan *vlan; list_for_each_entry(hard_iface, &batadv_hardif_list, list) { if (hard_iface->soft_iface == soft_iface) batadv_hardif_disable_interface(hard_iface); } /* destroy the "untagged" VLAN */ vlan = batadv_softif_vlan_get(bat_priv, BATADV_NO_FLAGS); if (vlan) { batadv_softif_destroy_vlan(bat_priv, vlan); batadv_softif_vlan_put(vlan); } unregister_netdevice_queue(soft_iface, head); } /** * batadv_softif_is_valid() - Check whether device is a batadv soft interface * @net_dev: device which should be checked * * Return: true when net_dev is a batman-adv interface, false otherwise */ bool batadv_softif_is_valid(const struct net_device *net_dev) { if (net_dev->netdev_ops->ndo_start_xmit == batadv_interface_tx) return true; return false; } static const struct nla_policy batadv_ifla_policy[IFLA_BATADV_MAX + 1] = { [IFLA_BATADV_ALGO_NAME] = { .type = NLA_NUL_STRING }, }; struct rtnl_link_ops batadv_link_ops __read_mostly = { .kind = "batadv", .priv_size = sizeof(struct batadv_priv), .setup = batadv_softif_init_early, .maxtype = IFLA_BATADV_MAX, .policy = batadv_ifla_policy, .validate = batadv_softif_validate, .newlink = batadv_softif_newlink, .dellink = batadv_softif_destroy_netlink, };
8173 5034 861 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_RCULIST_BL_H #define _LINUX_RCULIST_BL_H /* * RCU-protected bl list version. See include/linux/list_bl.h. */ #include <linux/list_bl.h> #include <linux/rcupdate.h> static inline void hlist_bl_set_first_rcu(struct hlist_bl_head *h, struct hlist_bl_node *n) { LIST_BL_BUG_ON((unsigned long)n & LIST_BL_LOCKMASK); LIST_BL_BUG_ON(((unsigned long)h->first & LIST_BL_LOCKMASK) != LIST_BL_LOCKMASK); rcu_assign_pointer(h->first, (struct hlist_bl_node *)((unsigned long)n | LIST_BL_LOCKMASK)); } static inline struct hlist_bl_node *hlist_bl_first_rcu(struct hlist_bl_head *h) { return (struct hlist_bl_node *) ((unsigned long)rcu_dereference_check(h->first, hlist_bl_is_locked(h)) & ~LIST_BL_LOCKMASK); } /** * hlist_bl_del_rcu - deletes entry from hash list without re-initialization * @n: the element to delete from the hash list. * * Note: hlist_bl_unhashed() on entry does not return true after this, * the entry is in an undefined state. It is useful for RCU based * lockfree traversal. * * In particular, it means that we can not poison the forward * pointers that may still be used for walking the hash list. * * The caller must take whatever precautions are necessary * (such as holding appropriate locks) to avoid racing * with another list-mutation primitive, such as hlist_bl_add_head_rcu() * or hlist_bl_del_rcu(), running on this same list. * However, it is perfectly legal to run concurrently with * the _rcu list-traversal primitives, such as * hlist_bl_for_each_entry(). */ static inline void hlist_bl_del_rcu(struct hlist_bl_node *n) { __hlist_bl_del(n); n->pprev = LIST_POISON2; } /** * hlist_bl_add_head_rcu * @n: the element to add to the hash list. * @h: the list to add to. * * Description: * Adds the specified element to the specified hlist_bl, * while permitting racing traversals. * * The caller must take whatever precautions are necessary * (such as holding appropriate locks) to avoid racing * with another list-mutation primitive, such as hlist_bl_add_head_rcu() * or hlist_bl_del_rcu(), running on this same list. * However, it is perfectly legal to run concurrently with * the _rcu list-traversal primitives, such as * hlist_bl_for_each_entry_rcu(), used to prevent memory-consistency * problems on Alpha CPUs. Regardless of the type of CPU, the * list-traversal primitive must be guarded by rcu_read_lock(). */ static inline void hlist_bl_add_head_rcu(struct hlist_bl_node *n, struct hlist_bl_head *h) { struct hlist_bl_node *first; /* don't need hlist_bl_first_rcu because we're under lock */ first = hlist_bl_first(h); n->next = first; if (first) first->pprev = &n->next; n->pprev = &h->first; /* need _rcu because we can have concurrent lock free readers */ hlist_bl_set_first_rcu(h, n); } /** * hlist_bl_for_each_entry_rcu - iterate over rcu list of given type * @tpos: the type * to use as a loop cursor. * @pos: the &struct hlist_bl_node to use as a loop cursor. * @head: the head for your list. * @member: the name of the hlist_bl_node within the struct. * */ #define hlist_bl_for_each_entry_rcu(tpos, pos, head, member) \ for (pos = hlist_bl_first_rcu(head); \ pos && \ ({ tpos = hlist_bl_entry(pos, typeof(*tpos), member); 1; }); \ pos = rcu_dereference_raw(pos->next)) #endif
1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_UNWIND_H #define _ASM_X86_UNWIND_H #include <linux/sched.h> #include <linux/ftrace.h> #include <linux/rethook.h> #include <asm/ptrace.h> #include <asm/stacktrace.h> #define IRET_FRAME_OFFSET (offsetof(struct pt_regs, ip)) #define IRET_FRAME_SIZE (sizeof(struct pt_regs) - IRET_FRAME_OFFSET) struct unwind_state { struct stack_info stack_info; unsigned long stack_mask; struct task_struct *task; int graph_idx; #if defined(CONFIG_RETHOOK) struct llist_node *kr_cur; #endif bool error; #if defined(CONFIG_UNWINDER_ORC) bool signal, full_regs; unsigned long sp, bp, ip; struct pt_regs *regs, *prev_regs; #elif defined(CONFIG_UNWINDER_FRAME_POINTER) bool got_irq; unsigned long *bp, *orig_sp, ip; /* * If non-NULL: The current frame is incomplete and doesn't contain a * valid BP. When looking for the next frame, use this instead of the * non-existent saved BP. */ unsigned long *next_bp; struct pt_regs *regs; #else unsigned long *sp; #endif }; void __unwind_start(struct unwind_state *state, struct task_struct *task, struct pt_regs *regs, unsigned long *first_frame); bool unwind_next_frame(struct unwind_state *state); unsigned long unwind_get_return_address(struct unwind_state *state); unsigned long *unwind_get_return_address_ptr(struct unwind_state *state); static inline bool unwind_done(struct unwind_state *state) { return state->stack_info.type == STACK_TYPE_UNKNOWN; } static inline bool unwind_error(struct unwind_state *state) { return state->error; } static inline void unwind_start(struct unwind_state *state, struct task_struct *task, struct pt_regs *regs, unsigned long *first_frame) { first_frame = first_frame ? : get_stack_pointer(task, regs); __unwind_start(state, task, regs, first_frame); } #if defined(CONFIG_UNWINDER_ORC) || defined(CONFIG_UNWINDER_FRAME_POINTER) /* * If 'partial' returns true, only the iret frame registers are valid. */ static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state, bool *partial) { if (unwind_done(state)) return NULL; if (partial) { #ifdef CONFIG_UNWINDER_ORC *partial = !state->full_regs; #else *partial = false; #endif } return state->regs; } #else static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state, bool *partial) { return NULL; } #endif #ifdef CONFIG_UNWINDER_ORC void unwind_init(void); void unwind_module_init(struct module *mod, void *orc_ip, size_t orc_ip_size, void *orc, size_t orc_size); #else static inline void unwind_init(void) {} static inline void unwind_module_init(struct module *mod, void *orc_ip, size_t orc_ip_size, void *orc, size_t orc_size) {} #endif static inline unsigned long unwind_recover_rethook(struct unwind_state *state, unsigned long addr, unsigned long *addr_p) { #ifdef CONFIG_RETHOOK if (is_rethook_trampoline(addr)) return rethook_find_ret_addr(state->task, (unsigned long)addr_p, &state->kr_cur); #endif return addr; } /* Recover the return address modified by rethook and ftrace_graph. */ static inline unsigned long unwind_recover_ret_addr(struct unwind_state *state, unsigned long addr, unsigned long *addr_p) { unsigned long ret; ret = ftrace_graph_ret_addr(state->task, &state->graph_idx, addr, addr_p); return unwind_recover_rethook(state, ret, addr_p); } /* * This disables KASAN checking when reading a value from another task's stack, * since the other task could be running on another CPU and could have poisoned * the stack in the meantime. */ #define READ_ONCE_TASK_STACK(task, x) \ ({ \ unsigned long val; \ if (task == current) \ val = READ_ONCE(x); \ else \ val = READ_ONCE_NOCHECK(x); \ val; \ }) static inline bool task_on_another_cpu(struct task_struct *task) { #ifdef CONFIG_SMP return task != current && task->on_cpu; #else return false; #endif } #endif /* _ASM_X86_UNWIND_H */
1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 /* SPDX-License-Identifier: GPL-2.0-only */ #ifndef _LINUX_RCUREF_H #define _LINUX_RCUREF_H #include <linux/atomic.h> #include <linux/bug.h> #include <linux/limits.h> #include <linux/lockdep.h> #include <linux/preempt.h> #include <linux/rcupdate.h> #define RCUREF_ONEREF 0x00000000U #define RCUREF_MAXREF 0x7FFFFFFFU #define RCUREF_SATURATED 0xA0000000U #define RCUREF_RELEASED 0xC0000000U #define RCUREF_DEAD 0xE0000000U #define RCUREF_NOREF 0xFFFFFFFFU /** * rcuref_init - Initialize a rcuref reference count with the given reference count * @ref: Pointer to the reference count * @cnt: The initial reference count typically '1' */ static inline void rcuref_init(rcuref_t *ref, unsigned int cnt) { atomic_set(&ref->refcnt, cnt - 1); } /** * rcuref_read - Read the number of held reference counts of a rcuref * @ref: Pointer to the reference count * * Return: The number of held references (0 ... N) */ static inline unsigned int rcuref_read(rcuref_t *ref) { unsigned int c = atomic_read(&ref->refcnt); /* Return 0 if within the DEAD zone. */ return c >= RCUREF_RELEASED ? 0 : c + 1; } extern __must_check bool rcuref_get_slowpath(rcuref_t *ref); /** * rcuref_get - Acquire one reference on a rcuref reference count * @ref: Pointer to the reference count * * Similar to atomic_inc_not_zero() but saturates at RCUREF_MAXREF. * * Provides no memory ordering, it is assumed the caller has guaranteed the * object memory to be stable (RCU, etc.). It does provide a control dependency * and thereby orders future stores. See documentation in lib/rcuref.c * * Return: * False if the attempt to acquire a reference failed. This happens * when the last reference has been put already * * True if a reference was successfully acquired */ static inline __must_check bool rcuref_get(rcuref_t *ref) { /* * Unconditionally increase the reference count. The saturation and * dead zones provide enough tolerance for this. */ if (likely(!atomic_add_negative_relaxed(1, &ref->refcnt))) return true; /* Handle the cases inside the saturation and dead zones */ return rcuref_get_slowpath(ref); } extern __must_check bool rcuref_put_slowpath(rcuref_t *ref, unsigned int cnt); /* * Internal helper. Do not invoke directly. */ static __always_inline __must_check bool __rcuref_put(rcuref_t *ref) { int cnt; RCU_LOCKDEP_WARN(!rcu_read_lock_held() && preemptible(), "suspicious rcuref_put_rcusafe() usage"); /* * Unconditionally decrease the reference count. The saturation and * dead zones provide enough tolerance for this. */ cnt = atomic_sub_return_release(1, &ref->refcnt); if (likely(cnt >= 0)) return false; /* * Handle the last reference drop and cases inside the saturation * and dead zones. */ return rcuref_put_slowpath(ref, cnt); } /** * rcuref_put_rcusafe -- Release one reference for a rcuref reference count RCU safe * @ref: Pointer to the reference count * * Provides release memory ordering, such that prior loads and stores are done * before, and provides an acquire ordering on success such that free() * must come after. * * Can be invoked from contexts, which guarantee that no grace period can * happen which would free the object concurrently if the decrement drops * the last reference and the slowpath races against a concurrent get() and * put() pair. rcu_read_lock()'ed and atomic contexts qualify. * * Return: * True if this was the last reference with no future references * possible. This signals the caller that it can safely release the * object which is protected by the reference counter. * * False if there are still active references or the put() raced * with a concurrent get()/put() pair. Caller is not allowed to * release the protected object. */ static inline __must_check bool rcuref_put_rcusafe(rcuref_t *ref) { return __rcuref_put(ref); } /** * rcuref_put -- Release one reference for a rcuref reference count * @ref: Pointer to the reference count * * Can be invoked from any context. * * Provides release memory ordering, such that prior loads and stores are done * before, and provides an acquire ordering on success such that free() * must come after. * * Return: * * True if this was the last reference with no future references * possible. This signals the caller that it can safely schedule the * object, which is protected by the reference counter, for * deconstruction. * * False if there are still active references or the put() raced * with a concurrent get()/put() pair. Caller is not allowed to * deconstruct the protected object. */ static inline __must_check bool rcuref_put(rcuref_t *ref) { bool released; preempt_disable(); released = __rcuref_put(ref); preempt_enable(); return released; } #endif
539 556 2 287 287 8 15 15 15 15 15 14 15 9 6 15 13 317 36 36 36 36 36 35 35 156 79 318 314 196 316 4 317 317 317 2 317 318 318 317 317 194 1 3 3 318 3 1 1 314 187 5 1 2 315 317 317 317 3 314 434 1 32 1 32 3 5 5 5 1 318 1 317 2 315 314 74 317 4 1 317 1 316 5 1 1 1 1 4 317 302 302 302 1 301 300 2 2 34 4 33 34 4 34 36 36 36 35 36 36 40 39 4 36 36 36 36 39 4 36 2 35 40 317 317 317 2 317 316 2 318 318 4 317 321 321 1 2 319 1 319 23 181 182 318 358 358 357 36 3 33 36 322 319 6 316 8 8 3 5 8 8 3 5 8 8 7 5 8 8 8 3 3 5 2 2 2 2 2 2 5 1 4 4 7 5 3 2 3 3 2 7 7 5 1 1 1 473 354 311 342 473 292 308 308 7 292 470 470 5 5 5 5 5 5 5 5 5 5 4 1 5 5 5 8 8 5 5 8 2 6 24 24 16 17 16 15 2 3 2 2 1 1 3 306 1 3 1 23 6 17 22 17 4 17 5 13 11 1 10 3 10 1 10 1 1 7 1 24 23 24 24 24 24 23 24 6 4 3 8 6 8 5 24 17 24 17 6 26 32 12 8 22 6 14 2 13 3 9 7 10 6 9 7 11 5 10 6 8 8 8 7 7 7 10 3 16 24 15 9 17 6 18 6 7 17 2 22 34 27 7 32 30 22 2 23 1 24 23 24 24 1 17 17 10 272 272 292 291 20 20 2 2 1 1 2 2 2 3 3 7 6 2 6 2 8 7 8 8 2 4 1 3 4 1 3 310 5 5 5 5 306 307 307 3 3 3 3 3 3 3 8 8 8 8 8 8 3 3 3 3 1 2 3 288 288 288 287 288 287 33 33 25 25 467 466 31 31 31 292 220 73 73 26 1 26 26 1 25 26 465 465 1 465 465 465 1 465 466 466 465 466 466 466 1 466 466 466 465 466 466 465 326 327 300 319 325 327 302 327 302 316 317 306 1 305 306 305 303 305 305 306 305 1 1 306 302 1 1 303 1 1 303 302 302 302 303 303 306 306 8 8 8 8 304 303 304 305 301 302 298 298 298 292 1 275 275 275 292 292 291 228 16 272 271 185 228 228 275 1 10 1 2 1 1 2 5 2 7 2 272 263 22 22 372 372 2 1 284 285 2 2 285 1 285 285 282 1 296 11 285 285 2 282 10 10 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082 6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121 6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132 6133 6134 6135 6136 6137 6138 6139 6140 6141 6142 6143 6144 6145 6146 6147 6148 6149 6150 6151 6152 6153 6154 6155 6156 6157 6158 6159 6160 6161 6162 6163 6164 6165 6166 6167 6168 6169 6170 6171 6172 6173 6174 6175 6176 6177 6178 6179 6180 6181 6182 6183 6184 6185 6186 6187 6188 6189 6190 6191 6192 6193 6194 6195 6196 6197 6198 6199 6200 6201 6202 6203 6204 6205 6206 6207 6208 6209 6210 6211 6212 6213 6214 6215 6216 6217 6218 6219 6220 6221 6222 6223 6224 6225 6226 6227 6228 6229 6230 6231 6232 6233 6234 6235 6236 6237 6238 6239 6240 6241 6242 6243 6244 6245 6246 6247 6248 6249 6250 6251 6252 6253 6254 6255 6256 6257 6258 6259 6260 6261 6262 6263 6264 6265 6266 6267 6268 6269 6270 6271 6272 6273 6274 6275 6276 6277 6278 6279 6280 6281 6282 6283 6284 6285 6286 6287 6288 6289 6290 6291 6292 6293 6294 6295 6296 6297 6298 6299 6300 6301 6302 6303 6304 6305 6306 6307 6308 6309 6310 6311 6312 6313 6314 6315 6316 6317 6318 6319 6320 6321 6322 6323 6324 6325 6326 6327 6328 6329 6330 6331 6332 6333 6334 6335 6336 6337 6338 6339 6340 6341 6342 6343 6344 6345 6346 6347 6348 6349 6350 6351 6352 6353 6354 6355 6356 6357 6358 6359 6360 6361 6362 6363 6364 6365 6366 6367 6368 6369 6370 6371 6372 6373 6374 6375 6376 6377 6378 6379 6380 6381 6382 6383 6384 6385 6386 6387 6388 6389 6390 6391 6392 6393 6394 6395 6396 6397 6398 6399 6400 6401 6402 6403 6404 6405 6406 6407 6408 6409 6410 6411 6412 6413 6414 6415 6416 6417 6418 6419 6420 6421 6422 6423 6424 6425 6426 6427 6428 6429 6430 6431 6432 6433 6434 6435 6436 6437 6438 6439 6440 6441 6442 6443 6444 6445 6446 6447 6448 6449 6450 6451 6452 6453 6454 6455 6456 6457 6458 6459 6460 6461 6462 6463 6464 6465 6466 6467 6468 6469 6470 6471 6472 6473 6474 6475 6476 6477 6478 6479 6480 6481 6482 6483 6484 6485 6486 6487 6488 6489 6490 6491 6492 6493 6494 6495 6496 6497 6498 6499 6500 6501 6502 6503 6504 6505 6506 6507 6508 6509 6510 6511 6512 6513 6514 6515 6516 6517 6518 6519 6520 6521 6522 6523 6524 6525 6526 6527 6528 6529 6530 6531 6532 6533 6534 6535 6536 6537 6538 6539 6540 6541 6542 6543 6544 6545 6546 6547 6548 6549 6550 6551 6552 6553 6554 6555 6556 6557 6558 6559 6560 6561 6562 6563 6564 6565 6566 6567 6568 6569 6570 6571 6572 6573 6574 6575 6576 6577 6578 6579 6580 6581 6582 6583 6584 6585 6586 6587 6588 6589 6590 6591 6592 6593 6594 6595 6596 6597 6598 6599 6600 6601 6602 6603 6604 6605 6606 6607 6608 6609 6610 6611 6612 6613 6614 6615 6616 6617 6618 6619 6620 6621 6622 6623 6624 6625 6626 6627 6628 6629 6630 6631 6632 6633 6634 6635 6636 6637 6638 6639 6640 6641 6642 6643 6644 6645 6646 6647 6648 6649 6650 6651 6652 6653 6654 6655 6656 6657 6658 6659 6660 6661 6662 6663 6664 6665 6666 6667 6668 6669 6670 6671 6672 6673 6674 6675 6676 6677 6678 6679 6680 6681 6682 6683 6684 6685 6686 6687 6688 6689 6690 6691 6692 6693 6694 6695 6696 6697 6698 6699 6700 6701 6702 6703 6704 6705 6706 6707 6708 6709 6710 6711 6712 6713 6714 6715 6716 6717 6718 6719 6720 6721 6722 6723 6724 6725 6726 6727 6728 6729 6730 6731 6732 6733 6734 6735 6736 6737 6738 6739 6740 6741 6742 6743 6744 6745 6746 6747 6748 6749 6750 6751 6752 6753 6754 6755 6756 6757 6758 6759 6760 6761 6762 6763 6764 6765 6766 6767 6768 6769 6770 6771 6772 6773 6774 6775 6776 6777 6778 6779 6780 6781 6782 6783 6784 6785 6786 6787 6788 6789 6790 6791 6792 6793 6794 6795 6796 6797 6798 6799 6800 6801 6802 6803 6804 6805 6806 6807 6808 6809 6810 6811 6812 6813 6814 6815 6816 6817 6818 6819 6820 6821 6822 6823 6824 6825 6826 6827 6828 6829 6830 6831 6832 6833 6834 6835 6836 6837 6838 6839 6840 6841 6842 6843 6844 6845 6846 6847 6848 6849 6850 6851 6852 6853 6854 6855 6856 6857 6858 6859 6860 6861 6862 6863 6864 6865 6866 6867 6868 6869 6870 6871 6872 6873 6874 6875 6876 6877 6878 6879 6880 6881 6882 6883 6884 6885 6886 6887 6888 6889 6890 6891 6892 6893 6894 6895 6896 6897 6898 6899 6900 6901 6902 6903 6904 6905 6906 6907 6908 6909 6910 6911 6912 6913 6914 6915 6916 6917 6918 6919 6920 6921 6922 6923 6924 6925 6926 6927 6928 6929 6930 6931 6932 6933 6934 6935 6936 6937 6938 6939 6940 6941 6942 6943 6944 6945 6946 6947 6948 6949 6950 6951 6952 6953 6954 6955 6956 6957 6958 6959 6960 6961 6962 6963 6964 6965 6966 6967 6968 6969 6970 6971 6972 6973 6974 6975 6976 6977 6978 6979 6980 6981 6982 6983 6984 6985 6986 6987 6988 6989 6990 6991 6992 6993 6994 6995 6996 6997 6998 6999 7000 7001 7002 7003 7004 7005 7006 7007 7008 7009 7010 7011 7012 7013 7014 7015 7016 7017 7018 7019 7020 7021 7022 7023 7024 7025 7026 7027 7028 7029 7030 7031 7032 7033 7034 7035 7036 7037 7038 7039 7040 7041 7042 7043 7044 7045 7046 7047 7048 7049 7050 7051 7052 7053 7054 7055 7056 7057 7058 7059 7060 7061 7062 7063 7064 7065 7066 7067 7068 7069 7070 7071 7072 7073 7074 7075 7076 7077 7078 7079 7080 7081 7082 7083 7084 7085 7086 7087 7088 7089 7090 7091 7092 7093 7094 7095 7096 7097 7098 7099 7100 7101 7102 7103 7104 7105 7106 7107 7108 7109 7110 7111 7112 7113 7114 7115 7116 7117 7118 7119 7120 7121 7122 7123 7124 7125 7126 7127 7128 7129 7130 7131 7132 7133 7134 7135 7136 7137 7138 7139 7140 7141 7142 7143 7144 7145 7146 7147 7148 7149 7150 7151 7152 7153 7154 7155 7156 7157 7158 7159 7160 7161 7162 7163 7164 7165 7166 7167 7168 7169 7170 7171 7172 7173 7174 7175 7176 7177 7178 7179 7180 7181 7182 7183 7184 7185 7186 7187 7188 7189 7190 7191 7192 7193 7194 7195 7196 7197 7198 7199 7200 7201 7202 7203 7204 7205 7206 7207 7208 7209 7210 7211 7212 7213 7214 7215 7216 7217 7218 7219 7220 7221 7222 7223 7224 7225 7226 7227 7228 7229 7230 7231 7232 7233 7234 7235 7236 7237 7238 7239 7240 7241 7242 7243 7244 7245 7246 7247 7248 7249 7250 7251 7252 7253 7254 7255 7256 7257 7258 7259 7260 7261 7262 7263 7264 7265 7266 7267 7268 7269 7270 7271 7272 7273 7274 7275 7276 7277 7278 7279 7280 7281 7282 7283 7284 7285 7286 7287 7288 7289 7290 7291 7292 7293 7294 7295 7296 7297 7298 7299 7300 7301 7302 7303 7304 7305 7306 7307 7308 7309 7310 7311 7312 7313 7314 7315 7316 7317 7318 7319 7320 7321 7322 7323 7324 7325 7326 7327 7328 7329 7330 7331 7332 7333 7334 7335 7336 7337 7338 7339 7340 7341 7342 7343 7344 7345 7346 7347 7348 7349 7350 7351 7352 7353 7354 7355 7356 7357 7358 7359 7360 7361 7362 7363 7364 7365 7366 7367 7368 7369 7370 7371 7372 7373 7374 7375 7376 7377 7378 7379 7380 7381 7382 7383 7384 7385 7386 7387 7388 7389 7390 7391 7392 7393 7394 7395 7396 7397 7398 7399 7400 7401 7402 7403 7404 7405 7406 7407 7408 7409 7410 7411 7412 7413 7414 7415 7416 7417 7418 7419 7420 7421 7422 7423 7424 7425 7426 7427 7428 7429 7430 7431 7432 7433 7434 7435 7436 7437 7438 7439 7440 7441 7442 7443 7444 7445 7446 7447 7448 7449 7450 7451 7452 7453 7454 7455 7456 7457 7458 7459 7460 7461 7462 7463 7464 7465 7466 7467 7468 7469 7470 7471 7472 7473 7474 7475 7476 7477 7478 7479 7480 7481 7482 7483 7484 7485 7486 7487 7488 7489 7490 7491 7492 7493 7494 7495 7496 7497 7498 7499 7500 7501 7502 7503 7504 7505 7506 7507 7508 7509 7510 7511 7512 7513 7514 7515 7516 7517 7518 7519 7520 7521 7522 7523 7524 7525 7526 7527 7528 7529 7530 7531 7532 7533 7534 7535 7536 7537 7538 7539 7540 7541 7542 7543 7544 7545 7546 7547 7548 7549 7550 7551 7552 7553 7554 7555 7556 7557 7558 7559 7560 7561 7562 7563 7564 7565 7566 7567 7568 7569 7570 7571 7572 7573 7574 7575 7576 7577 7578 7579 7580 7581 7582 7583 7584 7585 7586 7587 7588 7589 7590 7591 7592 7593 7594 7595 7596 7597 7598 7599 7600 7601 7602 7603 7604 7605 7606 7607 7608 7609 7610 7611 7612 7613 7614 7615 7616 7617 7618 7619 7620 7621 7622 7623 7624 7625 7626 7627 7628 7629 7630 7631 7632 7633 7634 7635 7636 7637 7638 7639 7640 7641 7642 7643 7644 7645 7646 7647 7648 7649 7650 7651 7652 7653 7654 7655 7656 7657 7658 7659 7660 7661 7662 7663 7664 7665 7666 7667 7668 7669 7670 7671 7672 7673 7674 7675 7676 7677 7678 7679 7680 7681 7682 7683 7684 7685 7686 7687 7688 7689 7690 7691 7692 7693 7694 7695 7696 7697 7698 7699 7700 7701 7702 7703 7704 7705 7706 7707 7708 7709 7710 7711 7712 7713 7714 7715 7716 7717 7718 7719 7720 7721 7722 7723 7724 7725 7726 7727 7728 7729 7730 7731 7732 7733 7734 7735 7736 7737 7738 7739 7740 7741 7742 7743 7744 7745 7746 7747 7748 7749 7750 7751 7752 7753 7754 7755 7756 7757 7758 7759 7760 7761 7762 7763 7764 7765 7766 7767 7768 7769 7770 7771 7772 7773 7774 7775 7776 7777 7778 7779 7780 7781 7782 7783 7784 7785 7786 7787 7788 7789 7790 7791 7792 7793 7794 7795 7796 7797 7798 7799 7800 7801 7802 7803 7804 7805 7806 7807 7808 7809 7810 7811 7812 7813 7814 7815 7816 7817 7818 7819 7820 7821 7822 7823 7824 7825 7826 7827 7828 7829 7830 7831 7832 7833 7834 7835 7836 7837 7838 7839 7840 7841 7842 7843 7844 7845 7846 7847 7848 7849 7850 7851 7852 7853 7854 7855 7856 7857 7858 7859 7860 7861 7862 7863 7864 7865 7866 7867 7868 7869 7870 7871 7872 7873 7874 7875 7876 7877 7878 7879 7880 7881 7882 7883 7884 7885 7886 7887 7888 7889 7890 7891 7892 7893 7894 7895 7896 7897 7898 7899 7900 7901 7902 7903 7904 7905 7906 7907 7908 7909 7910 7911 7912 7913 7914 7915 7916 7917 7918 7919 7920 7921 7922 7923 7924 7925 7926 7927 7928 7929 7930 7931 7932 7933 7934 7935 7936 7937 7938 7939 7940 7941 7942 7943 7944 7945 7946 7947 7948 7949 7950 7951 7952 7953 7954 7955 7956 7957 7958 7959 7960 7961 7962 7963 7964 7965 7966 7967 7968 7969 7970 7971 7972 7973 7974 7975 7976 7977 7978 7979 7980 7981 7982 7983 7984 7985 7986 7987 7988 7989 7990 7991 7992 7993 7994 7995 7996 7997 7998 7999 8000 8001 8002 8003 8004 8005 8006 8007 8008 8009 8010 8011 8012 8013 8014 8015 8016 8017 8018 8019 8020 8021 8022 8023 8024 8025 8026 8027 8028 8029 8030 8031 8032 8033 8034 8035 8036 8037 8038 8039 8040 8041 8042 8043 8044 8045 8046 8047 8048 8049 8050 8051 8052 8053 8054 8055 8056 8057 8058 8059 8060 8061 8062 8063 8064 8065 8066 8067 8068 8069 8070 8071 8072 8073 8074 8075 8076 8077 8078 8079 8080 8081 8082 8083 8084 8085 8086 8087 8088 8089 8090 8091 8092 8093 8094 8095 8096 8097 8098 8099 8100 8101 8102 8103 8104 8105 8106 8107 8108 8109 8110 8111 8112 8113 8114 8115 8116 8117 8118 8119 8120 8121 8122 8123 8124 8125 8126 8127 8128 8129 8130 8131 8132 8133 8134 8135 8136 8137 8138 8139 8140 8141 8142 8143 8144 8145 8146 8147 8148 8149 8150 8151 8152 8153 8154 8155 8156 8157 8158 8159 8160 8161 8162 8163 8164 8165 8166 8167 8168 8169 8170 8171 8172 8173 8174 8175 8176 8177 8178 8179 8180 8181 8182 8183 8184 8185 8186 8187 8188 8189 8190 8191 8192 8193 8194 8195 8196 8197 8198 8199 8200 8201 8202 8203 8204 8205 8206 8207 8208 8209 8210 8211 8212 8213 8214 8215 8216 8217 8218 8219 8220 8221 8222 8223 8224 8225 8226 8227 8228 8229 8230 8231 8232 8233 8234 8235 8236 8237 8238 8239 8240 8241 8242 8243 8244 8245 8246 8247 8248 8249 8250 8251 8252 8253 8254 8255 8256 8257 8258 8259 8260 8261 8262 8263 8264 8265 8266 8267 8268 8269 8270 8271 8272 8273 8274 8275 8276 8277 8278 8279 8280 8281 8282 8283 8284 8285 8286 8287 8288 8289 8290 8291 8292 8293 8294 8295 8296 8297 8298 8299 8300 8301 8302 8303 8304 8305 8306 8307 8308 8309 8310 8311 8312 8313 8314 8315 8316 8317 8318 8319 8320 8321 8322 8323 8324 8325 8326 8327 8328 8329 8330 8331 8332 8333 8334 8335 8336 8337 8338 8339 8340 8341 8342 8343 8344 8345 8346 8347 8348 8349 8350 8351 8352 8353 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2007 Oracle. All rights reserved. */ #include <linux/sched.h> #include <linux/sched/mm.h> #include <linux/slab.h> #include <linux/ratelimit.h> #include <linux/kthread.h> #include <linux/semaphore.h> #include <linux/uuid.h> #include <linux/list_sort.h> #include <linux/namei.h> #include "misc.h" #include "disk-io.h" #include "extent-tree.h" #include "transaction.h" #include "volumes.h" #include "raid56.h" #include "rcu-string.h" #include "dev-replace.h" #include "sysfs.h" #include "tree-checker.h" #include "space-info.h" #include "block-group.h" #include "discard.h" #include "zoned.h" #include "fs.h" #include "accessors.h" #include "uuid-tree.h" #include "ioctl.h" #include "relocation.h" #include "scrub.h" #include "super.h" #include "raid-stripe-tree.h" #define BTRFS_BLOCK_GROUP_STRIPE_MASK (BTRFS_BLOCK_GROUP_RAID0 | \ BTRFS_BLOCK_GROUP_RAID10 | \ BTRFS_BLOCK_GROUP_RAID56_MASK) struct btrfs_io_geometry { u32 stripe_index; u32 stripe_nr; int mirror_num; int num_stripes; u64 stripe_offset; u64 raid56_full_stripe_start; int max_errors; enum btrfs_map_op op; bool use_rst; }; const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { [BTRFS_RAID_RAID10] = { .sub_stripes = 2, .dev_stripes = 1, .devs_max = 0, /* 0 == as many as possible */ .devs_min = 2, .tolerated_failures = 1, .devs_increment = 2, .ncopies = 2, .nparity = 0, .raid_name = "raid10", .bg_flag = BTRFS_BLOCK_GROUP_RAID10, .mindev_error = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET, }, [BTRFS_RAID_RAID1] = { .sub_stripes = 1, .dev_stripes = 1, .devs_max = 2, .devs_min = 2, .tolerated_failures = 1, .devs_increment = 2, .ncopies = 2, .nparity = 0, .raid_name = "raid1", .bg_flag = BTRFS_BLOCK_GROUP_RAID1, .mindev_error = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET, }, [BTRFS_RAID_RAID1C3] = { .sub_stripes = 1, .dev_stripes = 1, .devs_max = 3, .devs_min = 3, .tolerated_failures = 2, .devs_increment = 3, .ncopies = 3, .nparity = 0, .raid_name = "raid1c3", .bg_flag = BTRFS_BLOCK_GROUP_RAID1C3, .mindev_error = BTRFS_ERROR_DEV_RAID1C3_MIN_NOT_MET, }, [BTRFS_RAID_RAID1C4] = { .sub_stripes = 1, .dev_stripes = 1, .devs_max = 4, .devs_min = 4, .tolerated_failures = 3, .devs_increment = 4, .ncopies = 4, .nparity = 0, .raid_name = "raid1c4", .bg_flag = BTRFS_BLOCK_GROUP_RAID1C4, .mindev_error = BTRFS_ERROR_DEV_RAID1C4_MIN_NOT_MET, }, [BTRFS_RAID_DUP] = { .sub_stripes = 1, .dev_stripes = 2, .devs_max = 1, .devs_min = 1, .tolerated_failures = 0, .devs_increment = 1, .ncopies = 2, .nparity = 0, .raid_name = "dup", .bg_flag = BTRFS_BLOCK_GROUP_DUP, .mindev_error = 0, }, [BTRFS_RAID_RAID0] = { .sub_stripes = 1, .dev_stripes = 1, .devs_max = 0, .devs_min = 1, .tolerated_failures = 0, .devs_increment = 1, .ncopies = 1, .nparity = 0, .raid_name = "raid0", .bg_flag = BTRFS_BLOCK_GROUP_RAID0, .mindev_error = 0, }, [BTRFS_RAID_SINGLE] = { .sub_stripes = 1, .dev_stripes = 1, .devs_max = 1, .devs_min = 1, .tolerated_failures = 0, .devs_increment = 1, .ncopies = 1, .nparity = 0, .raid_name = "single", .bg_flag = 0, .mindev_error = 0, }, [BTRFS_RAID_RAID5] = { .sub_stripes = 1, .dev_stripes = 1, .devs_max = 0, .devs_min = 2, .tolerated_failures = 1, .devs_increment = 1, .ncopies = 1, .nparity = 1, .raid_name = "raid5", .bg_flag = BTRFS_BLOCK_GROUP_RAID5, .mindev_error = BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET, }, [BTRFS_RAID_RAID6] = { .sub_stripes = 1, .dev_stripes = 1, .devs_max = 0, .devs_min = 3, .tolerated_failures = 2, .devs_increment = 1, .ncopies = 1, .nparity = 2, .raid_name = "raid6", .bg_flag = BTRFS_BLOCK_GROUP_RAID6, .mindev_error = BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET, }, }; /* * Convert block group flags (BTRFS_BLOCK_GROUP_*) to btrfs_raid_types, which * can be used as index to access btrfs_raid_array[]. */ enum btrfs_raid_types __attribute_const__ btrfs_bg_flags_to_raid_index(u64 flags) { const u64 profile = (flags & BTRFS_BLOCK_GROUP_PROFILE_MASK); if (!profile) return BTRFS_RAID_SINGLE; return BTRFS_BG_FLAG_TO_INDEX(profile); } const char *btrfs_bg_type_to_raid_name(u64 flags) { const int index = btrfs_bg_flags_to_raid_index(flags); if (index >= BTRFS_NR_RAID_TYPES) return NULL; return btrfs_raid_array[index].raid_name; } int btrfs_nr_parity_stripes(u64 type) { enum btrfs_raid_types index = btrfs_bg_flags_to_raid_index(type); return btrfs_raid_array[index].nparity; } /* * Fill @buf with textual description of @bg_flags, no more than @size_buf * bytes including terminating null byte. */ void btrfs_describe_block_groups(u64 bg_flags, char *buf, u32 size_buf) { int i; int ret; char *bp = buf; u64 flags = bg_flags; u32 size_bp = size_buf; if (!flags) { strcpy(bp, "NONE"); return; } #define DESCRIBE_FLAG(flag, desc) \ do { \ if (flags & (flag)) { \ ret = snprintf(bp, size_bp, "%s|", (desc)); \ if (ret < 0 || ret >= size_bp) \ goto out_overflow; \ size_bp -= ret; \ bp += ret; \ flags &= ~(flag); \ } \ } while (0) DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_DATA, "data"); DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_SYSTEM, "system"); DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_METADATA, "metadata"); DESCRIBE_FLAG(BTRFS_AVAIL_ALLOC_BIT_SINGLE, "single"); for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) DESCRIBE_FLAG(btrfs_raid_array[i].bg_flag, btrfs_raid_array[i].raid_name); #undef DESCRIBE_FLAG if (flags) { ret = snprintf(bp, size_bp, "0x%llx|", flags); size_bp -= ret; } if (size_bp < size_buf) buf[size_buf - size_bp - 1] = '\0'; /* remove last | */ /* * The text is trimmed, it's up to the caller to provide sufficiently * large buffer */ out_overflow:; } static int init_first_rw_device(struct btrfs_trans_handle *trans); static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info); static void btrfs_dev_stat_print_on_load(struct btrfs_device *device); /* * Device locking * ============== * * There are several mutexes that protect manipulation of devices and low-level * structures like chunks but not block groups, extents or files * * uuid_mutex (global lock) * ------------------------ * protects the fs_uuids list that tracks all per-fs fs_devices, resulting from * the SCAN_DEV ioctl registration or from mount either implicitly (the first * device) or requested by the device= mount option * * the mutex can be very coarse and can cover long-running operations * * protects: updates to fs_devices counters like missing devices, rw devices, * seeding, structure cloning, opening/closing devices at mount/umount time * * global::fs_devs - add, remove, updates to the global list * * does not protect: manipulation of the fs_devices::devices list in general * but in mount context it could be used to exclude list modifications by eg. * scan ioctl * * btrfs_device::name - renames (write side), read is RCU * * fs_devices::device_list_mutex (per-fs, with RCU) * ------------------------------------------------ * protects updates to fs_devices::devices, ie. adding and deleting * * simple list traversal with read-only actions can be done with RCU protection * * may be used to exclude some operations from running concurrently without any * modifications to the list (see write_all_supers) * * Is not required at mount and close times, because our device list is * protected by the uuid_mutex at that point. * * balance_mutex * ------------- * protects balance structures (status, state) and context accessed from * several places (internally, ioctl) * * chunk_mutex * ----------- * protects chunks, adding or removing during allocation, trim or when a new * device is added/removed. Additionally it also protects post_commit_list of * individual devices, since they can be added to the transaction's * post_commit_list only with chunk_mutex held. * * cleaner_mutex * ------------- * a big lock that is held by the cleaner thread and prevents running subvolume * cleaning together with relocation or delayed iputs * * * Lock nesting * ============ * * uuid_mutex * device_list_mutex * chunk_mutex * balance_mutex * * * Exclusive operations * ==================== * * Maintains the exclusivity of the following operations that apply to the * whole filesystem and cannot run in parallel. * * - Balance (*) * - Device add * - Device remove * - Device replace (*) * - Resize * * The device operations (as above) can be in one of the following states: * * - Running state * - Paused state * - Completed state * * Only device operations marked with (*) can go into the Paused state for the * following reasons: * * - ioctl (only Balance can be Paused through ioctl) * - filesystem remounted as read-only * - filesystem unmounted and mounted as read-only * - system power-cycle and filesystem mounted as read-only * - filesystem or device errors leading to forced read-only * * The status of exclusive operation is set and cleared atomically. * During the course of Paused state, fs_info::exclusive_operation remains set. * A device operation in Paused or Running state can be canceled or resumed * either by ioctl (Balance only) or when remounted as read-write. * The exclusive status is cleared when the device operation is canceled or * completed. */ DEFINE_MUTEX(uuid_mutex); static LIST_HEAD(fs_uuids); struct list_head * __attribute_const__ btrfs_get_fs_uuids(void) { return &fs_uuids; } /* * Allocate new btrfs_fs_devices structure identified by a fsid. * * @fsid: if not NULL, copy the UUID to fs_devices::fsid and to * fs_devices::metadata_fsid * * Return a pointer to a new struct btrfs_fs_devices on success, or ERR_PTR(). * The returned struct is not linked onto any lists and can be destroyed with * kfree() right away. */ static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid) { struct btrfs_fs_devices *fs_devs; fs_devs = kzalloc(sizeof(*fs_devs), GFP_KERNEL); if (!fs_devs) return ERR_PTR(-ENOMEM); mutex_init(&fs_devs->device_list_mutex); INIT_LIST_HEAD(&fs_devs->devices); INIT_LIST_HEAD(&fs_devs->alloc_list); INIT_LIST_HEAD(&fs_devs->fs_list); INIT_LIST_HEAD(&fs_devs->seed_list); if (fsid) { memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE); memcpy(fs_devs->metadata_uuid, fsid, BTRFS_FSID_SIZE); } return fs_devs; } static void btrfs_free_device(struct btrfs_device *device) { WARN_ON(!list_empty(&device->post_commit_list)); rcu_string_free(device->name); extent_io_tree_release(&device->alloc_state); btrfs_destroy_dev_zone_info(device); kfree(device); } static void free_fs_devices(struct btrfs_fs_devices *fs_devices) { struct btrfs_device *device; WARN_ON(fs_devices->opened); while (!list_empty(&fs_devices->devices)) { device = list_entry(fs_devices->devices.next, struct btrfs_device, dev_list); list_del(&device->dev_list); btrfs_free_device(device); } kfree(fs_devices); } void __exit btrfs_cleanup_fs_uuids(void) { struct btrfs_fs_devices *fs_devices; while (!list_empty(&fs_uuids)) { fs_devices = list_entry(fs_uuids.next, struct btrfs_fs_devices, fs_list); list_del(&fs_devices->fs_list); free_fs_devices(fs_devices); } } static bool match_fsid_fs_devices(const struct btrfs_fs_devices *fs_devices, const u8 *fsid, const u8 *metadata_fsid) { if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) != 0) return false; if (!metadata_fsid) return true; if (memcmp(metadata_fsid, fs_devices->metadata_uuid, BTRFS_FSID_SIZE) != 0) return false; return true; } static noinline struct btrfs_fs_devices *find_fsid( const u8 *fsid, const u8 *metadata_fsid) { struct btrfs_fs_devices *fs_devices; ASSERT(fsid); /* Handle non-split brain cases */ list_for_each_entry(fs_devices, &fs_uuids, fs_list) { if (match_fsid_fs_devices(fs_devices, fsid, metadata_fsid)) return fs_devices; } return NULL; } static int btrfs_get_bdev_and_sb(const char *device_path, blk_mode_t flags, void *holder, int flush, struct file **bdev_file, struct btrfs_super_block **disk_super) { struct block_device *bdev; int ret; *bdev_file = bdev_file_open_by_path(device_path, flags, holder, NULL); if (IS_ERR(*bdev_file)) { ret = PTR_ERR(*bdev_file); btrfs_err(NULL, "failed to open device for path %s with flags 0x%x: %d", device_path, flags, ret); goto error; } bdev = file_bdev(*bdev_file); if (flush) sync_blockdev(bdev); if (holder) { ret = set_blocksize(*bdev_file, BTRFS_BDEV_BLOCKSIZE); if (ret) { fput(*bdev_file); goto error; } } invalidate_bdev(bdev); *disk_super = btrfs_read_dev_super(bdev); if (IS_ERR(*disk_super)) { ret = PTR_ERR(*disk_super); fput(*bdev_file); goto error; } return 0; error: *disk_super = NULL; *bdev_file = NULL; return ret; } /* * Search and remove all stale devices (which are not mounted). When both * inputs are NULL, it will search and release all stale devices. * * @devt: Optional. When provided will it release all unmounted devices * matching this devt only. * @skip_device: Optional. Will skip this device when searching for the stale * devices. * * Return: 0 for success or if @devt is 0. * -EBUSY if @devt is a mounted device. * -ENOENT if @devt does not match any device in the list. */ static int btrfs_free_stale_devices(dev_t devt, struct btrfs_device *skip_device) { struct btrfs_fs_devices *fs_devices, *tmp_fs_devices; struct btrfs_device *device, *tmp_device; int ret; bool freed = false; lockdep_assert_held(&uuid_mutex); /* Return good status if there is no instance of devt. */ ret = 0; list_for_each_entry_safe(fs_devices, tmp_fs_devices, &fs_uuids, fs_list) { mutex_lock(&fs_devices->device_list_mutex); list_for_each_entry_safe(device, tmp_device, &fs_devices->devices, dev_list) { if (skip_device && skip_device == device) continue; if (devt && devt != device->devt) continue; if (fs_devices->opened) { if (devt) ret = -EBUSY; break; } /* delete the stale device */ fs_devices->num_devices--; list_del(&device->dev_list); btrfs_free_device(device); freed = true; } mutex_unlock(&fs_devices->device_list_mutex); if (fs_devices->num_devices == 0) { btrfs_sysfs_remove_fsid(fs_devices); list_del(&fs_devices->fs_list); free_fs_devices(fs_devices); } } /* If there is at least one freed device return 0. */ if (freed) return 0; return ret; } static struct btrfs_fs_devices *find_fsid_by_device( struct btrfs_super_block *disk_super, dev_t devt, bool *same_fsid_diff_dev) { struct btrfs_fs_devices *fsid_fs_devices; struct btrfs_fs_devices *devt_fs_devices; const bool has_metadata_uuid = (btrfs_super_incompat_flags(disk_super) & BTRFS_FEATURE_INCOMPAT_METADATA_UUID); bool found_by_devt = false; /* Find the fs_device by the usual method, if found use it. */ fsid_fs_devices = find_fsid(disk_super->fsid, has_metadata_uuid ? disk_super->metadata_uuid : NULL); /* The temp_fsid feature is supported only with single device filesystem. */ if (btrfs_super_num_devices(disk_super) != 1) return fsid_fs_devices; /* * A seed device is an integral component of the sprout device, which * functions as a multi-device filesystem. So, temp-fsid feature is * not supported. */ if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) return fsid_fs_devices; /* Try to find a fs_devices by matching devt. */ list_for_each_entry(devt_fs_devices, &fs_uuids, fs_list) { struct btrfs_device *device; list_for_each_entry(device, &devt_fs_devices->devices, dev_list) { if (device->devt == devt) { found_by_devt = true; break; } } if (found_by_devt) break; } if (found_by_devt) { /* Existing device. */ if (fsid_fs_devices == NULL) { if (devt_fs_devices->opened == 0) { /* Stale device. */ return NULL; } else { /* temp_fsid is mounting a subvol. */ return devt_fs_devices; } } else { /* Regular or temp_fsid device mounting a subvol. */ return devt_fs_devices; } } else { /* New device. */ if (fsid_fs_devices == NULL) { return NULL; } else { /* sb::fsid is already used create a new temp_fsid. */ *same_fsid_diff_dev = true; return NULL; } } /* Not reached. */ } /* * This is only used on mount, and we are protected from competing things * messing with our fs_devices by the uuid_mutex, thus we do not need the * fs_devices->device_list_mutex here. */ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices, struct btrfs_device *device, blk_mode_t flags, void *holder) { struct file *bdev_file; struct btrfs_super_block *disk_super; u64 devid; int ret; if (device->bdev) return -EINVAL; if (!device->name) return -EINVAL; ret = btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1, &bdev_file, &disk_super); if (ret) return ret; devid = btrfs_stack_device_id(&disk_super->dev_item); if (devid != device->devid) goto error_free_page; if (memcmp(device->uuid, disk_super->dev_item.uuid, BTRFS_UUID_SIZE)) goto error_free_page; device->generation = btrfs_super_generation(disk_super); if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) { if (btrfs_super_incompat_flags(disk_super) & BTRFS_FEATURE_INCOMPAT_METADATA_UUID) { pr_err( "BTRFS: Invalid seeding and uuid-changed device detected\n"); goto error_free_page; } clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); fs_devices->seeding = true; } else { if (bdev_read_only(file_bdev(bdev_file))) clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); else set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); } if (!bdev_nonrot(file_bdev(bdev_file))) fs_devices->rotating = true; if (bdev_max_discard_sectors(file_bdev(bdev_file))) fs_devices->discardable = true; device->bdev_file = bdev_file; device->bdev = file_bdev(bdev_file); clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); if (device->devt != device->bdev->bd_dev) { btrfs_warn(NULL, "device %s maj:min changed from %d:%d to %d:%d", device->name->str, MAJOR(device->devt), MINOR(device->devt), MAJOR(device->bdev->bd_dev), MINOR(device->bdev->bd_dev)); device->devt = device->bdev->bd_dev; } fs_devices->open_devices++; if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_st