| 2370 2369 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 | /* * linux/include/linux/console.h * * Copyright (C) 1993 Hamish Macdonald * * This file is subject to the terms and conditions of the GNU General Public * License. See the file COPYING in the main directory of this archive * for more details. * * Changed: * 10-Mar-94: Arno Griffioen: Conversion for vt100 emulator port from PC LINUX */ #ifndef _LINUX_CONSOLE_H_ #define _LINUX_CONSOLE_H_ 1 #include <linux/atomic.h> #include <linux/bits.h> #include <linux/irq_work.h> #include <linux/rculist.h> #include <linux/rcuwait.h> #include <linux/types.h> #include <linux/vesa.h> struct vc_data; struct console_font_op; struct console_font; struct module; struct tty_struct; struct notifier_block; enum con_scroll { SM_UP, SM_DOWN, }; enum vc_intensity; /** * struct consw - callbacks for consoles * * @owner: the module to get references of when this console is used * @con_startup: set up the console and return its name (like VGA, EGA, ...) * @con_init: initialize the console on @vc. @init is true for the very first * call on this @vc. * @con_deinit: deinitialize the console from @vc. * @con_clear: erase @count characters at [@x, @y] on @vc. @count >= 1. * @con_putc: emit one character with attributes @ca to [@x, @y] on @vc. * (optional -- @con_putcs would be called instead) * @con_putcs: emit @count characters with attributes @s to [@x, @y] on @vc. * @con_cursor: enable/disable cursor depending on @enable * @con_scroll: move lines from @top to @bottom in direction @dir by @lines. * Return true if no generic handling should be done. * Invoked by csi_M and printing to the console. * @con_switch: notifier about the console switch; it is supposed to return * true if a redraw is needed. * @con_blank: blank/unblank the console. The target mode is passed in @blank. * @mode_switch is set if changing from/to text/graphics. The hook * is supposed to return true if a redraw is needed. * @con_font_set: set console @vc font to @font with height @vpitch. @flags can * be %KD_FONT_FLAG_DONT_RECALC. (optional) * @con_font_get: fetch the current font on @vc of height @vpitch into @font. * (optional) * @con_font_default: set default font on @vc. @name can be %NULL or font name * to search for. @font can be filled back. (optional) * @con_resize: resize the @vc console to @width x @height. @from_user is true * when this change comes from the user space. * @con_set_palette: sets the palette of the console @vc to @table (optional) * @con_scrolldelta: the contents of the console should be scrolled by @lines. * Invoked by user. (optional) * @con_set_origin: set origin (see &vc_data::vc_origin) of the @vc. If not * provided or returns false, the origin is set to * @vc->vc_screenbuf. (optional) * @con_save_screen: save screen content into @vc->vc_screenbuf. Called e.g. * upon entering graphics. (optional) * @con_build_attr: build attributes based on @color, @intensity and other * parameters. The result is used for both normal and erase * characters. (optional) * @con_invert_region: invert a region of length @count on @vc starting at @p. * (optional) * @con_debug_enter: prepare the console for the debugger. This includes, but * is not limited to, unblanking the console, loading an * appropriate palette, and allowing debugger generated output. * (optional) * @con_debug_leave: restore the console to its pre-debug state as closely as * possible. (optional) */ struct consw { struct module *owner; const char *(*con_startup)(void); void (*con_init)(struct vc_data *vc, bool init); void (*con_deinit)(struct vc_data *vc); void (*con_clear)(struct vc_data *vc, unsigned int y, unsigned int x, unsigned int count); void (*con_putc)(struct vc_data *vc, u16 ca, unsigned int y, unsigned int x); void (*con_putcs)(struct vc_data *vc, const u16 *s, unsigned int count, unsigned int ypos, unsigned int xpos); void (*con_cursor)(struct vc_data *vc, bool enable); bool (*con_scroll)(struct vc_data *vc, unsigned int top, unsigned int bottom, enum con_scroll dir, unsigned int lines); bool (*con_switch)(struct vc_data *vc); bool (*con_blank)(struct vc_data *vc, enum vesa_blank_mode blank, bool mode_switch); int (*con_font_set)(struct vc_data *vc, const struct console_font *font, unsigned int vpitch, unsigned int flags); int (*con_font_get)(struct vc_data *vc, struct console_font *font, unsigned int vpitch); int (*con_font_default)(struct vc_data *vc, struct console_font *font, const char *name); int (*con_resize)(struct vc_data *vc, unsigned int width, unsigned int height, bool from_user); void (*con_set_palette)(struct vc_data *vc, const unsigned char *table); void (*con_scrolldelta)(struct vc_data *vc, int lines); bool (*con_set_origin)(struct vc_data *vc); void (*con_save_screen)(struct vc_data *vc); u8 (*con_build_attr)(struct vc_data *vc, u8 color, enum vc_intensity intensity, bool blink, bool underline, bool reverse, bool italic); void (*con_invert_region)(struct vc_data *vc, u16 *p, int count); void (*con_debug_enter)(struct vc_data *vc); void (*con_debug_leave)(struct vc_data *vc); }; extern const struct consw *conswitchp; extern const struct consw dummy_con; /* dummy console buffer */ extern const struct consw vga_con; /* VGA text console */ extern const struct consw newport_con; /* SGI Newport console */ struct screen_info; #ifdef CONFIG_VGA_CONSOLE void vgacon_register_screen(struct screen_info *si); #else static inline void vgacon_register_screen(struct screen_info *si) { } #endif int con_is_bound(const struct consw *csw); int do_unregister_con_driver(const struct consw *csw); int do_take_over_console(const struct consw *sw, int first, int last, int deflt); void give_up_console(const struct consw *sw); #ifdef CONFIG_VT void con_debug_enter(struct vc_data *vc); void con_debug_leave(void); #else static inline void con_debug_enter(struct vc_data *vc) { } static inline void con_debug_leave(void) { } #endif /* * The interface for a console, or any other device that wants to capture * console messages (printer driver?) */ /** * enum cons_flags - General console flags * @CON_PRINTBUFFER: Used by newly registered consoles to avoid duplicate * output of messages that were already shown by boot * consoles or read by userspace via syslog() syscall. * @CON_CONSDEV: Indicates that the console driver is backing * /dev/console. * @CON_ENABLED: Indicates if a console is allowed to print records. If * false, the console also will not advance to later * records. * @CON_BOOT: Marks the console driver as early console driver which * is used during boot before the real driver becomes * available. It will be automatically unregistered * when the real console driver is registered unless * "keep_bootcon" parameter is used. * @CON_ANYTIME: A misnomed historical flag which tells the core code * that the legacy @console::write callback can be invoked * on a CPU which is marked OFFLINE. That is misleading as * it suggests that there is no contextual limit for * invoking the callback. The original motivation was * readiness of the per-CPU areas. * @CON_BRL: Indicates a braille device which is exempt from * receiving the printk spam for obvious reasons. * @CON_EXTENDED: The console supports the extended output format of * /dev/kmesg which requires a larger output buffer. * @CON_SUSPENDED: Indicates if a console is suspended. If true, the * printing callbacks must not be called. * @CON_NBCON: Console can operate outside of the legacy style console_lock * constraints. */ enum cons_flags { CON_PRINTBUFFER = BIT(0), CON_CONSDEV = BIT(1), CON_ENABLED = BIT(2), CON_BOOT = BIT(3), CON_ANYTIME = BIT(4), CON_BRL = BIT(5), CON_EXTENDED = BIT(6), CON_SUSPENDED = BIT(7), CON_NBCON = BIT(8), }; /** * struct nbcon_state - console state for nbcon consoles * @atom: Compound of the state fields for atomic operations * * @req_prio: The priority of a handover request * @prio: The priority of the current owner * @unsafe: Console is busy in a non takeover region * @unsafe_takeover: A hostile takeover in an unsafe state happened in the * past. The console cannot be safe until re-initialized. * @cpu: The CPU on which the owner runs * * To be used for reading and preparing of the value stored in the nbcon * state variable @console::nbcon_state. * * The @prio and @req_prio fields are particularly important to allow * spin-waiting to timeout and give up without the risk of a waiter being * assigned the lock after giving up. */ struct nbcon_state { union { unsigned int atom; struct { unsigned int prio : 2; unsigned int req_prio : 2; unsigned int unsafe : 1; unsigned int unsafe_takeover : 1; unsigned int cpu : 24; }; }; }; /* * The nbcon_state struct is used to easily create and interpret values that * are stored in the @console::nbcon_state variable. Ensure this struct stays * within the size boundaries of the atomic variable's underlying type in * order to avoid any accidental truncation. */ static_assert(sizeof(struct nbcon_state) <= sizeof(int)); /** * enum nbcon_prio - console owner priority for nbcon consoles * @NBCON_PRIO_NONE: Unused * @NBCON_PRIO_NORMAL: Normal (non-emergency) usage * @NBCON_PRIO_EMERGENCY: Emergency output (WARN/OOPS...) * @NBCON_PRIO_PANIC: Panic output * @NBCON_PRIO_MAX: The number of priority levels * * A higher priority context can takeover the console when it is * in the safe state. The final attempt to flush consoles in panic() * can be allowed to do so even in an unsafe state (Hope and pray). */ enum nbcon_prio { NBCON_PRIO_NONE = 0, NBCON_PRIO_NORMAL, NBCON_PRIO_EMERGENCY, NBCON_PRIO_PANIC, NBCON_PRIO_MAX, }; struct console; struct printk_buffers; /** * struct nbcon_context - Context for console acquire/release * @console: The associated console * @spinwait_max_us: Limit for spin-wait acquire * @prio: Priority of the context * @allow_unsafe_takeover: Allow performing takeover even if unsafe. Can * be used only with NBCON_PRIO_PANIC @prio. It * might cause a system freeze when the console * is used later. * @backlog: Ringbuffer has pending records * @pbufs: Pointer to the text buffer for this context * @seq: The sequence number to print for this context */ struct nbcon_context { /* members set by caller */ struct console *console; unsigned int spinwait_max_us; enum nbcon_prio prio; unsigned int allow_unsafe_takeover : 1; /* members set by emit */ unsigned int backlog : 1; /* members set by acquire */ struct printk_buffers *pbufs; u64 seq; }; /** * struct nbcon_write_context - Context handed to the nbcon write callbacks * @ctxt: The core console context * @outbuf: Pointer to the text buffer for output * @len: Length to write * @unsafe_takeover: If a hostile takeover in an unsafe state has occurred */ struct nbcon_write_context { struct nbcon_context __private ctxt; char *outbuf; unsigned int len; bool unsafe_takeover; }; /** * struct console - The console descriptor structure * @name: The name of the console driver * @write: Legacy write callback to output messages (Optional) * @read: Read callback for console input (Optional) * @device: The underlying TTY device driver (Optional) * @unblank: Callback to unblank the console (Optional) * @setup: Callback for initializing the console (Optional) * @exit: Callback for teardown of the console (Optional) * @match: Callback for matching a console (Optional) * @flags: Console flags. See enum cons_flags * @index: Console index, e.g. port number * @cflag: TTY control mode flags * @ispeed: TTY input speed * @ospeed: TTY output speed * @seq: Sequence number of the next ringbuffer record to print * @dropped: Number of unreported dropped ringbuffer records * @data: Driver private data * @node: hlist node for the console list * * @nbcon_state: State for nbcon consoles * @nbcon_seq: Sequence number of the next record for nbcon to print * @nbcon_device_ctxt: Context available for non-printing operations * @nbcon_prev_seq: Seq num the previous nbcon owner was assigned to print * @pbufs: Pointer to nbcon private buffer * @kthread: Printer kthread for this console * @rcuwait: RCU-safe wait object for @kthread waking * @irq_work: Defer @kthread waking to IRQ work context */ struct console { char name[16]; void (*write)(struct console *co, const char *s, unsigned int count); int (*read)(struct console *co, char *s, unsigned int count); struct tty_driver *(*device)(struct console *co, int *index); void (*unblank)(void); int (*setup)(struct console *co, char *options); int (*exit)(struct console *co); int (*match)(struct console *co, char *name, int idx, char *options); short flags; short index; int cflag; uint ispeed; uint ospeed; u64 seq; unsigned long dropped; void *data; struct hlist_node node; /* nbcon console specific members */ /** * @write_atomic: * * NBCON callback to write out text in any context. (Optional) * * This callback is called with the console already acquired. However, * a higher priority context is allowed to take it over by default. * * The callback must call nbcon_enter_unsafe() and nbcon_exit_unsafe() * around any code where the takeover is not safe, for example, when * manipulating the serial port registers. * * nbcon_enter_unsafe() will fail if the context has lost the console * ownership in the meantime. In this case, the callback is no longer * allowed to go forward. It must back out immediately and carefully. * The buffer content is also no longer trusted since it no longer * belongs to the context. * * The callback should allow the takeover whenever it is safe. It * increases the chance to see messages when the system is in trouble. * If the driver must reacquire ownership in order to finalize or * revert hardware changes, nbcon_reacquire_nobuf() can be used. * However, on reacquire the buffer content is no longer available. A * reacquire cannot be used to resume printing. * * The callback can be called from any context (including NMI). * Therefore it must avoid usage of any locking and instead rely * on the console ownership for synchronization. */ void (*write_atomic)(struct console *con, struct nbcon_write_context *wctxt); /** * @write_thread: * * NBCON callback to write out text in task context. * * This callback must be called only in task context with both * device_lock() and the nbcon console acquired with * NBCON_PRIO_NORMAL. * * The same rules for console ownership verification and unsafe * sections handling applies as with write_atomic(). * * The console ownership handling is necessary for synchronization * against write_atomic() which is synchronized only via the context. * * The device_lock() provides the primary serialization for operations * on the device. It might be as relaxed (mutex)[*] or as tight * (disabled preemption and interrupts) as needed. It allows * the kthread to operate in the least restrictive mode[**]. * * [*] Standalone nbcon_context_try_acquire() is not safe with * the preemption enabled, see nbcon_owner_matches(). But it * can be safe when always called in the preemptive context * under the device_lock(). * * [**] The device_lock() makes sure that nbcon_context_try_acquire() * would never need to spin which is important especially with * PREEMPT_RT. */ void (*write_thread)(struct console *con, struct nbcon_write_context *wctxt); /** * @device_lock: * * NBCON callback to begin synchronization with driver code. * * Console drivers typically must deal with access to the hardware * via user input/output (such as an interactive login shell) and * output of kernel messages via printk() calls. This callback is * called by the printk-subsystem whenever it needs to synchronize * with hardware access by the driver. It should be implemented to * use whatever synchronization mechanism the driver is using for * itself (for example, the port lock for uart serial consoles). * * The callback is always called from task context. It may use any * synchronization method required by the driver. * * IMPORTANT: The callback MUST disable migration. The console driver * may be using a synchronization mechanism that already takes * care of this (such as spinlocks). Otherwise this function must * explicitly call migrate_disable(). * * The flags argument is provided as a convenience to the driver. It * will be passed again to device_unlock(). It can be ignored if the * driver does not need it. */ void (*device_lock)(struct console *con, unsigned long *flags); /** * @device_unlock: * * NBCON callback to finish synchronization with driver code. * * It is the counterpart to device_lock(). * * This callback is always called from task context. It must * appropriately re-enable migration (depending on how device_lock() * disabled migration). * * The flags argument is the value of the same variable that was * passed to device_lock(). */ void (*device_unlock)(struct console *con, unsigned long flags); atomic_t __private nbcon_state; atomic_long_t __private nbcon_seq; struct nbcon_context __private nbcon_device_ctxt; atomic_long_t __private nbcon_prev_seq; struct printk_buffers *pbufs; struct task_struct *kthread; struct rcuwait rcuwait; struct irq_work irq_work; }; #ifdef CONFIG_LOCKDEP extern void lockdep_assert_console_list_lock_held(void); #else static inline void lockdep_assert_console_list_lock_held(void) { } #endif #ifdef CONFIG_DEBUG_LOCK_ALLOC extern bool console_srcu_read_lock_is_held(void); #else static inline bool console_srcu_read_lock_is_held(void) { return 1; } #endif extern int console_srcu_read_lock(void); extern void console_srcu_read_unlock(int cookie); extern void console_list_lock(void) __acquires(console_mutex); extern void console_list_unlock(void) __releases(console_mutex); extern struct hlist_head console_list; /** * console_srcu_read_flags - Locklessly read flags of a possibly registered * console * @con: struct console pointer of console to read flags from * * Locklessly reading @con->flags provides a consistent read value because * there is at most one CPU modifying @con->flags and that CPU is using only * read-modify-write operations to do so. * * Requires console_srcu_read_lock to be held, which implies that @con might * be a registered console. The purpose of holding console_srcu_read_lock is * to guarantee that the console state is valid (CON_SUSPENDED/CON_ENABLED) * and that no exit/cleanup routines will run if the console is currently * undergoing unregistration. * * If the caller is holding the console_list_lock or it is _certain_ that * @con is not and will not become registered, the caller may read * @con->flags directly instead. * * Context: Any context. * Return: The current value of the @con->flags field. */ static inline short console_srcu_read_flags(const struct console *con) { WARN_ON_ONCE(!console_srcu_read_lock_is_held()); /* * The READ_ONCE() matches the WRITE_ONCE() when @flags are modified * for registered consoles with console_srcu_write_flags(). */ return data_race(READ_ONCE(con->flags)); } /** * console_srcu_write_flags - Write flags for a registered console * @con: struct console pointer of console to write flags to * @flags: new flags value to write * * Only use this function to write flags for registered consoles. It * requires holding the console_list_lock. * * Context: Any context. */ static inline void console_srcu_write_flags(struct console *con, short flags) { lockdep_assert_console_list_lock_held(); /* This matches the READ_ONCE() in console_srcu_read_flags(). */ WRITE_ONCE(con->flags, flags); } /* Variant of console_is_registered() when the console_list_lock is held. */ static inline bool console_is_registered_locked(const struct console *con) { lockdep_assert_console_list_lock_held(); return !hlist_unhashed(&con->node); } /* * console_is_registered - Check if the console is registered * @con: struct console pointer of console to check * * Context: Process context. May sleep while acquiring console list lock. * Return: true if the console is in the console list, otherwise false. * * If false is returned for a console that was previously registered, it * can be assumed that the console's unregistration is fully completed, * including the exit() callback after console list removal. */ static inline bool console_is_registered(const struct console *con) { bool ret; console_list_lock(); ret = console_is_registered_locked(con); console_list_unlock(); return ret; } /** * for_each_console_srcu() - Iterator over registered consoles * @con: struct console pointer used as loop cursor * * Although SRCU guarantees the console list will be consistent, the * struct console fields may be updated by other CPUs while iterating. * * Requires console_srcu_read_lock to be held. Can be invoked from * any context. */ #define for_each_console_srcu(con) \ hlist_for_each_entry_srcu(con, &console_list, node, \ console_srcu_read_lock_is_held()) /** * for_each_console() - Iterator over registered consoles * @con: struct console pointer used as loop cursor * * The console list and the &console.flags are immutable while iterating. * * Requires console_list_lock to be held. */ #define for_each_console(con) \ lockdep_assert_console_list_lock_held(); \ hlist_for_each_entry(con, &console_list, node) #ifdef CONFIG_PRINTK extern void nbcon_cpu_emergency_enter(void); extern void nbcon_cpu_emergency_exit(void); extern bool nbcon_can_proceed(struct nbcon_write_context *wctxt); extern bool nbcon_enter_unsafe(struct nbcon_write_context *wctxt); extern bool nbcon_exit_unsafe(struct nbcon_write_context *wctxt); extern void nbcon_reacquire_nobuf(struct nbcon_write_context *wctxt); #else static inline void nbcon_cpu_emergency_enter(void) { } static inline void nbcon_cpu_emergency_exit(void) { } static inline bool nbcon_can_proceed(struct nbcon_write_context *wctxt) { return false; } static inline bool nbcon_enter_unsafe(struct nbcon_write_context *wctxt) { return false; } static inline bool nbcon_exit_unsafe(struct nbcon_write_context *wctxt) { return false; } static inline void nbcon_reacquire_nobuf(struct nbcon_write_context *wctxt) { } #endif extern int console_set_on_cmdline; extern struct console *early_console; enum con_flush_mode { CONSOLE_FLUSH_PENDING, CONSOLE_REPLAY_ALL, }; extern int add_preferred_console(const char *name, const short idx, char *options); extern void console_force_preferred_locked(struct console *con); extern void register_console(struct console *); extern int unregister_console(struct console *); extern void console_lock(void); extern int console_trylock(void); extern void console_unlock(void); extern void console_conditional_schedule(void); extern void console_unblank(void); extern void console_flush_on_panic(enum con_flush_mode mode); extern struct tty_driver *console_device(int *); extern void console_suspend(struct console *); extern void console_resume(struct console *); extern int is_console_locked(void); extern int braille_register_console(struct console *, int index, char *console_options, char *braille_options); extern int braille_unregister_console(struct console *); #ifdef CONFIG_TTY extern void console_sysfs_notify(void); #else static inline void console_sysfs_notify(void) { } #endif extern bool console_suspend_enabled; /* Suspend and resume console messages over PM events */ extern void console_suspend_all(void); extern void console_resume_all(void); int mda_console_init(void); void vcs_make_sysfs(int index); void vcs_remove_sysfs(int index); /* Some debug stub to catch some of the obvious races in the VT code */ #define WARN_CONSOLE_UNLOCKED() \ WARN_ON(!atomic_read(&ignore_console_lock_warning) && \ !is_console_locked() && !oops_in_progress) /* * Increment ignore_console_lock_warning if you need to quiet * WARN_CONSOLE_UNLOCKED() for debugging purposes. */ extern atomic_t ignore_console_lock_warning; extern void console_init(void); /* For deferred console takeover */ void dummycon_register_output_notifier(struct notifier_block *nb); void dummycon_unregister_output_notifier(struct notifier_block *nb); #endif /* _LINUX_CONSOLE_H */ |
| 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 84 1 1 1 1 85 85 86 85 85 83 85 85 85 3 4 86 83 4 85 85 86 85 85 83 84 86 85 86 86 85 85 8 7 8 8 2 3 3 3 2 3 3 3 3 3 3 78 78 78 78 77 77 76 1 76 78 68 78 78 2 2 2 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 | // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2007-2008 Advanced Micro Devices, Inc. * Author: Joerg Roedel <jroedel@suse.de> */ #define pr_fmt(fmt) "iommu: " fmt #include <linux/amba/bus.h> #include <linux/device.h> #include <linux/kernel.h> #include <linux/bits.h> #include <linux/bug.h> #include <linux/types.h> #include <linux/init.h> #include <linux/export.h> #include <linux/slab.h> #include <linux/errno.h> #include <linux/host1x_context_bus.h> #include <linux/iommu.h> #include <linux/iommufd.h> #include <linux/idr.h> #include <linux/err.h> #include <linux/pci.h> #include <linux/pci-ats.h> #include <linux/bitops.h> #include <linux/platform_device.h> #include <linux/property.h> #include <linux/fsl/mc.h> #include <linux/module.h> #include <linux/cc_platform.h> #include <linux/cdx/cdx_bus.h> #include <trace/events/iommu.h> #include <linux/sched/mm.h> #include <linux/msi.h> #include <uapi/linux/iommufd.h> #include "dma-iommu.h" #include "iommu-priv.h" static struct kset *iommu_group_kset; static DEFINE_IDA(iommu_group_ida); static DEFINE_IDA(iommu_global_pasid_ida); static unsigned int iommu_def_domain_type __read_mostly; static bool iommu_dma_strict __read_mostly = IS_ENABLED(CONFIG_IOMMU_DEFAULT_DMA_STRICT); static u32 iommu_cmd_line __read_mostly; /* Tags used with xa_tag_pointer() in group->pasid_array */ enum { IOMMU_PASID_ARRAY_DOMAIN = 0, IOMMU_PASID_ARRAY_HANDLE = 1 }; struct iommu_group { struct kobject kobj; struct kobject *devices_kobj; struct list_head devices; struct xarray pasid_array; struct mutex mutex; void *iommu_data; void (*iommu_data_release)(void *iommu_data); char *name; int id; struct iommu_domain *default_domain; struct iommu_domain *blocking_domain; struct iommu_domain *domain; struct list_head entry; unsigned int owner_cnt; void *owner; }; struct group_device { struct list_head list; struct device *dev; char *name; }; /* Iterate over each struct group_device in a struct iommu_group */ #define for_each_group_device(group, pos) \ list_for_each_entry(pos, &(group)->devices, list) struct iommu_group_attribute { struct attribute attr; ssize_t (*show)(struct iommu_group *group, char *buf); ssize_t (*store)(struct iommu_group *group, const char *buf, size_t count); }; static const char * const iommu_group_resv_type_string[] = { [IOMMU_RESV_DIRECT] = "direct", [IOMMU_RESV_DIRECT_RELAXABLE] = "direct-relaxable", [IOMMU_RESV_RESERVED] = "reserved", [IOMMU_RESV_MSI] = "msi", [IOMMU_RESV_SW_MSI] = "msi", }; #define IOMMU_CMD_LINE_DMA_API BIT(0) #define IOMMU_CMD_LINE_STRICT BIT(1) static int bus_iommu_probe(const struct bus_type *bus); static int iommu_bus_notifier(struct notifier_block *nb, unsigned long action, void *data); static void iommu_release_device(struct device *dev); static int __iommu_attach_device(struct iommu_domain *domain, struct device *dev); static int __iommu_attach_group(struct iommu_domain *domain, struct iommu_group *group); static struct iommu_domain *__iommu_paging_domain_alloc_flags(struct device *dev, unsigned int type, unsigned int flags); enum { IOMMU_SET_DOMAIN_MUST_SUCCEED = 1 << 0, }; static int __iommu_device_set_domain(struct iommu_group *group, struct device *dev, struct iommu_domain *new_domain, unsigned int flags); static int __iommu_group_set_domain_internal(struct iommu_group *group, struct iommu_domain *new_domain, unsigned int flags); static int __iommu_group_set_domain(struct iommu_group *group, struct iommu_domain *new_domain) { return __iommu_group_set_domain_internal(group, new_domain, 0); } static void __iommu_group_set_domain_nofail(struct iommu_group *group, struct iommu_domain *new_domain) { WARN_ON(__iommu_group_set_domain_internal( group, new_domain, IOMMU_SET_DOMAIN_MUST_SUCCEED)); } static int iommu_setup_default_domain(struct iommu_group *group, int target_type); static int iommu_create_device_direct_mappings(struct iommu_domain *domain, struct device *dev); static ssize_t iommu_group_store_type(struct iommu_group *group, const char *buf, size_t count); static struct group_device *iommu_group_alloc_device(struct iommu_group *group, struct device *dev); static void __iommu_group_free_device(struct iommu_group *group, struct group_device *grp_dev); static void iommu_domain_init(struct iommu_domain *domain, unsigned int type, const struct iommu_ops *ops); #define IOMMU_GROUP_ATTR(_name, _mode, _show, _store) \ struct iommu_group_attribute iommu_group_attr_##_name = \ __ATTR(_name, _mode, _show, _store) #define to_iommu_group_attr(_attr) \ container_of(_attr, struct iommu_group_attribute, attr) #define to_iommu_group(_kobj) \ container_of(_kobj, struct iommu_group, kobj) static LIST_HEAD(iommu_device_list); static DEFINE_SPINLOCK(iommu_device_lock); static const struct bus_type * const iommu_buses[] = { &platform_bus_type, #ifdef CONFIG_PCI &pci_bus_type, #endif #ifdef CONFIG_ARM_AMBA &amba_bustype, #endif #ifdef CONFIG_FSL_MC_BUS &fsl_mc_bus_type, #endif #ifdef CONFIG_TEGRA_HOST1X_CONTEXT_BUS &host1x_context_device_bus_type, #endif #ifdef CONFIG_CDX_BUS &cdx_bus_type, #endif }; /* * Use a function instead of an array here because the domain-type is a * bit-field, so an array would waste memory. */ static const char *iommu_domain_type_str(unsigned int t) { switch (t) { case IOMMU_DOMAIN_BLOCKED: return "Blocked"; case IOMMU_DOMAIN_IDENTITY: return "Passthrough"; case IOMMU_DOMAIN_UNMANAGED: return "Unmanaged"; case IOMMU_DOMAIN_DMA: case IOMMU_DOMAIN_DMA_FQ: return "Translated"; case IOMMU_DOMAIN_PLATFORM: return "Platform"; default: return "Unknown"; } } static int __init iommu_subsys_init(void) { struct notifier_block *nb; if (!(iommu_cmd_line & IOMMU_CMD_LINE_DMA_API)) { if (IS_ENABLED(CONFIG_IOMMU_DEFAULT_PASSTHROUGH)) iommu_set_default_passthrough(false); else iommu_set_default_translated(false); if (iommu_default_passthrough() && cc_platform_has(CC_ATTR_MEM_ENCRYPT)) { pr_info("Memory encryption detected - Disabling default IOMMU Passthrough\n"); iommu_set_default_translated(false); } } if (!iommu_default_passthrough() && !iommu_dma_strict) iommu_def_domain_type = IOMMU_DOMAIN_DMA_FQ; pr_info("Default domain type: %s%s\n", iommu_domain_type_str(iommu_def_domain_type), (iommu_cmd_line & IOMMU_CMD_LINE_DMA_API) ? " (set via kernel command line)" : ""); if (!iommu_default_passthrough()) pr_info("DMA domain TLB invalidation policy: %s mode%s\n", iommu_dma_strict ? "strict" : "lazy", (iommu_cmd_line & IOMMU_CMD_LINE_STRICT) ? " (set via kernel command line)" : ""); nb = kcalloc(ARRAY_SIZE(iommu_buses), sizeof(*nb), GFP_KERNEL); if (!nb) return -ENOMEM; for (int i = 0; i < ARRAY_SIZE(iommu_buses); i++) { nb[i].notifier_call = iommu_bus_notifier; bus_register_notifier(iommu_buses[i], &nb[i]); } return 0; } subsys_initcall(iommu_subsys_init); static int remove_iommu_group(struct device *dev, void *data) { if (dev->iommu && dev->iommu->iommu_dev == data) iommu_release_device(dev); return 0; } /** * iommu_device_register() - Register an IOMMU hardware instance * @iommu: IOMMU handle for the instance * @ops: IOMMU ops to associate with the instance * @hwdev: (optional) actual instance device, used for fwnode lookup * * Return: 0 on success, or an error. */ int iommu_device_register(struct iommu_device *iommu, const struct iommu_ops *ops, struct device *hwdev) { int err = 0; /* We need to be able to take module references appropriately */ if (WARN_ON(is_module_address((unsigned long)ops) && !ops->owner)) return -EINVAL; iommu->ops = ops; if (hwdev) iommu->fwnode = dev_fwnode(hwdev); spin_lock(&iommu_device_lock); list_add_tail(&iommu->list, &iommu_device_list); spin_unlock(&iommu_device_lock); for (int i = 0; i < ARRAY_SIZE(iommu_buses) && !err; i++) err = bus_iommu_probe(iommu_buses[i]); if (err) iommu_device_unregister(iommu); else WRITE_ONCE(iommu->ready, true); return err; } EXPORT_SYMBOL_GPL(iommu_device_register); void iommu_device_unregister(struct iommu_device *iommu) { for (int i = 0; i < ARRAY_SIZE(iommu_buses); i++) bus_for_each_dev(iommu_buses[i], NULL, iommu, remove_iommu_group); spin_lock(&iommu_device_lock); list_del(&iommu->list); spin_unlock(&iommu_device_lock); /* Pairs with the alloc in generic_single_device_group() */ iommu_group_put(iommu->singleton_group); iommu->singleton_group = NULL; } EXPORT_SYMBOL_GPL(iommu_device_unregister); #if IS_ENABLED(CONFIG_IOMMUFD_TEST) void iommu_device_unregister_bus(struct iommu_device *iommu, const struct bus_type *bus, struct notifier_block *nb) { bus_unregister_notifier(bus, nb); iommu_device_unregister(iommu); } EXPORT_SYMBOL_GPL(iommu_device_unregister_bus); /* * Register an iommu driver against a single bus. This is only used by iommufd * selftest to create a mock iommu driver. The caller must provide * some memory to hold a notifier_block. */ int iommu_device_register_bus(struct iommu_device *iommu, const struct iommu_ops *ops, const struct bus_type *bus, struct notifier_block *nb) { int err; iommu->ops = ops; nb->notifier_call = iommu_bus_notifier; err = bus_register_notifier(bus, nb); if (err) return err; spin_lock(&iommu_device_lock); list_add_tail(&iommu->list, &iommu_device_list); spin_unlock(&iommu_device_lock); err = bus_iommu_probe(bus); if (err) { iommu_device_unregister_bus(iommu, bus, nb); return err; } return 0; } EXPORT_SYMBOL_GPL(iommu_device_register_bus); #endif static struct dev_iommu *dev_iommu_get(struct device *dev) { struct dev_iommu *param = dev->iommu; lockdep_assert_held(&iommu_probe_device_lock); if (param) return param; param = kzalloc(sizeof(*param), GFP_KERNEL); if (!param) return NULL; mutex_init(¶m->lock); dev->iommu = param; return param; } void dev_iommu_free(struct device *dev) { struct dev_iommu *param = dev->iommu; dev->iommu = NULL; if (param->fwspec) { fwnode_handle_put(param->fwspec->iommu_fwnode); kfree(param->fwspec); } kfree(param); } /* * Internal equivalent of device_iommu_mapped() for when we care that a device * actually has API ops, and don't want false positives from VFIO-only groups. */ static bool dev_has_iommu(struct device *dev) { return dev->iommu && dev->iommu->iommu_dev; } static u32 dev_iommu_get_max_pasids(struct device *dev) { u32 max_pasids = 0, bits = 0; int ret; if (dev_is_pci(dev)) { ret = pci_max_pasids(to_pci_dev(dev)); if (ret > 0) max_pasids = ret; } else { ret = device_property_read_u32(dev, "pasid-num-bits", &bits); if (!ret) max_pasids = 1UL << bits; } return min_t(u32, max_pasids, dev->iommu->iommu_dev->max_pasids); } void dev_iommu_priv_set(struct device *dev, void *priv) { /* FSL_PAMU does something weird */ if (!IS_ENABLED(CONFIG_FSL_PAMU)) lockdep_assert_held(&iommu_probe_device_lock); dev->iommu->priv = priv; } EXPORT_SYMBOL_GPL(dev_iommu_priv_set); /* * Init the dev->iommu and dev->iommu_group in the struct device and get the * driver probed */ static int iommu_init_device(struct device *dev) { const struct iommu_ops *ops; struct iommu_device *iommu_dev; struct iommu_group *group; int ret; if (!dev_iommu_get(dev)) return -ENOMEM; /* * For FDT-based systems and ACPI IORT/VIOT, the common firmware parsing * is buried in the bus dma_configure path. Properly unpicking that is * still a big job, so for now just invoke the whole thing. The device * already having a driver bound means dma_configure has already run and * found no IOMMU to wait for, so there's no point calling it again. */ if (!dev->iommu->fwspec && !dev->driver && dev->bus->dma_configure) { mutex_unlock(&iommu_probe_device_lock); dev->bus->dma_configure(dev); mutex_lock(&iommu_probe_device_lock); /* If another instance finished the job for us, skip it */ if (!dev->iommu || dev->iommu_group) return -ENODEV; } /* * At this point, relevant devices either now have a fwspec which will * match ops registered with a non-NULL fwnode, or we can reasonably * assume that only one of Intel, AMD, s390, PAMU or legacy SMMUv2 can * be present, and that any of their registered instances has suitable * ops for probing, and thus cheekily co-opt the same mechanism. */ ops = iommu_fwspec_ops(dev->iommu->fwspec); if (!ops) { ret = -ENODEV; goto err_free; } if (!try_module_get(ops->owner)) { ret = -EINVAL; goto err_free; } iommu_dev = ops->probe_device(dev); if (IS_ERR(iommu_dev)) { ret = PTR_ERR(iommu_dev); goto err_module_put; } dev->iommu->iommu_dev = iommu_dev; ret = iommu_device_link(iommu_dev, dev); if (ret) goto err_release; group = ops->device_group(dev); if (WARN_ON_ONCE(group == NULL)) group = ERR_PTR(-EINVAL); if (IS_ERR(group)) { ret = PTR_ERR(group); goto err_unlink; } dev->iommu_group = group; dev->iommu->max_pasids = dev_iommu_get_max_pasids(dev); if (ops->is_attach_deferred) dev->iommu->attach_deferred = ops->is_attach_deferred(dev); return 0; err_unlink: iommu_device_unlink(iommu_dev, dev); err_release: if (ops->release_device) ops->release_device(dev); err_module_put: module_put(ops->owner); err_free: dev->iommu->iommu_dev = NULL; dev_iommu_free(dev); return ret; } static void iommu_deinit_device(struct device *dev) { struct iommu_group *group = dev->iommu_group; const struct iommu_ops *ops = dev_iommu_ops(dev); lockdep_assert_held(&group->mutex); iommu_device_unlink(dev->iommu->iommu_dev, dev); /* * release_device() must stop using any attached domain on the device. * If there are still other devices in the group, they are not affected * by this callback. * * If the iommu driver provides release_domain, the core code ensures * that domain is attached prior to calling release_device. Drivers can * use this to enforce a translation on the idle iommu. Typically, the * global static blocked_domain is a good choice. * * Otherwise, the iommu driver must set the device to either an identity * or a blocking translation in release_device() and stop using any * domain pointer, as it is going to be freed. * * Regardless, if a delayed attach never occurred, then the release * should still avoid touching any hardware configuration either. */ if (!dev->iommu->attach_deferred && ops->release_domain) ops->release_domain->ops->attach_dev(ops->release_domain, dev); if (ops->release_device) ops->release_device(dev); /* * If this is the last driver to use the group then we must free the * domains before we do the module_put(). */ if (list_empty(&group->devices)) { if (group->default_domain) { iommu_domain_free(group->default_domain); group->default_domain = NULL; } if (group->blocking_domain) { iommu_domain_free(group->blocking_domain); group->blocking_domain = NULL; } group->domain = NULL; } /* Caller must put iommu_group */ dev->iommu_group = NULL; module_put(ops->owner); dev_iommu_free(dev); #ifdef CONFIG_IOMMU_DMA dev->dma_iommu = false; #endif } static struct iommu_domain *pasid_array_entry_to_domain(void *entry) { if (xa_pointer_tag(entry) == IOMMU_PASID_ARRAY_DOMAIN) return xa_untag_pointer(entry); return ((struct iommu_attach_handle *)xa_untag_pointer(entry))->domain; } DEFINE_MUTEX(iommu_probe_device_lock); static int __iommu_probe_device(struct device *dev, struct list_head *group_list) { struct iommu_group *group; struct group_device *gdev; int ret; /* * Serialise to avoid races between IOMMU drivers registering in * parallel and/or the "replay" calls from ACPI/OF code via client * driver probe. Once the latter have been cleaned up we should * probably be able to use device_lock() here to minimise the scope, * but for now enforcing a simple global ordering is fine. */ lockdep_assert_held(&iommu_probe_device_lock); /* Device is probed already if in a group */ if (dev->iommu_group) return 0; ret = iommu_init_device(dev); if (ret) return ret; /* * And if we do now see any replay calls, they would indicate someone * misusing the dma_configure path outside bus code. */ if (dev->driver) dev_WARN(dev, "late IOMMU probe at driver bind, something fishy here!\n"); group = dev->iommu_group; gdev = iommu_group_alloc_device(group, dev); mutex_lock(&group->mutex); if (IS_ERR(gdev)) { ret = PTR_ERR(gdev); goto err_put_group; } /* * The gdev must be in the list before calling * iommu_setup_default_domain() */ list_add_tail(&gdev->list, &group->devices); WARN_ON(group->default_domain && !group->domain); if (group->default_domain) iommu_create_device_direct_mappings(group->default_domain, dev); if (group->domain) { ret = __iommu_device_set_domain(group, dev, group->domain, 0); if (ret) goto err_remove_gdev; } else if (!group->default_domain && !group_list) { ret = iommu_setup_default_domain(group, 0); if (ret) goto err_remove_gdev; } else if (!group->default_domain) { /* * With a group_list argument we defer the default_domain setup * to the caller by providing a de-duplicated list of groups * that need further setup. */ if (list_empty(&group->entry)) list_add_tail(&group->entry, group_list); } if (group->default_domain) iommu_setup_dma_ops(dev); mutex_unlock(&group->mutex); return 0; err_remove_gdev: list_del(&gdev->list); __iommu_group_free_device(group, gdev); err_put_group: iommu_deinit_device(dev); mutex_unlock(&group->mutex); iommu_group_put(group); return ret; } int iommu_probe_device(struct device *dev) { const struct iommu_ops *ops; int ret; mutex_lock(&iommu_probe_device_lock); ret = __iommu_probe_device(dev, NULL); mutex_unlock(&iommu_probe_device_lock); if (ret) return ret; ops = dev_iommu_ops(dev); if (ops->probe_finalize) ops->probe_finalize(dev); return 0; } static void __iommu_group_free_device(struct iommu_group *group, struct group_device *grp_dev) { struct device *dev = grp_dev->dev; sysfs_remove_link(group->devices_kobj, grp_dev->name); sysfs_remove_link(&dev->kobj, "iommu_group"); trace_remove_device_from_group(group->id, dev); /* * If the group has become empty then ownership must have been * released, and the current domain must be set back to NULL or * the default domain. */ if (list_empty(&group->devices)) WARN_ON(group->owner_cnt || group->domain != group->default_domain); kfree(grp_dev->name); kfree(grp_dev); } /* Remove the iommu_group from the struct device. */ static void __iommu_group_remove_device(struct device *dev) { struct iommu_group *group = dev->iommu_group; struct group_device *device; mutex_lock(&group->mutex); for_each_group_device(group, device) { if (device->dev != dev) continue; list_del(&device->list); __iommu_group_free_device(group, device); if (dev_has_iommu(dev)) iommu_deinit_device(dev); else dev->iommu_group = NULL; break; } mutex_unlock(&group->mutex); /* * Pairs with the get in iommu_init_device() or * iommu_group_add_device() */ iommu_group_put(group); } static void iommu_release_device(struct device *dev) { struct iommu_group *group = dev->iommu_group; if (group) __iommu_group_remove_device(dev); /* Free any fwspec if no iommu_driver was ever attached */ if (dev->iommu) dev_iommu_free(dev); } static int __init iommu_set_def_domain_type(char *str) { bool pt; int ret; ret = kstrtobool(str, &pt); if (ret) return ret; if (pt) iommu_set_default_passthrough(true); else iommu_set_default_translated(true); return 0; } early_param("iommu.passthrough", iommu_set_def_domain_type); static int __init iommu_dma_setup(char *str) { int ret = kstrtobool(str, &iommu_dma_strict); if (!ret) iommu_cmd_line |= IOMMU_CMD_LINE_STRICT; return ret; } early_param("iommu.strict", iommu_dma_setup); void iommu_set_dma_strict(void) { iommu_dma_strict = true; if (iommu_def_domain_type == IOMMU_DOMAIN_DMA_FQ) iommu_def_domain_type = IOMMU_DOMAIN_DMA; } static ssize_t iommu_group_attr_show(struct kobject *kobj, struct attribute *__attr, char *buf) { struct iommu_group_attribute *attr = to_iommu_group_attr(__attr); struct iommu_group *group = to_iommu_group(kobj); ssize_t ret = -EIO; if (attr->show) ret = attr->show(group, buf); return ret; } static ssize_t iommu_group_attr_store(struct kobject *kobj, struct attribute *__attr, const char *buf, size_t count) { struct iommu_group_attribute *attr = to_iommu_group_attr(__attr); struct iommu_group *group = to_iommu_group(kobj); ssize_t ret = -EIO; if (attr->store) ret = attr->store(group, buf, count); return ret; } static const struct sysfs_ops iommu_group_sysfs_ops = { .show = iommu_group_attr_show, .store = iommu_group_attr_store, }; static int iommu_group_create_file(struct iommu_group *group, struct iommu_group_attribute *attr) { return sysfs_create_file(&group->kobj, &attr->attr); } static void iommu_group_remove_file(struct iommu_group *group, struct iommu_group_attribute *attr) { sysfs_remove_file(&group->kobj, &attr->attr); } static ssize_t iommu_group_show_name(struct iommu_group *group, char *buf) { return sysfs_emit(buf, "%s\n", group->name); } /** * iommu_insert_resv_region - Insert a new region in the * list of reserved regions. * @new: new region to insert * @regions: list of regions * * Elements are sorted by start address and overlapping segments * of the same type are merged. */ static int iommu_insert_resv_region(struct iommu_resv_region *new, struct list_head *regions) { struct iommu_resv_region *iter, *tmp, *nr, *top; LIST_HEAD(stack); nr = iommu_alloc_resv_region(new->start, new->length, new->prot, new->type, GFP_KERNEL); if (!nr) return -ENOMEM; /* First add the new element based on start address sorting */ list_for_each_entry(iter, regions, list) { if (nr->start < iter->start || (nr->start == iter->start && nr->type <= iter->type)) break; } list_add_tail(&nr->list, &iter->list); /* Merge overlapping segments of type nr->type in @regions, if any */ list_for_each_entry_safe(iter, tmp, regions, list) { phys_addr_t top_end, iter_end = iter->start + iter->length - 1; /* no merge needed on elements of different types than @new */ if (iter->type != new->type) { list_move_tail(&iter->list, &stack); continue; } /* look for the last stack element of same type as @iter */ list_for_each_entry_reverse(top, &stack, list) if (top->type == iter->type) goto check_overlap; list_move_tail(&iter->list, &stack); continue; check_overlap: top_end = top->start + top->length - 1; if (iter->start > top_end + 1) { list_move_tail(&iter->list, &stack); } else { top->length = max(top_end, iter_end) - top->start + 1; list_del(&iter->list); kfree(iter); } } list_splice(&stack, regions); return 0; } static int iommu_insert_device_resv_regions(struct list_head *dev_resv_regions, struct list_head *group_resv_regions) { struct iommu_resv_region *entry; int ret = 0; list_for_each_entry(entry, dev_resv_regions, list) { ret = iommu_insert_resv_region(entry, group_resv_regions); if (ret) break; } return ret; } int iommu_get_group_resv_regions(struct iommu_group *group, struct list_head *head) { struct group_device *device; int ret = 0; mutex_lock(&group->mutex); for_each_group_device(group, device) { struct list_head dev_resv_regions; /* * Non-API groups still expose reserved_regions in sysfs, * so filter out calls that get here that way. */ if (!dev_has_iommu(device->dev)) break; INIT_LIST_HEAD(&dev_resv_regions); iommu_get_resv_regions(device->dev, &dev_resv_regions); ret = iommu_insert_device_resv_regions(&dev_resv_regions, head); iommu_put_resv_regions(device->dev, &dev_resv_regions); if (ret) break; } mutex_unlock(&group->mutex); return ret; } EXPORT_SYMBOL_GPL(iommu_get_group_resv_regions); static ssize_t iommu_group_show_resv_regions(struct iommu_group *group, char *buf) { struct iommu_resv_region *region, *next; struct list_head group_resv_regions; int offset = 0; INIT_LIST_HEAD(&group_resv_regions); iommu_get_group_resv_regions(group, &group_resv_regions); list_for_each_entry_safe(region, next, &group_resv_regions, list) { offset += sysfs_emit_at(buf, offset, "0x%016llx 0x%016llx %s\n", (long long)region->start, (long long)(region->start + region->length - 1), iommu_group_resv_type_string[region->type]); kfree(region); } return offset; } static ssize_t iommu_group_show_type(struct iommu_group *group, char *buf) { char *type = "unknown"; mutex_lock(&group->mutex); if (group->default_domain) { switch (group->default_domain->type) { case IOMMU_DOMAIN_BLOCKED: type = "blocked"; break; case IOMMU_DOMAIN_IDENTITY: type = "identity"; break; case IOMMU_DOMAIN_UNMANAGED: type = "unmanaged"; break; case IOMMU_DOMAIN_DMA: type = "DMA"; break; case IOMMU_DOMAIN_DMA_FQ: type = "DMA-FQ"; break; } } mutex_unlock(&group->mutex); return sysfs_emit(buf, "%s\n", type); } static IOMMU_GROUP_ATTR(name, S_IRUGO, iommu_group_show_name, NULL); static IOMMU_GROUP_ATTR(reserved_regions, 0444, iommu_group_show_resv_regions, NULL); static IOMMU_GROUP_ATTR(type, 0644, iommu_group_show_type, iommu_group_store_type); static void iommu_group_release(struct kobject *kobj) { struct iommu_group *group = to_iommu_group(kobj); pr_debug("Releasing group %d\n", group->id); if (group->iommu_data_release) group->iommu_data_release(group->iommu_data); ida_free(&iommu_group_ida, group->id); /* Domains are free'd by iommu_deinit_device() */ WARN_ON(group->default_domain); WARN_ON(group->blocking_domain); kfree(group->name); kfree(group); } static const struct kobj_type iommu_group_ktype = { .sysfs_ops = &iommu_group_sysfs_ops, .release = iommu_group_release, }; /** * iommu_group_alloc - Allocate a new group * * This function is called by an iommu driver to allocate a new iommu * group. The iommu group represents the minimum granularity of the iommu. * Upon successful return, the caller holds a reference to the supplied * group in order to hold the group until devices are added. Use * iommu_group_put() to release this extra reference count, allowing the * group to be automatically reclaimed once it has no devices or external * references. */ struct iommu_group *iommu_group_alloc(void) { struct iommu_group *group; int ret; group = kzalloc(sizeof(*group), GFP_KERNEL); if (!group) return ERR_PTR(-ENOMEM); group->kobj.kset = iommu_group_kset; mutex_init(&group->mutex); INIT_LIST_HEAD(&group->devices); INIT_LIST_HEAD(&group->entry); xa_init(&group->pasid_array); ret = ida_alloc(&iommu_group_ida, GFP_KERNEL); if (ret < 0) { kfree(group); return ERR_PTR(ret); } group->id = ret; ret = kobject_init_and_add(&group->kobj, &iommu_group_ktype, NULL, "%d", group->id); if (ret) { kobject_put(&group->kobj); return ERR_PTR(ret); } group->devices_kobj = kobject_create_and_add("devices", &group->kobj); if (!group->devices_kobj) { kobject_put(&group->kobj); /* triggers .release & free */ return ERR_PTR(-ENOMEM); } /* * The devices_kobj holds a reference on the group kobject, so * as long as that exists so will the group. We can therefore * use the devices_kobj for reference counting. */ kobject_put(&group->kobj); ret = iommu_group_create_file(group, &iommu_group_attr_reserved_regions); if (ret) { kobject_put(group->devices_kobj); return ERR_PTR(ret); } ret = iommu_group_create_file(group, &iommu_group_attr_type); if (ret) { kobject_put(group->devices_kobj); return ERR_PTR(ret); } pr_debug("Allocated group %d\n", group->id); return group; } EXPORT_SYMBOL_GPL(iommu_group_alloc); /** * iommu_group_get_iommudata - retrieve iommu_data registered for a group * @group: the group * * iommu drivers can store data in the group for use when doing iommu * operations. This function provides a way to retrieve it. Caller * should hold a group reference. */ void *iommu_group_get_iommudata(struct iommu_group *group) { return group->iommu_data; } EXPORT_SYMBOL_GPL(iommu_group_get_iommudata); /** * iommu_group_set_iommudata - set iommu_data for a group * @group: the group * @iommu_data: new data * @release: release function for iommu_data * * iommu drivers can store data in the group for use when doing iommu * operations. This function provides a way to set the data after * the group has been allocated. Caller should hold a group reference. */ void iommu_group_set_iommudata(struct iommu_group *group, void *iommu_data, void (*release)(void *iommu_data)) { group->iommu_data = iommu_data; group->iommu_data_release = release; } EXPORT_SYMBOL_GPL(iommu_group_set_iommudata); /** * iommu_group_set_name - set name for a group * @group: the group * @name: name * * Allow iommu driver to set a name for a group. When set it will * appear in a name attribute file under the group in sysfs. */ int iommu_group_set_name(struct iommu_group *group, const char *name) { int ret; if (group->name) { iommu_group_remove_file(group, &iommu_group_attr_name); kfree(group->name); group->name = NULL; if (!name) return 0; } group->name = kstrdup(name, GFP_KERNEL); if (!group->name) return -ENOMEM; ret = iommu_group_create_file(group, &iommu_group_attr_name); if (ret) { kfree(group->name); group->name = NULL; return ret; } return 0; } EXPORT_SYMBOL_GPL(iommu_group_set_name); static int iommu_create_device_direct_mappings(struct iommu_domain *domain, struct device *dev) { struct iommu_resv_region *entry; struct list_head mappings; unsigned long pg_size; int ret = 0; pg_size = domain->pgsize_bitmap ? 1UL << __ffs(domain->pgsize_bitmap) : 0; INIT_LIST_HEAD(&mappings); if (WARN_ON_ONCE(iommu_is_dma_domain(domain) && !pg_size)) return -EINVAL; iommu_get_resv_regions(dev, &mappings); /* We need to consider overlapping regions for different devices */ list_for_each_entry(entry, &mappings, list) { dma_addr_t start, end, addr; size_t map_size = 0; if (entry->type == IOMMU_RESV_DIRECT) dev->iommu->require_direct = 1; if ((entry->type != IOMMU_RESV_DIRECT && entry->type != IOMMU_RESV_DIRECT_RELAXABLE) || !iommu_is_dma_domain(domain)) continue; start = ALIGN(entry->start, pg_size); end = ALIGN(entry->start + entry->length, pg_size); for (addr = start; addr <= end; addr += pg_size) { phys_addr_t phys_addr; if (addr == end) goto map_end; phys_addr = iommu_iova_to_phys(domain, addr); if (!phys_addr) { map_size += pg_size; continue; } map_end: if (map_size) { ret = iommu_map(domain, addr - map_size, addr - map_size, map_size, entry->prot, GFP_KERNEL); if (ret) goto out; map_size = 0; } } } out: iommu_put_resv_regions(dev, &mappings); return ret; } /* This is undone by __iommu_group_free_device() */ static struct group_device *iommu_group_alloc_device(struct iommu_group *group, struct device *dev) { int ret, i = 0; struct group_device *device; device = kzalloc(sizeof(*device), GFP_KERNEL); if (!device) return ERR_PTR(-ENOMEM); device->dev = dev; ret = sysfs_create_link(&dev->kobj, &group->kobj, "iommu_group"); if (ret) goto err_free_device; device->name = kasprintf(GFP_KERNEL, "%s", kobject_name(&dev->kobj)); rename: if (!device->name) { ret = -ENOMEM; goto err_remove_link; } ret = sysfs_create_link_nowarn(group->devices_kobj, &dev->kobj, device->name); if (ret) { if (ret == -EEXIST && i >= 0) { /* * Account for the slim chance of collision * and append an instance to the name. */ kfree(device->name); device->name = kasprintf(GFP_KERNEL, "%s.%d", kobject_name(&dev->kobj), i++); goto rename; } goto err_free_name; } trace_add_device_to_group(group->id, dev); dev_info(dev, "Adding to iommu group %d\n", group->id); return device; err_free_name: kfree(device->name); err_remove_link: sysfs_remove_link(&dev->kobj, "iommu_group"); err_free_device: kfree(device); dev_err(dev, "Failed to add to iommu group %d: %d\n", group->id, ret); return ERR_PTR(ret); } /** * iommu_group_add_device - add a device to an iommu group * @group: the group into which to add the device (reference should be held) * @dev: the device * * This function is called by an iommu driver to add a device into a * group. Adding a device increments the group reference count. */ int iommu_group_add_device(struct iommu_group *group, struct device *dev) { struct group_device *gdev; gdev = iommu_group_alloc_device(group, dev); if (IS_ERR(gdev)) return PTR_ERR(gdev); iommu_group_ref_get(group); dev->iommu_group = group; mutex_lock(&group->mutex); list_add_tail(&gdev->list, &group->devices); mutex_unlock(&group->mutex); return 0; } EXPORT_SYMBOL_GPL(iommu_group_add_device); /** * iommu_group_remove_device - remove a device from it's current group * @dev: device to be removed * * This function is called by an iommu driver to remove the device from * it's current group. This decrements the iommu group reference count. */ void iommu_group_remove_device(struct device *dev) { struct iommu_group *group = dev->iommu_group; if (!group) return; dev_info(dev, "Removing from iommu group %d\n", group->id); __iommu_group_remove_device(dev); } EXPORT_SYMBOL_GPL(iommu_group_remove_device); #if IS_ENABLED(CONFIG_LOCKDEP) && IS_ENABLED(CONFIG_IOMMU_API) /** * iommu_group_mutex_assert - Check device group mutex lock * @dev: the device that has group param set * * This function is called by an iommu driver to check whether it holds * group mutex lock for the given device or not. * * Note that this function must be called after device group param is set. */ void iommu_group_mutex_assert(struct device *dev) { struct iommu_group *group = dev->iommu_group; lockdep_assert_held(&group->mutex); } EXPORT_SYMBOL_GPL(iommu_group_mutex_assert); #endif static struct device *iommu_group_first_dev(struct iommu_group *group) { lockdep_assert_held(&group->mutex); return list_first_entry(&group->devices, struct group_device, list)->dev; } /** * iommu_group_for_each_dev - iterate over each device in the group * @group: the group * @data: caller opaque data to be passed to callback function * @fn: caller supplied callback function * * This function is called by group users to iterate over group devices. * Callers should hold a reference count to the group during callback. * The group->mutex is held across callbacks, which will block calls to * iommu_group_add/remove_device. */ int iommu_group_for_each_dev(struct iommu_group *group, void *data, int (*fn)(struct device *, void *)) { struct group_device *device; int ret = 0; mutex_lock(&group->mutex); for_each_group_device(group, device) { ret = fn(device->dev, data); if (ret) break; } mutex_unlock(&group->mutex); return ret; } EXPORT_SYMBOL_GPL(iommu_group_for_each_dev); /** * iommu_group_get - Return the group for a device and increment reference * @dev: get the group that this device belongs to * * This function is called by iommu drivers and users to get the group * for the specified device. If found, the group is returned and the group * reference in incremented, else NULL. */ struct iommu_group *iommu_group_get(struct device *dev) { struct iommu_group *group = dev->iommu_group; if (group) kobject_get(group->devices_kobj); return group; } EXPORT_SYMBOL_GPL(iommu_group_get); /** * iommu_group_ref_get - Increment reference on a group * @group: the group to use, must not be NULL * * This function is called by iommu drivers to take additional references on an * existing group. Returns the given group for convenience. */ struct iommu_group *iommu_group_ref_get(struct iommu_group *group) { kobject_get(group->devices_kobj); return group; } EXPORT_SYMBOL_GPL(iommu_group_ref_get); /** * iommu_group_put - Decrement group reference * @group: the group to use * * This function is called by iommu drivers and users to release the * iommu group. Once the reference count is zero, the group is released. */ void iommu_group_put(struct iommu_group *group) { if (group) kobject_put(group->devices_kobj); } EXPORT_SYMBOL_GPL(iommu_group_put); /** * iommu_group_id - Return ID for a group * @group: the group to ID * * Return the unique ID for the group matching the sysfs group number. */ int iommu_group_id(struct iommu_group *group) { return group->id; } EXPORT_SYMBOL_GPL(iommu_group_id); static struct iommu_group *get_pci_alias_group(struct pci_dev *pdev, unsigned long *devfns); /* * To consider a PCI device isolated, we require ACS to support Source * Validation, Request Redirection, Completer Redirection, and Upstream * Forwarding. This effectively means that devices cannot spoof their * requester ID, requests and completions cannot be redirected, and all * transactions are forwarded upstream, even as it passes through a * bridge where the target device is downstream. */ #define REQ_ACS_FLAGS (PCI_ACS_SV | PCI_ACS_RR | PCI_ACS_CR | PCI_ACS_UF) /* * For multifunction devices which are not isolated from each other, find * all the other non-isolated functions and look for existing groups. For * each function, we also need to look for aliases to or from other devices * that may already have a group. */ static struct iommu_group *get_pci_function_alias_group(struct pci_dev *pdev, unsigned long *devfns) { struct pci_dev *tmp = NULL; struct iommu_group *group; if (!pdev->multifunction || pci_acs_enabled(pdev, REQ_ACS_FLAGS)) return NULL; for_each_pci_dev(tmp) { if (tmp == pdev || tmp->bus != pdev->bus || PCI_SLOT(tmp->devfn) != PCI_SLOT(pdev->devfn) || pci_acs_enabled(tmp, REQ_ACS_FLAGS)) continue; group = get_pci_alias_group(tmp, devfns); if (group) { pci_dev_put(tmp); return group; } } return NULL; } /* * Look for aliases to or from the given device for existing groups. DMA * aliases are only supported on the same bus, therefore the search * space is quite small (especially since we're really only looking at pcie * device, and therefore only expect multiple slots on the root complex or * downstream switch ports). It's conceivable though that a pair of * multifunction devices could have aliases between them that would cause a * loop. To prevent this, we use a bitmap to track where we've been. */ static struct iommu_group *get_pci_alias_group(struct pci_dev *pdev, unsigned long *devfns) { struct pci_dev *tmp = NULL; struct iommu_group *group; if (test_and_set_bit(pdev->devfn & 0xff, devfns)) return NULL; group = iommu_group_get(&pdev->dev); if (group) return group; for_each_pci_dev(tmp) { if (tmp == pdev || tmp->bus != pdev->bus) continue; /* We alias them or they alias us */ if (pci_devs_are_dma_aliases(pdev, tmp)) { group = get_pci_alias_group(tmp, devfns); if (group) { pci_dev_put(tmp); return group; } group = get_pci_function_alias_group(tmp, devfns); if (group) { pci_dev_put(tmp); return group; } } } return NULL; } struct group_for_pci_data { struct pci_dev *pdev; struct iommu_group *group; }; /* * DMA alias iterator callback, return the last seen device. Stop and return * the IOMMU group if we find one along the way. */ static int get_pci_alias_or_group(struct pci_dev *pdev, u16 alias, void *opaque) { struct group_for_pci_data *data = opaque; data->pdev = pdev; data->group = iommu_group_get(&pdev->dev); return data->group != NULL; } /* * Generic device_group call-back function. It just allocates one * iommu-group per device. */ struct iommu_group *generic_device_group(struct device *dev) { return iommu_group_alloc(); } EXPORT_SYMBOL_GPL(generic_device_group); /* * Generic device_group call-back function. It just allocates one * iommu-group per iommu driver instance shared by every device * probed by that iommu driver. */ struct iommu_group *generic_single_device_group(struct device *dev) { struct iommu_device *iommu = dev->iommu->iommu_dev; if (!iommu->singleton_group) { struct iommu_group *group; group = iommu_group_alloc(); if (IS_ERR(group)) return group; iommu->singleton_group = group; } return iommu_group_ref_get(iommu->singleton_group); } EXPORT_SYMBOL_GPL(generic_single_device_group); /* * Use standard PCI bus topology, isolation features, and DMA alias quirks * to find or create an IOMMU group for a device. */ struct iommu_group *pci_device_group(struct device *dev) { struct pci_dev *pdev = to_pci_dev(dev); struct group_for_pci_data data; struct pci_bus *bus; struct iommu_group *group = NULL; u64 devfns[4] = { 0 }; if (WARN_ON(!dev_is_pci(dev))) return ERR_PTR(-EINVAL); /* * Find the upstream DMA alias for the device. A device must not * be aliased due to topology in order to have its own IOMMU group. * If we find an alias along the way that already belongs to a * group, use it. */ if (pci_for_each_dma_alias(pdev, get_pci_alias_or_group, &data)) return data.group; pdev = data.pdev; /* * Continue upstream from the point of minimum IOMMU granularity * due to aliases to the point where devices are protected from * peer-to-peer DMA by PCI ACS. Again, if we find an existing * group, use it. */ for (bus = pdev->bus; !pci_is_root_bus(bus); bus = bus->parent) { if (!bus->self) continue; if (pci_acs_path_enabled(bus->self, NULL, REQ_ACS_FLAGS)) break; pdev = bus->self; group = iommu_group_get(&pdev->dev); if (group) return group; } /* * Look for existing groups on device aliases. If we alias another * device or another device aliases us, use the same group. */ group = get_pci_alias_group(pdev, (unsigned long *)devfns); if (group) return group; /* * Look for existing groups on non-isolated functions on the same * slot and aliases of those funcions, if any. No need to clear * the search bitmap, the tested devfns are still valid. */ group = get_pci_function_alias_group(pdev, (unsigned long *)devfns); if (group) return group; /* No shared group found, allocate new */ return iommu_group_alloc(); } EXPORT_SYMBOL_GPL(pci_device_group); /* Get the IOMMU group for device on fsl-mc bus */ struct iommu_group *fsl_mc_device_group(struct device *dev) { struct device *cont_dev = fsl_mc_cont_dev(dev); struct iommu_group *group; group = iommu_group_get(cont_dev); if (!group) group = iommu_group_alloc(); return group; } EXPORT_SYMBOL_GPL(fsl_mc_device_group); static struct iommu_domain *__iommu_alloc_identity_domain(struct device *dev) { const struct iommu_ops *ops = dev_iommu_ops(dev); struct iommu_domain *domain; if (ops->identity_domain) return ops->identity_domain; if (ops->domain_alloc_identity) { domain = ops->domain_alloc_identity(dev); if (IS_ERR(domain)) return domain; } else { return ERR_PTR(-EOPNOTSUPP); } iommu_domain_init(domain, IOMMU_DOMAIN_IDENTITY, ops); return domain; } static struct iommu_domain * __iommu_group_alloc_default_domain(struct iommu_group *group, int req_type) { struct device *dev = iommu_group_first_dev(group); struct iommu_domain *dom; if (group->default_domain && group->default_domain->type == req_type) return group->default_domain; /* * When allocating the DMA API domain assume that the driver is going to * use PASID and make sure the RID's domain is PASID compatible. */ if (req_type & __IOMMU_DOMAIN_PAGING) { dom = __iommu_paging_domain_alloc_flags(dev, req_type, dev->iommu->max_pasids ? IOMMU_HWPT_ALLOC_PASID : 0); /* * If driver does not support PASID feature then * try to allocate non-PASID domain */ if (PTR_ERR(dom) == -EOPNOTSUPP) dom = __iommu_paging_domain_alloc_flags(dev, req_type, 0); return dom; } if (req_type == IOMMU_DOMAIN_IDENTITY) return __iommu_alloc_identity_domain(dev); return ERR_PTR(-EINVAL); } /* * req_type of 0 means "auto" which means to select a domain based on * iommu_def_domain_type or what the driver actually supports. */ static struct iommu_domain * iommu_group_alloc_default_domain(struct iommu_group *group, int req_type) { const struct iommu_ops *ops = dev_iommu_ops(iommu_group_first_dev(group)); struct iommu_domain *dom; lockdep_assert_held(&group->mutex); /* * Allow legacy drivers to specify the domain that will be the default * domain. This should always be either an IDENTITY/BLOCKED/PLATFORM * domain. Do not use in new drivers. */ if (ops->default_domain) { if (req_type != ops->default_domain->type) return ERR_PTR(-EINVAL); return ops->default_domain; } if (req_type) return __iommu_group_alloc_default_domain(group, req_type); /* The driver gave no guidance on what type to use, try the default */ dom = __iommu_group_alloc_default_domain(group, iommu_def_domain_type); if (!IS_ERR(dom)) return dom; /* Otherwise IDENTITY and DMA_FQ defaults will try DMA */ if (iommu_def_domain_type == IOMMU_DOMAIN_DMA) return ERR_PTR(-EINVAL); dom = __iommu_group_alloc_default_domain(group, IOMMU_DOMAIN_DMA); if (IS_ERR(dom)) return dom; pr_warn("Failed to allocate default IOMMU domain of type %u for group %s - Falling back to IOMMU_DOMAIN_DMA", iommu_def_domain_type, group->name); return dom; } struct iommu_domain *iommu_group_default_domain(struct iommu_group *group) { return group->default_domain; } static int probe_iommu_group(struct device *dev, void *data) { struct list_head *group_list = data; int ret; mutex_lock(&iommu_probe_device_lock); ret = __iommu_probe_device(dev, group_list); mutex_unlock(&iommu_probe_device_lock); if (ret == -ENODEV) ret = 0; return ret; } static int iommu_bus_notifier(struct notifier_block *nb, unsigned long action, void *data) { struct device *dev = data; if (action == BUS_NOTIFY_ADD_DEVICE) { int ret; ret = iommu_probe_device(dev); return (ret) ? NOTIFY_DONE : NOTIFY_OK; } else if (action == BUS_NOTIFY_REMOVED_DEVICE) { iommu_release_device(dev); return NOTIFY_OK; } return 0; } /* * Combine the driver's chosen def_domain_type across all the devices in a * group. Drivers must give a consistent result. */ static int iommu_get_def_domain_type(struct iommu_group *group, struct device *dev, int cur_type) { const struct iommu_ops *ops = dev_iommu_ops(dev); int type; if (ops->default_domain) { /* * Drivers that declare a global static default_domain will * always choose that. */ type = ops->default_domain->type; } else { if (ops->def_domain_type) type = ops->def_domain_type(dev); else return cur_type; } if (!type || cur_type == type) return cur_type; if (!cur_type) return type; dev_err_ratelimited( dev, "IOMMU driver error, requesting conflicting def_domain_type, %s and %s, for devices in group %u.\n", iommu_domain_type_str(cur_type), iommu_domain_type_str(type), group->id); /* * Try to recover, drivers are allowed to force IDENTITY or DMA, IDENTITY * takes precedence. */ if (type == IOMMU_DOMAIN_IDENTITY) return type; return cur_type; } /* * A target_type of 0 will select the best domain type. 0 can be returned in * this case meaning the global default should be used. */ static int iommu_get_default_domain_type(struct iommu_group *group, int target_type) { struct device *untrusted = NULL; struct group_device *gdev; int driver_type = 0; lockdep_assert_held(&group->mutex); /* * ARM32 drivers supporting CONFIG_ARM_DMA_USE_IOMMU can declare an * identity_domain and it will automatically become their default * domain. Later on ARM_DMA_USE_IOMMU will install its UNMANAGED domain. * Override the selection to IDENTITY. */ if (IS_ENABLED(CONFIG_ARM_DMA_USE_IOMMU)) { static_assert(!(IS_ENABLED(CONFIG_ARM_DMA_USE_IOMMU) && IS_ENABLED(CONFIG_IOMMU_DMA))); driver_type = IOMMU_DOMAIN_IDENTITY; } for_each_group_device(group, gdev) { driver_type = iommu_get_def_domain_type(group, gdev->dev, driver_type); if (dev_is_pci(gdev->dev) && to_pci_dev(gdev->dev)->untrusted) { /* * No ARM32 using systems will set untrusted, it cannot * work. */ if (WARN_ON(IS_ENABLED(CONFIG_ARM_DMA_USE_IOMMU))) return -1; untrusted = gdev->dev; } } /* * If the common dma ops are not selected in kconfig then we cannot use * IOMMU_DOMAIN_DMA at all. Force IDENTITY if nothing else has been * selected. */ if (!IS_ENABLED(CONFIG_IOMMU_DMA)) { if (WARN_ON(driver_type == IOMMU_DOMAIN_DMA)) return -1; if (!driver_type) driver_type = IOMMU_DOMAIN_IDENTITY; } if (untrusted) { if (driver_type && driver_type != IOMMU_DOMAIN_DMA) { dev_err_ratelimited( untrusted, "Device is not trusted, but driver is overriding group %u to %s, refusing to probe.\n", group->id, iommu_domain_type_str(driver_type)); return -1; } driver_type = IOMMU_DOMAIN_DMA; } if (target_type) { if (driver_type && target_type != driver_type) return -1; return target_type; } return driver_type; } static void iommu_group_do_probe_finalize(struct device *dev) { const struct iommu_ops *ops = dev_iommu_ops(dev); if (ops->probe_finalize) ops->probe_finalize(dev); } static int bus_iommu_probe(const struct bus_type *bus) { struct iommu_group *group, *next; LIST_HEAD(group_list); int ret; ret = bus_for_each_dev(bus, NULL, &group_list, probe_iommu_group); if (ret) return ret; list_for_each_entry_safe(group, next, &group_list, entry) { struct group_device *gdev; mutex_lock(&group->mutex); /* Remove item from the list */ list_del_init(&group->entry); /* * We go to the trouble of deferred default domain creation so * that the cross-group default domain type and the setup of the * IOMMU_RESV_DIRECT will work correctly in non-hotpug scenarios. */ ret = iommu_setup_default_domain(group, 0); if (ret) { mutex_unlock(&group->mutex); return ret; } for_each_group_device(group, gdev) iommu_setup_dma_ops(gdev->dev); mutex_unlock(&group->mutex); /* * FIXME: Mis-locked because the ops->probe_finalize() call-back * of some IOMMU drivers calls arm_iommu_attach_device() which * in-turn might call back into IOMMU core code, where it tries * to take group->mutex, resulting in a deadlock. */ for_each_group_device(group, gdev) iommu_group_do_probe_finalize(gdev->dev); } return 0; } /** * device_iommu_capable() - check for a general IOMMU capability * @dev: device to which the capability would be relevant, if available * @cap: IOMMU capability * * Return: true if an IOMMU is present and supports the given capability * for the given device, otherwise false. */ bool device_iommu_capable(struct device *dev, enum iommu_cap cap) { const struct iommu_ops *ops; if (!dev_has_iommu(dev)) return false; ops = dev_iommu_ops(dev); if (!ops->capable) return false; return ops->capable(dev, cap); } EXPORT_SYMBOL_GPL(device_iommu_capable); /** * iommu_group_has_isolated_msi() - Compute msi_device_has_isolated_msi() * for a group * @group: Group to query * * IOMMU groups should not have differing values of * msi_device_has_isolated_msi() for devices in a group. However nothing * directly prevents this, so ensure mistakes don't result in isolation failures * by checking that all the devices are the same. */ bool iommu_group_has_isolated_msi(struct iommu_group *group) { struct group_device *group_dev; bool ret = true; mutex_lock(&group->mutex); for_each_group_device(group, group_dev) ret &= msi_device_has_isolated_msi(group_dev->dev); mutex_unlock(&group->mutex); return ret; } EXPORT_SYMBOL_GPL(iommu_group_has_isolated_msi); /** * iommu_set_fault_handler() - set a fault handler for an iommu domain * @domain: iommu domain * @handler: fault handler * @token: user data, will be passed back to the fault handler * * This function should be used by IOMMU users which want to be notified * whenever an IOMMU fault happens. * * The fault handler itself should return 0 on success, and an appropriate * error code otherwise. */ void iommu_set_fault_handler(struct iommu_domain *domain, iommu_fault_handler_t handler, void *token) { if (WARN_ON(!domain || domain->cookie_type != IOMMU_COOKIE_NONE)) return; domain->cookie_type = IOMMU_COOKIE_FAULT_HANDLER; domain->handler = handler; domain->handler_token = token; } EXPORT_SYMBOL_GPL(iommu_set_fault_handler); static void iommu_domain_init(struct iommu_domain *domain, unsigned int type, const struct iommu_ops *ops) { domain->type = type; domain->owner = ops; if (!domain->ops) domain->ops = ops->default_domain_ops; } static struct iommu_domain * __iommu_paging_domain_alloc_flags(struct device *dev, unsigned int type, unsigned int flags) { const struct iommu_ops *ops; struct iommu_domain *domain; if (!dev_has_iommu(dev)) return ERR_PTR(-ENODEV); ops = dev_iommu_ops(dev); if (ops->domain_alloc_paging && !flags) domain = ops->domain_alloc_paging(dev); else if (ops->domain_alloc_paging_flags) domain = ops->domain_alloc_paging_flags(dev, flags, NULL); #if IS_ENABLED(CONFIG_FSL_PAMU) else if (ops->domain_alloc && !flags) domain = ops->domain_alloc(IOMMU_DOMAIN_UNMANAGED); #endif else return ERR_PTR(-EOPNOTSUPP); if (IS_ERR(domain)) return domain; if (!domain) return ERR_PTR(-ENOMEM); iommu_domain_init(domain, type, ops); return domain; } /** * iommu_paging_domain_alloc_flags() - Allocate a paging domain * @dev: device for which the domain is allocated * @flags: Bitmap of iommufd_hwpt_alloc_flags * * Allocate a paging domain which will be managed by a kernel driver. Return * allocated domain if successful, or an ERR pointer for failure. */ struct iommu_domain *iommu_paging_domain_alloc_flags(struct device *dev, unsigned int flags) { return __iommu_paging_domain_alloc_flags(dev, IOMMU_DOMAIN_UNMANAGED, flags); } EXPORT_SYMBOL_GPL(iommu_paging_domain_alloc_flags); void iommu_domain_free(struct iommu_domain *domain) { switch (domain->cookie_type) { case IOMMU_COOKIE_DMA_IOVA: iommu_put_dma_cookie(domain); break; case IOMMU_COOKIE_DMA_MSI: iommu_put_msi_cookie(domain); break; case IOMMU_COOKIE_SVA: mmdrop(domain->mm); break; default: break; } if (domain->ops->free) domain->ops->free(domain); } EXPORT_SYMBOL_GPL(iommu_domain_free); /* * Put the group's domain back to the appropriate core-owned domain - either the * standard kernel-mode DMA configuration or an all-DMA-blocked domain. */ static void __iommu_group_set_core_domain(struct iommu_group *group) { struct iommu_domain *new_domain; if (group->owner) new_domain = group->blocking_domain; else new_domain = group->default_domain; __iommu_group_set_domain_nofail(group, new_domain); } static int __iommu_attach_device(struct iommu_domain *domain, struct device *dev) { int ret; if (unlikely(domain->ops->attach_dev == NULL)) return -ENODEV; ret = domain->ops->attach_dev(domain, dev); if (ret) return ret; dev->iommu->attach_deferred = 0; trace_attach_device_to_domain(dev); return 0; } /** * iommu_attach_device - Attach an IOMMU domain to a device * @domain: IOMMU domain to attach * @dev: Device that will be attached * * Returns 0 on success and error code on failure * * Note that EINVAL can be treated as a soft failure, indicating * that certain configuration of the domain is incompatible with * the device. In this case attaching a different domain to the * device may succeed. */ int iommu_attach_device(struct iommu_domain *domain, struct device *dev) { /* Caller must be a probed driver on dev */ struct iommu_group *group = dev->iommu_group; int ret; if (!group) return -ENODEV; /* * Lock the group to make sure the device-count doesn't * change while we are attaching */ mutex_lock(&group->mutex); ret = -EINVAL; if (list_count_nodes(&group->devices) != 1) goto out_unlock; ret = __iommu_attach_group(domain, group); out_unlock: mutex_unlock(&group->mutex); return ret; } EXPORT_SYMBOL_GPL(iommu_attach_device); int iommu_deferred_attach(struct device *dev, struct iommu_domain *domain) { if (dev->iommu && dev->iommu->attach_deferred) return __iommu_attach_device(domain, dev); return 0; } void iommu_detach_device(struct iommu_domain *domain, struct device *dev) { /* Caller must be a probed driver on dev */ struct iommu_group *group = dev->iommu_group; if (!group) return; mutex_lock(&group->mutex); if (WARN_ON(domain != group->domain) || WARN_ON(list_count_nodes(&group->devices) != 1)) goto out_unlock; __iommu_group_set_core_domain(group); out_unlock: mutex_unlock(&group->mutex); } EXPORT_SYMBOL_GPL(iommu_detach_device); struct iommu_domain *iommu_get_domain_for_dev(struct device *dev) { /* Caller must be a probed driver on dev */ struct iommu_group *group = dev->iommu_group; if (!group) return NULL; return group->domain; } EXPORT_SYMBOL_GPL(iommu_get_domain_for_dev); /* * For IOMMU_DOMAIN_DMA implementations which already provide their own * guarantees that the group and its default domain are valid and correct. */ struct iommu_domain *iommu_get_dma_domain(struct device *dev) { return dev->iommu_group->default_domain; } static void *iommu_make_pasid_array_entry(struct iommu_domain *domain, struct iommu_attach_handle *handle) { if (handle) { handle->domain = domain; return xa_tag_pointer(handle, IOMMU_PASID_ARRAY_HANDLE); } return xa_tag_pointer(domain, IOMMU_PASID_ARRAY_DOMAIN); } static bool domain_iommu_ops_compatible(const struct iommu_ops *ops, struct iommu_domain *domain) { if (domain->owner == ops) return true; /* For static domains, owner isn't set. */ if (domain == ops->blocked_domain || domain == ops->identity_domain) return true; return false; } static int __iommu_attach_group(struct iommu_domain *domain, struct iommu_group *group) { struct device *dev; if (group->domain && group->domain != group->default_domain && group->domain != group->blocking_domain) return -EBUSY; dev = iommu_group_first_dev(group); if (!dev_has_iommu(dev) || !domain_iommu_ops_compatible(dev_iommu_ops(dev), domain)) return -EINVAL; return __iommu_group_set_domain(group, domain); } /** * iommu_attach_group - Attach an IOMMU domain to an IOMMU group * @domain: IOMMU domain to attach * @group: IOMMU group that will be attached * * Returns 0 on success and error code on failure * * Note that EINVAL can be treated as a soft failure, indicating * that certain configuration of the domain is incompatible with * the group. In this case attaching a different domain to the * group may succeed. */ int iommu_attach_group(struct iommu_domain *domain, struct iommu_group *group) { int ret; mutex_lock(&group->mutex); ret = __iommu_attach_group(domain, group); mutex_unlock(&group->mutex); return ret; } EXPORT_SYMBOL_GPL(iommu_attach_group); static int __iommu_device_set_domain(struct iommu_group *group, struct device *dev, struct iommu_domain *new_domain, unsigned int flags) { int ret; /* * If the device requires IOMMU_RESV_DIRECT then we cannot allow * the blocking domain to be attached as it does not contain the * required 1:1 mapping. This test effectively excludes the device * being used with iommu_group_claim_dma_owner() which will block * vfio and iommufd as well. */ if (dev->iommu->require_direct && (new_domain->type == IOMMU_DOMAIN_BLOCKED || new_domain == group->blocking_domain)) { dev_warn(dev, "Firmware has requested this device have a 1:1 IOMMU mapping, rejecting configuring the device without a 1:1 mapping. Contact your platform vendor.\n"); return -EINVAL; } if (dev->iommu->attach_deferred) { if (new_domain == group->default_domain) return 0; dev->iommu->attach_deferred = 0; } ret = __iommu_attach_device(new_domain, dev); if (ret) { /* * If we have a blocking domain then try to attach that in hopes * of avoiding a UAF. Modern drivers should implement blocking * domains as global statics that cannot fail. */ if ((flags & IOMMU_SET_DOMAIN_MUST_SUCCEED) && group->blocking_domain && group->blocking_domain != new_domain) __iommu_attach_device(group->blocking_domain, dev); return ret; } return 0; } /* * If 0 is returned the group's domain is new_domain. If an error is returned * then the group's domain will be set back to the existing domain unless * IOMMU_SET_DOMAIN_MUST_SUCCEED, otherwise an error is returned and the group's * domains is left inconsistent. This is a driver bug to fail attach with a * previously good domain. We try to avoid a kernel UAF because of this. * * IOMMU groups are really the natural working unit of the IOMMU, but the IOMMU * API works on domains and devices. Bridge that gap by iterating over the * devices in a group. Ideally we'd have a single device which represents the * requestor ID of the group, but we also allow IOMMU drivers to create policy * defined minimum sets, where the physical hardware may be able to distiguish * members, but we wish to group them at a higher level (ex. untrusted * multi-function PCI devices). Thus we attach each device. */ static int __iommu_group_set_domain_internal(struct iommu_group *group, struct iommu_domain *new_domain, unsigned int flags) { struct group_device *last_gdev; struct group_device *gdev; int result; int ret; lockdep_assert_held(&group->mutex); if (group->domain == new_domain) return 0; if (WARN_ON(!new_domain)) return -EINVAL; /* * Changing the domain is done by calling attach_dev() on the new * domain. This switch does not have to be atomic and DMA can be * discarded during the transition. DMA must only be able to access * either new_domain or group->domain, never something else. */ result = 0; for_each_group_device(group, gdev) { ret = __iommu_device_set_domain(group, gdev->dev, new_domain, flags); if (ret) { result = ret; /* * Keep trying the other devices in the group. If a * driver fails attach to an otherwise good domain, and * does not support blocking domains, it should at least * drop its reference on the current domain so we don't * UAF. */ if (flags & IOMMU_SET_DOMAIN_MUST_SUCCEED) continue; goto err_revert; } } group->domain = new_domain; return result; err_revert: /* * This is called in error unwind paths. A well behaved driver should * always allow us to attach to a domain that was already attached. */ last_gdev = gdev; for_each_group_device(group, gdev) { /* * A NULL domain can happen only for first probe, in which case * we leave group->domain as NULL and let release clean * everything up. */ if (group->domain) WARN_ON(__iommu_device_set_domain( group, gdev->dev, group->domain, IOMMU_SET_DOMAIN_MUST_SUCCEED)); if (gdev == last_gdev) break; } return ret; } void iommu_detach_group(struct iommu_domain *domain, struct iommu_group *group) { mutex_lock(&group->mutex); __iommu_group_set_core_domain(group); mutex_unlock(&group->mutex); } EXPORT_SYMBOL_GPL(iommu_detach_group); phys_addr_t iommu_iova_to_phys(struct iommu_domain *domain, dma_addr_t iova) { if (domain->type == IOMMU_DOMAIN_IDENTITY) return iova; if (domain->type == IOMMU_DOMAIN_BLOCKED) return 0; return domain->ops->iova_to_phys(domain, iova); } EXPORT_SYMBOL_GPL(iommu_iova_to_phys); static size_t iommu_pgsize(struct iommu_domain *domain, unsigned long iova, phys_addr_t paddr, size_t size, size_t *count) { unsigned int pgsize_idx, pgsize_idx_next; unsigned long pgsizes; size_t offset, pgsize, pgsize_next; size_t offset_end; unsigned long addr_merge = paddr | iova; /* Page sizes supported by the hardware and small enough for @size */ pgsizes = domain->pgsize_bitmap & GENMASK(__fls(size), 0); /* Constrain the page sizes further based on the maximum alignment */ if (likely(addr_merge)) pgsizes &= GENMASK(__ffs(addr_merge), 0); /* Make sure we have at least one suitable page size */ BUG_ON(!pgsizes); /* Pick the biggest page size remaining */ pgsize_idx = __fls(pgsizes); pgsize = BIT(pgsize_idx); if (!count) return pgsize; /* Find the next biggest support page size, if it exists */ pgsizes = domain->pgsize_bitmap & ~GENMASK(pgsize_idx, 0); if (!pgsizes) goto out_set_count; pgsize_idx_next = __ffs(pgsizes); pgsize_next = BIT(pgsize_idx_next); /* * There's no point trying a bigger page size unless the virtual * and physical addresses are similarly offset within the larger page. */ if ((iova ^ paddr) & (pgsize_next - 1)) goto out_set_count; /* Calculate the offset to the next page size alignment boundary */ offset = pgsize_next - (addr_merge & (pgsize_next - 1)); /* * If size is big enough to accommodate the larger page, reduce * the number of smaller pages. */ if (!check_add_overflow(offset, pgsize_next, &offset_end) && offset_end <= size) size = offset; out_set_count: *count = size >> pgsize_idx; return pgsize; } int iommu_map_nosync(struct iommu_domain *domain, unsigned long iova, phys_addr_t paddr, size_t size, int prot, gfp_t gfp) { const struct iommu_domain_ops *ops = domain->ops; unsigned long orig_iova = iova; unsigned int min_pagesz; size_t orig_size = size; phys_addr_t orig_paddr = paddr; int ret = 0; might_sleep_if(gfpflags_allow_blocking(gfp)); if (unlikely(!(domain->type & __IOMMU_DOMAIN_PAGING))) return -EINVAL; if (WARN_ON(!ops->map_pages || domain->pgsize_bitmap == 0UL)) return -ENODEV; /* Discourage passing strange GFP flags */ if (WARN_ON_ONCE(gfp & (__GFP_COMP | __GFP_DMA | __GFP_DMA32 | __GFP_HIGHMEM))) return -EINVAL; /* find out the minimum page size supported */ min_pagesz = 1 << __ffs(domain->pgsize_bitmap); /* * both the virtual address and the physical one, as well as * the size of the mapping, must be aligned (at least) to the * size of the smallest page supported by the hardware */ if (!IS_ALIGNED(iova | paddr | size, min_pagesz)) { pr_err("unaligned: iova 0x%lx pa %pa size 0x%zx min_pagesz 0x%x\n", iova, &paddr, size, min_pagesz); return -EINVAL; } pr_debug("map: iova 0x%lx pa %pa size 0x%zx\n", iova, &paddr, size); while (size) { size_t pgsize, count, mapped = 0; pgsize = iommu_pgsize(domain, iova, paddr, size, &count); pr_debug("mapping: iova 0x%lx pa %pa pgsize 0x%zx count %zu\n", iova, &paddr, pgsize, count); ret = ops->map_pages(domain, iova, paddr, pgsize, count, prot, gfp, &mapped); /* * Some pages may have been mapped, even if an error occurred, * so we should account for those so they can be unmapped. */ size -= mapped; if (ret) break; iova += mapped; paddr += mapped; } /* unroll mapping in case something went wrong */ if (ret) iommu_unmap(domain, orig_iova, orig_size - size); else trace_map(orig_iova, orig_paddr, orig_size); return ret; } int iommu_sync_map(struct iommu_domain *domain, unsigned long iova, size_t size) { const struct iommu_domain_ops *ops = domain->ops; if (!ops->iotlb_sync_map) return 0; return ops->iotlb_sync_map(domain, iova, size); } int iommu_map(struct iommu_domain *domain, unsigned long iova, phys_addr_t paddr, size_t size, int prot, gfp_t gfp) { int ret; ret = iommu_map_nosync(domain, iova, paddr, size, prot, gfp); if (ret) return ret; ret = iommu_sync_map(domain, iova, size); if (ret) iommu_unmap(domain, iova, size); return ret; } EXPORT_SYMBOL_GPL(iommu_map); static size_t __iommu_unmap(struct iommu_domain *domain, unsigned long iova, size_t size, struct iommu_iotlb_gather *iotlb_gather) { const struct iommu_domain_ops *ops = domain->ops; size_t unmapped_page, unmapped = 0; unsigned long orig_iova = iova; unsigned int min_pagesz; if (unlikely(!(domain->type & __IOMMU_DOMAIN_PAGING))) return 0; if (WARN_ON(!ops->unmap_pages || domain->pgsize_bitmap == 0UL)) return 0; /* find out the minimum page size supported */ min_pagesz = 1 << __ffs(domain->pgsize_bitmap); /* * The virtual address, as well as the size of the mapping, must be * aligned (at least) to the size of the smallest page supported * by the hardware */ if (!IS_ALIGNED(iova | size, min_pagesz)) { pr_err("unaligned: iova 0x%lx size 0x%zx min_pagesz 0x%x\n", iova, size, min_pagesz); return 0; } pr_debug("unmap this: iova 0x%lx size 0x%zx\n", iova, size); /* * Keep iterating until we either unmap 'size' bytes (or more) * or we hit an area that isn't mapped. */ while (unmapped < size) { size_t pgsize, count; pgsize = iommu_pgsize(domain, iova, iova, size - unmapped, &count); unmapped_page = ops->unmap_pages(domain, iova, pgsize, count, iotlb_gather); if (!unmapped_page) break; pr_debug("unmapped: iova 0x%lx size 0x%zx\n", iova, unmapped_page); iova += unmapped_page; unmapped += unmapped_page; } trace_unmap(orig_iova, size, unmapped); return unmapped; } /** * iommu_unmap() - Remove mappings from a range of IOVA * @domain: Domain to manipulate * @iova: IO virtual address to start * @size: Length of the range starting from @iova * * iommu_unmap() will remove a translation created by iommu_map(). It cannot * subdivide a mapping created by iommu_map(), so it should be called with IOVA * ranges that match what was passed to iommu_map(). The range can aggregate * contiguous iommu_map() calls so long as no individual range is split. * * Returns: Number of bytes of IOVA unmapped. iova + res will be the point * unmapping stopped. */ size_t iommu_unmap(struct iommu_domain *domain, unsigned long iova, size_t size) { struct iommu_iotlb_gather iotlb_gather; size_t ret; iommu_iotlb_gather_init(&iotlb_gather); ret = __iommu_unmap(domain, iova, size, &iotlb_gather); iommu_iotlb_sync(domain, &iotlb_gather); return ret; } EXPORT_SYMBOL_GPL(iommu_unmap); /** * iommu_unmap_fast() - Remove mappings from a range of IOVA without IOTLB sync * @domain: Domain to manipulate * @iova: IO virtual address to start * @size: Length of the range starting from @iova * @iotlb_gather: range information for a pending IOTLB flush * * iommu_unmap_fast() will remove a translation created by iommu_map(). * It can't subdivide a mapping created by iommu_map(), so it should be * called with IOVA ranges that match what was passed to iommu_map(). The * range can aggregate contiguous iommu_map() calls so long as no individual * range is split. * * Basically iommu_unmap_fast() is the same as iommu_unmap() but for callers * which manage the IOTLB flushing externally to perform a batched sync. * * Returns: Number of bytes of IOVA unmapped. iova + res will be the point * unmapping stopped. */ size_t iommu_unmap_fast(struct iommu_domain *domain, unsigned long iova, size_t size, struct iommu_iotlb_gather *iotlb_gather) { return __iommu_unmap(domain, iova, size, iotlb_gather); } EXPORT_SYMBOL_GPL(iommu_unmap_fast); ssize_t iommu_map_sg(struct iommu_domain *domain, unsigned long iova, struct scatterlist *sg, unsigned int nents, int prot, gfp_t gfp) { size_t len = 0, mapped = 0; phys_addr_t start; unsigned int i = 0; int ret; while (i <= nents) { phys_addr_t s_phys = sg_phys(sg); if (len && s_phys != start + len) { ret = iommu_map_nosync(domain, iova + mapped, start, len, prot, gfp); if (ret) goto out_err; mapped += len; len = 0; } if (sg_dma_is_bus_address(sg)) goto next; if (len) { len += sg->length; } else { len = sg->length; start = s_phys; } next: if (++i < nents) sg = sg_next(sg); } ret = iommu_sync_map(domain, iova, mapped); if (ret) goto out_err; return mapped; out_err: /* undo mappings already done */ iommu_unmap(domain, iova, mapped); return ret; } EXPORT_SYMBOL_GPL(iommu_map_sg); /** * report_iommu_fault() - report about an IOMMU fault to the IOMMU framework * @domain: the iommu domain where the fault has happened * @dev: the device where the fault has happened * @iova: the faulting address * @flags: mmu fault flags (e.g. IOMMU_FAULT_READ/IOMMU_FAULT_WRITE/...) * * This function should be called by the low-level IOMMU implementations * whenever IOMMU faults happen, to allow high-level users, that are * interested in such events, to know about them. * * This event may be useful for several possible use cases: * - mere logging of the event * - dynamic TLB/PTE loading * - if restarting of the faulting device is required * * Returns 0 on success and an appropriate error code otherwise (if dynamic * PTE/TLB loading will one day be supported, implementations will be able * to tell whether it succeeded or not according to this return value). * * Specifically, -ENOSYS is returned if a fault handler isn't installed * (though fault handlers can also return -ENOSYS, in case they want to * elicit the default behavior of the IOMMU drivers). */ int report_iommu_fault(struct iommu_domain *domain, struct device *dev, unsigned long iova, int flags) { int ret = -ENOSYS; /* * if upper layers showed interest and installed a fault handler, * invoke it. */ if (domain->cookie_type == IOMMU_COOKIE_FAULT_HANDLER && domain->handler) ret = domain->handler(domain, dev, iova, flags, domain->handler_token); trace_io_page_fault(dev, iova, flags); return ret; } EXPORT_SYMBOL_GPL(report_iommu_fault); static int __init iommu_init(void) { iommu_group_kset = kset_create_and_add("iommu_groups", NULL, kernel_kobj); BUG_ON(!iommu_group_kset); iommu_debugfs_setup(); return 0; } core_initcall(iommu_init); int iommu_set_pgtable_quirks(struct iommu_domain *domain, unsigned long quirk) { if (domain->type != IOMMU_DOMAIN_UNMANAGED) return -EINVAL; if (!domain->ops->set_pgtable_quirks) return -EINVAL; return domain->ops->set_pgtable_quirks(domain, quirk); } EXPORT_SYMBOL_GPL(iommu_set_pgtable_quirks); /** * iommu_get_resv_regions - get reserved regions * @dev: device for which to get reserved regions * @list: reserved region list for device * * This returns a list of reserved IOVA regions specific to this device. * A domain user should not map IOVA in these ranges. */ void iommu_get_resv_regions(struct device *dev, struct list_head *list) { const struct iommu_ops *ops = dev_iommu_ops(dev); if (ops->get_resv_regions) ops->get_resv_regions(dev, list); } EXPORT_SYMBOL_GPL(iommu_get_resv_regions); /** * iommu_put_resv_regions - release reserved regions * @dev: device for which to free reserved regions * @list: reserved region list for device * * This releases a reserved region list acquired by iommu_get_resv_regions(). */ void iommu_put_resv_regions(struct device *dev, struct list_head *list) { struct iommu_resv_region *entry, *next; list_for_each_entry_safe(entry, next, list, list) { if (entry->free) entry->free(dev, entry); else kfree(entry); } } EXPORT_SYMBOL(iommu_put_resv_regions); struct iommu_resv_region *iommu_alloc_resv_region(phys_addr_t start, size_t length, int prot, enum iommu_resv_type type, gfp_t gfp) { struct iommu_resv_region *region; region = kzalloc(sizeof(*region), gfp); if (!region) return NULL; INIT_LIST_HEAD(®ion->list); region->start = start; region->length = length; region->prot = prot; region->type = type; return region; } EXPORT_SYMBOL_GPL(iommu_alloc_resv_region); void iommu_set_default_passthrough(bool cmd_line) { if (cmd_line) iommu_cmd_line |= IOMMU_CMD_LINE_DMA_API; iommu_def_domain_type = IOMMU_DOMAIN_IDENTITY; } void iommu_set_default_translated(bool cmd_line) { if (cmd_line) iommu_cmd_line |= IOMMU_CMD_LINE_DMA_API; iommu_def_domain_type = IOMMU_DOMAIN_DMA; } bool iommu_default_passthrough(void) { return iommu_def_domain_type == IOMMU_DOMAIN_IDENTITY; } EXPORT_SYMBOL_GPL(iommu_default_passthrough); static const struct iommu_device *iommu_from_fwnode(const struct fwnode_handle *fwnode) { const struct iommu_device *iommu, *ret = NULL; spin_lock(&iommu_device_lock); list_for_each_entry(iommu, &iommu_device_list, list) if (iommu->fwnode == fwnode) { ret = iommu; break; } spin_unlock(&iommu_device_lock); return ret; } const struct iommu_ops *iommu_ops_from_fwnode(const struct fwnode_handle *fwnode) { const struct iommu_device *iommu = iommu_from_fwnode(fwnode); return iommu ? iommu->ops : NULL; } int iommu_fwspec_init(struct device *dev, struct fwnode_handle *iommu_fwnode) { const struct iommu_device *iommu = iommu_from_fwnode(iommu_fwnode); struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); if (!iommu) return driver_deferred_probe_check_state(dev); if (!dev->iommu && !READ_ONCE(iommu->ready)) return -EPROBE_DEFER; if (fwspec) return iommu->ops == iommu_fwspec_ops(fwspec) ? 0 : -EINVAL; if (!dev_iommu_get(dev)) return -ENOMEM; /* Preallocate for the overwhelmingly common case of 1 ID */ fwspec = kzalloc(struct_size(fwspec, ids, 1), GFP_KERNEL); if (!fwspec) return -ENOMEM; fwnode_handle_get(iommu_fwnode); fwspec->iommu_fwnode = iommu_fwnode; dev_iommu_fwspec_set(dev, fwspec); return 0; } EXPORT_SYMBOL_GPL(iommu_fwspec_init); void iommu_fwspec_free(struct device *dev) { struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); if (fwspec) { fwnode_handle_put(fwspec->iommu_fwnode); kfree(fwspec); dev_iommu_fwspec_set(dev, NULL); } } int iommu_fwspec_add_ids(struct device *dev, const u32 *ids, int num_ids) { struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); int i, new_num; if (!fwspec) return -EINVAL; new_num = fwspec->num_ids + num_ids; if (new_num > 1) { fwspec = krealloc(fwspec, struct_size(fwspec, ids, new_num), GFP_KERNEL); if (!fwspec) return -ENOMEM; dev_iommu_fwspec_set(dev, fwspec); } for (i = 0; i < num_ids; i++) fwspec->ids[fwspec->num_ids + i] = ids[i]; fwspec->num_ids = new_num; return 0; } EXPORT_SYMBOL_GPL(iommu_fwspec_add_ids); /** * iommu_setup_default_domain - Set the default_domain for the group * @group: Group to change * @target_type: Domain type to set as the default_domain * * Allocate a default domain and set it as the current domain on the group. If * the group already has a default domain it will be changed to the target_type. * When target_type is 0 the default domain is selected based on driver and * system preferences. */ static int iommu_setup_default_domain(struct iommu_group *group, int target_type) { struct iommu_domain *old_dom = group->default_domain; struct group_device *gdev; struct iommu_domain *dom; bool direct_failed; int req_type; int ret; lockdep_assert_held(&group->mutex); req_type = iommu_get_default_domain_type(group, target_type); if (req_type < 0) return -EINVAL; dom = iommu_group_alloc_default_domain(group, req_type); if (IS_ERR(dom)) return PTR_ERR(dom); if (group->default_domain == dom) return 0; if (iommu_is_dma_domain(dom)) { ret = iommu_get_dma_cookie(dom); if (ret) { iommu_domain_free(dom); return ret; } } /* * IOMMU_RESV_DIRECT and IOMMU_RESV_DIRECT_RELAXABLE regions must be * mapped before their device is attached, in order to guarantee * continuity with any FW activity */ direct_failed = false; for_each_group_device(group, gdev) { if (iommu_create_device_direct_mappings(dom, gdev->dev)) { direct_failed = true; dev_warn_once( gdev->dev->iommu->iommu_dev->dev, "IOMMU driver was not able to establish FW requested direct mapping."); } } /* We must set default_domain early for __iommu_device_set_domain */ group->default_domain = dom; if (!group->domain) { /* * Drivers are not allowed to fail the first domain attach. * The only way to recover from this is to fail attaching the * iommu driver and call ops->release_device. Put the domain * in group->default_domain so it is freed after. */ ret = __iommu_group_set_domain_internal( group, dom, IOMMU_SET_DOMAIN_MUST_SUCCEED); if (WARN_ON(ret)) goto out_free_old; } else { ret = __iommu_group_set_domain(group, dom); if (ret) goto err_restore_def_domain; } /* * Drivers are supposed to allow mappings to be installed in a domain * before device attachment, but some don't. Hack around this defect by * trying again after attaching. If this happens it means the device * will not continuously have the IOMMU_RESV_DIRECT map. */ if (direct_failed) { for_each_group_device(group, gdev) { ret = iommu_create_device_direct_mappings(dom, gdev->dev); if (ret) goto err_restore_domain; } } out_free_old: if (old_dom) iommu_domain_free(old_dom); return ret; err_restore_domain: if (old_dom) __iommu_group_set_domain_internal( group, old_dom, IOMMU_SET_DOMAIN_MUST_SUCCEED); err_restore_def_domain: if (old_dom) { iommu_domain_free(dom); group->default_domain = old_dom; } return ret; } /* * Changing the default domain through sysfs requires the users to unbind the * drivers from the devices in the iommu group, except for a DMA -> DMA-FQ * transition. Return failure if this isn't met. * * We need to consider the race between this and the device release path. * group->mutex is used here to guarantee that the device release path * will not be entered at the same time. */ static ssize_t iommu_group_store_type(struct iommu_group *group, const char *buf, size_t count) { struct group_device *gdev; int ret, req_type; if (!capable(CAP_SYS_ADMIN) || !capable(CAP_SYS_RAWIO)) return -EACCES; if (WARN_ON(!group) || !group->default_domain) return -EINVAL; if (sysfs_streq(buf, "identity")) req_type = IOMMU_DOMAIN_IDENTITY; else if (sysfs_streq(buf, "DMA")) req_type = IOMMU_DOMAIN_DMA; else if (sysfs_streq(buf, "DMA-FQ")) req_type = IOMMU_DOMAIN_DMA_FQ; else if (sysfs_streq(buf, "auto")) req_type = 0; else return -EINVAL; mutex_lock(&group->mutex); /* We can bring up a flush queue without tearing down the domain. */ if (req_type == IOMMU_DOMAIN_DMA_FQ && group->default_domain->type == IOMMU_DOMAIN_DMA) { ret = iommu_dma_init_fq(group->default_domain); if (ret) goto out_unlock; group->default_domain->type = IOMMU_DOMAIN_DMA_FQ; ret = count; goto out_unlock; } /* Otherwise, ensure that device exists and no driver is bound. */ if (list_empty(&group->devices) || group->owner_cnt) { ret = -EPERM; goto out_unlock; } ret = iommu_setup_default_domain(group, req_type); if (ret) goto out_unlock; /* Make sure dma_ops is appropriatley set */ for_each_group_device(group, gdev) iommu_setup_dma_ops(gdev->dev); out_unlock: mutex_unlock(&group->mutex); return ret ?: count; } /** * iommu_device_use_default_domain() - Device driver wants to handle device * DMA through the kernel DMA API. * @dev: The device. * * The device driver about to bind @dev wants to do DMA through the kernel * DMA API. Return 0 if it is allowed, otherwise an error. */ int iommu_device_use_default_domain(struct device *dev) { /* Caller is the driver core during the pre-probe path */ struct iommu_group *group = dev->iommu_group; int ret = 0; if (!group) return 0; mutex_lock(&group->mutex); /* We may race against bus_iommu_probe() finalising groups here */ if (!group->default_domain) { ret = -EPROBE_DEFER; goto unlock_out; } if (group->owner_cnt) { if (group->domain != group->default_domain || group->owner || !xa_empty(&group->pasid_array)) { ret = -EBUSY; goto unlock_out; } } group->owner_cnt++; unlock_out: mutex_unlock(&group->mutex); return ret; } /** * iommu_device_unuse_default_domain() - Device driver stops handling device * DMA through the kernel DMA API. * @dev: The device. * * The device driver doesn't want to do DMA through kernel DMA API anymore. * It must be called after iommu_device_use_default_domain(). */ void iommu_device_unuse_default_domain(struct device *dev) { /* Caller is the driver core during the post-probe path */ struct iommu_group *group = dev->iommu_group; if (!group) return; mutex_lock(&group->mutex); if (!WARN_ON(!group->owner_cnt || !xa_empty(&group->pasid_array))) group->owner_cnt--; mutex_unlock(&group->mutex); } static int __iommu_group_alloc_blocking_domain(struct iommu_group *group) { struct device *dev = iommu_group_first_dev(group); const struct iommu_ops *ops = dev_iommu_ops(dev); struct iommu_domain *domain; if (group->blocking_domain) return 0; if (ops->blocked_domain) { group->blocking_domain = ops->blocked_domain; return 0; } /* * For drivers that do not yet understand IOMMU_DOMAIN_BLOCKED create an * empty PAGING domain instead. */ domain = iommu_paging_domain_alloc(dev); if (IS_ERR(domain)) return PTR_ERR(domain); group->blocking_domain = domain; return 0; } static int __iommu_take_dma_ownership(struct iommu_group *group, void *owner) { int ret; if ((group->domain && group->domain != group->default_domain) || !xa_empty(&group->pasid_array)) return -EBUSY; ret = __iommu_group_alloc_blocking_domain(group); if (ret) return ret; ret = __iommu_group_set_domain(group, group->blocking_domain); if (ret) return ret; group->owner = owner; group->owner_cnt++; return 0; } /** * iommu_group_claim_dma_owner() - Set DMA ownership of a group * @group: The group. * @owner: Caller specified pointer. Used for exclusive ownership. * * This is to support backward compatibility for vfio which manages the dma * ownership in iommu_group level. New invocations on this interface should be * prohibited. Only a single owner may exist for a group. */ int iommu_group_claim_dma_owner(struct iommu_group *group, void *owner) { int ret = 0; if (WARN_ON(!owner)) return -EINVAL; mutex_lock(&group->mutex); if (group->owner_cnt) { ret = -EPERM; goto unlock_out; } ret = __iommu_take_dma_ownership(group, owner); unlock_out: mutex_unlock(&group->mutex); return ret; } EXPORT_SYMBOL_GPL(iommu_group_claim_dma_owner); /** * iommu_device_claim_dma_owner() - Set DMA ownership of a device * @dev: The device. * @owner: Caller specified pointer. Used for exclusive ownership. * * Claim the DMA ownership of a device. Multiple devices in the same group may * concurrently claim ownership if they present the same owner value. Returns 0 * on success and error code on failure */ int iommu_device_claim_dma_owner(struct device *dev, void *owner) { /* Caller must be a probed driver on dev */ struct iommu_group *group = dev->iommu_group; int ret = 0; if (WARN_ON(!owner)) return -EINVAL; if (!group) return -ENODEV; mutex_lock(&group->mutex); if (group->owner_cnt) { if (group->owner != owner) { ret = -EPERM; goto unlock_out; } group->owner_cnt++; goto unlock_out; } ret = __iommu_take_dma_ownership(group, owner); unlock_out: mutex_unlock(&group->mutex); return ret; } EXPORT_SYMBOL_GPL(iommu_device_claim_dma_owner); static void __iommu_release_dma_ownership(struct iommu_group *group) { if (WARN_ON(!group->owner_cnt || !group->owner || !xa_empty(&group->pasid_array))) return; group->owner_cnt = 0; group->owner = NULL; __iommu_group_set_domain_nofail(group, group->default_domain); } /** * iommu_group_release_dma_owner() - Release DMA ownership of a group * @group: The group * * Release the DMA ownership claimed by iommu_group_claim_dma_owner(). */ void iommu_group_release_dma_owner(struct iommu_group *group) { mutex_lock(&group->mutex); __iommu_release_dma_ownership(group); mutex_unlock(&group->mutex); } EXPORT_SYMBOL_GPL(iommu_group_release_dma_owner); /** * iommu_device_release_dma_owner() - Release DMA ownership of a device * @dev: The device. * * Release the DMA ownership claimed by iommu_device_claim_dma_owner(). */ void iommu_device_release_dma_owner(struct device *dev) { /* Caller must be a probed driver on dev */ struct iommu_group *group = dev->iommu_group; mutex_lock(&group->mutex); if (group->owner_cnt > 1) group->owner_cnt--; else __iommu_release_dma_ownership(group); mutex_unlock(&group->mutex); } EXPORT_SYMBOL_GPL(iommu_device_release_dma_owner); /** * iommu_group_dma_owner_claimed() - Query group dma ownership status * @group: The group. * * This provides status query on a given group. It is racy and only for * non-binding status reporting. */ bool iommu_group_dma_owner_claimed(struct iommu_group *group) { unsigned int user; mutex_lock(&group->mutex); user = group->owner_cnt; mutex_unlock(&group->mutex); return user; } EXPORT_SYMBOL_GPL(iommu_group_dma_owner_claimed); static void iommu_remove_dev_pasid(struct device *dev, ioasid_t pasid, struct iommu_domain *domain) { const struct iommu_ops *ops = dev_iommu_ops(dev); struct iommu_domain *blocked_domain = ops->blocked_domain; WARN_ON(blocked_domain->ops->set_dev_pasid(blocked_domain, dev, pasid, domain)); } static int __iommu_set_group_pasid(struct iommu_domain *domain, struct iommu_group *group, ioasid_t pasid, struct iommu_domain *old) { struct group_device *device, *last_gdev; int ret; for_each_group_device(group, device) { if (device->dev->iommu->max_pasids > 0) { ret = domain->ops->set_dev_pasid(domain, device->dev, pasid, old); if (ret) goto err_revert; } } return 0; err_revert: last_gdev = device; for_each_group_device(group, device) { if (device == last_gdev) break; if (device->dev->iommu->max_pasids > 0) { /* * If no old domain, undo the succeeded devices/pasid. * Otherwise, rollback the succeeded devices/pasid to * the old domain. And it is a driver bug to fail * attaching with a previously good domain. */ if (!old || WARN_ON(old->ops->set_dev_pasid(old, device->dev, pasid, domain))) iommu_remove_dev_pasid(device->dev, pasid, domain); } } return ret; } static void __iommu_remove_group_pasid(struct iommu_group *group, ioasid_t pasid, struct iommu_domain *domain) { struct group_device *device; for_each_group_device(group, device) { if (device->dev->iommu->max_pasids > 0) iommu_remove_dev_pasid(device->dev, pasid, domain); } } /* * iommu_attach_device_pasid() - Attach a domain to pasid of device * @domain: the iommu domain. * @dev: the attached device. * @pasid: the pasid of the device. * @handle: the attach handle. * * Caller should always provide a new handle to avoid race with the paths * that have lockless reference to handle if it intends to pass a valid handle. * * Return: 0 on success, or an error. */ int iommu_attach_device_pasid(struct iommu_domain *domain, struct device *dev, ioasid_t pasid, struct iommu_attach_handle *handle) { /* Caller must be a probed driver on dev */ struct iommu_group *group = dev->iommu_group; struct group_device *device; const struct iommu_ops *ops; void *entry; int ret; if (!group) return -ENODEV; ops = dev_iommu_ops(dev); if (!domain->ops->set_dev_pasid || !ops->blocked_domain || !ops->blocked_domain->ops->set_dev_pasid) return -EOPNOTSUPP; if (!domain_iommu_ops_compatible(ops, domain) || pasid == IOMMU_NO_PASID) return -EINVAL; mutex_lock(&group->mutex); for_each_group_device(group, device) { /* * Skip PASID validation for devices without PASID support * (max_pasids = 0). These devices cannot issue transactions * with PASID, so they don't affect group's PASID usage. */ if ((device->dev->iommu->max_pasids > 0) && (pasid >= device->dev->iommu->max_pasids)) { ret = -EINVAL; goto out_unlock; } } entry = iommu_make_pasid_array_entry(domain, handle); /* * Entry present is a failure case. Use xa_insert() instead of * xa_reserve(). */ ret = xa_insert(&group->pasid_array, pasid, XA_ZERO_ENTRY, GFP_KERNEL); if (ret) goto out_unlock; ret = __iommu_set_group_pasid(domain, group, pasid, NULL); if (ret) { xa_release(&group->pasid_array, pasid); goto out_unlock; } /* * The xa_insert() above reserved the memory, and the group->mutex is * held, this cannot fail. The new domain cannot be visible until the * operation succeeds as we cannot tolerate PRIs becoming concurrently * queued and then failing attach. */ WARN_ON(xa_is_err(xa_store(&group->pasid_array, pasid, entry, GFP_KERNEL))); out_unlock: mutex_unlock(&group->mutex); return ret; } EXPORT_SYMBOL_GPL(iommu_attach_device_pasid); /** * iommu_replace_device_pasid - Replace the domain that a specific pasid * of the device is attached to * @domain: the new iommu domain * @dev: the attached device. * @pasid: the pasid of the device. * @handle: the attach handle. * * This API allows the pasid to switch domains. The @pasid should have been * attached. Otherwise, this fails. The pasid will keep the old configuration * if replacement failed. * * Caller should always provide a new handle to avoid race with the paths * that have lockless reference to handle if it intends to pass a valid handle. * * Return 0 on success, or an error. */ int iommu_replace_device_pasid(struct iommu_domain *domain, struct device *dev, ioasid_t pasid, struct iommu_attach_handle *handle) { /* Caller must be a probed driver on dev */ struct iommu_group *group = dev->iommu_group; struct iommu_attach_handle *entry; struct iommu_domain *curr_domain; void *curr; int ret; if (!group) return -ENODEV; if (!domain->ops->set_dev_pasid) return -EOPNOTSUPP; if (!domain_iommu_ops_compatible(dev_iommu_ops(dev), domain) || pasid == IOMMU_NO_PASID || !handle) return -EINVAL; mutex_lock(&group->mutex); entry = iommu_make_pasid_array_entry(domain, handle); curr = xa_cmpxchg(&group->pasid_array, pasid, NULL, XA_ZERO_ENTRY, GFP_KERNEL); if (xa_is_err(curr)) { ret = xa_err(curr); goto out_unlock; } /* * No domain (with or without handle) attached, hence not * a replace case. */ if (!curr) { xa_release(&group->pasid_array, pasid); ret = -EINVAL; goto out_unlock; } /* * Reusing handle is problematic as there are paths that refers * the handle without lock. To avoid race, reject the callers that * attempt it. */ if (curr == entry) { WARN_ON(1); ret = -EINVAL; goto out_unlock; } curr_domain = pasid_array_entry_to_domain(curr); ret = 0; if (curr_domain != domain) { ret = __iommu_set_group_pasid(domain, group, pasid, curr_domain); if (ret) goto out_unlock; } /* * The above xa_cmpxchg() reserved the memory, and the * group->mutex is held, this cannot fail. */ WARN_ON(xa_is_err(xa_store(&group->pasid_array, pasid, entry, GFP_KERNEL))); out_unlock: mutex_unlock(&group->mutex); return ret; } EXPORT_SYMBOL_NS_GPL(iommu_replace_device_pasid, "IOMMUFD_INTERNAL"); /* * iommu_detach_device_pasid() - Detach the domain from pasid of device * @domain: the iommu domain. * @dev: the attached device. * @pasid: the pasid of the device. * * The @domain must have been attached to @pasid of the @dev with * iommu_attach_device_pasid(). */ void iommu_detach_device_pasid(struct iommu_domain *domain, struct device *dev, ioasid_t pasid) { /* Caller must be a probed driver on dev */ struct iommu_group *group = dev->iommu_group; mutex_lock(&group->mutex); __iommu_remove_group_pasid(group, pasid, domain); xa_erase(&group->pasid_array, pasid); mutex_unlock(&group->mutex); } EXPORT_SYMBOL_GPL(iommu_detach_device_pasid); ioasid_t iommu_alloc_global_pasid(struct device *dev) { int ret; /* max_pasids == 0 means that the device does not support PASID */ if (!dev->iommu->max_pasids) return IOMMU_PASID_INVALID; /* * max_pasids is set up by vendor driver based on number of PASID bits * supported but the IDA allocation is inclusive. */ ret = ida_alloc_range(&iommu_global_pasid_ida, IOMMU_FIRST_GLOBAL_PASID, dev->iommu->max_pasids - 1, GFP_KERNEL); return ret < 0 ? IOMMU_PASID_INVALID : ret; } EXPORT_SYMBOL_GPL(iommu_alloc_global_pasid); void iommu_free_global_pasid(ioasid_t pasid) { if (WARN_ON(pasid == IOMMU_PASID_INVALID)) return; ida_free(&iommu_global_pasid_ida, pasid); } EXPORT_SYMBOL_GPL(iommu_free_global_pasid); /** * iommu_attach_handle_get - Return the attach handle * @group: the iommu group that domain was attached to * @pasid: the pasid within the group * @type: matched domain type, 0 for any match * * Return handle or ERR_PTR(-ENOENT) on none, ERR_PTR(-EBUSY) on mismatch. * * Return the attach handle to the caller. The life cycle of an iommu attach * handle is from the time when the domain is attached to the time when the * domain is detached. Callers are required to synchronize the call of * iommu_attach_handle_get() with domain attachment and detachment. The attach * handle can only be used during its life cycle. */ struct iommu_attach_handle * iommu_attach_handle_get(struct iommu_group *group, ioasid_t pasid, unsigned int type) { struct iommu_attach_handle *handle; void *entry; xa_lock(&group->pasid_array); entry = xa_load(&group->pasid_array, pasid); if (!entry || xa_pointer_tag(entry) != IOMMU_PASID_ARRAY_HANDLE) { handle = ERR_PTR(-ENOENT); } else { handle = xa_untag_pointer(entry); if (type && handle->domain->type != type) handle = ERR_PTR(-EBUSY); } xa_unlock(&group->pasid_array); return handle; } EXPORT_SYMBOL_NS_GPL(iommu_attach_handle_get, "IOMMUFD_INTERNAL"); /** * iommu_attach_group_handle - Attach an IOMMU domain to an IOMMU group * @domain: IOMMU domain to attach * @group: IOMMU group that will be attached * @handle: attach handle * * Returns 0 on success and error code on failure. * * This is a variant of iommu_attach_group(). It allows the caller to provide * an attach handle and use it when the domain is attached. This is currently * used by IOMMUFD to deliver the I/O page faults. * * Caller should always provide a new handle to avoid race with the paths * that have lockless reference to handle. */ int iommu_attach_group_handle(struct iommu_domain *domain, struct iommu_group *group, struct iommu_attach_handle *handle) { void *entry; int ret; if (!handle) return -EINVAL; mutex_lock(&group->mutex); entry = iommu_make_pasid_array_entry(domain, handle); ret = xa_insert(&group->pasid_array, IOMMU_NO_PASID, XA_ZERO_ENTRY, GFP_KERNEL); if (ret) goto out_unlock; ret = __iommu_attach_group(domain, group); if (ret) { xa_release(&group->pasid_array, IOMMU_NO_PASID); goto out_unlock; } /* * The xa_insert() above reserved the memory, and the group->mutex is * held, this cannot fail. The new domain cannot be visible until the * operation succeeds as we cannot tolerate PRIs becoming concurrently * queued and then failing attach. */ WARN_ON(xa_is_err(xa_store(&group->pasid_array, IOMMU_NO_PASID, entry, GFP_KERNEL))); out_unlock: mutex_unlock(&group->mutex); return ret; } EXPORT_SYMBOL_NS_GPL(iommu_attach_group_handle, "IOMMUFD_INTERNAL"); /** * iommu_detach_group_handle - Detach an IOMMU domain from an IOMMU group * @domain: IOMMU domain to attach * @group: IOMMU group that will be attached * * Detach the specified IOMMU domain from the specified IOMMU group. * It must be used in conjunction with iommu_attach_group_handle(). */ void iommu_detach_group_handle(struct iommu_domain *domain, struct iommu_group *group) { mutex_lock(&group->mutex); __iommu_group_set_core_domain(group); xa_erase(&group->pasid_array, IOMMU_NO_PASID); mutex_unlock(&group->mutex); } EXPORT_SYMBOL_NS_GPL(iommu_detach_group_handle, "IOMMUFD_INTERNAL"); /** * iommu_replace_group_handle - replace the domain that a group is attached to * @group: IOMMU group that will be attached to the new domain * @new_domain: new IOMMU domain to replace with * @handle: attach handle * * This API allows the group to switch domains without being forced to go to * the blocking domain in-between. It allows the caller to provide an attach * handle for the new domain and use it when the domain is attached. * * If the currently attached domain is a core domain (e.g. a default_domain), * it will act just like the iommu_attach_group_handle(). * * Caller should always provide a new handle to avoid race with the paths * that have lockless reference to handle. */ int iommu_replace_group_handle(struct iommu_group *group, struct iommu_domain *new_domain, struct iommu_attach_handle *handle) { void *curr, *entry; int ret; if (!new_domain || !handle) return -EINVAL; mutex_lock(&group->mutex); entry = iommu_make_pasid_array_entry(new_domain, handle); ret = xa_reserve(&group->pasid_array, IOMMU_NO_PASID, GFP_KERNEL); if (ret) goto err_unlock; ret = __iommu_group_set_domain(group, new_domain); if (ret) goto err_release; curr = xa_store(&group->pasid_array, IOMMU_NO_PASID, entry, GFP_KERNEL); WARN_ON(xa_is_err(curr)); mutex_unlock(&group->mutex); return 0; err_release: xa_release(&group->pasid_array, IOMMU_NO_PASID); err_unlock: mutex_unlock(&group->mutex); return ret; } EXPORT_SYMBOL_NS_GPL(iommu_replace_group_handle, "IOMMUFD_INTERNAL"); #if IS_ENABLED(CONFIG_IRQ_MSI_IOMMU) /** * iommu_dma_prepare_msi() - Map the MSI page in the IOMMU domain * @desc: MSI descriptor, will store the MSI page * @msi_addr: MSI target address to be mapped * * The implementation of sw_msi() should take msi_addr and map it to * an IOVA in the domain and call msi_desc_set_iommu_msi_iova() with the * mapping information. * * Return: 0 on success or negative error code if the mapping failed. */ int iommu_dma_prepare_msi(struct msi_desc *desc, phys_addr_t msi_addr) { struct device *dev = msi_desc_to_dev(desc); struct iommu_group *group = dev->iommu_group; int ret = 0; if (!group) return 0; mutex_lock(&group->mutex); /* An IDENTITY domain must pass through */ if (group->domain && group->domain->type != IOMMU_DOMAIN_IDENTITY) { switch (group->domain->cookie_type) { case IOMMU_COOKIE_DMA_MSI: case IOMMU_COOKIE_DMA_IOVA: ret = iommu_dma_sw_msi(group->domain, desc, msi_addr); break; case IOMMU_COOKIE_IOMMUFD: ret = iommufd_sw_msi(group->domain, desc, msi_addr); break; default: ret = -EOPNOTSUPP; break; } } mutex_unlock(&group->mutex); return ret; } #endif /* CONFIG_IRQ_MSI_IOMMU */ |
| 18 18 18 17 18 18 18 18 18 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 | // SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0 /******************************************************************************* * * Module Name: nssearch - Namespace search * ******************************************************************************/ #include <acpi/acpi.h> #include "accommon.h" #include "acnamesp.h" #ifdef ACPI_ASL_COMPILER #include "amlcode.h" #endif #define _COMPONENT ACPI_NAMESPACE ACPI_MODULE_NAME("nssearch") /* Local prototypes */ static acpi_status acpi_ns_search_parent_tree(u32 target_name, struct acpi_namespace_node *node, acpi_object_type type, struct acpi_namespace_node **return_node); /******************************************************************************* * * FUNCTION: acpi_ns_search_one_scope * * PARAMETERS: target_name - Ascii ACPI name to search for * parent_node - Starting node where search will begin * type - Object type to match * return_node - Where the matched Named obj is returned * * RETURN: Status * * DESCRIPTION: Search a single level of the namespace. Performs a * simple search of the specified level, and does not add * entries or search parents. * * * Named object lists are built (and subsequently dumped) in the * order in which the names are encountered during the namespace load; * * All namespace searching is linear in this implementation, but * could be easily modified to support any improved search * algorithm. However, the linear search was chosen for simplicity * and because the trees are small and the other interpreter * execution overhead is relatively high. * * Note: CPU execution analysis has shown that the AML interpreter spends * a very small percentage of its time searching the namespace. Therefore, * the linear search seems to be sufficient, as there would seem to be * little value in improving the search. * ******************************************************************************/ acpi_status acpi_ns_search_one_scope(u32 target_name, struct acpi_namespace_node *parent_node, acpi_object_type type, struct acpi_namespace_node **return_node) { struct acpi_namespace_node *node; ACPI_FUNCTION_TRACE(ns_search_one_scope); #ifdef ACPI_DEBUG_OUTPUT if (ACPI_LV_NAMES & acpi_dbg_level) { char *scope_name; scope_name = acpi_ns_get_normalized_pathname(parent_node, TRUE); if (scope_name) { ACPI_DEBUG_PRINT((ACPI_DB_NAMES, "Searching %s (%p) For [%4.4s] (%s)\n", scope_name, parent_node, ACPI_CAST_PTR(char, &target_name), acpi_ut_get_type_name(type))); ACPI_FREE(scope_name); } } #endif /* * Search for name at this namespace level, which is to say that we * must search for the name among the children of this object */ node = parent_node->child; while (node) { /* Check for match against the name */ if (node->name.integer == target_name) { /* Resolve a control method alias if any */ if (acpi_ns_get_type(node) == ACPI_TYPE_LOCAL_METHOD_ALIAS) { node = ACPI_CAST_PTR(struct acpi_namespace_node, node->object); } /* Found matching entry */ ACPI_DEBUG_PRINT((ACPI_DB_NAMES, "Name [%4.4s] (%s) %p found in scope [%4.4s] %p\n", ACPI_CAST_PTR(char, &target_name), acpi_ut_get_type_name(node->type), node, acpi_ut_get_node_name(parent_node), parent_node)); *return_node = node; return_ACPI_STATUS(AE_OK); } /* Didn't match name, move on to the next peer object */ node = node->peer; } /* Searched entire namespace level, not found */ ACPI_DEBUG_PRINT((ACPI_DB_NAMES, "Name [%4.4s] (%s) not found in search in scope [%4.4s] " "%p first child %p\n", ACPI_CAST_PTR(char, &target_name), acpi_ut_get_type_name(type), acpi_ut_get_node_name(parent_node), parent_node, parent_node->child)); return_ACPI_STATUS(AE_NOT_FOUND); } /******************************************************************************* * * FUNCTION: acpi_ns_search_parent_tree * * PARAMETERS: target_name - Ascii ACPI name to search for * node - Starting node where search will begin * type - Object type to match * return_node - Where the matched Node is returned * * RETURN: Status * * DESCRIPTION: Called when a name has not been found in the current namespace * level. Before adding it or giving up, ACPI scope rules require * searching enclosing scopes in cases identified by acpi_ns_local(). * * "A name is located by finding the matching name in the current * name space, and then in the parent name space. If the parent * name space does not contain the name, the search continues * recursively until either the name is found or the name space * does not have a parent (the root of the name space). This * indicates that the name is not found" (From ACPI Specification, * section 5.3) * ******************************************************************************/ static acpi_status acpi_ns_search_parent_tree(u32 target_name, struct acpi_namespace_node *node, acpi_object_type type, struct acpi_namespace_node **return_node) { acpi_status status; struct acpi_namespace_node *parent_node; ACPI_FUNCTION_TRACE(ns_search_parent_tree); parent_node = node->parent; /* * If there is no parent (i.e., we are at the root) or type is "local", * we won't be searching the parent tree. */ if (!parent_node) { ACPI_DEBUG_PRINT((ACPI_DB_NAMES, "[%4.4s] has no parent\n", ACPI_CAST_PTR(char, &target_name))); return_ACPI_STATUS(AE_NOT_FOUND); } if (acpi_ns_local(type)) { ACPI_DEBUG_PRINT((ACPI_DB_NAMES, "[%4.4s] type [%s] must be local to this scope (no parent search)\n", ACPI_CAST_PTR(char, &target_name), acpi_ut_get_type_name(type))); return_ACPI_STATUS(AE_NOT_FOUND); } /* Search the parent tree */ ACPI_DEBUG_PRINT((ACPI_DB_NAMES, "Searching parent [%4.4s] for [%4.4s]\n", acpi_ut_get_node_name(parent_node), ACPI_CAST_PTR(char, &target_name))); /* Search parents until target is found or we have backed up to the root */ while (parent_node) { /* * Search parent scope. Use TYPE_ANY because we don't care about the * object type at this point, we only care about the existence of * the actual name we are searching for. Typechecking comes later. */ status = acpi_ns_search_one_scope(target_name, parent_node, ACPI_TYPE_ANY, return_node); if (ACPI_SUCCESS(status)) { return_ACPI_STATUS(status); } /* Not found here, go up another level (until we reach the root) */ parent_node = parent_node->parent; } /* Not found in parent tree */ return_ACPI_STATUS(AE_NOT_FOUND); } /******************************************************************************* * * FUNCTION: acpi_ns_search_and_enter * * PARAMETERS: target_name - Ascii ACPI name to search for (4 chars) * walk_state - Current state of the walk * node - Starting node where search will begin * interpreter_mode - Add names only in ACPI_MODE_LOAD_PASS_x. * Otherwise,search only. * type - Object type to match * flags - Flags describing the search restrictions * return_node - Where the Node is returned * * RETURN: Status * * DESCRIPTION: Search for a name segment in a single namespace level, * optionally adding it if it is not found. If the passed * Type is not Any and the type previously stored in the * entry was Any (i.e. unknown), update the stored type. * * In ACPI_IMODE_EXECUTE, search only. * In other modes, search and add if not found. * ******************************************************************************/ acpi_status acpi_ns_search_and_enter(u32 target_name, struct acpi_walk_state *walk_state, struct acpi_namespace_node *node, acpi_interpreter_mode interpreter_mode, acpi_object_type type, u32 flags, struct acpi_namespace_node **return_node) { acpi_status status; struct acpi_namespace_node *new_node; ACPI_FUNCTION_TRACE(ns_search_and_enter); /* Parameter validation */ if (!node || !target_name || !return_node) { ACPI_ERROR((AE_INFO, "Null parameter: Node %p Name 0x%X ReturnNode %p", node, target_name, return_node)); return_ACPI_STATUS(AE_BAD_PARAMETER); } /* * Name must consist of valid ACPI characters. We will repair the name if * necessary because we don't want to abort because of this, but we want * all namespace names to be printable. A warning message is appropriate. * * This issue came up because there are in fact machines that exhibit * this problem, and we want to be able to enable ACPI support for them, * even though there are a few bad names. */ acpi_ut_repair_name(ACPI_CAST_PTR(char, &target_name)); /* Try to find the name in the namespace level specified by the caller */ *return_node = ACPI_ENTRY_NOT_FOUND; status = acpi_ns_search_one_scope(target_name, node, type, return_node); if (status != AE_NOT_FOUND) { /* * If we found it AND the request specifies that a find is an error, * return the error */ if (status == AE_OK) { /* The node was found in the namespace */ /* * If the namespace override feature is enabled for this node, * delete any existing attached sub-object and make the node * look like a new node that is owned by the override table. */ if (flags & ACPI_NS_OVERRIDE_IF_FOUND) { ACPI_DEBUG_PRINT((ACPI_DB_NAMES, "Namespace override: %4.4s pass %u type %X Owner %X\n", ACPI_CAST_PTR(char, &target_name), interpreter_mode, (*return_node)->type, walk_state->owner_id)); acpi_ns_delete_children(*return_node); if (acpi_gbl_runtime_namespace_override) { acpi_ut_remove_reference((*return_node)->object); (*return_node)->object = NULL; (*return_node)->owner_id = walk_state->owner_id; } else { acpi_ns_remove_node(*return_node); *return_node = ACPI_ENTRY_NOT_FOUND; } } /* Return an error if we don't expect to find the object */ else if (flags & ACPI_NS_ERROR_IF_FOUND) { status = AE_ALREADY_EXISTS; } } #ifdef ACPI_ASL_COMPILER if (*return_node && (*return_node)->type == ACPI_TYPE_ANY) { (*return_node)->flags |= ANOBJ_IS_EXTERNAL; } #endif /* Either found it or there was an error: finished either way */ return_ACPI_STATUS(status); } /* * The name was not found. If we are NOT performing the first pass * (name entry) of loading the namespace, search the parent tree (all the * way to the root if necessary.) We don't want to perform the parent * search when the namespace is actually being loaded. We want to perform * the search when namespace references are being resolved (load pass 2) * and during the execution phase. */ if ((interpreter_mode != ACPI_IMODE_LOAD_PASS1) && (flags & ACPI_NS_SEARCH_PARENT)) { /* * Not found at this level - search parent tree according to the * ACPI specification */ status = acpi_ns_search_parent_tree(target_name, node, type, return_node); if (ACPI_SUCCESS(status)) { return_ACPI_STATUS(status); } } /* In execute mode, just search, never add names. Exit now */ if (interpreter_mode == ACPI_IMODE_EXECUTE) { ACPI_DEBUG_PRINT((ACPI_DB_NAMES, "%4.4s Not found in %p [Not adding]\n", ACPI_CAST_PTR(char, &target_name), node)); return_ACPI_STATUS(AE_NOT_FOUND); } /* Create the new named object */ new_node = acpi_ns_create_node(target_name); if (!new_node) { return_ACPI_STATUS(AE_NO_MEMORY); } #ifdef ACPI_ASL_COMPILER /* Node is an object defined by an External() statement */ if (flags & ACPI_NS_EXTERNAL || (walk_state && walk_state->opcode == AML_SCOPE_OP)) { new_node->flags |= ANOBJ_IS_EXTERNAL; } #endif if (flags & ACPI_NS_TEMPORARY) { new_node->flags |= ANOBJ_TEMPORARY; } /* Install the new object into the parent's list of children */ acpi_ns_install_node(walk_state, node, new_node, type); *return_node = new_node; return_ACPI_STATUS(AE_OK); } |
| 1 7 5 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 | /* SPDX-License-Identifier: GPL-2.0+ */ /* * comedidev.h * header file for kernel-only structures, variables, and constants * * COMEDI - Linux Control and Measurement Device Interface * Copyright (C) 1997-2000 David A. Schleef <ds@schleef.org> */ #ifndef _COMEDIDEV_H #define _COMEDIDEV_H #include <linux/dma-mapping.h> #include <linux/mutex.h> #include <linux/spinlock_types.h> #include <linux/rwsem.h> #include <linux/kref.h> #include <linux/comedi.h> #define COMEDI_VERSION(a, b, c) (((a) << 16) + ((b) << 8) + (c)) #define COMEDI_VERSION_CODE COMEDI_VERSION(COMEDI_MAJORVERSION, \ COMEDI_MINORVERSION, COMEDI_MICROVERSION) #define COMEDI_RELEASE VERSION #define COMEDI_NUM_BOARD_MINORS 0x30 /** * struct comedi_subdevice - Working data for a COMEDI subdevice * @device: COMEDI device to which this subdevice belongs. (Initialized by * comedi_alloc_subdevices().) * @index: Index of this subdevice within device's array of subdevices. * (Initialized by comedi_alloc_subdevices().) * @type: Type of subdevice from &enum comedi_subdevice_type. (Initialized by * the low-level driver.) * @n_chan: Number of channels the subdevice supports. (Initialized by the * low-level driver.) * @subdev_flags: Various "SDF" flags indicating aspects of the subdevice to * the COMEDI core and user application. (Initialized by the low-level * driver.) * @len_chanlist: Maximum length of a channel list if the subdevice supports * asynchronous acquisition commands. (Optionally initialized by the * low-level driver, or changed from 0 to 1 during post-configuration.) * @private: Private data pointer which is either set by the low-level driver * itself, or by a call to comedi_alloc_spriv() which allocates storage. * In the latter case, the storage is automatically freed after the * low-level driver's "detach" handler is called for the device. * (Initialized by the low-level driver.) * @async: Pointer to &struct comedi_async id the subdevice supports * asynchronous acquisition commands. (Allocated and initialized during * post-configuration if needed.) * @lock: Pointer to a file object that performed a %COMEDI_LOCK ioctl on the * subdevice. (Initially NULL.) * @busy: Pointer to a file object that is performing an asynchronous * acquisition command on the subdevice. (Initially NULL.) * @runflags: Internal flags for use by COMEDI core, mostly indicating whether * an asynchronous acquisition command is running. * @spin_lock: Generic spin-lock for use by the COMEDI core and the low-level * driver. (Initialized by comedi_alloc_subdevices().) * @io_bits: Bit-mask indicating the channel directions for a DIO subdevice * with no more than 32 channels. A '1' at a bit position indicates the * corresponding channel is configured as an output. (Initialized by the * low-level driver for a DIO subdevice. Forced to all-outputs during * post-configuration for a digital output subdevice.) * @maxdata: If non-zero, this is the maximum raw data value of each channel. * If zero, the maximum data value is channel-specific. (Initialized by * the low-level driver.) * @maxdata_list: If the maximum data value is channel-specific, this points * to an array of maximum data values indexed by channel index. * (Initialized by the low-level driver.) * @range_table: If non-NULL, this points to a COMEDI range table for the * subdevice. If NULL, the range table is channel-specific. (Initialized * by the low-level driver, will be set to an "invalid" range table during * post-configuration if @range_table and @range_table_list are both * NULL.) * @range_table_list: If the COMEDI range table is channel-specific, this * points to an array of pointers to COMEDI range tables indexed by * channel number. (Initialized by the low-level driver.) * @chanlist: Not used. * @insn_read: Optional pointer to a handler for the %INSN_READ instruction. * (Initialized by the low-level driver, or set to a default handler * during post-configuration.) * @insn_write: Optional pointer to a handler for the %INSN_WRITE instruction. * (Initialized by the low-level driver, or set to a default handler * during post-configuration.) * @insn_bits: Optional pointer to a handler for the %INSN_BITS instruction * for a digital input, digital output or digital input/output subdevice. * (Initialized by the low-level driver, or set to a default handler * during post-configuration.) * @insn_config: Optional pointer to a handler for the %INSN_CONFIG * instruction. (Initialized by the low-level driver, or set to a default * handler during post-configuration.) * @do_cmd: If the subdevice supports asynchronous acquisition commands, this * points to a handler to set it up in hardware. (Initialized by the * low-level driver.) * @do_cmdtest: If the subdevice supports asynchronous acquisition commands, * this points to a handler used to check and possibly tweak a prospective * acquisition command without setting it up in hardware. (Initialized by * the low-level driver.) * @poll: If the subdevice supports asynchronous acquisition commands, this * is an optional pointer to a handler for the %COMEDI_POLL ioctl which * instructs the low-level driver to synchronize buffers. (Initialized by * the low-level driver if needed.) * @cancel: If the subdevice supports asynchronous acquisition commands, this * points to a handler used to terminate a running command. (Initialized * by the low-level driver.) * @buf_change: If the subdevice supports asynchronous acquisition commands, * this is an optional pointer to a handler that is called when the data * buffer for handling asynchronous commands is allocated or reallocated. * (Initialized by the low-level driver if needed.) * @munge: If the subdevice supports asynchronous acquisition commands and * uses DMA to transfer data from the hardware to the acquisition buffer, * this points to a function used to "munge" the data values from the * hardware into the format expected by COMEDI. (Initialized by the * low-level driver if needed.) * @async_dma_dir: If the subdevice supports asynchronous acquisition commands * and uses DMA to transfer data from the hardware to the acquisition * buffer, this sets the DMA direction for the buffer. (initialized to * %DMA_NONE by comedi_alloc_subdevices() and changed by the low-level * driver if necessary.) * @state: Handy bit-mask indicating the output states for a DIO or digital * output subdevice with no more than 32 channels. (Initialized by the * low-level driver.) * @class_dev: If the subdevice supports asynchronous acquisition commands, * this points to a sysfs comediX_subdY device where X is the minor device * number of the COMEDI device and Y is the subdevice number. The minor * device number for the sysfs device is allocated dynamically in the * range 48 to 255. This is used to allow the COMEDI device to be opened * with a different default read or write subdevice. (Allocated during * post-configuration if needed.) * @minor: If @class_dev is set, this is its dynamically allocated minor * device number. (Set during post-configuration if necessary.) * @readback: Optional pointer to memory allocated by * comedi_alloc_subdev_readback() used to hold the values written to * analog output channels so they can be read back. The storage is * automatically freed after the low-level driver's "detach" handler is * called for the device. (Initialized by the low-level driver.) * * This is the main control structure for a COMEDI subdevice. If the subdevice * supports asynchronous acquisition commands, additional information is stored * in the &struct comedi_async pointed to by @async. * * Most of the subdevice is initialized by the low-level driver's "attach" or * "auto_attach" handlers but parts of it are initialized by * comedi_alloc_subdevices(), and other parts are initialized during * post-configuration on return from that handler. * * A low-level driver that sets @insn_bits for a digital input, digital output, * or DIO subdevice may leave @insn_read and @insn_write uninitialized, in * which case they will be set to a default handler during post-configuration * that uses @insn_bits to emulate the %INSN_READ and %INSN_WRITE instructions. */ struct comedi_subdevice { struct comedi_device *device; int index; int type; int n_chan; int subdev_flags; int len_chanlist; /* maximum length of channel/gain list */ void *private; struct comedi_async *async; void *lock; void *busy; unsigned int runflags; spinlock_t spin_lock; /* generic spin-lock for COMEDI and drivers */ unsigned int io_bits; unsigned int maxdata; /* if maxdata==0, use list */ const unsigned int *maxdata_list; /* list is channel specific */ const struct comedi_lrange *range_table; const struct comedi_lrange *const *range_table_list; unsigned int *chanlist; /* driver-owned chanlist (not used) */ int (*insn_read)(struct comedi_device *dev, struct comedi_subdevice *s, struct comedi_insn *insn, unsigned int *data); int (*insn_write)(struct comedi_device *dev, struct comedi_subdevice *s, struct comedi_insn *insn, unsigned int *data); int (*insn_bits)(struct comedi_device *dev, struct comedi_subdevice *s, struct comedi_insn *insn, unsigned int *data); int (*insn_config)(struct comedi_device *dev, struct comedi_subdevice *s, struct comedi_insn *insn, unsigned int *data); int (*do_cmd)(struct comedi_device *dev, struct comedi_subdevice *s); int (*do_cmdtest)(struct comedi_device *dev, struct comedi_subdevice *s, struct comedi_cmd *cmd); int (*poll)(struct comedi_device *dev, struct comedi_subdevice *s); int (*cancel)(struct comedi_device *dev, struct comedi_subdevice *s); /* called when the buffer changes */ int (*buf_change)(struct comedi_device *dev, struct comedi_subdevice *s); void (*munge)(struct comedi_device *dev, struct comedi_subdevice *s, void *data, unsigned int num_bytes, unsigned int start_chan_index); enum dma_data_direction async_dma_dir; unsigned int state; struct device *class_dev; int minor; unsigned int *readback; }; /** * struct comedi_buf_page - Describe a page of a COMEDI buffer * @virt_addr: Kernel address of page. * @dma_addr: DMA address of page if in DMA coherent memory. */ struct comedi_buf_page { void *virt_addr; dma_addr_t dma_addr; }; /** * struct comedi_buf_map - Describe pages in a COMEDI buffer * @dma_hw_dev: Low-level hardware &struct device pointer copied from the * COMEDI device's hw_dev member. * @page_list: Pointer to array of &struct comedi_buf_page, one for each * page in the buffer. * @n_pages: Number of pages in the buffer. * @dma_dir: DMA direction used to allocate pages of DMA coherent memory, * or %DMA_NONE if pages allocated from regular memory. * @refcount: &struct kref reference counter used to free the buffer. * * A COMEDI data buffer is allocated as individual pages, either in * conventional memory or DMA coherent memory, depending on the attached, * low-level hardware device. * * The buffer is normally freed when the COMEDI device is detached from the * low-level driver (which may happen due to device removal), but if it happens * to be mmapped at the time, the pages cannot be freed until the buffer has * been munmapped. That is what the reference counter is for. */ struct comedi_buf_map { struct device *dma_hw_dev; struct comedi_buf_page *page_list; unsigned int n_pages; enum dma_data_direction dma_dir; struct kref refcount; }; /** * struct comedi_async - Control data for asynchronous COMEDI commands * @prealloc_bufsz: Buffer size (in bytes). * @buf_map: Map of buffer pages. * @max_bufsize: Maximum allowed buffer size (in bytes). * @buf_write_count: "Write completed" count (in bytes, modulo 2**32). * @buf_write_alloc_count: "Allocated for writing" count (in bytes, * modulo 2**32). * @buf_read_count: "Read completed" count (in bytes, modulo 2**32). * @buf_read_alloc_count: "Allocated for reading" count (in bytes, * modulo 2**32). * @buf_write_ptr: Buffer position for writer. * @buf_read_ptr: Buffer position for reader. * @cur_chan: Current position in chanlist for scan (for those drivers that * use it). * @scans_done: The number of scans completed. * @scan_progress: Amount received or sent for current scan (in bytes). * @munge_chan: Current position in chanlist for "munging". * @munge_count: "Munge" count (in bytes, modulo 2**32). * @munge_ptr: Buffer position for "munging". * @events: Bit-vector of events that have occurred. * @cmd: Details of comedi command in progress. * @wait_head: Task wait queue for file reader or writer. * @cb_mask: Bit-vector of events that should wake waiting tasks. * @inttrig: Software trigger function for command, or NULL. * * Note about the ..._count and ..._ptr members: * * Think of the _Count values being integers of unlimited size, indexing * into a buffer of infinite length (though only an advancing portion * of the buffer of fixed length prealloc_bufsz is accessible at any * time). Then: * * Buf_Read_Count <= Buf_Read_Alloc_Count <= Munge_Count <= * Buf_Write_Count <= Buf_Write_Alloc_Count <= * (Buf_Read_Count + prealloc_bufsz) * * (Those aren't the actual members, apart from prealloc_bufsz.) When the * buffer is reset, those _Count values start at 0 and only increase in value, * maintaining the above inequalities until the next time the buffer is * reset. The buffer is divided into the following regions by the inequalities: * * [0, Buf_Read_Count): * old region no longer accessible * * [Buf_Read_Count, Buf_Read_Alloc_Count): * filled and munged region allocated for reading but not yet read * * [Buf_Read_Alloc_Count, Munge_Count): * filled and munged region not yet allocated for reading * * [Munge_Count, Buf_Write_Count): * filled region not yet munged * * [Buf_Write_Count, Buf_Write_Alloc_Count): * unfilled region allocated for writing but not yet written * * [Buf_Write_Alloc_Count, Buf_Read_Count + prealloc_bufsz): * unfilled region not yet allocated for writing * * [Buf_Read_Count + prealloc_bufsz, infinity): * unfilled region not yet accessible * * Data needs to be written into the buffer before it can be read out, * and may need to be converted (or "munged") between the two * operations. Extra unfilled buffer space may need to allocated for * writing (advancing Buf_Write_Alloc_Count) before new data is written. * After writing new data, the newly filled space needs to be released * (advancing Buf_Write_Count). This also results in the new data being * "munged" (advancing Munge_Count). Before data is read out of the * buffer, extra space may need to be allocated for reading (advancing * Buf_Read_Alloc_Count). After the data has been read out, the space * needs to be released (advancing Buf_Read_Count). * * The actual members, buf_read_count, buf_read_alloc_count, * munge_count, buf_write_count, and buf_write_alloc_count take the * value of the corresponding capitalized _Count values modulo 2^32 * (UINT_MAX+1). Subtracting a "higher" _count value from a "lower" * _count value gives the same answer as subtracting a "higher" _Count * value from a lower _Count value because prealloc_bufsz < UINT_MAX+1. * The modulo operation is done implicitly. * * The buf_read_ptr, munge_ptr, and buf_write_ptr members take the value * of the corresponding capitalized _Count values modulo prealloc_bufsz. * These correspond to byte indices in the physical buffer. The modulo * operation is done by subtracting prealloc_bufsz when the value * exceeds prealloc_bufsz (assuming prealloc_bufsz plus the increment is * less than or equal to UINT_MAX). */ struct comedi_async { unsigned int prealloc_bufsz; struct comedi_buf_map *buf_map; unsigned int max_bufsize; unsigned int buf_write_count; unsigned int buf_write_alloc_count; unsigned int buf_read_count; unsigned int buf_read_alloc_count; unsigned int buf_write_ptr; unsigned int buf_read_ptr; unsigned int cur_chan; unsigned int scans_done; unsigned int scan_progress; unsigned int munge_chan; unsigned int munge_count; unsigned int munge_ptr; unsigned int events; struct comedi_cmd cmd; wait_queue_head_t wait_head; unsigned int cb_mask; int (*inttrig)(struct comedi_device *dev, struct comedi_subdevice *s, unsigned int x); }; /** * enum comedi_cb - &struct comedi_async callback "events" * @COMEDI_CB_EOS: end-of-scan * @COMEDI_CB_EOA: end-of-acquisition/output * @COMEDI_CB_BLOCK: data has arrived, wakes up read() / write() * @COMEDI_CB_EOBUF: DEPRECATED: end of buffer * @COMEDI_CB_ERROR: card error during acquisition * @COMEDI_CB_OVERFLOW: buffer overflow/underflow * @COMEDI_CB_ERROR_MASK: events that indicate an error has occurred * @COMEDI_CB_CANCEL_MASK: events that will cancel an async command */ enum comedi_cb { COMEDI_CB_EOS = BIT(0), COMEDI_CB_EOA = BIT(1), COMEDI_CB_BLOCK = BIT(2), COMEDI_CB_EOBUF = BIT(3), COMEDI_CB_ERROR = BIT(4), COMEDI_CB_OVERFLOW = BIT(5), /* masks */ COMEDI_CB_ERROR_MASK = (COMEDI_CB_ERROR | COMEDI_CB_OVERFLOW), COMEDI_CB_CANCEL_MASK = (COMEDI_CB_EOA | COMEDI_CB_ERROR_MASK) }; /** * struct comedi_driver - COMEDI driver registration * @driver_name: Name of driver. * @module: Owning module. * @attach: The optional "attach" handler for manually configured COMEDI * devices. * @detach: The "detach" handler for deconfiguring COMEDI devices. * @auto_attach: The optional "auto_attach" handler for automatically * configured COMEDI devices. * @num_names: Optional number of "board names" supported. * @board_name: Optional pointer to a pointer to a board name. The pointer * to a board name is embedded in an element of a driver-defined array * of static, read-only board type information. * @offset: Optional size of each element of the driver-defined array of * static, read-only board type information, i.e. the offset between each * pointer to a board name. * * This is used with comedi_driver_register() and comedi_driver_unregister() to * register and unregister a low-level COMEDI driver with the COMEDI core. * * If @num_names is non-zero, @board_name should be non-NULL, and @offset * should be at least sizeof(*board_name). These are used by the handler for * the %COMEDI_DEVCONFIG ioctl to match a hardware device and its driver by * board name. If @num_names is zero, the %COMEDI_DEVCONFIG ioctl matches a * hardware device and its driver by driver name. This is only useful if the * @attach handler is set. If @num_names is non-zero, the driver's @attach * handler will be called with the COMEDI device structure's board_ptr member * pointing to the matched pointer to a board name within the driver's private * array of static, read-only board type information. * * The @detach handler has two roles. If a COMEDI device was successfully * configured by the @attach or @auto_attach handler, it is called when the * device is being deconfigured (by the %COMEDI_DEVCONFIG ioctl, or due to * unloading of the driver, or due to device removal). It is also called when * the @attach or @auto_attach handler returns an error. Therefore, the * @attach or @auto_attach handlers can defer clean-up on error until the * @detach handler is called. If the @attach or @auto_attach handlers free * any resources themselves, they must prevent the @detach handler from * freeing the same resources. The @detach handler must not assume that all * resources requested by the @attach or @auto_attach handler were * successfully allocated. */ struct comedi_driver { /* private: */ struct comedi_driver *next; /* Next in list of COMEDI drivers. */ /* public: */ const char *driver_name; struct module *module; int (*attach)(struct comedi_device *dev, struct comedi_devconfig *it); void (*detach)(struct comedi_device *dev); int (*auto_attach)(struct comedi_device *dev, unsigned long context); unsigned int num_names; const char *const *board_name; int offset; }; /** * struct comedi_device - Working data for a COMEDI device * @use_count: Number of open file objects. * @driver: Low-level COMEDI driver attached to this COMEDI device. * @pacer: Optional pointer to a dynamically allocated acquisition pacer * control. It is freed automatically after the COMEDI device is * detached from the low-level driver. * @private: Optional pointer to private data allocated by the low-level * driver. It is freed automatically after the COMEDI device is * detached from the low-level driver. * @class_dev: Sysfs comediX device. * @minor: Minor device number of COMEDI char device (0-47). * @detach_count: Counter incremented every time the COMEDI device is detached. * Used for checking a previous attachment is still valid. * @hw_dev: Optional pointer to the low-level hardware &struct device. It is * required for automatically configured COMEDI devices and optional for * COMEDI devices configured by the %COMEDI_DEVCONFIG ioctl, although * the bus-specific COMEDI functions only work if it is set correctly. * It is also passed to dma_alloc_coherent() for COMEDI subdevices that * have their 'async_dma_dir' member set to something other than * %DMA_NONE. * @board_name: Pointer to a COMEDI board name or a COMEDI driver name. When * the low-level driver's "attach" handler is called by the handler for * the %COMEDI_DEVCONFIG ioctl, it either points to a matched board name * string if the 'num_names' member of the &struct comedi_driver is * non-zero, otherwise it points to the low-level driver name string. * When the low-lever driver's "auto_attach" handler is called for an * automatically configured COMEDI device, it points to the low-level * driver name string. The low-level driver is free to change it in its * "attach" or "auto_attach" handler if it wishes. * @board_ptr: Optional pointer to private, read-only board type information in * the low-level driver. If the 'num_names' member of the &struct * comedi_driver is non-zero, the handler for the %COMEDI_DEVCONFIG ioctl * will point it to a pointer to a matched board name string within the * driver's private array of static, read-only board type information when * calling the driver's "attach" handler. The low-level driver is free to * change it. * @attached: Flag indicating that the COMEDI device is attached to a low-level * driver. * @ioenabled: Flag used to indicate that a PCI device has been enabled and * its regions requested. * @spinlock: Generic spin-lock for use by the low-level driver. * @mutex: Generic mutex for use by the COMEDI core module. * @attach_lock: &struct rw_semaphore used to guard against the COMEDI device * being detached while an operation is in progress. The down_write() * operation is only allowed while @mutex is held and is used when * changing @attached and @detach_count and calling the low-level driver's * "detach" handler. The down_read() operation is generally used without * holding @mutex. * @refcount: &struct kref reference counter for freeing COMEDI device. * @n_subdevices: Number of COMEDI subdevices allocated by the low-level * driver for this device. * @subdevices: Dynamically allocated array of COMEDI subdevices. * @mmio: Optional pointer to a remapped MMIO region set by the low-level * driver. * @iobase: Optional base of an I/O port region requested by the low-level * driver. * @iolen: Length of I/O port region requested at @iobase. * @irq: Optional IRQ number requested by the low-level driver. * @read_subdev: Optional pointer to a default COMEDI subdevice operated on by * the read() file operation. Set by the low-level driver. * @write_subdev: Optional pointer to a default COMEDI subdevice operated on by * the write() file operation. Set by the low-level driver. * @async_queue: Storage for fasync_helper(). * @open: Optional pointer to a function set by the low-level driver to be * called when @use_count changes from 0 to 1. * @close: Optional pointer to a function set by the low-level driver to be * called when @use_count changed from 1 to 0. * @insn_device_config: Optional pointer to a handler for all sub-instructions * except %INSN_DEVICE_CONFIG_GET_ROUTES of the %INSN_DEVICE_CONFIG * instruction. If this is not initialized by the low-level driver, a * default handler will be set during post-configuration. * @get_valid_routes: Optional pointer to a handler for the * %INSN_DEVICE_CONFIG_GET_ROUTES sub-instruction of the * %INSN_DEVICE_CONFIG instruction set. If this is not initialized by the * low-level driver, a default handler that copies zero routes back to the * user will be used. * * This is the main control data structure for a COMEDI device (as far as the * COMEDI core is concerned). There are two groups of COMEDI devices - * "legacy" devices that are configured by the handler for the * %COMEDI_DEVCONFIG ioctl, and automatically configured devices resulting * from a call to comedi_auto_config() as a result of a bus driver probe in * a low-level COMEDI driver. The "legacy" COMEDI devices are allocated * during module initialization if the "comedi_num_legacy_minors" module * parameter is non-zero and use minor device numbers from 0 to * comedi_num_legacy_minors minus one. The automatically configured COMEDI * devices are allocated on demand and use minor device numbers from * comedi_num_legacy_minors to 47. */ struct comedi_device { int use_count; struct comedi_driver *driver; struct comedi_8254 *pacer; void *private; struct device *class_dev; int minor; unsigned int detach_count; struct device *hw_dev; const char *board_name; const void *board_ptr; unsigned int attached:1; unsigned int ioenabled:1; spinlock_t spinlock; /* generic spin-lock for low-level driver */ struct mutex mutex; /* generic mutex for COMEDI core */ struct rw_semaphore attach_lock; struct kref refcount; int n_subdevices; struct comedi_subdevice *subdevices; /* dumb */ void __iomem *mmio; unsigned long iobase; unsigned long iolen; unsigned int irq; struct comedi_subdevice *read_subdev; struct comedi_subdevice *write_subdev; struct fasync_struct *async_queue; int (*open)(struct comedi_device *dev); void (*close)(struct comedi_device *dev); int (*insn_device_config)(struct comedi_device *dev, struct comedi_insn *insn, unsigned int *data); unsigned int (*get_valid_routes)(struct comedi_device *dev, unsigned int n_pairs, unsigned int *pair_data); }; /* * function prototypes */ void comedi_event(struct comedi_device *dev, struct comedi_subdevice *s); struct comedi_device *comedi_dev_get_from_minor(unsigned int minor); int comedi_dev_put(struct comedi_device *dev); bool comedi_is_subdevice_running(struct comedi_subdevice *s); void *comedi_alloc_spriv(struct comedi_subdevice *s, size_t size); void comedi_set_spriv_auto_free(struct comedi_subdevice *s); int comedi_check_chanlist(struct comedi_subdevice *s, int n, unsigned int *chanlist); /* range stuff */ #define RANGE(a, b) {(a) * 1e6, (b) * 1e6, 0} #define RANGE_ext(a, b) {(a) * 1e6, (b) * 1e6, RF_EXTERNAL} #define RANGE_mA(a, b) {(a) * 1e6, (b) * 1e6, UNIT_mA} #define RANGE_unitless(a, b) {(a) * 1e6, (b) * 1e6, 0} #define BIP_RANGE(a) {-(a) * 1e6, (a) * 1e6, 0} #define UNI_RANGE(a) {0, (a) * 1e6, 0} extern const struct comedi_lrange range_bipolar10; extern const struct comedi_lrange range_bipolar5; extern const struct comedi_lrange range_bipolar2_5; extern const struct comedi_lrange range_unipolar10; extern const struct comedi_lrange range_unipolar5; extern const struct comedi_lrange range_unipolar2_5; extern const struct comedi_lrange range_0_20mA; extern const struct comedi_lrange range_4_20mA; extern const struct comedi_lrange range_0_32mA; extern const struct comedi_lrange range_unknown; #define range_digital range_unipolar5 /** * struct comedi_lrange - Describes a COMEDI range table * @length: Number of entries in the range table. * @range: Array of &struct comedi_krange, one for each range. * * Each element of @range[] describes the minimum and maximum physical range * and the type of units. Typically, the type of unit is %UNIT_volt * (i.e. volts) and the minimum and maximum are in millionths of a volt. * There may also be a flag that indicates the minimum and maximum are merely * scale factors for an unknown, external reference. */ struct comedi_lrange { int length; struct comedi_krange range[] __counted_by(length); }; /** * comedi_range_is_bipolar() - Test if subdevice range is bipolar * @s: COMEDI subdevice. * @range: Index of range within a range table. * * Tests whether a range is bipolar by checking whether its minimum value * is negative. * * Assumes @range is valid. Does not work for subdevices using a * channel-specific range table list. * * Return: * %true if the range is bipolar. * %false if the range is unipolar. */ static inline bool comedi_range_is_bipolar(struct comedi_subdevice *s, unsigned int range) { return s->range_table->range[range].min < 0; } /** * comedi_range_is_unipolar() - Test if subdevice range is unipolar * @s: COMEDI subdevice. * @range: Index of range within a range table. * * Tests whether a range is unipolar by checking whether its minimum value * is at least 0. * * Assumes @range is valid. Does not work for subdevices using a * channel-specific range table list. * * Return: * %true if the range is unipolar. * %false if the range is bipolar. */ static inline bool comedi_range_is_unipolar(struct comedi_subdevice *s, unsigned int range) { return s->range_table->range[range].min >= 0; } /** * comedi_range_is_external() - Test if subdevice range is external * @s: COMEDI subdevice. * @range: Index of range within a range table. * * Tests whether a range is externally reference by checking whether its * %RF_EXTERNAL flag is set. * * Assumes @range is valid. Does not work for subdevices using a * channel-specific range table list. * * Return: * %true if the range is external. * %false if the range is internal. */ static inline bool comedi_range_is_external(struct comedi_subdevice *s, unsigned int range) { return !!(s->range_table->range[range].flags & RF_EXTERNAL); } /** * comedi_chan_range_is_bipolar() - Test if channel-specific range is bipolar * @s: COMEDI subdevice. * @chan: The channel number. * @range: Index of range within a range table. * * Tests whether a range is bipolar by checking whether its minimum value * is negative. * * Assumes @chan and @range are valid. Only works for subdevices with a * channel-specific range table list. * * Return: * %true if the range is bipolar. * %false if the range is unipolar. */ static inline bool comedi_chan_range_is_bipolar(struct comedi_subdevice *s, unsigned int chan, unsigned int range) { return s->range_table_list[chan]->range[range].min < 0; } /** * comedi_chan_range_is_unipolar() - Test if channel-specific range is unipolar * @s: COMEDI subdevice. * @chan: The channel number. * @range: Index of range within a range table. * * Tests whether a range is unipolar by checking whether its minimum value * is at least 0. * * Assumes @chan and @range are valid. Only works for subdevices with a * channel-specific range table list. * * Return: * %true if the range is unipolar. * %false if the range is bipolar. */ static inline bool comedi_chan_range_is_unipolar(struct comedi_subdevice *s, unsigned int chan, unsigned int range) { return s->range_table_list[chan]->range[range].min >= 0; } /** * comedi_chan_range_is_external() - Test if channel-specific range is external * @s: COMEDI subdevice. * @chan: The channel number. * @range: Index of range within a range table. * * Tests whether a range is externally reference by checking whether its * %RF_EXTERNAL flag is set. * * Assumes @chan and @range are valid. Only works for subdevices with a * channel-specific range table list. * * Return: * %true if the range is bipolar. * %false if the range is unipolar. */ static inline bool comedi_chan_range_is_external(struct comedi_subdevice *s, unsigned int chan, unsigned int range) { return !!(s->range_table_list[chan]->range[range].flags & RF_EXTERNAL); } /** * comedi_offset_munge() - Convert between offset binary and 2's complement * @s: COMEDI subdevice. * @val: Value to be converted. * * Toggles the highest bit of a sample value to toggle between offset binary * and 2's complement. Assumes that @s->maxdata is a power of 2 minus 1. * * Return: The converted value. */ static inline unsigned int comedi_offset_munge(struct comedi_subdevice *s, unsigned int val) { return val ^ s->maxdata ^ (s->maxdata >> 1); } /** * comedi_bytes_per_sample() - Determine subdevice sample size * @s: COMEDI subdevice. * * The sample size will be 4 (sizeof int) or 2 (sizeof short) depending on * whether the %SDF_LSAMPL subdevice flag is set or not. * * Return: The subdevice sample size. */ static inline unsigned int comedi_bytes_per_sample(struct comedi_subdevice *s) { return s->subdev_flags & SDF_LSAMPL ? sizeof(int) : sizeof(short); } /** * comedi_sample_shift() - Determine log2 of subdevice sample size * @s: COMEDI subdevice. * * The sample size will be 4 (sizeof int) or 2 (sizeof short) depending on * whether the %SDF_LSAMPL subdevice flag is set or not. The log2 of the * sample size will be 2 or 1 and can be used as the right operand of a * bit-shift operator to multiply or divide something by the sample size. * * Return: log2 of the subdevice sample size. */ static inline unsigned int comedi_sample_shift(struct comedi_subdevice *s) { return s->subdev_flags & SDF_LSAMPL ? 2 : 1; } /** * comedi_bytes_to_samples() - Convert a number of bytes to a number of samples * @s: COMEDI subdevice. * @nbytes: Number of bytes * * Return: The number of bytes divided by the subdevice sample size. */ static inline unsigned int comedi_bytes_to_samples(struct comedi_subdevice *s, unsigned int nbytes) { return nbytes >> comedi_sample_shift(s); } /** * comedi_samples_to_bytes() - Convert a number of samples to a number of bytes * @s: COMEDI subdevice. * @nsamples: Number of samples. * * Return: The number of samples multiplied by the subdevice sample size. * (Does not check for arithmetic overflow.) */ static inline unsigned int comedi_samples_to_bytes(struct comedi_subdevice *s, unsigned int nsamples) { return nsamples << comedi_sample_shift(s); } /** * comedi_check_trigger_src() - Trivially validate a comedi_cmd trigger source * @src: Pointer to the trigger source to validate. * @flags: Bitmask of valid %TRIG_* for the trigger. * * This is used in "step 1" of the do_cmdtest functions of comedi drivers * to validate the comedi_cmd triggers. The mask of the @src against the * @flags allows the userspace comedilib to pass all the comedi_cmd * triggers as %TRIG_ANY and get back a bitmask of the valid trigger sources. * * Return: * 0 if trigger sources in *@src are all supported. * -EINVAL if any trigger source in *@src is unsupported. */ static inline int comedi_check_trigger_src(unsigned int *src, unsigned int flags) { unsigned int orig_src = *src; *src = orig_src & flags; if (*src == TRIG_INVALID || *src != orig_src) return -EINVAL; return 0; } /** * comedi_check_trigger_is_unique() - Make sure a trigger source is unique * @src: The trigger source to check. * * Return: * 0 if no more than one trigger source is set. * -EINVAL if more than one trigger source is set. */ static inline int comedi_check_trigger_is_unique(unsigned int src) { /* this test is true if more than one _src bit is set */ if ((src & (src - 1)) != 0) return -EINVAL; return 0; } /** * comedi_check_trigger_arg_is() - Trivially validate a trigger argument * @arg: Pointer to the trigger arg to validate. * @val: The value the argument should be. * * Forces *@arg to be @val. * * Return: * 0 if *@arg was already @val. * -EINVAL if *@arg differed from @val. */ static inline int comedi_check_trigger_arg_is(unsigned int *arg, unsigned int val) { if (*arg != val) { *arg = val; return -EINVAL; } return 0; } /** * comedi_check_trigger_arg_min() - Trivially validate a trigger argument min * @arg: Pointer to the trigger arg to validate. * @val: The minimum value the argument should be. * * Forces *@arg to be at least @val, setting it to @val if necessary. * * Return: * 0 if *@arg was already at least @val. * -EINVAL if *@arg was less than @val. */ static inline int comedi_check_trigger_arg_min(unsigned int *arg, unsigned int val) { if (*arg < val) { *arg = val; return -EINVAL; } return 0; } /** * comedi_check_trigger_arg_max() - Trivially validate a trigger argument max * @arg: Pointer to the trigger arg to validate. * @val: The maximum value the argument should be. * * Forces *@arg to be no more than @val, setting it to @val if necessary. * * Return: * 0 if*@arg was already no more than @val. * -EINVAL if *@arg was greater than @val. */ static inline int comedi_check_trigger_arg_max(unsigned int *arg, unsigned int val) { if (*arg > val) { *arg = val; return -EINVAL; } return 0; } /* * Must set dev->hw_dev if you wish to dma directly into comedi's buffer. * Also useful for retrieving a previously configured hardware device of * known bus type. Set automatically for auto-configured devices. * Automatically set to NULL when detaching hardware device. */ int comedi_set_hw_dev(struct comedi_device *dev, struct device *hw_dev); /** * comedi_buf_n_bytes_ready - Determine amount of unread data in buffer * @s: COMEDI subdevice. * * Determines the number of bytes of unread data in the asynchronous * acquisition data buffer for a subdevice. The data in question might not * have been fully "munged" yet. * * Returns: The amount of unread data in bytes. */ static inline unsigned int comedi_buf_n_bytes_ready(struct comedi_subdevice *s) { return s->async->buf_write_count - s->async->buf_read_count; } unsigned int comedi_buf_write_alloc(struct comedi_subdevice *s, unsigned int n); unsigned int comedi_buf_write_free(struct comedi_subdevice *s, unsigned int n); unsigned int comedi_buf_read_n_available(struct comedi_subdevice *s); unsigned int comedi_buf_read_alloc(struct comedi_subdevice *s, unsigned int n); unsigned int comedi_buf_read_free(struct comedi_subdevice *s, unsigned int n); unsigned int comedi_buf_write_samples(struct comedi_subdevice *s, const void *data, unsigned int nsamples); unsigned int comedi_buf_read_samples(struct comedi_subdevice *s, void *data, unsigned int nsamples); /* drivers.c - general comedi driver functions */ #define COMEDI_TIMEOUT_MS 1000 int comedi_timeout(struct comedi_device *dev, struct comedi_subdevice *s, struct comedi_insn *insn, int (*cb)(struct comedi_device *dev, struct comedi_subdevice *s, struct comedi_insn *insn, unsigned long context), unsigned long context); unsigned int comedi_handle_events(struct comedi_device *dev, struct comedi_subdevice *s); int comedi_dio_insn_config(struct comedi_device *dev, struct comedi_subdevice *s, struct comedi_insn *insn, unsigned int *data, unsigned int mask); unsigned int comedi_dio_update_state(struct comedi_subdevice *s, unsigned int *data); unsigned int comedi_bytes_per_scan_cmd(struct comedi_subdevice *s, struct comedi_cmd *cmd); unsigned int comedi_bytes_per_scan(struct comedi_subdevice *s); unsigned int comedi_nscans_left(struct comedi_subdevice *s, unsigned int nscans); unsigned int comedi_nsamples_left(struct comedi_subdevice *s, unsigned int nsamples); void comedi_inc_scan_progress(struct comedi_subdevice *s, unsigned int num_bytes); void *comedi_alloc_devpriv(struct comedi_device *dev, size_t size); int comedi_alloc_subdevices(struct comedi_device *dev, int num_subdevices); int comedi_alloc_subdev_readback(struct comedi_subdevice *s); int comedi_readback_insn_read(struct comedi_device *dev, struct comedi_subdevice *s, struct comedi_insn *insn, unsigned int *data); int comedi_load_firmware(struct comedi_device *dev, struct device *hw_dev, const char *name, int (*cb)(struct comedi_device *dev, const u8 *data, size_t size, unsigned long context), unsigned long context); int __comedi_request_region(struct comedi_device *dev, unsigned long start, unsigned long len); int comedi_request_region(struct comedi_device *dev, unsigned long start, unsigned long len); void comedi_legacy_detach(struct comedi_device *dev); int comedi_auto_config(struct device *hardware_device, struct comedi_driver *driver, unsigned long context); void comedi_auto_unconfig(struct device *hardware_device); int comedi_driver_register(struct comedi_driver *driver); void comedi_driver_unregister(struct comedi_driver *driver); /** * module_comedi_driver() - Helper macro for registering a comedi driver * @__comedi_driver: comedi_driver struct * * Helper macro for comedi drivers which do not do anything special in module * init/exit. This eliminates a lot of boilerplate. Each module may only use * this macro once, and calling it replaces module_init() and module_exit(). */ #define module_comedi_driver(__comedi_driver) \ module_driver(__comedi_driver, comedi_driver_register, \ comedi_driver_unregister) #endif /* _COMEDIDEV_H */ |
| 8 10 5 4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 | /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * drivers/net/bond/bond_options.h - bonding options * Copyright (c) 2013 Nikolay Aleksandrov <nikolay@redhat.com> */ #ifndef _NET_BOND_OPTIONS_H #define _NET_BOND_OPTIONS_H #include <linux/bits.h> #include <linux/limits.h> #include <linux/types.h> #include <linux/string.h> struct netlink_ext_ack; struct nlattr; #define BOND_OPT_MAX_NAMELEN 32 #define BOND_OPT_VALID(opt) ((opt) < BOND_OPT_LAST) #define BOND_MODE_ALL_EX(x) (~(x)) /* Option flags: * BOND_OPTFLAG_NOSLAVES - check if the bond device is empty before setting * BOND_OPTFLAG_IFDOWN - check if the bond device is down before setting * BOND_OPTFLAG_RAWVAL - the option parses the value itself */ enum { BOND_OPTFLAG_NOSLAVES = BIT(0), BOND_OPTFLAG_IFDOWN = BIT(1), BOND_OPTFLAG_RAWVAL = BIT(2) }; /* Value type flags: * BOND_VALFLAG_DEFAULT - mark the value as default * BOND_VALFLAG_(MIN|MAX) - mark the value as min/max */ enum { BOND_VALFLAG_DEFAULT = BIT(0), BOND_VALFLAG_MIN = BIT(1), BOND_VALFLAG_MAX = BIT(2) }; /* Option IDs, their bit positions correspond to their IDs */ enum { BOND_OPT_MODE, BOND_OPT_PACKETS_PER_SLAVE, BOND_OPT_XMIT_HASH, BOND_OPT_ARP_VALIDATE, BOND_OPT_ARP_ALL_TARGETS, BOND_OPT_FAIL_OVER_MAC, BOND_OPT_ARP_INTERVAL, BOND_OPT_ARP_TARGETS, BOND_OPT_DOWNDELAY, BOND_OPT_UPDELAY, BOND_OPT_LACP_RATE, BOND_OPT_MINLINKS, BOND_OPT_AD_SELECT, BOND_OPT_NUM_PEER_NOTIF, BOND_OPT_MIIMON, BOND_OPT_PRIMARY, BOND_OPT_PRIMARY_RESELECT, BOND_OPT_USE_CARRIER, BOND_OPT_ACTIVE_SLAVE, BOND_OPT_QUEUE_ID, BOND_OPT_ALL_SLAVES_ACTIVE, BOND_OPT_RESEND_IGMP, BOND_OPT_LP_INTERVAL, BOND_OPT_SLAVES, BOND_OPT_TLB_DYNAMIC_LB, BOND_OPT_AD_ACTOR_SYS_PRIO, BOND_OPT_AD_ACTOR_SYSTEM, BOND_OPT_AD_USER_PORT_KEY, BOND_OPT_NUM_PEER_NOTIF_ALIAS, BOND_OPT_PEER_NOTIF_DELAY, BOND_OPT_LACP_ACTIVE, BOND_OPT_MISSED_MAX, BOND_OPT_NS_TARGETS, BOND_OPT_PRIO, BOND_OPT_COUPLED_CONTROL, BOND_OPT_BROADCAST_NEIGH, BOND_OPT_LAST }; /* This structure is used for storing option values and for passing option * values when changing an option. The logic when used as an arg is as follows: * - if value != ULLONG_MAX -> parse value * - if string != NULL -> parse string * - if the opt is RAW data and length less than maxlen, * copy the data to extra storage */ #define BOND_OPT_EXTRA_MAXLEN 16 struct bond_opt_value { char *string; u64 value; u32 flags; union { char extra[BOND_OPT_EXTRA_MAXLEN]; struct net_device *slave_dev; }; }; struct bonding; struct bond_option { int id; const char *name; const char *desc; u32 flags; /* unsuppmodes is used to denote modes in which the option isn't * supported. */ unsigned long unsuppmodes; /* supported values which this option can have, can be a subset of * BOND_OPTVAL_RANGE's value range */ const struct bond_opt_value *values; int (*set)(struct bonding *bond, const struct bond_opt_value *val); }; int __bond_opt_set(struct bonding *bond, unsigned int option, struct bond_opt_value *val, struct nlattr *bad_attr, struct netlink_ext_ack *extack); int __bond_opt_set_notify(struct bonding *bond, unsigned int option, struct bond_opt_value *val); int bond_opt_tryset_rtnl(struct bonding *bond, unsigned int option, char *buf); const struct bond_opt_value *bond_opt_parse(const struct bond_option *opt, struct bond_opt_value *val); const struct bond_option *bond_opt_get(unsigned int option); const struct bond_option *bond_opt_get_by_name(const char *name); const struct bond_opt_value *bond_opt_get_val(unsigned int option, u64 val); /* This helper is used to initialize a bond_opt_value structure for parameter * passing. There should be either a valid string or value, but not both. * When value is ULLONG_MAX then string will be used. */ static inline void __bond_opt_init(struct bond_opt_value *optval, char *string, u64 value, void *extra, size_t extra_len) { memset(optval, 0, sizeof(*optval)); optval->value = ULLONG_MAX; if (value != ULLONG_MAX) optval->value = value; else if (string) optval->string = string; if (extra && extra_len <= BOND_OPT_EXTRA_MAXLEN) memcpy(optval->extra, extra, extra_len); } #define bond_opt_initval(optval, value) __bond_opt_init(optval, NULL, value, NULL, 0) #define bond_opt_initstr(optval, str) __bond_opt_init(optval, str, ULLONG_MAX, NULL, 0) #define bond_opt_initextra(optval, extra, extra_len) \ __bond_opt_init(optval, NULL, ULLONG_MAX, extra, extra_len) #define bond_opt_slave_initval(optval, slave_dev, value) \ __bond_opt_init(optval, NULL, value, slave_dev, sizeof(struct net_device *)) void bond_option_arp_ip_targets_clear(struct bonding *bond); #if IS_ENABLED(CONFIG_IPV6) void bond_option_ns_ip6_targets_clear(struct bonding *bond); #endif void bond_slave_ns_maddrs_add(struct bonding *bond, struct slave *slave); void bond_slave_ns_maddrs_del(struct bonding *bond, struct slave *slave); #endif /* _NET_BOND_OPTIONS_H */ |
| 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 2 2 2 2 2 2 2 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 | // SPDX-License-Identifier: GPL-2.0-only #include <linux/ethtool.h> #include <linux/firmware.h> #include <linux/sfp.h> #include <net/devlink.h> #include <net/netdev_lock.h> #include "netlink.h" #include "common.h" #include "bitset.h" #include "module_fw.h" struct module_req_info { struct ethnl_req_info base; }; struct module_reply_data { struct ethnl_reply_data base; struct ethtool_module_power_mode_params power; }; #define MODULE_REPDATA(__reply_base) \ container_of(__reply_base, struct module_reply_data, base) /* MODULE_GET */ const struct nla_policy ethnl_module_get_policy[ETHTOOL_A_MODULE_HEADER + 1] = { [ETHTOOL_A_MODULE_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), }; static int module_get_power_mode(struct net_device *dev, struct module_reply_data *data, struct netlink_ext_ack *extack) { const struct ethtool_ops *ops = dev->ethtool_ops; if (!ops->get_module_power_mode) return 0; if (dev->ethtool->module_fw_flash_in_progress) { NL_SET_ERR_MSG(extack, "Module firmware flashing is in progress"); return -EBUSY; } return ops->get_module_power_mode(dev, &data->power, extack); } static int module_prepare_data(const struct ethnl_req_info *req_base, struct ethnl_reply_data *reply_base, const struct genl_info *info) { struct module_reply_data *data = MODULE_REPDATA(reply_base); struct net_device *dev = reply_base->dev; int ret; ret = ethnl_ops_begin(dev); if (ret < 0) return ret; ret = module_get_power_mode(dev, data, info->extack); if (ret < 0) goto out_complete; out_complete: ethnl_ops_complete(dev); return ret; } static int module_reply_size(const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) { struct module_reply_data *data = MODULE_REPDATA(reply_base); int len = 0; if (data->power.policy) len += nla_total_size(sizeof(u8)); /* _MODULE_POWER_MODE_POLICY */ if (data->power.mode) len += nla_total_size(sizeof(u8)); /* _MODULE_POWER_MODE */ return len; } static int module_fill_reply(struct sk_buff *skb, const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) { const struct module_reply_data *data = MODULE_REPDATA(reply_base); if (data->power.policy && nla_put_u8(skb, ETHTOOL_A_MODULE_POWER_MODE_POLICY, data->power.policy)) return -EMSGSIZE; if (data->power.mode && nla_put_u8(skb, ETHTOOL_A_MODULE_POWER_MODE, data->power.mode)) return -EMSGSIZE; return 0; } /* MODULE_SET */ const struct nla_policy ethnl_module_set_policy[ETHTOOL_A_MODULE_POWER_MODE_POLICY + 1] = { [ETHTOOL_A_MODULE_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), [ETHTOOL_A_MODULE_POWER_MODE_POLICY] = NLA_POLICY_RANGE(NLA_U8, ETHTOOL_MODULE_POWER_MODE_POLICY_HIGH, ETHTOOL_MODULE_POWER_MODE_POLICY_AUTO), }; static int ethnl_set_module_validate(struct ethnl_req_info *req_info, struct genl_info *info) { const struct ethtool_ops *ops = req_info->dev->ethtool_ops; struct nlattr **tb = info->attrs; if (!tb[ETHTOOL_A_MODULE_POWER_MODE_POLICY]) return 0; if (req_info->dev->ethtool->module_fw_flash_in_progress) { NL_SET_ERR_MSG(info->extack, "Module firmware flashing is in progress"); return -EBUSY; } if (!ops->get_module_power_mode || !ops->set_module_power_mode) { NL_SET_ERR_MSG_ATTR(info->extack, tb[ETHTOOL_A_MODULE_POWER_MODE_POLICY], "Setting power mode policy is not supported by this device"); return -EOPNOTSUPP; } return 1; } static int ethnl_set_module(struct ethnl_req_info *req_info, struct genl_info *info) { struct ethtool_module_power_mode_params power = {}; struct ethtool_module_power_mode_params power_new; const struct ethtool_ops *ops; struct net_device *dev = req_info->dev; struct nlattr **tb = info->attrs; int ret; ops = dev->ethtool_ops; power_new.policy = nla_get_u8(tb[ETHTOOL_A_MODULE_POWER_MODE_POLICY]); ret = ops->get_module_power_mode(dev, &power, info->extack); if (ret < 0) return ret; if (power_new.policy == power.policy) return 0; ret = ops->set_module_power_mode(dev, &power_new, info->extack); return ret < 0 ? ret : 1; } const struct ethnl_request_ops ethnl_module_request_ops = { .request_cmd = ETHTOOL_MSG_MODULE_GET, .reply_cmd = ETHTOOL_MSG_MODULE_GET_REPLY, .hdr_attr = ETHTOOL_A_MODULE_HEADER, .req_info_size = sizeof(struct module_req_info), .reply_data_size = sizeof(struct module_reply_data), .prepare_data = module_prepare_data, .reply_size = module_reply_size, .fill_reply = module_fill_reply, .set_validate = ethnl_set_module_validate, .set = ethnl_set_module, .set_ntf_cmd = ETHTOOL_MSG_MODULE_NTF, }; /* MODULE_FW_FLASH_ACT */ const struct nla_policy ethnl_module_fw_flash_act_policy[ETHTOOL_A_MODULE_FW_FLASH_PASSWORD + 1] = { [ETHTOOL_A_MODULE_FW_FLASH_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), [ETHTOOL_A_MODULE_FW_FLASH_FILE_NAME] = { .type = NLA_NUL_STRING }, [ETHTOOL_A_MODULE_FW_FLASH_PASSWORD] = { .type = NLA_U32 }, }; static LIST_HEAD(module_fw_flash_work_list); static DEFINE_SPINLOCK(module_fw_flash_work_list_lock); static int module_flash_fw_work_list_add(struct ethtool_module_fw_flash *module_fw, struct genl_info *info) { struct ethtool_module_fw_flash *work; /* First, check if already registered. */ spin_lock(&module_fw_flash_work_list_lock); list_for_each_entry(work, &module_fw_flash_work_list, list) { if (work->fw_update.ntf_params.portid == info->snd_portid && work->fw_update.dev == module_fw->fw_update.dev) { spin_unlock(&module_fw_flash_work_list_lock); return -EALREADY; } } list_add_tail(&module_fw->list, &module_fw_flash_work_list); spin_unlock(&module_fw_flash_work_list_lock); return 0; } static void module_flash_fw_work_list_del(struct list_head *list) { spin_lock(&module_fw_flash_work_list_lock); list_del(list); spin_unlock(&module_fw_flash_work_list_lock); } static void module_flash_fw_work(struct work_struct *work) { struct ethtool_module_fw_flash *module_fw; module_fw = container_of(work, struct ethtool_module_fw_flash, work); ethtool_cmis_fw_update(&module_fw->fw_update); module_flash_fw_work_list_del(&module_fw->list); module_fw->fw_update.dev->ethtool->module_fw_flash_in_progress = false; netdev_put(module_fw->fw_update.dev, &module_fw->dev_tracker); release_firmware(module_fw->fw_update.fw); kfree(module_fw); } #define MODULE_EEPROM_PHYS_ID_PAGE 0 #define MODULE_EEPROM_PHYS_ID_I2C_ADDR 0x50 static int module_flash_fw_work_init(struct ethtool_module_fw_flash *module_fw, struct net_device *dev, struct netlink_ext_ack *extack) { const struct ethtool_ops *ops = dev->ethtool_ops; struct ethtool_module_eeprom page_data = {}; u8 phys_id; int err; /* Fetch the SFF-8024 Identifier Value. For all supported standards, it * is located at I2C address 0x50, byte 0. See section 4.1 in SFF-8024, * revision 4.9. */ page_data.page = MODULE_EEPROM_PHYS_ID_PAGE; page_data.offset = SFP_PHYS_ID; page_data.length = sizeof(phys_id); page_data.i2c_address = MODULE_EEPROM_PHYS_ID_I2C_ADDR; page_data.data = &phys_id; err = ops->get_module_eeprom_by_page(dev, &page_data, extack); if (err < 0) return err; switch (phys_id) { case SFF8024_ID_QSFP_DD: case SFF8024_ID_OSFP: case SFF8024_ID_DSFP: case SFF8024_ID_QSFP_PLUS_CMIS: case SFF8024_ID_SFP_DD_CMIS: case SFF8024_ID_SFP_PLUS_CMIS: INIT_WORK(&module_fw->work, module_flash_fw_work); break; default: NL_SET_ERR_MSG(extack, "Module type does not support firmware flashing"); return -EOPNOTSUPP; } return 0; } void ethnl_module_fw_flash_sock_destroy(struct ethnl_sock_priv *sk_priv) { struct ethtool_module_fw_flash *work; spin_lock(&module_fw_flash_work_list_lock); list_for_each_entry(work, &module_fw_flash_work_list, list) { if (work->fw_update.dev == sk_priv->dev && work->fw_update.ntf_params.portid == sk_priv->portid) { work->fw_update.ntf_params.closed_sock = true; break; } } spin_unlock(&module_fw_flash_work_list_lock); } static int module_flash_fw_schedule(struct net_device *dev, const char *file_name, struct ethtool_module_fw_flash_params *params, struct sk_buff *skb, struct genl_info *info) { struct ethtool_cmis_fw_update_params *fw_update; struct ethtool_module_fw_flash *module_fw; int err; module_fw = kzalloc(sizeof(*module_fw), GFP_KERNEL); if (!module_fw) return -ENOMEM; fw_update = &module_fw->fw_update; fw_update->params = *params; err = request_firmware_direct(&fw_update->fw, file_name, &dev->dev); if (err) { NL_SET_ERR_MSG(info->extack, "Failed to request module firmware image"); goto err_free; } err = module_flash_fw_work_init(module_fw, dev, info->extack); if (err < 0) goto err_release_firmware; dev->ethtool->module_fw_flash_in_progress = true; netdev_hold(dev, &module_fw->dev_tracker, GFP_KERNEL); fw_update->dev = dev; fw_update->ntf_params.portid = info->snd_portid; fw_update->ntf_params.seq = info->snd_seq; fw_update->ntf_params.closed_sock = false; err = ethnl_sock_priv_set(skb, dev, fw_update->ntf_params.portid, ETHTOOL_SOCK_TYPE_MODULE_FW_FLASH); if (err < 0) goto err_release_firmware; err = module_flash_fw_work_list_add(module_fw, info); if (err < 0) goto err_release_firmware; schedule_work(&module_fw->work); return 0; err_release_firmware: release_firmware(fw_update->fw); err_free: kfree(module_fw); return err; } static int module_flash_fw(struct net_device *dev, struct nlattr **tb, struct sk_buff *skb, struct genl_info *info) { struct ethtool_module_fw_flash_params params = {}; const char *file_name; struct nlattr *attr; if (GENL_REQ_ATTR_CHECK(info, ETHTOOL_A_MODULE_FW_FLASH_FILE_NAME)) return -EINVAL; file_name = nla_data(tb[ETHTOOL_A_MODULE_FW_FLASH_FILE_NAME]); attr = tb[ETHTOOL_A_MODULE_FW_FLASH_PASSWORD]; if (attr) { params.password = cpu_to_be32(nla_get_u32(attr)); params.password_valid = true; } return module_flash_fw_schedule(dev, file_name, ¶ms, skb, info); } static int ethnl_module_fw_flash_validate(struct net_device *dev, struct netlink_ext_ack *extack) { struct devlink_port *devlink_port = dev->devlink_port; const struct ethtool_ops *ops = dev->ethtool_ops; if (!ops->set_module_eeprom_by_page || !ops->get_module_eeprom_by_page) { NL_SET_ERR_MSG(extack, "Flashing module firmware is not supported by this device"); return -EOPNOTSUPP; } if (!ops->reset) { NL_SET_ERR_MSG(extack, "Reset module is not supported by this device, so flashing is not permitted"); return -EOPNOTSUPP; } if (dev->ethtool->module_fw_flash_in_progress) { NL_SET_ERR_MSG(extack, "Module firmware flashing already in progress"); return -EBUSY; } if (dev->flags & IFF_UP) { NL_SET_ERR_MSG(extack, "Netdevice is up, so flashing is not permitted"); return -EBUSY; } if (devlink_port && devlink_port->attrs.split) { NL_SET_ERR_MSG(extack, "Can't perform firmware flashing on a split port"); return -EOPNOTSUPP; } return 0; } int ethnl_act_module_fw_flash(struct sk_buff *skb, struct genl_info *info) { struct ethnl_req_info req_info = {}; struct nlattr **tb = info->attrs; struct net_device *dev; int ret; ret = ethnl_parse_header_dev_get(&req_info, tb[ETHTOOL_A_MODULE_FW_FLASH_HEADER], genl_info_net(info), info->extack, true); if (ret < 0) return ret; dev = req_info.dev; rtnl_lock(); netdev_lock_ops(dev); ret = ethnl_ops_begin(dev); if (ret < 0) goto out_unlock; ret = ethnl_module_fw_flash_validate(dev, info->extack); if (ret < 0) goto out_unlock; ret = module_flash_fw(dev, tb, skb, info); ethnl_ops_complete(dev); out_unlock: netdev_unlock_ops(dev); rtnl_unlock(); ethnl_parse_header_dev_put(&req_info); return ret; } /* MODULE_FW_FLASH_NTF */ static int ethnl_module_fw_flash_ntf_put_err(struct sk_buff *skb, char *err_msg, char *sub_err_msg) { int err_msg_len, sub_err_msg_len, total_len; struct nlattr *attr; if (!err_msg) return 0; err_msg_len = strlen(err_msg); total_len = err_msg_len + 2; /* For period and NUL. */ if (sub_err_msg) { sub_err_msg_len = strlen(sub_err_msg); total_len += sub_err_msg_len + 2; /* For ", ". */ } attr = nla_reserve(skb, ETHTOOL_A_MODULE_FW_FLASH_STATUS_MSG, total_len); if (!attr) return -ENOMEM; if (sub_err_msg) sprintf(nla_data(attr), "%s, %s.", err_msg, sub_err_msg); else sprintf(nla_data(attr), "%s.", err_msg); return 0; } static void ethnl_module_fw_flash_ntf(struct net_device *dev, enum ethtool_module_fw_flash_status status, struct ethnl_module_fw_flash_ntf_params *ntf_params, char *err_msg, char *sub_err_msg, u64 done, u64 total) { struct sk_buff *skb; void *hdr; int ret; if (ntf_params->closed_sock) return; skb = genlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); if (!skb) return; hdr = ethnl_unicast_put(skb, ntf_params->portid, ++ntf_params->seq, ETHTOOL_MSG_MODULE_FW_FLASH_NTF); if (!hdr) goto err_skb; ret = ethnl_fill_reply_header(skb, dev, ETHTOOL_A_MODULE_FW_FLASH_HEADER); if (ret < 0) goto err_skb; if (nla_put_u32(skb, ETHTOOL_A_MODULE_FW_FLASH_STATUS, status)) goto err_skb; ret = ethnl_module_fw_flash_ntf_put_err(skb, err_msg, sub_err_msg); if (ret < 0) goto err_skb; if (nla_put_uint(skb, ETHTOOL_A_MODULE_FW_FLASH_DONE, done)) goto err_skb; if (nla_put_uint(skb, ETHTOOL_A_MODULE_FW_FLASH_TOTAL, total)) goto err_skb; genlmsg_end(skb, hdr); genlmsg_unicast(dev_net(dev), skb, ntf_params->portid); return; err_skb: nlmsg_free(skb); } void ethnl_module_fw_flash_ntf_err(struct net_device *dev, struct ethnl_module_fw_flash_ntf_params *params, char *err_msg, char *sub_err_msg) { ethnl_module_fw_flash_ntf(dev, ETHTOOL_MODULE_FW_FLASH_STATUS_ERROR, params, err_msg, sub_err_msg, 0, 0); } void ethnl_module_fw_flash_ntf_start(struct net_device *dev, struct ethnl_module_fw_flash_ntf_params *params) { ethnl_module_fw_flash_ntf(dev, ETHTOOL_MODULE_FW_FLASH_STATUS_STARTED, params, NULL, NULL, 0, 0); } void ethnl_module_fw_flash_ntf_complete(struct net_device *dev, struct ethnl_module_fw_flash_ntf_params *params) { ethnl_module_fw_flash_ntf(dev, ETHTOOL_MODULE_FW_FLASH_STATUS_COMPLETED, params, NULL, NULL, 0, 0); } void ethnl_module_fw_flash_ntf_in_progress(struct net_device *dev, struct ethnl_module_fw_flash_ntf_params *params, u64 done, u64 total) { ethnl_module_fw_flash_ntf(dev, ETHTOOL_MODULE_FW_FLASH_STATUS_IN_PROGRESS, params, NULL, NULL, done, total); } |
| 5 5 5 4 5 5 1 4 4 4 4 4 2 2 5 5 1 1 1 1 3 3 3 3 3 3 3 3 1 2 12 12 5 2 2 1 3 3 2 2 2 2 2 2 2 1 1 14 3 3 3 2 2 2 2 1 1 1 1 1 1 1 2 2 2 2 1 1 1 1 1 2 2 2 2 1 1 1 2 173 1 2 1 1 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 5 5 5 3 1 2 3 2 1 2 14 174 186 1 5 1 1 1 1 2 1 186 174 43 130 132 1 1 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 | // SPDX-License-Identifier: GPL-2.0 #include <linux/capability.h> #include <linux/compat.h> #include <linux/blkdev.h> #include <linux/export.h> #include <linux/gfp.h> #include <linux/blkpg.h> #include <linux/hdreg.h> #include <linux/backing-dev.h> #include <linux/fs.h> #include <linux/blktrace_api.h> #include <linux/pr.h> #include <linux/uaccess.h> #include <linux/pagemap.h> #include <linux/io_uring/cmd.h> #include <linux/blk-integrity.h> #include <uapi/linux/blkdev.h> #include "blk.h" #include "blk-crypto-internal.h" static int blkpg_do_ioctl(struct block_device *bdev, struct blkpg_partition __user *upart, int op) { struct gendisk *disk = bdev->bd_disk; struct blkpg_partition p; sector_t start, length, capacity, end; if (!capable(CAP_SYS_ADMIN)) return -EACCES; if (copy_from_user(&p, upart, sizeof(struct blkpg_partition))) return -EFAULT; if (bdev_is_partition(bdev)) return -EINVAL; if (p.pno <= 0) return -EINVAL; if (op == BLKPG_DEL_PARTITION) return bdev_del_partition(disk, p.pno); if (p.start < 0 || p.length <= 0 || LLONG_MAX - p.length < p.start) return -EINVAL; /* Check that the partition is aligned to the block size */ if (!IS_ALIGNED(p.start | p.length, bdev_logical_block_size(bdev))) return -EINVAL; start = p.start >> SECTOR_SHIFT; length = p.length >> SECTOR_SHIFT; capacity = get_capacity(disk); if (check_add_overflow(start, length, &end)) return -EINVAL; if (start >= capacity || end > capacity) return -EINVAL; switch (op) { case BLKPG_ADD_PARTITION: return bdev_add_partition(disk, p.pno, start, length); case BLKPG_RESIZE_PARTITION: return bdev_resize_partition(disk, p.pno, start, length); default: return -EINVAL; } } static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user *arg) { struct blkpg_partition __user *udata; int op; if (get_user(op, &arg->op) || get_user(udata, &arg->data)) return -EFAULT; return blkpg_do_ioctl(bdev, udata, op); } #ifdef CONFIG_COMPAT struct compat_blkpg_ioctl_arg { compat_int_t op; compat_int_t flags; compat_int_t datalen; compat_caddr_t data; }; static int compat_blkpg_ioctl(struct block_device *bdev, struct compat_blkpg_ioctl_arg __user *arg) { compat_caddr_t udata; int op; if (get_user(op, &arg->op) || get_user(udata, &arg->data)) return -EFAULT; return blkpg_do_ioctl(bdev, compat_ptr(udata), op); } #endif /* * Check that [start, start + len) is a valid range from the block device's * perspective, including verifying that it can be correctly translated into * logical block addresses. */ static int blk_validate_byte_range(struct block_device *bdev, uint64_t start, uint64_t len) { unsigned int bs_mask = bdev_logical_block_size(bdev) - 1; uint64_t end; if ((start | len) & bs_mask) return -EINVAL; if (!len) return -EINVAL; if (check_add_overflow(start, len, &end) || end > bdev_nr_bytes(bdev)) return -EINVAL; return 0; } static int blk_ioctl_discard(struct block_device *bdev, blk_mode_t mode, unsigned long arg) { uint64_t range[2], start, len; struct bio *prev = NULL, *bio; sector_t sector, nr_sects; struct blk_plug plug; int err; if (copy_from_user(range, (void __user *)arg, sizeof(range))) return -EFAULT; start = range[0]; len = range[1]; if (!bdev_max_discard_sectors(bdev)) return -EOPNOTSUPP; if (!(mode & BLK_OPEN_WRITE)) return -EBADF; if (bdev_read_only(bdev)) return -EPERM; err = blk_validate_byte_range(bdev, start, len); if (err) return err; inode_lock(bdev->bd_mapping->host); filemap_invalidate_lock(bdev->bd_mapping); err = truncate_bdev_range(bdev, mode, start, start + len - 1); if (err) goto fail; sector = start >> SECTOR_SHIFT; nr_sects = len >> SECTOR_SHIFT; blk_start_plug(&plug); while (1) { if (fatal_signal_pending(current)) { if (prev) bio_await_chain(prev); err = -EINTR; goto out_unplug; } bio = blk_alloc_discard_bio(bdev, §or, &nr_sects, GFP_KERNEL); if (!bio) break; prev = bio_chain_and_submit(prev, bio); } if (prev) { err = submit_bio_wait(prev); if (err == -EOPNOTSUPP) err = 0; bio_put(prev); } out_unplug: blk_finish_plug(&plug); fail: filemap_invalidate_unlock(bdev->bd_mapping); inode_unlock(bdev->bd_mapping->host); return err; } static int blk_ioctl_secure_erase(struct block_device *bdev, blk_mode_t mode, void __user *argp) { uint64_t start, len, end; uint64_t range[2]; int err; if (!(mode & BLK_OPEN_WRITE)) return -EBADF; if (!bdev_max_secure_erase_sectors(bdev)) return -EOPNOTSUPP; if (copy_from_user(range, argp, sizeof(range))) return -EFAULT; start = range[0]; len = range[1]; if ((start & 511) || (len & 511)) return -EINVAL; if (check_add_overflow(start, len, &end) || end > bdev_nr_bytes(bdev)) return -EINVAL; inode_lock(bdev->bd_mapping->host); filemap_invalidate_lock(bdev->bd_mapping); err = truncate_bdev_range(bdev, mode, start, end - 1); if (!err) err = blkdev_issue_secure_erase(bdev, start >> 9, len >> 9, GFP_KERNEL); filemap_invalidate_unlock(bdev->bd_mapping); inode_unlock(bdev->bd_mapping->host); return err; } static int blk_ioctl_zeroout(struct block_device *bdev, blk_mode_t mode, unsigned long arg) { uint64_t range[2]; uint64_t start, end, len; int err; if (!(mode & BLK_OPEN_WRITE)) return -EBADF; if (copy_from_user(range, (void __user *)arg, sizeof(range))) return -EFAULT; start = range[0]; len = range[1]; end = start + len - 1; if (start & 511) return -EINVAL; if (len & 511) return -EINVAL; if (end >= (uint64_t)bdev_nr_bytes(bdev)) return -EINVAL; if (end < start) return -EINVAL; /* Invalidate the page cache, including dirty pages */ inode_lock(bdev->bd_mapping->host); filemap_invalidate_lock(bdev->bd_mapping); err = truncate_bdev_range(bdev, mode, start, end); if (err) goto fail; err = blkdev_issue_zeroout(bdev, start >> 9, len >> 9, GFP_KERNEL, BLKDEV_ZERO_NOUNMAP | BLKDEV_ZERO_KILLABLE); fail: filemap_invalidate_unlock(bdev->bd_mapping); inode_unlock(bdev->bd_mapping->host); return err; } static int put_ushort(unsigned short __user *argp, unsigned short val) { return put_user(val, argp); } static int put_int(int __user *argp, int val) { return put_user(val, argp); } static int put_uint(unsigned int __user *argp, unsigned int val) { return put_user(val, argp); } static int put_long(long __user *argp, long val) { return put_user(val, argp); } static int put_ulong(unsigned long __user *argp, unsigned long val) { return put_user(val, argp); } static int put_u64(u64 __user *argp, u64 val) { return put_user(val, argp); } #ifdef CONFIG_COMPAT static int compat_put_long(compat_long_t __user *argp, long val) { return put_user(val, argp); } static int compat_put_ulong(compat_ulong_t __user *argp, compat_ulong_t val) { return put_user(val, argp); } #endif #ifdef CONFIG_COMPAT /* * This is the equivalent of compat_ptr_ioctl(), to be used by block * drivers that implement only commands that are completely compatible * between 32-bit and 64-bit user space */ int blkdev_compat_ptr_ioctl(struct block_device *bdev, blk_mode_t mode, unsigned cmd, unsigned long arg) { struct gendisk *disk = bdev->bd_disk; if (disk->fops->ioctl) return disk->fops->ioctl(bdev, mode, cmd, (unsigned long)compat_ptr(arg)); return -ENOIOCTLCMD; } EXPORT_SYMBOL(blkdev_compat_ptr_ioctl); #endif static bool blkdev_pr_allowed(struct block_device *bdev, blk_mode_t mode) { /* no sense to make reservations for partitions */ if (bdev_is_partition(bdev)) return false; if (capable(CAP_SYS_ADMIN)) return true; /* * Only allow unprivileged reservations if the file descriptor is open * for writing. */ return mode & BLK_OPEN_WRITE; } static int blkdev_pr_register(struct block_device *bdev, blk_mode_t mode, struct pr_registration __user *arg) { const struct pr_ops *ops = bdev->bd_disk->fops->pr_ops; struct pr_registration reg; if (!blkdev_pr_allowed(bdev, mode)) return -EPERM; if (!ops || !ops->pr_register) return -EOPNOTSUPP; if (copy_from_user(®, arg, sizeof(reg))) return -EFAULT; if (reg.flags & ~PR_FL_IGNORE_KEY) return -EOPNOTSUPP; return ops->pr_register(bdev, reg.old_key, reg.new_key, reg.flags); } static int blkdev_pr_reserve(struct block_device *bdev, blk_mode_t mode, struct pr_reservation __user *arg) { const struct pr_ops *ops = bdev->bd_disk->fops->pr_ops; struct pr_reservation rsv; if (!blkdev_pr_allowed(bdev, mode)) return -EPERM; if (!ops || !ops->pr_reserve) return -EOPNOTSUPP; if (copy_from_user(&rsv, arg, sizeof(rsv))) return -EFAULT; if (rsv.flags & ~PR_FL_IGNORE_KEY) return -EOPNOTSUPP; return ops->pr_reserve(bdev, rsv.key, rsv.type, rsv.flags); } static int blkdev_pr_release(struct block_device *bdev, blk_mode_t mode, struct pr_reservation __user *arg) { const struct pr_ops *ops = bdev->bd_disk->fops->pr_ops; struct pr_reservation rsv; if (!blkdev_pr_allowed(bdev, mode)) return -EPERM; if (!ops || !ops->pr_release) return -EOPNOTSUPP; if (copy_from_user(&rsv, arg, sizeof(rsv))) return -EFAULT; if (rsv.flags) return -EOPNOTSUPP; return ops->pr_release(bdev, rsv.key, rsv.type); } static int blkdev_pr_preempt(struct block_device *bdev, blk_mode_t mode, struct pr_preempt __user *arg, bool abort) { const struct pr_ops *ops = bdev->bd_disk->fops->pr_ops; struct pr_preempt p; if (!blkdev_pr_allowed(bdev, mode)) return -EPERM; if (!ops || !ops->pr_preempt) return -EOPNOTSUPP; if (copy_from_user(&p, arg, sizeof(p))) return -EFAULT; if (p.flags) return -EOPNOTSUPP; return ops->pr_preempt(bdev, p.old_key, p.new_key, p.type, abort); } static int blkdev_pr_clear(struct block_device *bdev, blk_mode_t mode, struct pr_clear __user *arg) { const struct pr_ops *ops = bdev->bd_disk->fops->pr_ops; struct pr_clear c; if (!blkdev_pr_allowed(bdev, mode)) return -EPERM; if (!ops || !ops->pr_clear) return -EOPNOTSUPP; if (copy_from_user(&c, arg, sizeof(c))) return -EFAULT; if (c.flags) return -EOPNOTSUPP; return ops->pr_clear(bdev, c.key); } static int blkdev_flushbuf(struct block_device *bdev, unsigned cmd, unsigned long arg) { if (!capable(CAP_SYS_ADMIN)) return -EACCES; mutex_lock(&bdev->bd_holder_lock); if (bdev->bd_holder_ops && bdev->bd_holder_ops->sync) bdev->bd_holder_ops->sync(bdev); else { mutex_unlock(&bdev->bd_holder_lock); sync_blockdev(bdev); } invalidate_bdev(bdev); return 0; } static int blkdev_roset(struct block_device *bdev, unsigned cmd, unsigned long arg) { int ret, n; if (!capable(CAP_SYS_ADMIN)) return -EACCES; if (get_user(n, (int __user *)arg)) return -EFAULT; if (bdev->bd_disk->fops->set_read_only) { ret = bdev->bd_disk->fops->set_read_only(bdev, n); if (ret) return ret; } if (n) bdev_set_flag(bdev, BD_READ_ONLY); else bdev_clear_flag(bdev, BD_READ_ONLY); return 0; } static int blkdev_getgeo(struct block_device *bdev, struct hd_geometry __user *argp) { struct gendisk *disk = bdev->bd_disk; struct hd_geometry geo; int ret; if (!argp) return -EINVAL; if (!disk->fops->getgeo) return -ENOTTY; /* * We need to set the startsect first, the driver may * want to override it. */ memset(&geo, 0, sizeof(geo)); geo.start = get_start_sect(bdev); ret = disk->fops->getgeo(bdev, &geo); if (ret) return ret; if (copy_to_user(argp, &geo, sizeof(geo))) return -EFAULT; return 0; } #ifdef CONFIG_COMPAT struct compat_hd_geometry { unsigned char heads; unsigned char sectors; unsigned short cylinders; u32 start; }; static int compat_hdio_getgeo(struct block_device *bdev, struct compat_hd_geometry __user *ugeo) { struct gendisk *disk = bdev->bd_disk; struct hd_geometry geo; int ret; if (!ugeo) return -EINVAL; if (!disk->fops->getgeo) return -ENOTTY; memset(&geo, 0, sizeof(geo)); /* * We need to set the startsect first, the driver may * want to override it. */ geo.start = get_start_sect(bdev); ret = disk->fops->getgeo(bdev, &geo); if (ret) return ret; ret = copy_to_user(ugeo, &geo, 4); ret |= put_user(geo.start, &ugeo->start); if (ret) ret = -EFAULT; return ret; } #endif /* set the logical block size */ static int blkdev_bszset(struct file *file, blk_mode_t mode, int __user *argp) { // this one might be file_inode(file)->i_rdev - a rare valid // use of file_inode() for those. dev_t dev = I_BDEV(file->f_mapping->host)->bd_dev; struct file *excl_file; int ret, n; if (!capable(CAP_SYS_ADMIN)) return -EACCES; if (!argp) return -EINVAL; if (get_user(n, argp)) return -EFAULT; if (mode & BLK_OPEN_EXCL) return set_blocksize(file, n); excl_file = bdev_file_open_by_dev(dev, mode, &dev, NULL); if (IS_ERR(excl_file)) return -EBUSY; ret = set_blocksize(excl_file, n); fput(excl_file); return ret; } /* * Common commands that are handled the same way on native and compat * user space. Note the separate arg/argp parameters that are needed * to deal with the compat_ptr() conversion. */ static int blkdev_common_ioctl(struct block_device *bdev, blk_mode_t mode, unsigned int cmd, unsigned long arg, void __user *argp) { unsigned int max_sectors; switch (cmd) { case BLKFLSBUF: return blkdev_flushbuf(bdev, cmd, arg); case BLKROSET: return blkdev_roset(bdev, cmd, arg); case BLKDISCARD: return blk_ioctl_discard(bdev, mode, arg); case BLKSECDISCARD: return blk_ioctl_secure_erase(bdev, mode, argp); case BLKZEROOUT: return blk_ioctl_zeroout(bdev, mode, arg); case BLKGETDISKSEQ: return put_u64(argp, bdev->bd_disk->diskseq); case BLKREPORTZONE: return blkdev_report_zones_ioctl(bdev, cmd, arg); case BLKRESETZONE: case BLKOPENZONE: case BLKCLOSEZONE: case BLKFINISHZONE: return blkdev_zone_mgmt_ioctl(bdev, mode, cmd, arg); case BLKGETZONESZ: return put_uint(argp, bdev_zone_sectors(bdev)); case BLKGETNRZONES: return put_uint(argp, bdev_nr_zones(bdev)); case BLKROGET: return put_int(argp, bdev_read_only(bdev) != 0); case BLKSSZGET: /* get block device logical block size */ return put_int(argp, bdev_logical_block_size(bdev)); case BLKPBSZGET: /* get block device physical block size */ return put_uint(argp, bdev_physical_block_size(bdev)); case BLKIOMIN: return put_uint(argp, bdev_io_min(bdev)); case BLKIOOPT: return put_uint(argp, bdev_io_opt(bdev)); case BLKALIGNOFF: return put_int(argp, bdev_alignment_offset(bdev)); case BLKDISCARDZEROES: return put_uint(argp, 0); case BLKSECTGET: max_sectors = min_t(unsigned int, USHRT_MAX, queue_max_sectors(bdev_get_queue(bdev))); return put_ushort(argp, max_sectors); case BLKROTATIONAL: return put_ushort(argp, !bdev_nonrot(bdev)); case BLKRASET: case BLKFRASET: if(!capable(CAP_SYS_ADMIN)) return -EACCES; bdev->bd_disk->bdi->ra_pages = (arg * 512) / PAGE_SIZE; return 0; case BLKRRPART: if (!capable(CAP_SYS_ADMIN)) return -EACCES; if (bdev_is_partition(bdev)) return -EINVAL; return disk_scan_partitions(bdev->bd_disk, mode | BLK_OPEN_STRICT_SCAN); case BLKTRACESTART: case BLKTRACESTOP: case BLKTRACETEARDOWN: return blk_trace_ioctl(bdev, cmd, argp); case BLKCRYPTOIMPORTKEY: case BLKCRYPTOGENERATEKEY: case BLKCRYPTOPREPAREKEY: return blk_crypto_ioctl(bdev, cmd, argp); case IOC_PR_REGISTER: return blkdev_pr_register(bdev, mode, argp); case IOC_PR_RESERVE: return blkdev_pr_reserve(bdev, mode, argp); case IOC_PR_RELEASE: return blkdev_pr_release(bdev, mode, argp); case IOC_PR_PREEMPT: return blkdev_pr_preempt(bdev, mode, argp, false); case IOC_PR_PREEMPT_ABORT: return blkdev_pr_preempt(bdev, mode, argp, true); case IOC_PR_CLEAR: return blkdev_pr_clear(bdev, mode, argp); default: return blk_get_meta_cap(bdev, cmd, argp); } } /* * Always keep this in sync with compat_blkdev_ioctl() * to handle all incompatible commands in both functions. * * New commands must be compatible and go into blkdev_common_ioctl */ long blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg) { struct block_device *bdev = I_BDEV(file->f_mapping->host); void __user *argp = (void __user *)arg; blk_mode_t mode = file_to_blk_mode(file); int ret; switch (cmd) { /* These need separate implementations for the data structure */ case HDIO_GETGEO: return blkdev_getgeo(bdev, argp); case BLKPG: return blkpg_ioctl(bdev, argp); /* Compat mode returns 32-bit data instead of 'long' */ case BLKRAGET: case BLKFRAGET: if (!argp) return -EINVAL; return put_long(argp, (bdev->bd_disk->bdi->ra_pages * PAGE_SIZE) / 512); case BLKGETSIZE: if (bdev_nr_sectors(bdev) > ~0UL) return -EFBIG; return put_ulong(argp, bdev_nr_sectors(bdev)); /* The data is compatible, but the command number is different */ case BLKBSZGET: /* get block device soft block size (cf. BLKSSZGET) */ return put_int(argp, block_size(bdev)); case BLKBSZSET: return blkdev_bszset(file, mode, argp); case BLKGETSIZE64: return put_u64(argp, bdev_nr_bytes(bdev)); /* Incompatible alignment on i386 */ case BLKTRACESETUP: return blk_trace_ioctl(bdev, cmd, argp); default: break; } ret = blkdev_common_ioctl(bdev, mode, cmd, arg, argp); if (ret != -ENOIOCTLCMD) return ret; if (!bdev->bd_disk->fops->ioctl) return -ENOTTY; return bdev->bd_disk->fops->ioctl(bdev, mode, cmd, arg); } #ifdef CONFIG_COMPAT #define BLKBSZGET_32 _IOR(0x12, 112, int) #define BLKBSZSET_32 _IOW(0x12, 113, int) #define BLKGETSIZE64_32 _IOR(0x12, 114, int) /* Most of the generic ioctls are handled in the normal fallback path. This assumes the blkdev's low level compat_ioctl always returns ENOIOCTLCMD for unknown ioctls. */ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg) { int ret; void __user *argp = compat_ptr(arg); struct block_device *bdev = I_BDEV(file->f_mapping->host); struct gendisk *disk = bdev->bd_disk; blk_mode_t mode = file_to_blk_mode(file); switch (cmd) { /* These need separate implementations for the data structure */ case HDIO_GETGEO: return compat_hdio_getgeo(bdev, argp); case BLKPG: return compat_blkpg_ioctl(bdev, argp); /* Compat mode returns 32-bit data instead of 'long' */ case BLKRAGET: case BLKFRAGET: if (!argp) return -EINVAL; return compat_put_long(argp, (bdev->bd_disk->bdi->ra_pages * PAGE_SIZE) / 512); case BLKGETSIZE: if (bdev_nr_sectors(bdev) > ~(compat_ulong_t)0) return -EFBIG; return compat_put_ulong(argp, bdev_nr_sectors(bdev)); /* The data is compatible, but the command number is different */ case BLKBSZGET_32: /* get the logical block size (cf. BLKSSZGET) */ return put_int(argp, bdev_logical_block_size(bdev)); case BLKBSZSET_32: return blkdev_bszset(file, mode, argp); case BLKGETSIZE64_32: return put_u64(argp, bdev_nr_bytes(bdev)); /* Incompatible alignment on i386 */ case BLKTRACESETUP32: return blk_trace_ioctl(bdev, cmd, argp); default: break; } ret = blkdev_common_ioctl(bdev, mode, cmd, arg, argp); if (ret == -ENOIOCTLCMD && disk->fops->compat_ioctl) ret = disk->fops->compat_ioctl(bdev, mode, cmd, arg); return ret; } #endif struct blk_iou_cmd { int res; bool nowait; }; static void blk_cmd_complete(struct io_uring_cmd *cmd, unsigned int issue_flags) { struct blk_iou_cmd *bic = io_uring_cmd_to_pdu(cmd, struct blk_iou_cmd); if (bic->res == -EAGAIN && bic->nowait) io_uring_cmd_issue_blocking(cmd); else io_uring_cmd_done(cmd, bic->res, 0, issue_flags); } static void bio_cmd_bio_end_io(struct bio *bio) { struct io_uring_cmd *cmd = bio->bi_private; struct blk_iou_cmd *bic = io_uring_cmd_to_pdu(cmd, struct blk_iou_cmd); if (unlikely(bio->bi_status) && !bic->res) bic->res = blk_status_to_errno(bio->bi_status); io_uring_cmd_do_in_task_lazy(cmd, blk_cmd_complete); bio_put(bio); } static int blkdev_cmd_discard(struct io_uring_cmd *cmd, struct block_device *bdev, uint64_t start, uint64_t len, bool nowait) { struct blk_iou_cmd *bic = io_uring_cmd_to_pdu(cmd, struct blk_iou_cmd); gfp_t gfp = nowait ? GFP_NOWAIT : GFP_KERNEL; sector_t sector = start >> SECTOR_SHIFT; sector_t nr_sects = len >> SECTOR_SHIFT; struct bio *prev = NULL, *bio; int err; if (!bdev_max_discard_sectors(bdev)) return -EOPNOTSUPP; if (!(file_to_blk_mode(cmd->file) & BLK_OPEN_WRITE)) return -EBADF; if (bdev_read_only(bdev)) return -EPERM; err = blk_validate_byte_range(bdev, start, len); if (err) return err; err = filemap_invalidate_pages(bdev->bd_mapping, start, start + len - 1, nowait); if (err) return err; while (true) { bio = blk_alloc_discard_bio(bdev, §or, &nr_sects, gfp); if (!bio) break; if (nowait) { /* * Don't allow multi-bio non-blocking submissions as * subsequent bios may fail but we won't get a direct * indication of that. Normally, the caller should * retry from a blocking context. */ if (unlikely(nr_sects)) { bio_put(bio); return -EAGAIN; } bio->bi_opf |= REQ_NOWAIT; } prev = bio_chain_and_submit(prev, bio); } if (unlikely(!prev)) return -EAGAIN; if (unlikely(nr_sects)) bic->res = -EAGAIN; prev->bi_private = cmd; prev->bi_end_io = bio_cmd_bio_end_io; submit_bio(prev); return -EIOCBQUEUED; } int blkdev_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags) { struct block_device *bdev = I_BDEV(cmd->file->f_mapping->host); struct blk_iou_cmd *bic = io_uring_cmd_to_pdu(cmd, struct blk_iou_cmd); const struct io_uring_sqe *sqe = cmd->sqe; u32 cmd_op = cmd->cmd_op; uint64_t start, len; if (unlikely(sqe->ioprio || sqe->__pad1 || sqe->len || sqe->rw_flags || sqe->file_index)) return -EINVAL; bic->res = 0; bic->nowait = issue_flags & IO_URING_F_NONBLOCK; start = READ_ONCE(sqe->addr); len = READ_ONCE(sqe->addr3); switch (cmd_op) { case BLOCK_URING_CMD_DISCARD: return blkdev_cmd_discard(cmd, bdev, start, len, bic->nowait); } return -EINVAL; } |
| 24 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 | /* SPDX-License-Identifier: GPL-2.0-or-later */ /* SCTP kernel reference Implementation * Copyright (c) 1999-2001 Motorola, Inc. * Copyright (c) 2001-2003 International Business Machines, Corp. * * This file is part of the SCTP kernel reference Implementation * * SCTP Checksum functions * * Please send any bug reports or fixes you make to the * email address(es): * lksctp developers <linux-sctp@vger.kernel.org> * * Written or modified by: * Dinakaran Joseph * Jon Grimm <jgrimm@us.ibm.com> * Sridhar Samudrala <sri@us.ibm.com> * Vlad Yasevich <vladislav.yasevich@hp.com> */ #ifndef __sctp_checksum_h__ #define __sctp_checksum_h__ #include <linux/types.h> #include <linux/sctp.h> static inline __le32 sctp_compute_cksum(const struct sk_buff *skb, unsigned int offset) { struct sctphdr *sh = (struct sctphdr *)(skb->data + offset); __le32 old = sh->checksum; u32 new; sh->checksum = 0; new = ~skb_crc32c(skb, offset, skb->len - offset, ~0); sh->checksum = old; return cpu_to_le32(new); } #endif /* __sctp_checksum_h__ */ |
| 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 | /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * Public Key Signature Algorithm * * Copyright (c) 2023 Herbert Xu <herbert@gondor.apana.org.au> */ #ifndef _CRYPTO_SIG_H #define _CRYPTO_SIG_H #include <linux/crypto.h> /** * struct crypto_sig - user-instantiated objects which encapsulate * algorithms and core processing logic * * @base: Common crypto API algorithm data structure */ struct crypto_sig { struct crypto_tfm base; }; /** * struct sig_alg - generic public key signature algorithm * * @sign: Function performs a sign operation as defined by public key * algorithm. On success, the signature size is returned. * Optional. * @verify: Function performs a complete verify operation as defined by * public key algorithm, returning verification status. Optional. * @set_pub_key: Function invokes the algorithm specific set public key * function, which knows how to decode and interpret * the BER encoded public key and parameters. Mandatory. * @set_priv_key: Function invokes the algorithm specific set private key * function, which knows how to decode and interpret * the BER encoded private key and parameters. Optional. * @key_size: Function returns key size. Mandatory. * @digest_size: Function returns maximum digest size. Optional. * @max_size: Function returns maximum signature size. Optional. * @init: Initialize the cryptographic transformation object. * This function is used to initialize the cryptographic * transformation object. This function is called only once at * the instantiation time, right after the transformation context * was allocated. In case the cryptographic hardware has some * special requirements which need to be handled by software, this * function shall check for the precise requirement of the * transformation and put any software fallbacks in place. * @exit: Deinitialize the cryptographic transformation object. This is a * counterpart to @init, used to remove various changes set in * @init. * * @base: Common crypto API algorithm data structure */ struct sig_alg { int (*sign)(struct crypto_sig *tfm, const void *src, unsigned int slen, void *dst, unsigned int dlen); int (*verify)(struct crypto_sig *tfm, const void *src, unsigned int slen, const void *digest, unsigned int dlen); int (*set_pub_key)(struct crypto_sig *tfm, const void *key, unsigned int keylen); int (*set_priv_key)(struct crypto_sig *tfm, const void *key, unsigned int keylen); unsigned int (*key_size)(struct crypto_sig *tfm); unsigned int (*digest_size)(struct crypto_sig *tfm); unsigned int (*max_size)(struct crypto_sig *tfm); int (*init)(struct crypto_sig *tfm); void (*exit)(struct crypto_sig *tfm); struct crypto_alg base; }; /** * DOC: Generic Public Key Signature API * * The Public Key Signature API is used with the algorithms of type * CRYPTO_ALG_TYPE_SIG (listed as type "sig" in /proc/crypto) */ /** * crypto_alloc_sig() - allocate signature tfm handle * @alg_name: is the cra_name / name or cra_driver_name / driver name of the * signing algorithm e.g. "ecdsa" * @type: specifies the type of the algorithm * @mask: specifies the mask for the algorithm * * Allocate a handle for public key signature algorithm. The returned struct * crypto_sig is the handle that is required for any subsequent * API invocation for signature operations. * * Return: allocated handle in case of success; IS_ERR() is true in case * of an error, PTR_ERR() returns the error code. */ struct crypto_sig *crypto_alloc_sig(const char *alg_name, u32 type, u32 mask); static inline struct crypto_tfm *crypto_sig_tfm(struct crypto_sig *tfm) { return &tfm->base; } static inline struct crypto_sig *__crypto_sig_tfm(struct crypto_tfm *tfm) { return container_of(tfm, struct crypto_sig, base); } static inline struct sig_alg *__crypto_sig_alg(struct crypto_alg *alg) { return container_of(alg, struct sig_alg, base); } static inline struct sig_alg *crypto_sig_alg(struct crypto_sig *tfm) { return __crypto_sig_alg(crypto_sig_tfm(tfm)->__crt_alg); } /** * crypto_free_sig() - free signature tfm handle * * @tfm: signature tfm handle allocated with crypto_alloc_sig() * * If @tfm is a NULL or error pointer, this function does nothing. */ static inline void crypto_free_sig(struct crypto_sig *tfm) { crypto_destroy_tfm(tfm, crypto_sig_tfm(tfm)); } /** * crypto_sig_keysize() - Get key size * * Function returns the key size in bits. * Function assumes that the key is already set in the transformation. If this * function is called without a setkey or with a failed setkey, you may end up * in a NULL dereference. * * @tfm: signature tfm handle allocated with crypto_alloc_sig() */ static inline unsigned int crypto_sig_keysize(struct crypto_sig *tfm) { struct sig_alg *alg = crypto_sig_alg(tfm); return alg->key_size(tfm); } /** * crypto_sig_digestsize() - Get maximum digest size * * Function returns the maximum digest size in bytes. * Function assumes that the key is already set in the transformation. If this * function is called without a setkey or with a failed setkey, you may end up * in a NULL dereference. * * @tfm: signature tfm handle allocated with crypto_alloc_sig() */ static inline unsigned int crypto_sig_digestsize(struct crypto_sig *tfm) { struct sig_alg *alg = crypto_sig_alg(tfm); return alg->digest_size(tfm); } /** * crypto_sig_maxsize() - Get maximum signature size * * Function returns the maximum signature size in bytes. * Function assumes that the key is already set in the transformation. If this * function is called without a setkey or with a failed setkey, you may end up * in a NULL dereference. * * @tfm: signature tfm handle allocated with crypto_alloc_sig() */ static inline unsigned int crypto_sig_maxsize(struct crypto_sig *tfm) { struct sig_alg *alg = crypto_sig_alg(tfm); return alg->max_size(tfm); } /** * crypto_sig_sign() - Invoke signing operation * * Function invokes the specific signing operation for a given algorithm * * @tfm: signature tfm handle allocated with crypto_alloc_sig() * @src: source buffer * @slen: source length * @dst: destination obuffer * @dlen: destination length * * Return: signature size on success; error code in case of error */ static inline int crypto_sig_sign(struct crypto_sig *tfm, const void *src, unsigned int slen, void *dst, unsigned int dlen) { struct sig_alg *alg = crypto_sig_alg(tfm); return alg->sign(tfm, src, slen, dst, dlen); } /** * crypto_sig_verify() - Invoke signature verification * * Function invokes the specific signature verification operation * for a given algorithm. * * @tfm: signature tfm handle allocated with crypto_alloc_sig() * @src: source buffer * @slen: source length * @digest: digest * @dlen: digest length * * Return: zero on verification success; error code in case of error. */ static inline int crypto_sig_verify(struct crypto_sig *tfm, const void *src, unsigned int slen, const void *digest, unsigned int dlen) { struct sig_alg *alg = crypto_sig_alg(tfm); return alg->verify(tfm, src, slen, digest, dlen); } /** * crypto_sig_set_pubkey() - Invoke set public key operation * * Function invokes the algorithm specific set key function, which knows * how to decode and interpret the encoded key and parameters * * @tfm: tfm handle * @key: BER encoded public key, algo OID, paramlen, BER encoded * parameters * @keylen: length of the key (not including other data) * * Return: zero on success; error code in case of error */ static inline int crypto_sig_set_pubkey(struct crypto_sig *tfm, const void *key, unsigned int keylen) { struct sig_alg *alg = crypto_sig_alg(tfm); return alg->set_pub_key(tfm, key, keylen); } /** * crypto_sig_set_privkey() - Invoke set private key operation * * Function invokes the algorithm specific set key function, which knows * how to decode and interpret the encoded key and parameters * * @tfm: tfm handle * @key: BER encoded private key, algo OID, paramlen, BER encoded * parameters * @keylen: length of the key (not including other data) * * Return: zero on success; error code in case of error */ static inline int crypto_sig_set_privkey(struct crypto_sig *tfm, const void *key, unsigned int keylen) { struct sig_alg *alg = crypto_sig_alg(tfm); return alg->set_priv_key(tfm, key, keylen); } #endif |
| 59 61 34 34 25 25 29 5 28 6 6 6 6 6 2 9 8 9 4 9 63 63 63 64 62 64 60 37 37 60 64 63 18 14 18 4 4 4 1 4 4 4 3 2 4 76 78 78 6 5 4 4 4 4 76 78 5 65 3 5 5 4 1 5 5 1 73 73 73 72 72 1 1 72 72 71 71 12 70 72 72 60 61 60 59 1 7 70 61 70 71 61 28 26 28 27 11 10 2 1 1 1 26 25 26 10 26 17 18 26 26 26 9 7 25 1 26 24 25 26 26 26 26 27 11 27 3 3 2 20 2 107 39 38 116 110 113 112 1 114 5 115 3 3 92 92 91 90 91 94 94 45 88 31 29 88 22 75 73 4 88 87 45 39 65 64 20 65 88 60 89 86 84 39 8 95 95 93 88 95 1 31 1 32 3 3 31 3 22 22 22 10 22 49 48 49 29 10 10 7 1 10 20 20 1 20 17 20 20 1 20 20 19 19 1 21 20 19 2 20 17 19 1 19 18 6 47 25 25 47 30 30 20 28 29 21 2 31 2 48 19 47 9 9 9 2 8 8 4 2 2 4 8 9 2 3 4 4 4 4 3 10 10 9 9 8 8 8 13 14 14 14 4 14 14 14 10 4 14 49 47 47 49 49 19 8 7 7 7 5 7 16 3 3 1 3 2 2 2 10 8 8 8 1 8 3 7 4 2 1 5 4 2 2 7 5 2 4 7 7 1 1 6 6 6 6 3 6 7 6 8 8 9 9 1 9 8 9 7 1 6 6 4 9 9 9 9 9 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 | // SPDX-License-Identifier: GPL-2.0-only /* * "splice": joining two ropes together by interweaving their strands. * * This is the "extended pipe" functionality, where a pipe is used as * an arbitrary in-memory buffer. Think of a pipe as a small kernel * buffer that you can use to transfer data from one end to the other. * * The traditional unix read/write is extended with a "splice()" operation * that transfers data buffers to or from a pipe buffer. * * Named by Larry McVoy, original implementation from Linus, extended by * Jens to support splicing to files, network, direct splicing, etc and * fixing lots of bugs. * * Copyright (C) 2005-2006 Jens Axboe <axboe@kernel.dk> * Copyright (C) 2005-2006 Linus Torvalds <torvalds@osdl.org> * Copyright (C) 2006 Ingo Molnar <mingo@elte.hu> * */ #include <linux/bvec.h> #include <linux/fs.h> #include <linux/file.h> #include <linux/pagemap.h> #include <linux/splice.h> #include <linux/memcontrol.h> #include <linux/mm_inline.h> #include <linux/swap.h> #include <linux/writeback.h> #include <linux/export.h> #include <linux/syscalls.h> #include <linux/uio.h> #include <linux/fsnotify.h> #include <linux/security.h> #include <linux/gfp.h> #include <linux/net.h> #include <linux/socket.h> #include <linux/sched/signal.h> #include "internal.h" /* * Splice doesn't support FMODE_NOWAIT. Since pipes may set this flag to * indicate they support non-blocking reads or writes, we must clear it * here if set to avoid blocking other users of this pipe if splice is * being done on it. */ static noinline void pipe_clear_nowait(struct file *file) { fmode_t fmode = READ_ONCE(file->f_mode); do { if (!(fmode & FMODE_NOWAIT)) break; } while (!try_cmpxchg(&file->f_mode, &fmode, fmode & ~FMODE_NOWAIT)); } /* * Attempt to steal a page from a pipe buffer. This should perhaps go into * a vm helper function, it's already simplified quite a bit by the * addition of remove_mapping(). If success is returned, the caller may * attempt to reuse this page for another destination. */ static bool page_cache_pipe_buf_try_steal(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { struct folio *folio = page_folio(buf->page); struct address_space *mapping; folio_lock(folio); mapping = folio_mapping(folio); if (mapping) { WARN_ON(!folio_test_uptodate(folio)); /* * At least for ext2 with nobh option, we need to wait on * writeback completing on this folio, since we'll remove it * from the pagecache. Otherwise truncate wont wait on the * folio, allowing the disk blocks to be reused by someone else * before we actually wrote our data to them. fs corruption * ensues. */ folio_wait_writeback(folio); if (!filemap_release_folio(folio, GFP_KERNEL)) goto out_unlock; /* * If we succeeded in removing the mapping, set LRU flag * and return good. */ if (remove_mapping(mapping, folio)) { buf->flags |= PIPE_BUF_FLAG_LRU; return true; } } /* * Raced with truncate or failed to remove folio from current * address space, unlock and return failure. */ out_unlock: folio_unlock(folio); return false; } static void page_cache_pipe_buf_release(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { put_page(buf->page); buf->flags &= ~PIPE_BUF_FLAG_LRU; } /* * Check whether the contents of buf is OK to access. Since the content * is a page cache page, IO may be in flight. */ static int page_cache_pipe_buf_confirm(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { struct folio *folio = page_folio(buf->page); int err; if (!folio_test_uptodate(folio)) { folio_lock(folio); /* * Folio got truncated/unhashed. This will cause a 0-byte * splice, if this is the first page. */ if (!folio->mapping) { err = -ENODATA; goto error; } /* * Uh oh, read-error from disk. */ if (!folio_test_uptodate(folio)) { err = -EIO; goto error; } /* Folio is ok after all, we are done */ folio_unlock(folio); } return 0; error: folio_unlock(folio); return err; } const struct pipe_buf_operations page_cache_pipe_buf_ops = { .confirm = page_cache_pipe_buf_confirm, .release = page_cache_pipe_buf_release, .try_steal = page_cache_pipe_buf_try_steal, .get = generic_pipe_buf_get, }; static bool user_page_pipe_buf_try_steal(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { if (!(buf->flags & PIPE_BUF_FLAG_GIFT)) return false; buf->flags |= PIPE_BUF_FLAG_LRU; return generic_pipe_buf_try_steal(pipe, buf); } static const struct pipe_buf_operations user_page_pipe_buf_ops = { .release = page_cache_pipe_buf_release, .try_steal = user_page_pipe_buf_try_steal, .get = generic_pipe_buf_get, }; static void wakeup_pipe_readers(struct pipe_inode_info *pipe) { smp_mb(); if (waitqueue_active(&pipe->rd_wait)) wake_up_interruptible(&pipe->rd_wait); kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); } /** * splice_to_pipe - fill passed data into a pipe * @pipe: pipe to fill * @spd: data to fill * * Description: * @spd contains a map of pages and len/offset tuples, along with * the struct pipe_buf_operations associated with these pages. This * function will link that data to the pipe. * */ ssize_t splice_to_pipe(struct pipe_inode_info *pipe, struct splice_pipe_desc *spd) { unsigned int spd_pages = spd->nr_pages; unsigned int tail = pipe->tail; unsigned int head = pipe->head; ssize_t ret = 0; int page_nr = 0; if (!spd_pages) return 0; if (unlikely(!pipe->readers)) { send_sig(SIGPIPE, current, 0); ret = -EPIPE; goto out; } while (!pipe_full(head, tail, pipe->max_usage)) { struct pipe_buffer *buf = pipe_buf(pipe, head); buf->page = spd->pages[page_nr]; buf->offset = spd->partial[page_nr].offset; buf->len = spd->partial[page_nr].len; buf->private = spd->partial[page_nr].private; buf->ops = spd->ops; buf->flags = 0; head++; pipe->head = head; page_nr++; ret += buf->len; if (!--spd->nr_pages) break; } if (!ret) ret = -EAGAIN; out: while (page_nr < spd_pages) spd->spd_release(spd, page_nr++); return ret; } EXPORT_SYMBOL_GPL(splice_to_pipe); ssize_t add_to_pipe(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { unsigned int head = pipe->head; unsigned int tail = pipe->tail; int ret; if (unlikely(!pipe->readers)) { send_sig(SIGPIPE, current, 0); ret = -EPIPE; } else if (pipe_full(head, tail, pipe->max_usage)) { ret = -EAGAIN; } else { *pipe_buf(pipe, head) = *buf; pipe->head = head + 1; return buf->len; } pipe_buf_release(pipe, buf); return ret; } EXPORT_SYMBOL(add_to_pipe); /* * Check if we need to grow the arrays holding pages and partial page * descriptions. */ int splice_grow_spd(const struct pipe_inode_info *pipe, struct splice_pipe_desc *spd) { unsigned int max_usage = READ_ONCE(pipe->max_usage); spd->nr_pages_max = max_usage; if (max_usage <= PIPE_DEF_BUFFERS) return 0; spd->pages = kmalloc_array(max_usage, sizeof(struct page *), GFP_KERNEL); spd->partial = kmalloc_array(max_usage, sizeof(struct partial_page), GFP_KERNEL); if (spd->pages && spd->partial) return 0; kfree(spd->pages); kfree(spd->partial); return -ENOMEM; } void splice_shrink_spd(struct splice_pipe_desc *spd) { if (spd->nr_pages_max <= PIPE_DEF_BUFFERS) return; kfree(spd->pages); kfree(spd->partial); } /** * copy_splice_read - Copy data from a file and splice the copy into a pipe * @in: The file to read from * @ppos: Pointer to the file position to read from * @pipe: The pipe to splice into * @len: The amount to splice * @flags: The SPLICE_F_* flags * * This function allocates a bunch of pages sufficient to hold the requested * amount of data (but limited by the remaining pipe capacity), passes it to * the file's ->read_iter() to read into and then splices the used pages into * the pipe. * * Return: On success, the number of bytes read will be returned and *@ppos * will be updated if appropriate; 0 will be returned if there is no more data * to be read; -EAGAIN will be returned if the pipe had no space, and some * other negative error code will be returned on error. A short read may occur * if the pipe has insufficient space, we reach the end of the data or we hit a * hole. */ ssize_t copy_splice_read(struct file *in, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags) { struct iov_iter to; struct bio_vec *bv; struct kiocb kiocb; struct page **pages; ssize_t ret; size_t used, npages, chunk, remain, keep = 0; int i; /* Work out how much data we can actually add into the pipe */ used = pipe_buf_usage(pipe); npages = max_t(ssize_t, pipe->max_usage - used, 0); len = min_t(size_t, len, npages * PAGE_SIZE); npages = DIV_ROUND_UP(len, PAGE_SIZE); bv = kzalloc(array_size(npages, sizeof(bv[0])) + array_size(npages, sizeof(struct page *)), GFP_KERNEL); if (!bv) return -ENOMEM; pages = (struct page **)(bv + npages); npages = alloc_pages_bulk(GFP_USER, npages, pages); if (!npages) { kfree(bv); return -ENOMEM; } remain = len = min_t(size_t, len, npages * PAGE_SIZE); for (i = 0; i < npages; i++) { chunk = min_t(size_t, PAGE_SIZE, remain); bv[i].bv_page = pages[i]; bv[i].bv_offset = 0; bv[i].bv_len = chunk; remain -= chunk; } /* Do the I/O */ iov_iter_bvec(&to, ITER_DEST, bv, npages, len); init_sync_kiocb(&kiocb, in); kiocb.ki_pos = *ppos; ret = in->f_op->read_iter(&kiocb, &to); if (ret > 0) { keep = DIV_ROUND_UP(ret, PAGE_SIZE); *ppos = kiocb.ki_pos; } /* * Callers of ->splice_read() expect -EAGAIN on "can't put anything in * there", rather than -EFAULT. */ if (ret == -EFAULT) ret = -EAGAIN; /* Free any pages that didn't get touched at all. */ if (keep < npages) release_pages(pages + keep, npages - keep); /* Push the remaining pages into the pipe. */ remain = ret; for (i = 0; i < keep; i++) { struct pipe_buffer *buf = pipe_head_buf(pipe); chunk = min_t(size_t, remain, PAGE_SIZE); *buf = (struct pipe_buffer) { .ops = &default_pipe_buf_ops, .page = bv[i].bv_page, .offset = 0, .len = chunk, }; pipe->head++; remain -= chunk; } kfree(bv); return ret; } EXPORT_SYMBOL(copy_splice_read); const struct pipe_buf_operations default_pipe_buf_ops = { .release = generic_pipe_buf_release, .try_steal = generic_pipe_buf_try_steal, .get = generic_pipe_buf_get, }; /* Pipe buffer operations for a socket and similar. */ const struct pipe_buf_operations nosteal_pipe_buf_ops = { .release = generic_pipe_buf_release, .get = generic_pipe_buf_get, }; EXPORT_SYMBOL(nosteal_pipe_buf_ops); static void wakeup_pipe_writers(struct pipe_inode_info *pipe) { smp_mb(); if (waitqueue_active(&pipe->wr_wait)) wake_up_interruptible(&pipe->wr_wait); kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); } /** * splice_from_pipe_feed - feed available data from a pipe to a file * @pipe: pipe to splice from * @sd: information to @actor * @actor: handler that splices the data * * Description: * This function loops over the pipe and calls @actor to do the * actual moving of a single struct pipe_buffer to the desired * destination. It returns when there's no more buffers left in * the pipe or if the requested number of bytes (@sd->total_len) * have been copied. It returns a positive number (one) if the * pipe needs to be filled with more data, zero if the required * number of bytes have been copied and -errno on error. * * This, together with splice_from_pipe_{begin,end,next}, may be * used to implement the functionality of __splice_from_pipe() when * locking is required around copying the pipe buffers to the * destination. */ static int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_desc *sd, splice_actor *actor) { unsigned int head = pipe->head; unsigned int tail = pipe->tail; int ret; while (!pipe_empty(head, tail)) { struct pipe_buffer *buf = pipe_buf(pipe, tail); sd->len = buf->len; if (sd->len > sd->total_len) sd->len = sd->total_len; ret = pipe_buf_confirm(pipe, buf); if (unlikely(ret)) { if (ret == -ENODATA) ret = 0; return ret; } ret = actor(pipe, buf, sd); if (ret <= 0) return ret; buf->offset += ret; buf->len -= ret; sd->num_spliced += ret; sd->len -= ret; sd->pos += ret; sd->total_len -= ret; if (!buf->len) { pipe_buf_release(pipe, buf); tail++; pipe->tail = tail; if (pipe->files) sd->need_wakeup = true; } if (!sd->total_len) return 0; } return 1; } /* We know we have a pipe buffer, but maybe it's empty? */ static inline bool eat_empty_buffer(struct pipe_inode_info *pipe) { unsigned int tail = pipe->tail; struct pipe_buffer *buf = pipe_buf(pipe, tail); if (unlikely(!buf->len)) { pipe_buf_release(pipe, buf); pipe->tail = tail+1; return true; } return false; } /** * splice_from_pipe_next - wait for some data to splice from * @pipe: pipe to splice from * @sd: information about the splice operation * * Description: * This function will wait for some data and return a positive * value (one) if pipe buffers are available. It will return zero * or -errno if no more data needs to be spliced. */ static int splice_from_pipe_next(struct pipe_inode_info *pipe, struct splice_desc *sd) { /* * Check for signal early to make process killable when there are * always buffers available */ if (signal_pending(current)) return -ERESTARTSYS; repeat: while (pipe_is_empty(pipe)) { if (!pipe->writers) return 0; if (sd->num_spliced) return 0; if (sd->flags & SPLICE_F_NONBLOCK) return -EAGAIN; if (signal_pending(current)) return -ERESTARTSYS; if (sd->need_wakeup) { wakeup_pipe_writers(pipe); sd->need_wakeup = false; } pipe_wait_readable(pipe); } if (eat_empty_buffer(pipe)) goto repeat; return 1; } /** * splice_from_pipe_begin - start splicing from pipe * @sd: information about the splice operation * * Description: * This function should be called before a loop containing * splice_from_pipe_next() and splice_from_pipe_feed() to * initialize the necessary fields of @sd. */ static void splice_from_pipe_begin(struct splice_desc *sd) { sd->num_spliced = 0; sd->need_wakeup = false; } /** * splice_from_pipe_end - finish splicing from pipe * @pipe: pipe to splice from * @sd: information about the splice operation * * Description: * This function will wake up pipe writers if necessary. It should * be called after a loop containing splice_from_pipe_next() and * splice_from_pipe_feed(). */ static void splice_from_pipe_end(struct pipe_inode_info *pipe, struct splice_desc *sd) { if (sd->need_wakeup) wakeup_pipe_writers(pipe); } /** * __splice_from_pipe - splice data from a pipe to given actor * @pipe: pipe to splice from * @sd: information to @actor * @actor: handler that splices the data * * Description: * This function does little more than loop over the pipe and call * @actor to do the actual moving of a single struct pipe_buffer to * the desired destination. See pipe_to_file, pipe_to_sendmsg, or * pipe_to_user. * */ ssize_t __splice_from_pipe(struct pipe_inode_info *pipe, struct splice_desc *sd, splice_actor *actor) { int ret; splice_from_pipe_begin(sd); do { cond_resched(); ret = splice_from_pipe_next(pipe, sd); if (ret > 0) ret = splice_from_pipe_feed(pipe, sd, actor); } while (ret > 0); splice_from_pipe_end(pipe, sd); return sd->num_spliced ? sd->num_spliced : ret; } EXPORT_SYMBOL(__splice_from_pipe); /** * splice_from_pipe - splice data from a pipe to a file * @pipe: pipe to splice from * @out: file to splice to * @ppos: position in @out * @len: how many bytes to splice * @flags: splice modifier flags * @actor: handler that splices the data * * Description: * See __splice_from_pipe. This function locks the pipe inode, * otherwise it's identical to __splice_from_pipe(). * */ ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out, loff_t *ppos, size_t len, unsigned int flags, splice_actor *actor) { ssize_t ret; struct splice_desc sd = { .total_len = len, .flags = flags, .pos = *ppos, .u.file = out, }; pipe_lock(pipe); ret = __splice_from_pipe(pipe, &sd, actor); pipe_unlock(pipe); return ret; } /** * iter_file_splice_write - splice data from a pipe to a file * @pipe: pipe info * @out: file to write to * @ppos: position in @out * @len: number of bytes to splice * @flags: splice modifier flags * * Description: * Will either move or copy pages (determined by @flags options) from * the given pipe inode to the given file. * This one is ->write_iter-based. * */ ssize_t iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out, loff_t *ppos, size_t len, unsigned int flags) { struct splice_desc sd = { .total_len = len, .flags = flags, .pos = *ppos, .u.file = out, }; int nbufs = pipe->max_usage; struct bio_vec *array; ssize_t ret; if (!out->f_op->write_iter) return -EINVAL; array = kcalloc(nbufs, sizeof(struct bio_vec), GFP_KERNEL); if (unlikely(!array)) return -ENOMEM; pipe_lock(pipe); splice_from_pipe_begin(&sd); while (sd.total_len) { struct kiocb kiocb; struct iov_iter from; unsigned int head, tail; size_t left; int n; ret = splice_from_pipe_next(pipe, &sd); if (ret <= 0) break; if (unlikely(nbufs < pipe->max_usage)) { kfree(array); nbufs = pipe->max_usage; array = kcalloc(nbufs, sizeof(struct bio_vec), GFP_KERNEL); if (!array) { ret = -ENOMEM; break; } } head = pipe->head; tail = pipe->tail; /* build the vector */ left = sd.total_len; for (n = 0; !pipe_empty(head, tail) && left && n < nbufs; tail++) { struct pipe_buffer *buf = pipe_buf(pipe, tail); size_t this_len = buf->len; /* zero-length bvecs are not supported, skip them */ if (!this_len) continue; this_len = min(this_len, left); ret = pipe_buf_confirm(pipe, buf); if (unlikely(ret)) { if (ret == -ENODATA) ret = 0; goto done; } bvec_set_page(&array[n], buf->page, this_len, buf->offset); left -= this_len; n++; } iov_iter_bvec(&from, ITER_SOURCE, array, n, sd.total_len - left); init_sync_kiocb(&kiocb, out); kiocb.ki_pos = sd.pos; ret = out->f_op->write_iter(&kiocb, &from); sd.pos = kiocb.ki_pos; if (ret <= 0) break; sd.num_spliced += ret; sd.total_len -= ret; *ppos = sd.pos; /* dismiss the fully eaten buffers, adjust the partial one */ tail = pipe->tail; while (ret) { struct pipe_buffer *buf = pipe_buf(pipe, tail); if (ret >= buf->len) { ret -= buf->len; buf->len = 0; pipe_buf_release(pipe, buf); tail++; pipe->tail = tail; if (pipe->files) sd.need_wakeup = true; } else { buf->offset += ret; buf->len -= ret; ret = 0; } } } done: kfree(array); splice_from_pipe_end(pipe, &sd); pipe_unlock(pipe); if (sd.num_spliced) ret = sd.num_spliced; return ret; } EXPORT_SYMBOL(iter_file_splice_write); #ifdef CONFIG_NET /** * splice_to_socket - splice data from a pipe to a socket * @pipe: pipe to splice from * @out: socket to write to * @ppos: position in @out * @len: number of bytes to splice * @flags: splice modifier flags * * Description: * Will send @len bytes from the pipe to a network socket. No data copying * is involved. * */ ssize_t splice_to_socket(struct pipe_inode_info *pipe, struct file *out, loff_t *ppos, size_t len, unsigned int flags) { struct socket *sock = sock_from_file(out); struct bio_vec bvec[16]; struct msghdr msg = {}; ssize_t ret = 0; size_t spliced = 0; bool need_wakeup = false; pipe_lock(pipe); while (len > 0) { unsigned int head, tail, bc = 0; size_t remain = len; /* * Check for signal early to make process killable when there * are always buffers available */ ret = -ERESTARTSYS; if (signal_pending(current)) break; while (pipe_is_empty(pipe)) { ret = 0; if (!pipe->writers) goto out; if (spliced) goto out; ret = -EAGAIN; if (flags & SPLICE_F_NONBLOCK) goto out; ret = -ERESTARTSYS; if (signal_pending(current)) goto out; if (need_wakeup) { wakeup_pipe_writers(pipe); need_wakeup = false; } pipe_wait_readable(pipe); } head = pipe->head; tail = pipe->tail; while (!pipe_empty(head, tail)) { struct pipe_buffer *buf = pipe_buf(pipe, tail); size_t seg; if (!buf->len) { tail++; continue; } seg = min_t(size_t, remain, buf->len); ret = pipe_buf_confirm(pipe, buf); if (unlikely(ret)) { if (ret == -ENODATA) ret = 0; break; } bvec_set_page(&bvec[bc++], buf->page, seg, buf->offset); remain -= seg; if (remain == 0 || bc >= ARRAY_SIZE(bvec)) break; tail++; } if (!bc) break; msg.msg_flags = MSG_SPLICE_PAGES; if (flags & SPLICE_F_MORE) msg.msg_flags |= MSG_MORE; if (remain && pipe_occupancy(pipe->head, tail) > 0) msg.msg_flags |= MSG_MORE; if (out->f_flags & O_NONBLOCK) msg.msg_flags |= MSG_DONTWAIT; iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, bvec, bc, len - remain); ret = sock_sendmsg(sock, &msg); if (ret <= 0) break; spliced += ret; len -= ret; tail = pipe->tail; while (ret > 0) { struct pipe_buffer *buf = pipe_buf(pipe, tail); size_t seg = min_t(size_t, ret, buf->len); buf->offset += seg; buf->len -= seg; ret -= seg; if (!buf->len) { pipe_buf_release(pipe, buf); tail++; } } if (tail != pipe->tail) { pipe->tail = tail; if (pipe->files) need_wakeup = true; } } out: pipe_unlock(pipe); if (need_wakeup) wakeup_pipe_writers(pipe); return spliced ?: ret; } #endif static int warn_unsupported(struct file *file, const char *op) { pr_debug_ratelimited( "splice %s not supported for file %pD4 (pid: %d comm: %.20s)\n", op, file, current->pid, current->comm); return -EINVAL; } /* * Attempt to initiate a splice from pipe to file. */ static ssize_t do_splice_from(struct pipe_inode_info *pipe, struct file *out, loff_t *ppos, size_t len, unsigned int flags) { if (unlikely(!out->f_op->splice_write)) return warn_unsupported(out, "write"); return out->f_op->splice_write(pipe, out, ppos, len, flags); } /* * Indicate to the caller that there was a premature EOF when reading from the * source and the caller didn't indicate they would be sending more data after * this. */ static void do_splice_eof(struct splice_desc *sd) { if (sd->splice_eof) sd->splice_eof(sd); } /* * Callers already called rw_verify_area() on the entire range. * No need to call it for sub ranges. */ static ssize_t do_splice_read(struct file *in, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags) { unsigned int p_space; if (unlikely(!(in->f_mode & FMODE_READ))) return -EBADF; if (!len) return 0; /* Don't try to read more the pipe has space for. */ p_space = pipe->max_usage - pipe_buf_usage(pipe); len = min_t(size_t, len, p_space << PAGE_SHIFT); if (unlikely(len > MAX_RW_COUNT)) len = MAX_RW_COUNT; if (unlikely(!in->f_op->splice_read)) return warn_unsupported(in, "read"); /* * O_DIRECT and DAX don't deal with the pagecache, so we allocate a * buffer, copy into it and splice that into the pipe. */ if ((in->f_flags & O_DIRECT) || IS_DAX(in->f_mapping->host)) return copy_splice_read(in, ppos, pipe, len, flags); return in->f_op->splice_read(in, ppos, pipe, len, flags); } /** * vfs_splice_read - Read data from a file and splice it into a pipe * @in: File to splice from * @ppos: Input file offset * @pipe: Pipe to splice to * @len: Number of bytes to splice * @flags: Splice modifier flags (SPLICE_F_*) * * Splice the requested amount of data from the input file to the pipe. This * is synchronous as the caller must hold the pipe lock across the entire * operation. * * If successful, it returns the amount of data spliced, 0 if it hit the EOF or * a hole and a negative error code otherwise. */ ssize_t vfs_splice_read(struct file *in, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags) { ssize_t ret; ret = rw_verify_area(READ, in, ppos, len); if (unlikely(ret < 0)) return ret; return do_splice_read(in, ppos, pipe, len, flags); } EXPORT_SYMBOL_GPL(vfs_splice_read); /** * splice_direct_to_actor - splices data directly between two non-pipes * @in: file to splice from * @sd: actor information on where to splice to * @actor: handles the data splicing * * Description: * This is a special case helper to splice directly between two * points, without requiring an explicit pipe. Internally an allocated * pipe is cached in the process, and reused during the lifetime of * that process. * */ ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd, splice_direct_actor *actor) { struct pipe_inode_info *pipe; ssize_t ret, bytes; size_t len; int i, flags, more; /* * We require the input to be seekable, as we don't want to randomly * drop data for eg socket -> socket splicing. Use the piped splicing * for that! */ if (unlikely(!(in->f_mode & FMODE_LSEEK))) return -EINVAL; /* * neither in nor out is a pipe, setup an internal pipe attached to * 'out' and transfer the wanted data from 'in' to 'out' through that */ pipe = current->splice_pipe; if (unlikely(!pipe)) { pipe = alloc_pipe_info(); if (!pipe) return -ENOMEM; /* * We don't have an immediate reader, but we'll read the stuff * out of the pipe right after the splice_to_pipe(). So set * PIPE_READERS appropriately. */ pipe->readers = 1; current->splice_pipe = pipe; } /* * Do the splice. */ bytes = 0; len = sd->total_len; /* Don't block on output, we have to drain the direct pipe. */ flags = sd->flags; sd->flags &= ~SPLICE_F_NONBLOCK; /* * We signal MORE until we've read sufficient data to fulfill the * request and we keep signalling it if the caller set it. */ more = sd->flags & SPLICE_F_MORE; sd->flags |= SPLICE_F_MORE; WARN_ON_ONCE(!pipe_is_empty(pipe)); while (len) { size_t read_len; loff_t pos = sd->pos, prev_pos = pos; ret = do_splice_read(in, &pos, pipe, len, flags); if (unlikely(ret <= 0)) goto read_failure; read_len = ret; sd->total_len = read_len; /* * If we now have sufficient data to fulfill the request then * we clear SPLICE_F_MORE if it was not set initially. */ if (read_len >= len && !more) sd->flags &= ~SPLICE_F_MORE; /* * NOTE: nonblocking mode only applies to the input. We * must not do the output in nonblocking mode as then we * could get stuck data in the internal pipe: */ ret = actor(pipe, sd); if (unlikely(ret <= 0)) { sd->pos = prev_pos; goto out_release; } bytes += ret; len -= ret; sd->pos = pos; if (ret < read_len) { sd->pos = prev_pos + ret; goto out_release; } } done: pipe->tail = pipe->head = 0; file_accessed(in); return bytes; read_failure: /* * If the user did *not* set SPLICE_F_MORE *and* we didn't hit that * "use all of len" case that cleared SPLICE_F_MORE, *and* we did a * "->splice_in()" that returned EOF (ie zero) *and* we have sent at * least 1 byte *then* we will also do the ->splice_eof() call. */ if (ret == 0 && !more && len > 0 && bytes) do_splice_eof(sd); out_release: /* * If we did an incomplete transfer we must release * the pipe buffers in question: */ for (i = 0; i < pipe->ring_size; i++) { struct pipe_buffer *buf = &pipe->bufs[i]; if (buf->ops) pipe_buf_release(pipe, buf); } if (!bytes) bytes = ret; goto done; } EXPORT_SYMBOL(splice_direct_to_actor); static int direct_splice_actor(struct pipe_inode_info *pipe, struct splice_desc *sd) { struct file *file = sd->u.file; long ret; file_start_write(file); ret = do_splice_from(pipe, file, sd->opos, sd->total_len, sd->flags); file_end_write(file); return ret; } static int splice_file_range_actor(struct pipe_inode_info *pipe, struct splice_desc *sd) { struct file *file = sd->u.file; return do_splice_from(pipe, file, sd->opos, sd->total_len, sd->flags); } static void direct_file_splice_eof(struct splice_desc *sd) { struct file *file = sd->u.file; if (file->f_op->splice_eof) file->f_op->splice_eof(file); } static ssize_t do_splice_direct_actor(struct file *in, loff_t *ppos, struct file *out, loff_t *opos, size_t len, unsigned int flags, splice_direct_actor *actor) { struct splice_desc sd = { .len = len, .total_len = len, .flags = flags, .pos = *ppos, .u.file = out, .splice_eof = direct_file_splice_eof, .opos = opos, }; ssize_t ret; if (unlikely(!(out->f_mode & FMODE_WRITE))) return -EBADF; if (unlikely(out->f_flags & O_APPEND)) return -EINVAL; ret = splice_direct_to_actor(in, &sd, actor); if (ret > 0) *ppos = sd.pos; return ret; } /** * do_splice_direct - splices data directly between two files * @in: file to splice from * @ppos: input file offset * @out: file to splice to * @opos: output file offset * @len: number of bytes to splice * @flags: splice modifier flags * * Description: * For use by do_sendfile(). splice can easily emulate sendfile, but * doing it in the application would incur an extra system call * (splice in + splice out, as compared to just sendfile()). So this helper * can splice directly through a process-private pipe. * * Callers already called rw_verify_area() on the entire range. */ ssize_t do_splice_direct(struct file *in, loff_t *ppos, struct file *out, loff_t *opos, size_t len, unsigned int flags) { return do_splice_direct_actor(in, ppos, out, opos, len, flags, direct_splice_actor); } EXPORT_SYMBOL(do_splice_direct); /** * splice_file_range - splices data between two files for copy_file_range() * @in: file to splice from * @ppos: input file offset * @out: file to splice to * @opos: output file offset * @len: number of bytes to splice * * Description: * For use by ->copy_file_range() methods. * Like do_splice_direct(), but vfs_copy_file_range() already holds * start_file_write() on @out file. * * Callers already called rw_verify_area() on the entire range. */ ssize_t splice_file_range(struct file *in, loff_t *ppos, struct file *out, loff_t *opos, size_t len) { lockdep_assert(file_write_started(out)); return do_splice_direct_actor(in, ppos, out, opos, min_t(size_t, len, MAX_RW_COUNT), 0, splice_file_range_actor); } EXPORT_SYMBOL(splice_file_range); static int wait_for_space(struct pipe_inode_info *pipe, unsigned flags) { for (;;) { if (unlikely(!pipe->readers)) { send_sig(SIGPIPE, current, 0); return -EPIPE; } if (!pipe_is_full(pipe)) return 0; if (flags & SPLICE_F_NONBLOCK) return -EAGAIN; if (signal_pending(current)) return -ERESTARTSYS; pipe_wait_writable(pipe); } } static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe, struct pipe_inode_info *opipe, size_t len, unsigned int flags); ssize_t splice_file_to_pipe(struct file *in, struct pipe_inode_info *opipe, loff_t *offset, size_t len, unsigned int flags) { ssize_t ret; pipe_lock(opipe); ret = wait_for_space(opipe, flags); if (!ret) ret = do_splice_read(in, offset, opipe, len, flags); pipe_unlock(opipe); if (ret > 0) wakeup_pipe_readers(opipe); return ret; } /* * Determine where to splice to/from. */ ssize_t do_splice(struct file *in, loff_t *off_in, struct file *out, loff_t *off_out, size_t len, unsigned int flags) { struct pipe_inode_info *ipipe; struct pipe_inode_info *opipe; loff_t offset; ssize_t ret; if (unlikely(!(in->f_mode & FMODE_READ) || !(out->f_mode & FMODE_WRITE))) return -EBADF; ipipe = get_pipe_info(in, true); opipe = get_pipe_info(out, true); if (ipipe && opipe) { if (off_in || off_out) return -ESPIPE; /* Splicing to self would be fun, but... */ if (ipipe == opipe) return -EINVAL; if ((in->f_flags | out->f_flags) & O_NONBLOCK) flags |= SPLICE_F_NONBLOCK; ret = splice_pipe_to_pipe(ipipe, opipe, len, flags); } else if (ipipe) { if (off_in) return -ESPIPE; if (off_out) { if (!(out->f_mode & FMODE_PWRITE)) return -EINVAL; offset = *off_out; } else { offset = out->f_pos; } if (unlikely(out->f_flags & O_APPEND)) return -EINVAL; ret = rw_verify_area(WRITE, out, &offset, len); if (unlikely(ret < 0)) return ret; if (in->f_flags & O_NONBLOCK) flags |= SPLICE_F_NONBLOCK; file_start_write(out); ret = do_splice_from(ipipe, out, &offset, len, flags); file_end_write(out); if (!off_out) out->f_pos = offset; else *off_out = offset; } else if (opipe) { if (off_out) return -ESPIPE; if (off_in) { if (!(in->f_mode & FMODE_PREAD)) return -EINVAL; offset = *off_in; } else { offset = in->f_pos; } ret = rw_verify_area(READ, in, &offset, len); if (unlikely(ret < 0)) return ret; if (out->f_flags & O_NONBLOCK) flags |= SPLICE_F_NONBLOCK; ret = splice_file_to_pipe(in, opipe, &offset, len, flags); if (!off_in) in->f_pos = offset; else *off_in = offset; } else { ret = -EINVAL; } if (ret > 0) { /* * Generate modify out before access in: * do_splice_from() may've already sent modify out, * and this ensures the events get merged. */ fsnotify_modify(out); fsnotify_access(in); } return ret; } static ssize_t __do_splice(struct file *in, loff_t __user *off_in, struct file *out, loff_t __user *off_out, size_t len, unsigned int flags) { struct pipe_inode_info *ipipe; struct pipe_inode_info *opipe; loff_t offset, *__off_in = NULL, *__off_out = NULL; ssize_t ret; ipipe = get_pipe_info(in, true); opipe = get_pipe_info(out, true); if (ipipe) { if (off_in) return -ESPIPE; pipe_clear_nowait(in); } if (opipe) { if (off_out) return -ESPIPE; pipe_clear_nowait(out); } if (off_out) { if (copy_from_user(&offset, off_out, sizeof(loff_t))) return -EFAULT; __off_out = &offset; } if (off_in) { if (copy_from_user(&offset, off_in, sizeof(loff_t))) return -EFAULT; __off_in = &offset; } ret = do_splice(in, __off_in, out, __off_out, len, flags); if (ret < 0) return ret; if (__off_out && copy_to_user(off_out, __off_out, sizeof(loff_t))) return -EFAULT; if (__off_in && copy_to_user(off_in, __off_in, sizeof(loff_t))) return -EFAULT; return ret; } static ssize_t iter_to_pipe(struct iov_iter *from, struct pipe_inode_info *pipe, unsigned int flags) { struct pipe_buffer buf = { .ops = &user_page_pipe_buf_ops, .flags = flags }; size_t total = 0; ssize_t ret = 0; while (iov_iter_count(from)) { struct page *pages[16]; ssize_t left; size_t start; int i, n; left = iov_iter_get_pages2(from, pages, ~0UL, 16, &start); if (left <= 0) { ret = left; break; } n = DIV_ROUND_UP(left + start, PAGE_SIZE); for (i = 0; i < n; i++) { int size = min_t(int, left, PAGE_SIZE - start); buf.page = pages[i]; buf.offset = start; buf.len = size; ret = add_to_pipe(pipe, &buf); if (unlikely(ret < 0)) { iov_iter_revert(from, left); // this one got dropped by add_to_pipe() while (++i < n) put_page(pages[i]); goto out; } total += ret; left -= size; start = 0; } } out: return total ? total : ret; } static int pipe_to_user(struct pipe_inode_info *pipe, struct pipe_buffer *buf, struct splice_desc *sd) { int n = copy_page_to_iter(buf->page, buf->offset, sd->len, sd->u.data); return n == sd->len ? n : -EFAULT; } /* * For lack of a better implementation, implement vmsplice() to userspace * as a simple copy of the pipes pages to the user iov. */ static ssize_t vmsplice_to_user(struct file *file, struct iov_iter *iter, unsigned int flags) { struct pipe_inode_info *pipe = get_pipe_info(file, true); struct splice_desc sd = { .total_len = iov_iter_count(iter), .flags = flags, .u.data = iter }; ssize_t ret = 0; if (!pipe) return -EBADF; pipe_clear_nowait(file); if (sd.total_len) { pipe_lock(pipe); ret = __splice_from_pipe(pipe, &sd, pipe_to_user); pipe_unlock(pipe); } if (ret > 0) fsnotify_access(file); return ret; } /* * vmsplice splices a user address range into a pipe. It can be thought of * as splice-from-memory, where the regular splice is splice-from-file (or * to file). In both cases the output is a pipe, naturally. */ static ssize_t vmsplice_to_pipe(struct file *file, struct iov_iter *iter, unsigned int flags) { struct pipe_inode_info *pipe; ssize_t ret = 0; unsigned buf_flag = 0; if (flags & SPLICE_F_GIFT) buf_flag = PIPE_BUF_FLAG_GIFT; pipe = get_pipe_info(file, true); if (!pipe) return -EBADF; pipe_clear_nowait(file); pipe_lock(pipe); ret = wait_for_space(pipe, flags); if (!ret) ret = iter_to_pipe(iter, pipe, buf_flag); pipe_unlock(pipe); if (ret > 0) { wakeup_pipe_readers(pipe); fsnotify_modify(file); } return ret; } /* * Note that vmsplice only really supports true splicing _from_ user memory * to a pipe, not the other way around. Splicing from user memory is a simple * operation that can be supported without any funky alignment restrictions * or nasty vm tricks. We simply map in the user memory and fill them into * a pipe. The reverse isn't quite as easy, though. There are two possible * solutions for that: * * - memcpy() the data internally, at which point we might as well just * do a regular read() on the buffer anyway. * - Lots of nasty vm tricks, that are neither fast nor flexible (it * has restriction limitations on both ends of the pipe). * * Currently we punt and implement it as a normal copy, see pipe_to_user(). * */ SYSCALL_DEFINE4(vmsplice, int, fd, const struct iovec __user *, uiov, unsigned long, nr_segs, unsigned int, flags) { struct iovec iovstack[UIO_FASTIOV]; struct iovec *iov = iovstack; struct iov_iter iter; ssize_t error; int type; if (unlikely(flags & ~SPLICE_F_ALL)) return -EINVAL; CLASS(fd, f)(fd); if (fd_empty(f)) return -EBADF; if (fd_file(f)->f_mode & FMODE_WRITE) type = ITER_SOURCE; else if (fd_file(f)->f_mode & FMODE_READ) type = ITER_DEST; else return -EBADF; error = import_iovec(type, uiov, nr_segs, ARRAY_SIZE(iovstack), &iov, &iter); if (error < 0) return error; if (!iov_iter_count(&iter)) error = 0; else if (type == ITER_SOURCE) error = vmsplice_to_pipe(fd_file(f), &iter, flags); else error = vmsplice_to_user(fd_file(f), &iter, flags); kfree(iov); return error; } SYSCALL_DEFINE6(splice, int, fd_in, loff_t __user *, off_in, int, fd_out, loff_t __user *, off_out, size_t, len, unsigned int, flags) { if (unlikely(!len)) return 0; if (unlikely(flags & ~SPLICE_F_ALL)) return -EINVAL; CLASS(fd, in)(fd_in); if (fd_empty(in)) return -EBADF; CLASS(fd, out)(fd_out); if (fd_empty(out)) return -EBADF; return __do_splice(fd_file(in), off_in, fd_file(out), off_out, len, flags); } /* * Make sure there's data to read. Wait for input if we can, otherwise * return an appropriate error. */ static int ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags) { int ret; /* * Check the pipe occupancy without the inode lock first. This function * is speculative anyways, so missing one is ok. */ if (!pipe_is_empty(pipe)) return 0; ret = 0; pipe_lock(pipe); while (pipe_is_empty(pipe)) { if (signal_pending(current)) { ret = -ERESTARTSYS; break; } if (!pipe->writers) break; if (flags & SPLICE_F_NONBLOCK) { ret = -EAGAIN; break; } pipe_wait_readable(pipe); } pipe_unlock(pipe); return ret; } /* * Make sure there's writeable room. Wait for room if we can, otherwise * return an appropriate error. */ static int opipe_prep(struct pipe_inode_info *pipe, unsigned int flags) { int ret; /* * Check pipe occupancy without the inode lock first. This function * is speculative anyways, so missing one is ok. */ if (!pipe_is_full(pipe)) return 0; ret = 0; pipe_lock(pipe); while (pipe_is_full(pipe)) { if (!pipe->readers) { send_sig(SIGPIPE, current, 0); ret = -EPIPE; break; } if (flags & SPLICE_F_NONBLOCK) { ret = -EAGAIN; break; } if (signal_pending(current)) { ret = -ERESTARTSYS; break; } pipe_wait_writable(pipe); } pipe_unlock(pipe); return ret; } /* * Splice contents of ipipe to opipe. */ static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe, struct pipe_inode_info *opipe, size_t len, unsigned int flags) { struct pipe_buffer *ibuf, *obuf; unsigned int i_head, o_head; unsigned int i_tail, o_tail; int ret = 0; bool input_wakeup = false; retry: ret = ipipe_prep(ipipe, flags); if (ret) return ret; ret = opipe_prep(opipe, flags); if (ret) return ret; /* * Potential ABBA deadlock, work around it by ordering lock * grabbing by pipe info address. Otherwise two different processes * could deadlock (one doing tee from A -> B, the other from B -> A). */ pipe_double_lock(ipipe, opipe); i_tail = ipipe->tail; o_head = opipe->head; do { size_t o_len; if (!opipe->readers) { send_sig(SIGPIPE, current, 0); if (!ret) ret = -EPIPE; break; } i_head = ipipe->head; o_tail = opipe->tail; if (pipe_empty(i_head, i_tail) && !ipipe->writers) break; /* * Cannot make any progress, because either the input * pipe is empty or the output pipe is full. */ if (pipe_empty(i_head, i_tail) || pipe_full(o_head, o_tail, opipe->max_usage)) { /* Already processed some buffers, break */ if (ret) break; if (flags & SPLICE_F_NONBLOCK) { ret = -EAGAIN; break; } /* * We raced with another reader/writer and haven't * managed to process any buffers. A zero return * value means EOF, so retry instead. */ pipe_unlock(ipipe); pipe_unlock(opipe); goto retry; } ibuf = pipe_buf(ipipe, i_tail); obuf = pipe_buf(opipe, o_head); if (len >= ibuf->len) { /* * Simply move the whole buffer from ipipe to opipe */ *obuf = *ibuf; ibuf->ops = NULL; i_tail++; ipipe->tail = i_tail; input_wakeup = true; o_len = obuf->len; o_head++; opipe->head = o_head; } else { /* * Get a reference to this pipe buffer, * so we can copy the contents over. */ if (!pipe_buf_get(ipipe, ibuf)) { if (ret == 0) ret = -EFAULT; break; } *obuf = *ibuf; /* * Don't inherit the gift and merge flags, we need to * prevent multiple steals of this page. */ obuf->flags &= ~PIPE_BUF_FLAG_GIFT; obuf->flags &= ~PIPE_BUF_FLAG_CAN_MERGE; obuf->len = len; ibuf->offset += len; ibuf->len -= len; o_len = len; o_head++; opipe->head = o_head; } ret += o_len; len -= o_len; } while (len); pipe_unlock(ipipe); pipe_unlock(opipe); /* * If we put data in the output pipe, wakeup any potential readers. */ if (ret > 0) wakeup_pipe_readers(opipe); if (input_wakeup) wakeup_pipe_writers(ipipe); return ret; } /* * Link contents of ipipe to opipe. */ static ssize_t link_pipe(struct pipe_inode_info *ipipe, struct pipe_inode_info *opipe, size_t len, unsigned int flags) { struct pipe_buffer *ibuf, *obuf; unsigned int i_head, o_head; unsigned int i_tail, o_tail; ssize_t ret = 0; /* * Potential ABBA deadlock, work around it by ordering lock * grabbing by pipe info address. Otherwise two different processes * could deadlock (one doing tee from A -> B, the other from B -> A). */ pipe_double_lock(ipipe, opipe); i_tail = ipipe->tail; o_head = opipe->head; do { if (!opipe->readers) { send_sig(SIGPIPE, current, 0); if (!ret) ret = -EPIPE; break; } i_head = ipipe->head; o_tail = opipe->tail; /* * If we have iterated all input buffers or run out of * output room, break. */ if (pipe_empty(i_head, i_tail) || pipe_full(o_head, o_tail, opipe->max_usage)) break; ibuf = pipe_buf(ipipe, i_tail); obuf = pipe_buf(opipe, o_head); /* * Get a reference to this pipe buffer, * so we can copy the contents over. */ if (!pipe_buf_get(ipipe, ibuf)) { if (ret == 0) ret = -EFAULT; break; } *obuf = *ibuf; /* * Don't inherit the gift and merge flag, we need to prevent * multiple steals of this page. */ obuf->flags &= ~PIPE_BUF_FLAG_GIFT; obuf->flags &= ~PIPE_BUF_FLAG_CAN_MERGE; if (obuf->len > len) obuf->len = len; ret += obuf->len; len -= obuf->len; o_head++; opipe->head = o_head; i_tail++; } while (len); pipe_unlock(ipipe); pipe_unlock(opipe); /* * If we put data in the output pipe, wakeup any potential readers. */ if (ret > 0) wakeup_pipe_readers(opipe); return ret; } /* * This is a tee(1) implementation that works on pipes. It doesn't copy * any data, it simply references the 'in' pages on the 'out' pipe. * The 'flags' used are the SPLICE_F_* variants, currently the only * applicable one is SPLICE_F_NONBLOCK. */ ssize_t do_tee(struct file *in, struct file *out, size_t len, unsigned int flags) { struct pipe_inode_info *ipipe = get_pipe_info(in, true); struct pipe_inode_info *opipe = get_pipe_info(out, true); ssize_t ret = -EINVAL; if (unlikely(!(in->f_mode & FMODE_READ) || !(out->f_mode & FMODE_WRITE))) return -EBADF; /* * Duplicate the contents of ipipe to opipe without actually * copying the data. */ if (ipipe && opipe && ipipe != opipe) { if ((in->f_flags | out->f_flags) & O_NONBLOCK) flags |= SPLICE_F_NONBLOCK; /* * Keep going, unless we encounter an error. The ipipe/opipe * ordering doesn't really matter. */ ret = ipipe_prep(ipipe, flags); if (!ret) { ret = opipe_prep(opipe, flags); if (!ret) ret = link_pipe(ipipe, opipe, len, flags); } } if (ret > 0) { fsnotify_access(in); fsnotify_modify(out); } return ret; } SYSCALL_DEFINE4(tee, int, fdin, int, fdout, size_t, len, unsigned int, flags) { if (unlikely(flags & ~SPLICE_F_ALL)) return -EINVAL; if (unlikely(!len)) return 0; CLASS(fd, in)(fdin); if (fd_empty(in)) return -EBADF; CLASS(fd, out)(fdout); if (fd_empty(out)) return -EBADF; return do_tee(fd_file(in), fd_file(out), len, flags); } |
| 9 9 9 8 5 18 10 9 10 10 10 10 10 2 8 9 9 9 9 9 9 20 20 18 18 18 18 20 9 13 2 11 13 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 | // SPDX-License-Identifier: GPL-2.0-only /* Copyright (c) 2016 Facebook */ #include "percpu_freelist.h" int pcpu_freelist_init(struct pcpu_freelist *s) { int cpu; s->freelist = alloc_percpu(struct pcpu_freelist_head); if (!s->freelist) return -ENOMEM; for_each_possible_cpu(cpu) { struct pcpu_freelist_head *head = per_cpu_ptr(s->freelist, cpu); raw_res_spin_lock_init(&head->lock); head->first = NULL; } return 0; } void pcpu_freelist_destroy(struct pcpu_freelist *s) { free_percpu(s->freelist); } static inline void pcpu_freelist_push_node(struct pcpu_freelist_head *head, struct pcpu_freelist_node *node) { node->next = head->first; WRITE_ONCE(head->first, node); } static inline bool ___pcpu_freelist_push(struct pcpu_freelist_head *head, struct pcpu_freelist_node *node) { if (raw_res_spin_lock(&head->lock)) return false; pcpu_freelist_push_node(head, node); raw_res_spin_unlock(&head->lock); return true; } void __pcpu_freelist_push(struct pcpu_freelist *s, struct pcpu_freelist_node *node) { struct pcpu_freelist_head *head; int cpu; if (___pcpu_freelist_push(this_cpu_ptr(s->freelist), node)) return; while (true) { for_each_cpu_wrap(cpu, cpu_possible_mask, raw_smp_processor_id()) { if (cpu == raw_smp_processor_id()) continue; head = per_cpu_ptr(s->freelist, cpu); if (raw_res_spin_lock(&head->lock)) continue; pcpu_freelist_push_node(head, node); raw_res_spin_unlock(&head->lock); return; } } } void pcpu_freelist_push(struct pcpu_freelist *s, struct pcpu_freelist_node *node) { unsigned long flags; local_irq_save(flags); __pcpu_freelist_push(s, node); local_irq_restore(flags); } void pcpu_freelist_populate(struct pcpu_freelist *s, void *buf, u32 elem_size, u32 nr_elems) { struct pcpu_freelist_head *head; unsigned int cpu, cpu_idx, i, j, n, m; n = nr_elems / num_possible_cpus(); m = nr_elems % num_possible_cpus(); cpu_idx = 0; for_each_possible_cpu(cpu) { head = per_cpu_ptr(s->freelist, cpu); j = n + (cpu_idx < m ? 1 : 0); for (i = 0; i < j; i++) { /* No locking required as this is not visible yet. */ pcpu_freelist_push_node(head, buf); buf += elem_size; } cpu_idx++; } } static struct pcpu_freelist_node *___pcpu_freelist_pop(struct pcpu_freelist *s) { struct pcpu_freelist_node *node = NULL; struct pcpu_freelist_head *head; int cpu; for_each_cpu_wrap(cpu, cpu_possible_mask, raw_smp_processor_id()) { head = per_cpu_ptr(s->freelist, cpu); if (!READ_ONCE(head->first)) continue; if (raw_res_spin_lock(&head->lock)) continue; node = head->first; if (node) { WRITE_ONCE(head->first, node->next); raw_res_spin_unlock(&head->lock); return node; } raw_res_spin_unlock(&head->lock); } return node; } struct pcpu_freelist_node *__pcpu_freelist_pop(struct pcpu_freelist *s) { return ___pcpu_freelist_pop(s); } struct pcpu_freelist_node *pcpu_freelist_pop(struct pcpu_freelist *s) { struct pcpu_freelist_node *ret; unsigned long flags; local_irq_save(flags); ret = __pcpu_freelist_pop(s); local_irq_restore(flags); return ret; } |
| 17 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 | // SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0 /******************************************************************************* * * Module Name: nsaccess - Top-level functions for accessing ACPI namespace * ******************************************************************************/ #include <acpi/acpi.h> #include "accommon.h" #include "amlcode.h" #include "acnamesp.h" #include "acdispat.h" #ifdef ACPI_ASL_COMPILER #include "acdisasm.h" #endif #define _COMPONENT ACPI_NAMESPACE ACPI_MODULE_NAME("nsaccess") /******************************************************************************* * * FUNCTION: acpi_ns_root_initialize * * PARAMETERS: None * * RETURN: Status * * DESCRIPTION: Allocate and initialize the default root named objects * * MUTEX: Locks namespace for entire execution * ******************************************************************************/ acpi_status acpi_ns_root_initialize(void) { acpi_status status; const struct acpi_predefined_names *init_val = NULL; struct acpi_namespace_node *new_node; struct acpi_namespace_node *prev_node = NULL; union acpi_operand_object *obj_desc; acpi_string val = NULL; ACPI_FUNCTION_TRACE(ns_root_initialize); status = acpi_ut_acquire_mutex(ACPI_MTX_NAMESPACE); if (ACPI_FAILURE(status)) { return_ACPI_STATUS(status); } /* * The global root ptr is initially NULL, so a non-NULL value indicates * that acpi_ns_root_initialize() has already been called; just return. */ if (acpi_gbl_root_node) { status = AE_OK; goto unlock_and_exit; } /* * Tell the rest of the subsystem that the root is initialized * (This is OK because the namespace is locked) */ acpi_gbl_root_node = &acpi_gbl_root_node_struct; /* Enter the predefined names in the name table */ ACPI_DEBUG_PRINT((ACPI_DB_INFO, "Entering predefined entries into namespace\n")); /* * Create the initial (default) namespace. * This namespace looks like something similar to this: * * ACPI Namespace (from Namespace Root): * 0 _GPE Scope 00203160 00 * 0 _PR_ Scope 002031D0 00 * 0 _SB_ Device 00203240 00 Notify Object: 0020ADD8 * 0 _SI_ Scope 002032B0 00 * 0 _TZ_ Device 00203320 00 * 0 _REV Integer 00203390 00 = 0000000000000002 * 0 _OS_ String 00203488 00 Len 14 "Microsoft Windows NT" * 0 _GL_ Mutex 00203580 00 Object 002035F0 * 0 _OSI Method 00203678 00 Args 1 Len 0000 Aml 00000000 */ for (init_val = acpi_gbl_pre_defined_names; init_val->name; init_val++) { status = AE_OK; /* _OSI is optional for now, will be permanent later */ if (!strcmp(init_val->name, "_OSI") && !acpi_gbl_create_osi_method) { continue; } /* * Create, init, and link the new predefined name * Note: No need to use acpi_ns_lookup here because all the * predefined names are at the root level. It is much easier to * just create and link the new node(s) here. */ new_node = acpi_ns_create_node(*ACPI_CAST_PTR(u32, init_val->name)); if (!new_node) { status = AE_NO_MEMORY; goto unlock_and_exit; } new_node->descriptor_type = ACPI_DESC_TYPE_NAMED; new_node->type = init_val->type; if (!prev_node) { acpi_gbl_root_node_struct.child = new_node; } else { prev_node->peer = new_node; } new_node->parent = &acpi_gbl_root_node_struct; prev_node = new_node; /* * Name entered successfully. If entry in pre_defined_names[] specifies * an initial value, create the initial value. */ if (init_val->val) { status = acpi_os_predefined_override(init_val, &val); if (ACPI_FAILURE(status)) { ACPI_ERROR((AE_INFO, "Could not override predefined %s", init_val->name)); } if (!val) { val = init_val->val; } /* * Entry requests an initial value, allocate a * descriptor for it. */ obj_desc = acpi_ut_create_internal_object(init_val->type); if (!obj_desc) { status = AE_NO_MEMORY; goto unlock_and_exit; } /* * Convert value string from table entry to * internal representation. Only types actually * used for initial values are implemented here. */ switch (init_val->type) { case ACPI_TYPE_METHOD: obj_desc->method.param_count = (u8) ACPI_TO_INTEGER(val); obj_desc->common.flags |= AOPOBJ_DATA_VALID; #if defined (ACPI_ASL_COMPILER) /* Save the parameter count for the iASL compiler */ new_node->value = obj_desc->method.param_count; #else /* Mark this as a very SPECIAL method (_OSI) */ obj_desc->method.info_flags = ACPI_METHOD_INTERNAL_ONLY; obj_desc->method.dispatch.implementation = acpi_ut_osi_implementation; #endif break; case ACPI_TYPE_INTEGER: obj_desc->integer.value = ACPI_TO_INTEGER(val); break; case ACPI_TYPE_STRING: /* Build an object around the static string */ obj_desc->string.length = (u32)strlen(val); obj_desc->string.pointer = val; obj_desc->common.flags |= AOPOBJ_STATIC_POINTER; break; case ACPI_TYPE_MUTEX: obj_desc->mutex.node = new_node; obj_desc->mutex.sync_level = (u8) (ACPI_TO_INTEGER(val) - 1); /* Create a mutex */ status = acpi_os_create_mutex(&obj_desc->mutex. os_mutex); if (ACPI_FAILURE(status)) { acpi_ut_remove_reference(obj_desc); goto unlock_and_exit; } /* Special case for ACPI Global Lock */ if (strcmp(init_val->name, "_GL_") == 0) { acpi_gbl_global_lock_mutex = obj_desc; /* Create additional counting semaphore for global lock */ status = acpi_os_create_semaphore(1, 0, &acpi_gbl_global_lock_semaphore); if (ACPI_FAILURE(status)) { acpi_ut_remove_reference (obj_desc); goto unlock_and_exit; } } break; default: ACPI_ERROR((AE_INFO, "Unsupported initial type value 0x%X", init_val->type)); acpi_ut_remove_reference(obj_desc); obj_desc = NULL; continue; } /* Store pointer to value descriptor in the Node */ status = acpi_ns_attach_object(new_node, obj_desc, obj_desc->common.type); /* Remove local reference to the object */ acpi_ut_remove_reference(obj_desc); } } unlock_and_exit: (void)acpi_ut_release_mutex(ACPI_MTX_NAMESPACE); /* Save a handle to "_GPE", it is always present */ if (ACPI_SUCCESS(status)) { status = acpi_ns_get_node(NULL, "\\_GPE", ACPI_NS_NO_UPSEARCH, &acpi_gbl_fadt_gpe_device); } return_ACPI_STATUS(status); } /******************************************************************************* * * FUNCTION: acpi_ns_lookup * * PARAMETERS: scope_info - Current scope info block * pathname - Search pathname, in internal format * (as represented in the AML stream) * type - Type associated with name * interpreter_mode - IMODE_LOAD_PASS2 => add name if not found * flags - Flags describing the search restrictions * walk_state - Current state of the walk * return_node - Where the Node is placed (if found * or created successfully) * * RETURN: Status * * DESCRIPTION: Find or enter the passed name in the name space. * Log an error if name not found in Exec mode. * * MUTEX: Assumes namespace is locked. * ******************************************************************************/ acpi_status acpi_ns_lookup(union acpi_generic_state *scope_info, char *pathname, acpi_object_type type, acpi_interpreter_mode interpreter_mode, u32 flags, struct acpi_walk_state *walk_state, struct acpi_namespace_node **return_node) { acpi_status status; char *path = pathname; char *external_path; struct acpi_namespace_node *prefix_node; struct acpi_namespace_node *current_node = NULL; struct acpi_namespace_node *this_node = NULL; u32 num_segments; u32 num_carats; acpi_name simple_name; acpi_object_type type_to_check_for; acpi_object_type this_search_type; u32 search_parent_flag = ACPI_NS_SEARCH_PARENT; u32 local_flags; acpi_interpreter_mode local_interpreter_mode; ACPI_FUNCTION_TRACE(ns_lookup); if (!return_node) { return_ACPI_STATUS(AE_BAD_PARAMETER); } local_flags = flags & ~(ACPI_NS_ERROR_IF_FOUND | ACPI_NS_OVERRIDE_IF_FOUND | ACPI_NS_SEARCH_PARENT); *return_node = ACPI_ENTRY_NOT_FOUND; acpi_gbl_ns_lookup_count++; if (!acpi_gbl_root_node) { return_ACPI_STATUS(AE_NO_NAMESPACE); } /* Get the prefix scope. A null scope means use the root scope */ if ((!scope_info) || (!scope_info->scope.node)) { ACPI_DEBUG_PRINT((ACPI_DB_NAMES, "Null scope prefix, using root node (%p)\n", acpi_gbl_root_node)); prefix_node = acpi_gbl_root_node; } else { prefix_node = scope_info->scope.node; if (ACPI_GET_DESCRIPTOR_TYPE(prefix_node) != ACPI_DESC_TYPE_NAMED) { ACPI_ERROR((AE_INFO, "%p is not a namespace node [%s]", prefix_node, acpi_ut_get_descriptor_name(prefix_node))); return_ACPI_STATUS(AE_AML_INTERNAL); } if (!(flags & ACPI_NS_PREFIX_IS_SCOPE)) { /* * This node might not be a actual "scope" node (such as a * Device/Method, etc.) It could be a Package or other object * node. Backup up the tree to find the containing scope node. */ while (!acpi_ns_opens_scope(prefix_node->type) && prefix_node->type != ACPI_TYPE_ANY) { prefix_node = prefix_node->parent; } } } /* Save type. TBD: may be no longer necessary */ type_to_check_for = type; /* * Begin examination of the actual pathname */ if (!pathname) { /* A Null name_path is allowed and refers to the root */ num_segments = 0; this_node = acpi_gbl_root_node; path = ""; ACPI_DEBUG_PRINT((ACPI_DB_NAMES, "Null Pathname (Zero segments), Flags=%X\n", flags)); } else { /* * Name pointer is valid (and must be in internal name format) * * Check for scope prefixes: * * As represented in the AML stream, a namepath consists of an * optional scope prefix followed by a name segment part. * * If present, the scope prefix is either a Root Prefix (in * which case the name is fully qualified), or one or more * Parent Prefixes (in which case the name's scope is relative * to the current scope). */ if (*path == (u8) AML_ROOT_PREFIX) { /* Pathname is fully qualified, start from the root */ this_node = acpi_gbl_root_node; search_parent_flag = ACPI_NS_NO_UPSEARCH; /* Point to name segment part */ path++; ACPI_DEBUG_PRINT((ACPI_DB_NAMES, "Path is absolute from root [%p]\n", this_node)); } else { /* Pathname is relative to current scope, start there */ ACPI_DEBUG_PRINT((ACPI_DB_NAMES, "Searching relative to prefix scope [%4.4s] (%p)\n", acpi_ut_get_node_name(prefix_node), prefix_node)); /* * Handle multiple Parent Prefixes (carat) by just getting * the parent node for each prefix instance. */ this_node = prefix_node; num_carats = 0; while (*path == (u8) AML_PARENT_PREFIX) { /* Name is fully qualified, no search rules apply */ search_parent_flag = ACPI_NS_NO_UPSEARCH; /* * Point past this prefix to the name segment * part or the next Parent Prefix */ path++; /* Backup to the parent node */ num_carats++; this_node = this_node->parent; if (!this_node) { /* * Current scope has no parent scope. Externalize * the internal path for error message. */ status = acpi_ns_externalize_name (ACPI_UINT32_MAX, pathname, NULL, &external_path); if (ACPI_SUCCESS(status)) { ACPI_ERROR((AE_INFO, "%s: Path has too many parent prefixes (^)", external_path)); ACPI_FREE(external_path); } return_ACPI_STATUS(AE_NOT_FOUND); } } if (search_parent_flag == ACPI_NS_NO_UPSEARCH) { ACPI_DEBUG_PRINT((ACPI_DB_NAMES, "Search scope is [%4.4s], path has %u carat(s)\n", acpi_ut_get_node_name (this_node), num_carats)); } } /* * Determine the number of ACPI name segments in this pathname. * * The segment part consists of either: * - A Null name segment (0) * - A dual_name_prefix followed by two 4-byte name segments * - A multi_name_prefix followed by a byte indicating the * number of segments and the segments themselves. * - A single 4-byte name segment * * Examine the name prefix opcode, if any, to determine the number of * segments. */ switch (*path) { case 0: /* * Null name after a root or parent prefixes. We already * have the correct target node and there are no name segments. */ num_segments = 0; type = this_node->type; ACPI_DEBUG_PRINT((ACPI_DB_NAMES, "Prefix-only Pathname (Zero name segments), Flags=%X\n", flags)); break; case AML_DUAL_NAME_PREFIX: /* More than one name_seg, search rules do not apply */ search_parent_flag = ACPI_NS_NO_UPSEARCH; /* Two segments, point to first name segment */ num_segments = 2; path++; ACPI_DEBUG_PRINT((ACPI_DB_NAMES, "Dual Pathname (2 segments, Flags=%X)\n", flags)); break; case AML_MULTI_NAME_PREFIX: /* More than one name_seg, search rules do not apply */ search_parent_flag = ACPI_NS_NO_UPSEARCH; /* Extract segment count, point to first name segment */ path++; num_segments = (u32) (u8) * path; path++; ACPI_DEBUG_PRINT((ACPI_DB_NAMES, "Multi Pathname (%u Segments, Flags=%X)\n", num_segments, flags)); break; default: /* * Not a Null name, no Dual or Multi prefix, hence there is * only one name segment and Pathname is already pointing to it. */ num_segments = 1; ACPI_DEBUG_PRINT((ACPI_DB_NAMES, "Simple Pathname (1 segment, Flags=%X)\n", flags)); break; } ACPI_DEBUG_EXEC(acpi_ns_print_pathname(num_segments, path)); } /* * Search namespace for each segment of the name. Loop through and * verify (or add to the namespace) each name segment. * * The object type is significant only at the last name * segment. (We don't care about the types along the path, only * the type of the final target object.) */ this_search_type = ACPI_TYPE_ANY; current_node = this_node; while (num_segments && current_node) { num_segments--; if (!num_segments) { /* This is the last segment, enable typechecking */ this_search_type = type; /* * Only allow automatic parent search (search rules) if the caller * requested it AND we have a single, non-fully-qualified name_seg */ if ((search_parent_flag != ACPI_NS_NO_UPSEARCH) && (flags & ACPI_NS_SEARCH_PARENT)) { local_flags |= ACPI_NS_SEARCH_PARENT; } /* Set error flag according to caller */ if (flags & ACPI_NS_ERROR_IF_FOUND) { local_flags |= ACPI_NS_ERROR_IF_FOUND; } /* Set override flag according to caller */ if (flags & ACPI_NS_OVERRIDE_IF_FOUND) { local_flags |= ACPI_NS_OVERRIDE_IF_FOUND; } } /* Handle opcodes that create a new name_seg via a full name_path */ local_interpreter_mode = interpreter_mode; if ((flags & ACPI_NS_PREFIX_MUST_EXIST) && (num_segments > 0)) { /* Every element of the path must exist (except for the final name_seg) */ local_interpreter_mode = ACPI_IMODE_EXECUTE; } /* Extract one ACPI name from the front of the pathname */ ACPI_MOVE_32_TO_32(&simple_name, path); /* Try to find the single (4 character) ACPI name */ status = acpi_ns_search_and_enter(simple_name, walk_state, current_node, local_interpreter_mode, this_search_type, local_flags, &this_node); if (ACPI_FAILURE(status)) { if (status == AE_NOT_FOUND) { #if !defined ACPI_ASL_COMPILER /* Note: iASL reports this error by itself, not needed here */ if (flags & ACPI_NS_PREFIX_MUST_EXIST) { acpi_os_printf(ACPI_MSG_BIOS_ERROR "Object does not exist: %4.4s\n", (char *)&simple_name); } #endif /* Name not found in ACPI namespace */ ACPI_DEBUG_PRINT((ACPI_DB_NAMES, "Name [%4.4s] not found in scope [%4.4s] %p\n", (char *)&simple_name, (char *)¤t_node->name, current_node)); } #ifdef ACPI_EXEC_APP if ((status == AE_ALREADY_EXISTS) && (this_node->flags & ANOBJ_NODE_EARLY_INIT)) { this_node->flags &= ~ANOBJ_NODE_EARLY_INIT; status = AE_OK; } #endif #ifdef ACPI_ASL_COMPILER /* * If this ACPI name already exists within the namespace as an * external declaration, then mark the external as a conflicting * declaration and proceed to process the current node as if it did * not exist in the namespace. If this node is not processed as * normal, then it could cause improper namespace resolution * by failing to open a new scope. */ if (acpi_gbl_disasm_flag && (status == AE_ALREADY_EXISTS) && ((this_node->flags & ANOBJ_IS_EXTERNAL) || (walk_state && walk_state->opcode == AML_EXTERNAL_OP))) { this_node->flags &= ~ANOBJ_IS_EXTERNAL; this_node->type = (u8)this_search_type; if (walk_state->opcode != AML_EXTERNAL_OP) { acpi_dm_mark_external_conflict (this_node); } break; } #endif *return_node = this_node; return_ACPI_STATUS(status); } /* More segments to follow? */ if (num_segments > 0) { /* * If we have an alias to an object that opens a scope (such as a * device or processor), we need to dereference the alias here so * that we can access any children of the original node (via the * remaining segments). */ if (this_node->type == ACPI_TYPE_LOCAL_ALIAS) { if (!this_node->object) { return_ACPI_STATUS(AE_NOT_EXIST); } if (acpi_ns_opens_scope (((struct acpi_namespace_node *) this_node->object)->type)) { this_node = (struct acpi_namespace_node *) this_node->object; } } } /* Special handling for the last segment (num_segments == 0) */ else { /* * Sanity typecheck of the target object: * * If 1) This is the last segment (num_segments == 0) * 2) And we are looking for a specific type * (Not checking for TYPE_ANY) * 3) Which is not an alias * 4) Which is not a local type (TYPE_SCOPE) * 5) And the type of target object is known (not TYPE_ANY) * 6) And target object does not match what we are looking for * * Then we have a type mismatch. Just warn and ignore it. */ if ((type_to_check_for != ACPI_TYPE_ANY) && (type_to_check_for != ACPI_TYPE_LOCAL_ALIAS) && (type_to_check_for != ACPI_TYPE_LOCAL_METHOD_ALIAS) && (type_to_check_for != ACPI_TYPE_LOCAL_SCOPE) && (this_node->type != ACPI_TYPE_ANY) && (this_node->type != type_to_check_for)) { /* Complain about a type mismatch */ ACPI_WARNING((AE_INFO, "NsLookup: Type mismatch on %4.4s (%s), searching for (%s)", ACPI_CAST_PTR(char, &simple_name), acpi_ut_get_type_name(this_node-> type), acpi_ut_get_type_name (type_to_check_for))); } /* * If this is the last name segment and we are not looking for a * specific type, but the type of found object is known, use that * type to (later) see if it opens a scope. */ if (type == ACPI_TYPE_ANY) { type = this_node->type; } } /* Point to next name segment and make this node current */ path += ACPI_NAMESEG_SIZE; current_node = this_node; } /* Always check if we need to open a new scope */ if (!(flags & ACPI_NS_DONT_OPEN_SCOPE) && (walk_state)) { /* * If entry is a type which opens a scope, push the new scope on the * scope stack. */ if (acpi_ns_opens_scope(type)) { status = acpi_ds_scope_stack_push(this_node, type, walk_state); if (ACPI_FAILURE(status)) { return_ACPI_STATUS(status); } } } #ifdef ACPI_EXEC_APP if (flags & ACPI_NS_EARLY_INIT) { this_node->flags |= ANOBJ_NODE_EARLY_INIT; } #endif *return_node = this_node; return_ACPI_STATUS(AE_OK); } |
| 13 13 2 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef IOU_NAPI_H #define IOU_NAPI_H #include <linux/kernel.h> #include <linux/io_uring.h> #include <net/busy_poll.h> #ifdef CONFIG_NET_RX_BUSY_POLL void io_napi_init(struct io_ring_ctx *ctx); void io_napi_free(struct io_ring_ctx *ctx); int io_register_napi(struct io_ring_ctx *ctx, void __user *arg); int io_unregister_napi(struct io_ring_ctx *ctx, void __user *arg); int __io_napi_add_id(struct io_ring_ctx *ctx, unsigned int napi_id); void __io_napi_busy_loop(struct io_ring_ctx *ctx, struct io_wait_queue *iowq); int io_napi_sqpoll_busy_poll(struct io_ring_ctx *ctx); static inline bool io_napi(struct io_ring_ctx *ctx) { return !list_empty(&ctx->napi_list); } static inline void io_napi_busy_loop(struct io_ring_ctx *ctx, struct io_wait_queue *iowq) { if (!io_napi(ctx)) return; __io_napi_busy_loop(ctx, iowq); } /* * io_napi_add() - Add napi id to the busy poll list * @req: pointer to io_kiocb request * * Add the napi id of the socket to the napi busy poll list and hash table. */ static inline void io_napi_add(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; struct socket *sock; if (READ_ONCE(ctx->napi_track_mode) != IO_URING_NAPI_TRACKING_DYNAMIC) return; sock = sock_from_file(req->file); if (sock && sock->sk) __io_napi_add_id(ctx, READ_ONCE(sock->sk->sk_napi_id)); } #else static inline void io_napi_init(struct io_ring_ctx *ctx) { } static inline void io_napi_free(struct io_ring_ctx *ctx) { } static inline int io_register_napi(struct io_ring_ctx *ctx, void __user *arg) { return -EOPNOTSUPP; } static inline int io_unregister_napi(struct io_ring_ctx *ctx, void __user *arg) { return -EOPNOTSUPP; } static inline bool io_napi(struct io_ring_ctx *ctx) { return false; } static inline void io_napi_add(struct io_kiocb *req) { } static inline void io_napi_busy_loop(struct io_ring_ctx *ctx, struct io_wait_queue *iowq) { } static inline int io_napi_sqpoll_busy_poll(struct io_ring_ctx *ctx) { return 0; } #endif /* CONFIG_NET_RX_BUSY_POLL */ #endif |
| 253 255 201 10 10 10 16 16 16 16 14 2 2 15 15 13 16 16 16 15 16 2 16 15 14 16 16 16 16 16 16 14 16 16 16 14 49 49 49 47 2 49 49 49 49 49 48 49 2 2 2 7 7 283 281 282 283 282 25 27 282 2 2 281 282 3 284 284 191 194 27 27 24 90 283 278 2 283 1 282 281 88 285 283 175 175 177 177 177 175 175 204 203 1 202 204 202 192 194 203 136 204 204 201 202 202 200 202 70 70 70 69 70 68 69 69 70 70 70 68 70 70 70 70 70 69 66 70 69 70 69 70 69 70 70 69 69 1 167 117 159 275 114 117 117 69 70 70 70 70 69 70 68 69 150 150 152 10 10 82 83 83 82 151 151 148 152 150 83 69 151 69 266 269 267 269 264 269 1 39 116 114 39 39 39 11 116 117 26 117 44 44 44 43 44 123 123 124 122 124 124 123 124 124 1 2 2 2 2 2 1 7 1 1 1 182 181 182 182 181 180 198 10 2 2 183 168 4 168 182 4 6 181 181 183 181 183 183 181 2 181 183 6 181 6 6 5 6 6 6 2 6 6 182 5 3 10 10 7 7 2 7 10 10 21 21 201 255 253 254 197 247 184 183 15 183 53 61 53 61 60 60 60 165 116 53 167 10 10 10 10 10 10 10 10 167 168 167 167 30 114 117 115 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 70 86 70 68 16 32 35 34 14 14 1 35 34 32 35 35 32 24 10 35 303 298 84 114 113 86 35 303 116 113 113 116 116 116 114 113 1 113 121 119 117 120 36 122 121 121 122 122 3 117 119 116 115 117 116 116 117 1 117 117 117 114 115 117 115 122 26 3 3 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 1 1 3 3 3 3 3 3 3 3 3 3 3 3 3 2 2 2 3 3 3 3 2 3 3 3 3 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 2 3 3 3 3 3 3 3 2 3 3 3 3 3 1 1 3 3 3 3 3 3 3 3 3 3 3 3 3 3 4 4 4 4 4 4 4 3 3 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 | // SPDX-License-Identifier: GPL-2.0 /* * Block multiqueue core code * * Copyright (C) 2013-2014 Jens Axboe * Copyright (C) 2013-2014 Christoph Hellwig */ #include <linux/kernel.h> #include <linux/module.h> #include <linux/backing-dev.h> #include <linux/bio.h> #include <linux/blkdev.h> #include <linux/blk-integrity.h> #include <linux/kmemleak.h> #include <linux/mm.h> #include <linux/init.h> #include <linux/slab.h> #include <linux/workqueue.h> #include <linux/smp.h> #include <linux/interrupt.h> #include <linux/llist.h> #include <linux/cpu.h> #include <linux/cache.h> #include <linux/sched/topology.h> #include <linux/sched/signal.h> #include <linux/delay.h> #include <linux/crash_dump.h> #include <linux/prefetch.h> #include <linux/blk-crypto.h> #include <linux/part_stat.h> #include <linux/sched/isolation.h> #include <trace/events/block.h> #include <linux/t10-pi.h> #include "blk.h" #include "blk-mq.h" #include "blk-mq-debugfs.h" #include "blk-pm.h" #include "blk-stat.h" #include "blk-mq-sched.h" #include "blk-rq-qos.h" static DEFINE_PER_CPU(struct llist_head, blk_cpu_done); static DEFINE_PER_CPU(call_single_data_t, blk_cpu_csd); static DEFINE_MUTEX(blk_mq_cpuhp_lock); static void blk_mq_insert_request(struct request *rq, blk_insert_t flags); static void blk_mq_request_bypass_insert(struct request *rq, blk_insert_t flags); static void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx, struct list_head *list); static int blk_hctx_poll(struct request_queue *q, struct blk_mq_hw_ctx *hctx, struct io_comp_batch *iob, unsigned int flags); /* * Check if any of the ctx, dispatch list or elevator * have pending work in this hardware queue. */ static bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx) { return !list_empty_careful(&hctx->dispatch) || sbitmap_any_bit_set(&hctx->ctx_map) || blk_mq_sched_has_work(hctx); } /* * Mark this ctx as having pending work in this hardware queue */ static void blk_mq_hctx_mark_pending(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx) { const int bit = ctx->index_hw[hctx->type]; if (!sbitmap_test_bit(&hctx->ctx_map, bit)) sbitmap_set_bit(&hctx->ctx_map, bit); } static void blk_mq_hctx_clear_pending(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx) { const int bit = ctx->index_hw[hctx->type]; sbitmap_clear_bit(&hctx->ctx_map, bit); } struct mq_inflight { struct block_device *part; unsigned int inflight[2]; }; static bool blk_mq_check_in_driver(struct request *rq, void *priv) { struct mq_inflight *mi = priv; if (rq->rq_flags & RQF_IO_STAT && (!bdev_is_partition(mi->part) || rq->part == mi->part) && blk_mq_rq_state(rq) == MQ_RQ_IN_FLIGHT) mi->inflight[rq_data_dir(rq)]++; return true; } void blk_mq_in_driver_rw(struct block_device *part, unsigned int inflight[2]) { struct mq_inflight mi = { .part = part }; blk_mq_queue_tag_busy_iter(bdev_get_queue(part), blk_mq_check_in_driver, &mi); inflight[READ] = mi.inflight[READ]; inflight[WRITE] = mi.inflight[WRITE]; } #ifdef CONFIG_LOCKDEP static bool blk_freeze_set_owner(struct request_queue *q, struct task_struct *owner) { if (!owner) return false; if (!q->mq_freeze_depth) { q->mq_freeze_owner = owner; q->mq_freeze_owner_depth = 1; q->mq_freeze_disk_dead = !q->disk || test_bit(GD_DEAD, &q->disk->state) || !blk_queue_registered(q); q->mq_freeze_queue_dying = blk_queue_dying(q); return true; } if (owner == q->mq_freeze_owner) q->mq_freeze_owner_depth += 1; return false; } /* verify the last unfreeze in owner context */ static bool blk_unfreeze_check_owner(struct request_queue *q) { if (q->mq_freeze_owner != current) return false; if (--q->mq_freeze_owner_depth == 0) { q->mq_freeze_owner = NULL; return true; } return false; } #else static bool blk_freeze_set_owner(struct request_queue *q, struct task_struct *owner) { return false; } static bool blk_unfreeze_check_owner(struct request_queue *q) { return false; } #endif bool __blk_freeze_queue_start(struct request_queue *q, struct task_struct *owner) { bool freeze; mutex_lock(&q->mq_freeze_lock); freeze = blk_freeze_set_owner(q, owner); if (++q->mq_freeze_depth == 1) { percpu_ref_kill(&q->q_usage_counter); mutex_unlock(&q->mq_freeze_lock); if (queue_is_mq(q)) blk_mq_run_hw_queues(q, false); } else { mutex_unlock(&q->mq_freeze_lock); } return freeze; } void blk_freeze_queue_start(struct request_queue *q) { if (__blk_freeze_queue_start(q, current)) blk_freeze_acquire_lock(q); } EXPORT_SYMBOL_GPL(blk_freeze_queue_start); void blk_mq_freeze_queue_wait(struct request_queue *q) { wait_event(q->mq_freeze_wq, percpu_ref_is_zero(&q->q_usage_counter)); } EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_wait); int blk_mq_freeze_queue_wait_timeout(struct request_queue *q, unsigned long timeout) { return wait_event_timeout(q->mq_freeze_wq, percpu_ref_is_zero(&q->q_usage_counter), timeout); } EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_wait_timeout); void blk_mq_freeze_queue_nomemsave(struct request_queue *q) { blk_freeze_queue_start(q); blk_mq_freeze_queue_wait(q); } EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_nomemsave); bool __blk_mq_unfreeze_queue(struct request_queue *q, bool force_atomic) { bool unfreeze; mutex_lock(&q->mq_freeze_lock); if (force_atomic) q->q_usage_counter.data->force_atomic = true; q->mq_freeze_depth--; WARN_ON_ONCE(q->mq_freeze_depth < 0); if (!q->mq_freeze_depth) { percpu_ref_resurrect(&q->q_usage_counter); wake_up_all(&q->mq_freeze_wq); } unfreeze = blk_unfreeze_check_owner(q); mutex_unlock(&q->mq_freeze_lock); return unfreeze; } void blk_mq_unfreeze_queue_nomemrestore(struct request_queue *q) { if (__blk_mq_unfreeze_queue(q, false)) blk_unfreeze_release_lock(q); } EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue_nomemrestore); /* * non_owner variant of blk_freeze_queue_start * * Unlike blk_freeze_queue_start, the queue doesn't need to be unfrozen * by the same task. This is fragile and should not be used if at all * possible. */ void blk_freeze_queue_start_non_owner(struct request_queue *q) { __blk_freeze_queue_start(q, NULL); } EXPORT_SYMBOL_GPL(blk_freeze_queue_start_non_owner); /* non_owner variant of blk_mq_unfreeze_queue */ void blk_mq_unfreeze_queue_non_owner(struct request_queue *q) { __blk_mq_unfreeze_queue(q, false); } EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue_non_owner); /* * FIXME: replace the scsi_internal_device_*block_nowait() calls in the * mpt3sas driver such that this function can be removed. */ void blk_mq_quiesce_queue_nowait(struct request_queue *q) { unsigned long flags; spin_lock_irqsave(&q->queue_lock, flags); if (!q->quiesce_depth++) blk_queue_flag_set(QUEUE_FLAG_QUIESCED, q); spin_unlock_irqrestore(&q->queue_lock, flags); } EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue_nowait); /** * blk_mq_wait_quiesce_done() - wait until in-progress quiesce is done * @set: tag_set to wait on * * Note: it is driver's responsibility for making sure that quiesce has * been started on or more of the request_queues of the tag_set. This * function only waits for the quiesce on those request_queues that had * the quiesce flag set using blk_mq_quiesce_queue_nowait. */ void blk_mq_wait_quiesce_done(struct blk_mq_tag_set *set) { if (set->flags & BLK_MQ_F_BLOCKING) synchronize_srcu(set->srcu); else synchronize_rcu(); } EXPORT_SYMBOL_GPL(blk_mq_wait_quiesce_done); /** * blk_mq_quiesce_queue() - wait until all ongoing dispatches have finished * @q: request queue. * * Note: this function does not prevent that the struct request end_io() * callback function is invoked. Once this function is returned, we make * sure no dispatch can happen until the queue is unquiesced via * blk_mq_unquiesce_queue(). */ void blk_mq_quiesce_queue(struct request_queue *q) { blk_mq_quiesce_queue_nowait(q); /* nothing to wait for non-mq queues */ if (queue_is_mq(q)) blk_mq_wait_quiesce_done(q->tag_set); } EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue); /* * blk_mq_unquiesce_queue() - counterpart of blk_mq_quiesce_queue() * @q: request queue. * * This function recovers queue into the state before quiescing * which is done by blk_mq_quiesce_queue. */ void blk_mq_unquiesce_queue(struct request_queue *q) { unsigned long flags; bool run_queue = false; spin_lock_irqsave(&q->queue_lock, flags); if (WARN_ON_ONCE(q->quiesce_depth <= 0)) { ; } else if (!--q->quiesce_depth) { blk_queue_flag_clear(QUEUE_FLAG_QUIESCED, q); run_queue = true; } spin_unlock_irqrestore(&q->queue_lock, flags); /* dispatch requests which are inserted during quiescing */ if (run_queue) blk_mq_run_hw_queues(q, true); } EXPORT_SYMBOL_GPL(blk_mq_unquiesce_queue); void blk_mq_quiesce_tagset(struct blk_mq_tag_set *set) { struct request_queue *q; mutex_lock(&set->tag_list_lock); list_for_each_entry(q, &set->tag_list, tag_set_list) { if (!blk_queue_skip_tagset_quiesce(q)) blk_mq_quiesce_queue_nowait(q); } mutex_unlock(&set->tag_list_lock); blk_mq_wait_quiesce_done(set); } EXPORT_SYMBOL_GPL(blk_mq_quiesce_tagset); void blk_mq_unquiesce_tagset(struct blk_mq_tag_set *set) { struct request_queue *q; mutex_lock(&set->tag_list_lock); list_for_each_entry(q, &set->tag_list, tag_set_list) { if (!blk_queue_skip_tagset_quiesce(q)) blk_mq_unquiesce_queue(q); } mutex_unlock(&set->tag_list_lock); } EXPORT_SYMBOL_GPL(blk_mq_unquiesce_tagset); void blk_mq_wake_waiters(struct request_queue *q) { struct blk_mq_hw_ctx *hctx; unsigned long i; queue_for_each_hw_ctx(q, hctx, i) if (blk_mq_hw_queue_mapped(hctx)) blk_mq_tag_wakeup_all(hctx->tags, true); } void blk_rq_init(struct request_queue *q, struct request *rq) { memset(rq, 0, sizeof(*rq)); INIT_LIST_HEAD(&rq->queuelist); rq->q = q; rq->__sector = (sector_t) -1; INIT_HLIST_NODE(&rq->hash); RB_CLEAR_NODE(&rq->rb_node); rq->tag = BLK_MQ_NO_TAG; rq->internal_tag = BLK_MQ_NO_TAG; rq->start_time_ns = blk_time_get_ns(); blk_crypto_rq_set_defaults(rq); } EXPORT_SYMBOL(blk_rq_init); /* Set start and alloc time when the allocated request is actually used */ static inline void blk_mq_rq_time_init(struct request *rq, u64 alloc_time_ns) { #ifdef CONFIG_BLK_RQ_ALLOC_TIME if (blk_queue_rq_alloc_time(rq->q)) rq->alloc_time_ns = alloc_time_ns; else rq->alloc_time_ns = 0; #endif } static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, struct blk_mq_tags *tags, unsigned int tag) { struct blk_mq_ctx *ctx = data->ctx; struct blk_mq_hw_ctx *hctx = data->hctx; struct request_queue *q = data->q; struct request *rq = tags->static_rqs[tag]; rq->q = q; rq->mq_ctx = ctx; rq->mq_hctx = hctx; rq->cmd_flags = data->cmd_flags; if (data->flags & BLK_MQ_REQ_PM) data->rq_flags |= RQF_PM; rq->rq_flags = data->rq_flags; if (data->rq_flags & RQF_SCHED_TAGS) { rq->tag = BLK_MQ_NO_TAG; rq->internal_tag = tag; } else { rq->tag = tag; rq->internal_tag = BLK_MQ_NO_TAG; } rq->timeout = 0; rq->part = NULL; rq->io_start_time_ns = 0; rq->stats_sectors = 0; rq->nr_phys_segments = 0; rq->nr_integrity_segments = 0; rq->end_io = NULL; rq->end_io_data = NULL; blk_crypto_rq_set_defaults(rq); INIT_LIST_HEAD(&rq->queuelist); /* tag was already set */ WRITE_ONCE(rq->deadline, 0); req_ref_set(rq, 1); if (rq->rq_flags & RQF_USE_SCHED) { struct elevator_queue *e = data->q->elevator; INIT_HLIST_NODE(&rq->hash); RB_CLEAR_NODE(&rq->rb_node); if (e->type->ops.prepare_request) e->type->ops.prepare_request(rq); } return rq; } static inline struct request * __blk_mq_alloc_requests_batch(struct blk_mq_alloc_data *data) { unsigned int tag, tag_offset; struct blk_mq_tags *tags; struct request *rq; unsigned long tag_mask; int i, nr = 0; tag_mask = blk_mq_get_tags(data, data->nr_tags, &tag_offset); if (unlikely(!tag_mask)) return NULL; tags = blk_mq_tags_from_data(data); for (i = 0; tag_mask; i++) { if (!(tag_mask & (1UL << i))) continue; tag = tag_offset + i; prefetch(tags->static_rqs[tag]); tag_mask &= ~(1UL << i); rq = blk_mq_rq_ctx_init(data, tags, tag); rq_list_add_head(data->cached_rqs, rq); nr++; } if (!(data->rq_flags & RQF_SCHED_TAGS)) blk_mq_add_active_requests(data->hctx, nr); /* caller already holds a reference, add for remainder */ percpu_ref_get_many(&data->q->q_usage_counter, nr - 1); data->nr_tags -= nr; return rq_list_pop(data->cached_rqs); } static struct request *__blk_mq_alloc_requests(struct blk_mq_alloc_data *data) { struct request_queue *q = data->q; u64 alloc_time_ns = 0; struct request *rq; unsigned int tag; /* alloc_time includes depth and tag waits */ if (blk_queue_rq_alloc_time(q)) alloc_time_ns = blk_time_get_ns(); if (data->cmd_flags & REQ_NOWAIT) data->flags |= BLK_MQ_REQ_NOWAIT; retry: data->ctx = blk_mq_get_ctx(q); data->hctx = blk_mq_map_queue(data->cmd_flags, data->ctx); if (q->elevator) { /* * All requests use scheduler tags when an I/O scheduler is * enabled for the queue. */ data->rq_flags |= RQF_SCHED_TAGS; /* * Flush/passthrough requests are special and go directly to the * dispatch list. */ if ((data->cmd_flags & REQ_OP_MASK) != REQ_OP_FLUSH && !blk_op_is_passthrough(data->cmd_flags)) { struct elevator_mq_ops *ops = &q->elevator->type->ops; WARN_ON_ONCE(data->flags & BLK_MQ_REQ_RESERVED); data->rq_flags |= RQF_USE_SCHED; if (ops->limit_depth) ops->limit_depth(data->cmd_flags, data); } } else { blk_mq_tag_busy(data->hctx); } if (data->flags & BLK_MQ_REQ_RESERVED) data->rq_flags |= RQF_RESV; /* * Try batched alloc if we want more than 1 tag. */ if (data->nr_tags > 1) { rq = __blk_mq_alloc_requests_batch(data); if (rq) { blk_mq_rq_time_init(rq, alloc_time_ns); return rq; } data->nr_tags = 1; } /* * Waiting allocations only fail because of an inactive hctx. In that * case just retry the hctx assignment and tag allocation as CPU hotplug * should have migrated us to an online CPU by now. */ tag = blk_mq_get_tag(data); if (tag == BLK_MQ_NO_TAG) { if (data->flags & BLK_MQ_REQ_NOWAIT) return NULL; /* * Give up the CPU and sleep for a random short time to * ensure that thread using a realtime scheduling class * are migrated off the CPU, and thus off the hctx that * is going away. */ msleep(3); goto retry; } if (!(data->rq_flags & RQF_SCHED_TAGS)) blk_mq_inc_active_requests(data->hctx); rq = blk_mq_rq_ctx_init(data, blk_mq_tags_from_data(data), tag); blk_mq_rq_time_init(rq, alloc_time_ns); return rq; } static struct request *blk_mq_rq_cache_fill(struct request_queue *q, struct blk_plug *plug, blk_opf_t opf, blk_mq_req_flags_t flags) { struct blk_mq_alloc_data data = { .q = q, .flags = flags, .shallow_depth = 0, .cmd_flags = opf, .rq_flags = 0, .nr_tags = plug->nr_ios, .cached_rqs = &plug->cached_rqs, .ctx = NULL, .hctx = NULL }; struct request *rq; if (blk_queue_enter(q, flags)) return NULL; plug->nr_ios = 1; rq = __blk_mq_alloc_requests(&data); if (unlikely(!rq)) blk_queue_exit(q); return rq; } static struct request *blk_mq_alloc_cached_request(struct request_queue *q, blk_opf_t opf, blk_mq_req_flags_t flags) { struct blk_plug *plug = current->plug; struct request *rq; if (!plug) return NULL; if (rq_list_empty(&plug->cached_rqs)) { if (plug->nr_ios == 1) return NULL; rq = blk_mq_rq_cache_fill(q, plug, opf, flags); if (!rq) return NULL; } else { rq = rq_list_peek(&plug->cached_rqs); if (!rq || rq->q != q) return NULL; if (blk_mq_get_hctx_type(opf) != rq->mq_hctx->type) return NULL; if (op_is_flush(rq->cmd_flags) != op_is_flush(opf)) return NULL; rq_list_pop(&plug->cached_rqs); blk_mq_rq_time_init(rq, blk_time_get_ns()); } rq->cmd_flags = opf; INIT_LIST_HEAD(&rq->queuelist); return rq; } struct request *blk_mq_alloc_request(struct request_queue *q, blk_opf_t opf, blk_mq_req_flags_t flags) { struct request *rq; rq = blk_mq_alloc_cached_request(q, opf, flags); if (!rq) { struct blk_mq_alloc_data data = { .q = q, .flags = flags, .shallow_depth = 0, .cmd_flags = opf, .rq_flags = 0, .nr_tags = 1, .cached_rqs = NULL, .ctx = NULL, .hctx = NULL }; int ret; ret = blk_queue_enter(q, flags); if (ret) return ERR_PTR(ret); rq = __blk_mq_alloc_requests(&data); if (!rq) goto out_queue_exit; } rq->__data_len = 0; rq->__sector = (sector_t) -1; rq->bio = rq->biotail = NULL; return rq; out_queue_exit: blk_queue_exit(q); return ERR_PTR(-EWOULDBLOCK); } EXPORT_SYMBOL(blk_mq_alloc_request); struct request *blk_mq_alloc_request_hctx(struct request_queue *q, blk_opf_t opf, blk_mq_req_flags_t flags, unsigned int hctx_idx) { struct blk_mq_alloc_data data = { .q = q, .flags = flags, .shallow_depth = 0, .cmd_flags = opf, .rq_flags = 0, .nr_tags = 1, .cached_rqs = NULL, .ctx = NULL, .hctx = NULL }; u64 alloc_time_ns = 0; struct request *rq; unsigned int cpu; unsigned int tag; int ret; /* alloc_time includes depth and tag waits */ if (blk_queue_rq_alloc_time(q)) alloc_time_ns = blk_time_get_ns(); /* * If the tag allocator sleeps we could get an allocation for a * different hardware context. No need to complicate the low level * allocator for this for the rare use case of a command tied to * a specific queue. */ if (WARN_ON_ONCE(!(flags & BLK_MQ_REQ_NOWAIT)) || WARN_ON_ONCE(!(flags & BLK_MQ_REQ_RESERVED))) return ERR_PTR(-EINVAL); if (hctx_idx >= q->nr_hw_queues) return ERR_PTR(-EIO); ret = blk_queue_enter(q, flags); if (ret) return ERR_PTR(ret); /* * Check if the hardware context is actually mapped to anything. * If not tell the caller that it should skip this queue. */ ret = -EXDEV; data.hctx = xa_load(&q->hctx_table, hctx_idx); if (!blk_mq_hw_queue_mapped(data.hctx)) goto out_queue_exit; cpu = cpumask_first_and(data.hctx->cpumask, cpu_online_mask); if (cpu >= nr_cpu_ids) goto out_queue_exit; data.ctx = __blk_mq_get_ctx(q, cpu); if (q->elevator) data.rq_flags |= RQF_SCHED_TAGS; else blk_mq_tag_busy(data.hctx); if (flags & BLK_MQ_REQ_RESERVED) data.rq_flags |= RQF_RESV; ret = -EWOULDBLOCK; tag = blk_mq_get_tag(&data); if (tag == BLK_MQ_NO_TAG) goto out_queue_exit; if (!(data.rq_flags & RQF_SCHED_TAGS)) blk_mq_inc_active_requests(data.hctx); rq = blk_mq_rq_ctx_init(&data, blk_mq_tags_from_data(&data), tag); blk_mq_rq_time_init(rq, alloc_time_ns); rq->__data_len = 0; rq->__sector = (sector_t) -1; rq->bio = rq->biotail = NULL; return rq; out_queue_exit: blk_queue_exit(q); return ERR_PTR(ret); } EXPORT_SYMBOL_GPL(blk_mq_alloc_request_hctx); static void blk_mq_finish_request(struct request *rq) { struct request_queue *q = rq->q; blk_zone_finish_request(rq); if (rq->rq_flags & RQF_USE_SCHED) { q->elevator->type->ops.finish_request(rq); /* * For postflush request that may need to be * completed twice, we should clear this flag * to avoid double finish_request() on the rq. */ rq->rq_flags &= ~RQF_USE_SCHED; } } static void __blk_mq_free_request(struct request *rq) { struct request_queue *q = rq->q; struct blk_mq_ctx *ctx = rq->mq_ctx; struct blk_mq_hw_ctx *hctx = rq->mq_hctx; const int sched_tag = rq->internal_tag; blk_crypto_free_request(rq); blk_pm_mark_last_busy(rq); rq->mq_hctx = NULL; if (rq->tag != BLK_MQ_NO_TAG) { blk_mq_dec_active_requests(hctx); blk_mq_put_tag(hctx->tags, ctx, rq->tag); } if (sched_tag != BLK_MQ_NO_TAG) blk_mq_put_tag(hctx->sched_tags, ctx, sched_tag); blk_mq_sched_restart(hctx); blk_queue_exit(q); } void blk_mq_free_request(struct request *rq) { struct request_queue *q = rq->q; blk_mq_finish_request(rq); if (unlikely(laptop_mode && !blk_rq_is_passthrough(rq))) laptop_io_completion(q->disk->bdi); rq_qos_done(q, rq); WRITE_ONCE(rq->state, MQ_RQ_IDLE); if (req_ref_put_and_test(rq)) __blk_mq_free_request(rq); } EXPORT_SYMBOL_GPL(blk_mq_free_request); void blk_mq_free_plug_rqs(struct blk_plug *plug) { struct request *rq; while ((rq = rq_list_pop(&plug->cached_rqs)) != NULL) blk_mq_free_request(rq); } void blk_dump_rq_flags(struct request *rq, char *msg) { printk(KERN_INFO "%s: dev %s: flags=%llx\n", msg, rq->q->disk ? rq->q->disk->disk_name : "?", (__force unsigned long long) rq->cmd_flags); printk(KERN_INFO " sector %llu, nr/cnr %u/%u\n", (unsigned long long)blk_rq_pos(rq), blk_rq_sectors(rq), blk_rq_cur_sectors(rq)); printk(KERN_INFO " bio %p, biotail %p, len %u\n", rq->bio, rq->biotail, blk_rq_bytes(rq)); } EXPORT_SYMBOL(blk_dump_rq_flags); static void blk_account_io_completion(struct request *req, unsigned int bytes) { if (req->rq_flags & RQF_IO_STAT) { const int sgrp = op_stat_group(req_op(req)); part_stat_lock(); part_stat_add(req->part, sectors[sgrp], bytes >> 9); part_stat_unlock(); } } static void blk_print_req_error(struct request *req, blk_status_t status) { printk_ratelimited(KERN_ERR "%s error, dev %s, sector %llu op 0x%x:(%s) flags 0x%x " "phys_seg %u prio class %u\n", blk_status_to_str(status), req->q->disk ? req->q->disk->disk_name : "?", blk_rq_pos(req), (__force u32)req_op(req), blk_op_str(req_op(req)), (__force u32)(req->cmd_flags & ~REQ_OP_MASK), req->nr_phys_segments, IOPRIO_PRIO_CLASS(req_get_ioprio(req))); } /* * Fully end IO on a request. Does not support partial completions, or * errors. */ static void blk_complete_request(struct request *req) { const bool is_flush = (req->rq_flags & RQF_FLUSH_SEQ) != 0; int total_bytes = blk_rq_bytes(req); struct bio *bio = req->bio; trace_block_rq_complete(req, BLK_STS_OK, total_bytes); if (!bio) return; if (blk_integrity_rq(req) && req_op(req) == REQ_OP_READ) blk_integrity_complete(req, total_bytes); /* * Upper layers may call blk_crypto_evict_key() anytime after the last * bio_endio(). Therefore, the keyslot must be released before that. */ blk_crypto_rq_put_keyslot(req); blk_account_io_completion(req, total_bytes); do { struct bio *next = bio->bi_next; /* Completion has already been traced */ bio_clear_flag(bio, BIO_TRACE_COMPLETION); if (blk_req_bio_is_zone_append(req, bio)) blk_zone_append_update_request_bio(req, bio); if (!is_flush) bio_endio(bio); bio = next; } while (bio); /* * Reset counters so that the request stacking driver * can find how many bytes remain in the request * later. */ if (!req->end_io) { req->bio = NULL; req->__data_len = 0; } } /** * blk_update_request - Complete multiple bytes without completing the request * @req: the request being processed * @error: block status code * @nr_bytes: number of bytes to complete for @req * * Description: * Ends I/O on a number of bytes attached to @req, but doesn't complete * the request structure even if @req doesn't have leftover. * If @req has leftover, sets it up for the next range of segments. * * Passing the result of blk_rq_bytes() as @nr_bytes guarantees * %false return from this function. * * Note: * The RQF_SPECIAL_PAYLOAD flag is ignored on purpose in this function * except in the consistency check at the end of this function. * * Return: * %false - this request doesn't have any more data * %true - this request has more data **/ bool blk_update_request(struct request *req, blk_status_t error, unsigned int nr_bytes) { bool is_flush = req->rq_flags & RQF_FLUSH_SEQ; bool quiet = req->rq_flags & RQF_QUIET; int total_bytes; trace_block_rq_complete(req, error, nr_bytes); if (!req->bio) return false; if (blk_integrity_rq(req) && req_op(req) == REQ_OP_READ && error == BLK_STS_OK) blk_integrity_complete(req, nr_bytes); /* * Upper layers may call blk_crypto_evict_key() anytime after the last * bio_endio(). Therefore, the keyslot must be released before that. */ if (blk_crypto_rq_has_keyslot(req) && nr_bytes >= blk_rq_bytes(req)) __blk_crypto_rq_put_keyslot(req); if (unlikely(error && !blk_rq_is_passthrough(req) && !quiet) && !test_bit(GD_DEAD, &req->q->disk->state)) { blk_print_req_error(req, error); trace_block_rq_error(req, error, nr_bytes); } blk_account_io_completion(req, nr_bytes); total_bytes = 0; while (req->bio) { struct bio *bio = req->bio; unsigned bio_bytes = min(bio->bi_iter.bi_size, nr_bytes); if (unlikely(error)) bio->bi_status = error; if (bio_bytes == bio->bi_iter.bi_size) { req->bio = bio->bi_next; } else if (bio_is_zone_append(bio) && error == BLK_STS_OK) { /* * Partial zone append completions cannot be supported * as the BIO fragments may end up not being written * sequentially. */ bio->bi_status = BLK_STS_IOERR; } /* Completion has already been traced */ bio_clear_flag(bio, BIO_TRACE_COMPLETION); if (unlikely(quiet)) bio_set_flag(bio, BIO_QUIET); bio_advance(bio, bio_bytes); /* Don't actually finish bio if it's part of flush sequence */ if (!bio->bi_iter.bi_size) { if (blk_req_bio_is_zone_append(req, bio)) blk_zone_append_update_request_bio(req, bio); if (!is_flush) bio_endio(bio); } total_bytes += bio_bytes; nr_bytes -= bio_bytes; if (!nr_bytes) break; } /* * completely done */ if (!req->bio) { /* * Reset counters so that the request stacking driver * can find how many bytes remain in the request * later. */ req->__data_len = 0; return false; } req->__data_len -= total_bytes; /* update sector only for requests with clear definition of sector */ if (!blk_rq_is_passthrough(req)) req->__sector += total_bytes >> 9; /* mixed attributes always follow the first bio */ if (req->rq_flags & RQF_MIXED_MERGE) { req->cmd_flags &= ~REQ_FAILFAST_MASK; req->cmd_flags |= req->bio->bi_opf & REQ_FAILFAST_MASK; } if (!(req->rq_flags & RQF_SPECIAL_PAYLOAD)) { /* * If total number of sectors is less than the first segment * size, something has gone terribly wrong. */ if (blk_rq_bytes(req) < blk_rq_cur_bytes(req)) { blk_dump_rq_flags(req, "request botched"); req->__data_len = blk_rq_cur_bytes(req); } /* recalculate the number of segments */ req->nr_phys_segments = blk_recalc_rq_segments(req); } return true; } EXPORT_SYMBOL_GPL(blk_update_request); static inline void blk_account_io_done(struct request *req, u64 now) { trace_block_io_done(req); /* * Account IO completion. flush_rq isn't accounted as a * normal IO on queueing nor completion. Accounting the * containing request is enough. */ if ((req->rq_flags & (RQF_IO_STAT|RQF_FLUSH_SEQ)) == RQF_IO_STAT) { const int sgrp = op_stat_group(req_op(req)); part_stat_lock(); update_io_ticks(req->part, jiffies, true); part_stat_inc(req->part, ios[sgrp]); part_stat_add(req->part, nsecs[sgrp], now - req->start_time_ns); part_stat_local_dec(req->part, in_flight[op_is_write(req_op(req))]); part_stat_unlock(); } } static inline bool blk_rq_passthrough_stats(struct request *req) { struct bio *bio = req->bio; if (!blk_queue_passthrough_stat(req->q)) return false; /* Requests without a bio do not transfer data. */ if (!bio) return false; /* * Stats are accumulated in the bdev, so must have one attached to a * bio to track stats. Most drivers do not set the bdev for passthrough * requests, but nvme is one that will set it. */ if (!bio->bi_bdev) return false; /* * We don't know what a passthrough command does, but we know the * payload size and data direction. Ensuring the size is aligned to the * block size filters out most commands with payloads that don't * represent sector access. */ if (blk_rq_bytes(req) & (bdev_logical_block_size(bio->bi_bdev) - 1)) return false; return true; } static inline void blk_account_io_start(struct request *req) { trace_block_io_start(req); if (!blk_queue_io_stat(req->q)) return; if (blk_rq_is_passthrough(req) && !blk_rq_passthrough_stats(req)) return; req->rq_flags |= RQF_IO_STAT; req->start_time_ns = blk_time_get_ns(); /* * All non-passthrough requests are created from a bio with one * exception: when a flush command that is part of a flush sequence * generated by the state machine in blk-flush.c is cloned onto the * lower device by dm-multipath we can get here without a bio. */ if (req->bio) req->part = req->bio->bi_bdev; else req->part = req->q->disk->part0; part_stat_lock(); update_io_ticks(req->part, jiffies, false); part_stat_local_inc(req->part, in_flight[op_is_write(req_op(req))]); part_stat_unlock(); } static inline void __blk_mq_end_request_acct(struct request *rq, u64 now) { if (rq->rq_flags & RQF_STATS) blk_stat_add(rq, now); blk_mq_sched_completed_request(rq, now); blk_account_io_done(rq, now); } inline void __blk_mq_end_request(struct request *rq, blk_status_t error) { if (blk_mq_need_time_stamp(rq)) __blk_mq_end_request_acct(rq, blk_time_get_ns()); blk_mq_finish_request(rq); if (rq->end_io) { rq_qos_done(rq->q, rq); if (rq->end_io(rq, error) == RQ_END_IO_FREE) blk_mq_free_request(rq); } else { blk_mq_free_request(rq); } } EXPORT_SYMBOL(__blk_mq_end_request); void blk_mq_end_request(struct request *rq, blk_status_t error) { if (blk_update_request(rq, error, blk_rq_bytes(rq))) BUG(); __blk_mq_end_request(rq, error); } EXPORT_SYMBOL(blk_mq_end_request); #define TAG_COMP_BATCH 32 static inline void blk_mq_flush_tag_batch(struct blk_mq_hw_ctx *hctx, int *tag_array, int nr_tags) { struct request_queue *q = hctx->queue; blk_mq_sub_active_requests(hctx, nr_tags); blk_mq_put_tags(hctx->tags, tag_array, nr_tags); percpu_ref_put_many(&q->q_usage_counter, nr_tags); } void blk_mq_end_request_batch(struct io_comp_batch *iob) { int tags[TAG_COMP_BATCH], nr_tags = 0; struct blk_mq_hw_ctx *cur_hctx = NULL; struct request *rq; u64 now = 0; if (iob->need_ts) now = blk_time_get_ns(); while ((rq = rq_list_pop(&iob->req_list)) != NULL) { prefetch(rq->bio); prefetch(rq->rq_next); blk_complete_request(rq); if (iob->need_ts) __blk_mq_end_request_acct(rq, now); blk_mq_finish_request(rq); rq_qos_done(rq->q, rq); /* * If end_io handler returns NONE, then it still has * ownership of the request. */ if (rq->end_io && rq->end_io(rq, 0) == RQ_END_IO_NONE) continue; WRITE_ONCE(rq->state, MQ_RQ_IDLE); if (!req_ref_put_and_test(rq)) continue; blk_crypto_free_request(rq); blk_pm_mark_last_busy(rq); if (nr_tags == TAG_COMP_BATCH || cur_hctx != rq->mq_hctx) { if (cur_hctx) blk_mq_flush_tag_batch(cur_hctx, tags, nr_tags); nr_tags = 0; cur_hctx = rq->mq_hctx; } tags[nr_tags++] = rq->tag; } if (nr_tags) blk_mq_flush_tag_batch(cur_hctx, tags, nr_tags); } EXPORT_SYMBOL_GPL(blk_mq_end_request_batch); static void blk_complete_reqs(struct llist_head *list) { struct llist_node *entry = llist_reverse_order(llist_del_all(list)); struct request *rq, *next; llist_for_each_entry_safe(rq, next, entry, ipi_list) rq->q->mq_ops->complete(rq); } static __latent_entropy void blk_done_softirq(void) { blk_complete_reqs(this_cpu_ptr(&blk_cpu_done)); } static int blk_softirq_cpu_dead(unsigned int cpu) { blk_complete_reqs(&per_cpu(blk_cpu_done, cpu)); return 0; } static void __blk_mq_complete_request_remote(void *data) { __raise_softirq_irqoff(BLOCK_SOFTIRQ); } static inline bool blk_mq_complete_need_ipi(struct request *rq) { int cpu = raw_smp_processor_id(); if (!IS_ENABLED(CONFIG_SMP) || !test_bit(QUEUE_FLAG_SAME_COMP, &rq->q->queue_flags)) return false; /* * With force threaded interrupts enabled, raising softirq from an SMP * function call will always result in waking the ksoftirqd thread. * This is probably worse than completing the request on a different * cache domain. */ if (force_irqthreads()) return false; /* same CPU or cache domain and capacity? Complete locally */ if (cpu == rq->mq_ctx->cpu || (!test_bit(QUEUE_FLAG_SAME_FORCE, &rq->q->queue_flags) && cpus_share_cache(cpu, rq->mq_ctx->cpu) && cpus_equal_capacity(cpu, rq->mq_ctx->cpu))) return false; /* don't try to IPI to an offline CPU */ return cpu_online(rq->mq_ctx->cpu); } static void blk_mq_complete_send_ipi(struct request *rq) { unsigned int cpu; cpu = rq->mq_ctx->cpu; if (llist_add(&rq->ipi_list, &per_cpu(blk_cpu_done, cpu))) smp_call_function_single_async(cpu, &per_cpu(blk_cpu_csd, cpu)); } static void blk_mq_raise_softirq(struct request *rq) { struct llist_head *list; preempt_disable(); list = this_cpu_ptr(&blk_cpu_done); if (llist_add(&rq->ipi_list, list)) raise_softirq(BLOCK_SOFTIRQ); preempt_enable(); } bool blk_mq_complete_request_remote(struct request *rq) { WRITE_ONCE(rq->state, MQ_RQ_COMPLETE); /* * For request which hctx has only one ctx mapping, * or a polled request, always complete locally, * it's pointless to redirect the completion. */ if ((rq->mq_hctx->nr_ctx == 1 && rq->mq_ctx->cpu == raw_smp_processor_id()) || rq->cmd_flags & REQ_POLLED) return false; if (blk_mq_complete_need_ipi(rq)) { blk_mq_complete_send_ipi(rq); return true; } if (rq->q->nr_hw_queues == 1) { blk_mq_raise_softirq(rq); return true; } return false; } EXPORT_SYMBOL_GPL(blk_mq_complete_request_remote); /** * blk_mq_complete_request - end I/O on a request * @rq: the request being processed * * Description: * Complete a request by scheduling the ->complete_rq operation. **/ void blk_mq_complete_request(struct request *rq) { if (!blk_mq_complete_request_remote(rq)) rq->q->mq_ops->complete(rq); } EXPORT_SYMBOL(blk_mq_complete_request); /** * blk_mq_start_request - Start processing a request * @rq: Pointer to request to be started * * Function used by device drivers to notify the block layer that a request * is going to be processed now, so blk layer can do proper initializations * such as starting the timeout timer. */ void blk_mq_start_request(struct request *rq) { struct request_queue *q = rq->q; trace_block_rq_issue(rq); if (test_bit(QUEUE_FLAG_STATS, &q->queue_flags) && !blk_rq_is_passthrough(rq)) { rq->io_start_time_ns = blk_time_get_ns(); rq->stats_sectors = blk_rq_sectors(rq); rq->rq_flags |= RQF_STATS; rq_qos_issue(q, rq); } WARN_ON_ONCE(blk_mq_rq_state(rq) != MQ_RQ_IDLE); blk_add_timer(rq); WRITE_ONCE(rq->state, MQ_RQ_IN_FLIGHT); rq->mq_hctx->tags->rqs[rq->tag] = rq; if (blk_integrity_rq(rq) && req_op(rq) == REQ_OP_WRITE) blk_integrity_prepare(rq); if (rq->bio && rq->bio->bi_opf & REQ_POLLED) WRITE_ONCE(rq->bio->bi_cookie, rq->mq_hctx->queue_num); } EXPORT_SYMBOL(blk_mq_start_request); /* * Allow 2x BLK_MAX_REQUEST_COUNT requests on plug queue for multiple * queues. This is important for md arrays to benefit from merging * requests. */ static inline unsigned short blk_plug_max_rq_count(struct blk_plug *plug) { if (plug->multiple_queues) return BLK_MAX_REQUEST_COUNT * 2; return BLK_MAX_REQUEST_COUNT; } static void blk_add_rq_to_plug(struct blk_plug *plug, struct request *rq) { struct request *last = rq_list_peek(&plug->mq_list); if (!plug->rq_count) { trace_block_plug(rq->q); } else if (plug->rq_count >= blk_plug_max_rq_count(plug) || (!blk_queue_nomerges(rq->q) && blk_rq_bytes(last) >= BLK_PLUG_FLUSH_SIZE)) { blk_mq_flush_plug_list(plug, false); last = NULL; trace_block_plug(rq->q); } if (!plug->multiple_queues && last && last->q != rq->q) plug->multiple_queues = true; /* * Any request allocated from sched tags can't be issued to * ->queue_rqs() directly */ if (!plug->has_elevator && (rq->rq_flags & RQF_SCHED_TAGS)) plug->has_elevator = true; rq_list_add_tail(&plug->mq_list, rq); plug->rq_count++; } /** * blk_execute_rq_nowait - insert a request to I/O scheduler for execution * @rq: request to insert * @at_head: insert request at head or tail of queue * * Description: * Insert a fully prepared request at the back of the I/O scheduler queue * for execution. Don't wait for completion. * * Note: * This function will invoke @done directly if the queue is dead. */ void blk_execute_rq_nowait(struct request *rq, bool at_head) { struct blk_mq_hw_ctx *hctx = rq->mq_hctx; WARN_ON(irqs_disabled()); WARN_ON(!blk_rq_is_passthrough(rq)); blk_account_io_start(rq); if (current->plug && !at_head) { blk_add_rq_to_plug(current->plug, rq); return; } blk_mq_insert_request(rq, at_head ? BLK_MQ_INSERT_AT_HEAD : 0); blk_mq_run_hw_queue(hctx, hctx->flags & BLK_MQ_F_BLOCKING); } EXPORT_SYMBOL_GPL(blk_execute_rq_nowait); struct blk_rq_wait { struct completion done; blk_status_t ret; }; static enum rq_end_io_ret blk_end_sync_rq(struct request *rq, blk_status_t ret) { struct blk_rq_wait *wait = rq->end_io_data; wait->ret = ret; complete(&wait->done); return RQ_END_IO_NONE; } bool blk_rq_is_poll(struct request *rq) { if (!rq->mq_hctx) return false; if (rq->mq_hctx->type != HCTX_TYPE_POLL) return false; return true; } EXPORT_SYMBOL_GPL(blk_rq_is_poll); static void blk_rq_poll_completion(struct request *rq, struct completion *wait) { do { blk_hctx_poll(rq->q, rq->mq_hctx, NULL, 0); cond_resched(); } while (!completion_done(wait)); } /** * blk_execute_rq - insert a request into queue for execution * @rq: request to insert * @at_head: insert request at head or tail of queue * * Description: * Insert a fully prepared request at the back of the I/O scheduler queue * for execution and wait for completion. * Return: The blk_status_t result provided to blk_mq_end_request(). */ blk_status_t blk_execute_rq(struct request *rq, bool at_head) { struct blk_mq_hw_ctx *hctx = rq->mq_hctx; struct blk_rq_wait wait = { .done = COMPLETION_INITIALIZER_ONSTACK(wait.done), }; WARN_ON(irqs_disabled()); WARN_ON(!blk_rq_is_passthrough(rq)); rq->end_io_data = &wait; rq->end_io = blk_end_sync_rq; blk_account_io_start(rq); blk_mq_insert_request(rq, at_head ? BLK_MQ_INSERT_AT_HEAD : 0); blk_mq_run_hw_queue(hctx, false); if (blk_rq_is_poll(rq)) blk_rq_poll_completion(rq, &wait.done); else blk_wait_io(&wait.done); return wait.ret; } EXPORT_SYMBOL(blk_execute_rq); static void __blk_mq_requeue_request(struct request *rq) { struct request_queue *q = rq->q; blk_mq_put_driver_tag(rq); trace_block_rq_requeue(rq); rq_qos_requeue(q, rq); if (blk_mq_request_started(rq)) { WRITE_ONCE(rq->state, MQ_RQ_IDLE); rq->rq_flags &= ~RQF_TIMED_OUT; } } void blk_mq_requeue_request(struct request *rq, bool kick_requeue_list) { struct request_queue *q = rq->q; unsigned long flags; __blk_mq_requeue_request(rq); /* this request will be re-inserted to io scheduler queue */ blk_mq_sched_requeue_request(rq); spin_lock_irqsave(&q->requeue_lock, flags); list_add_tail(&rq->queuelist, &q->requeue_list); spin_unlock_irqrestore(&q->requeue_lock, flags); if (kick_requeue_list) blk_mq_kick_requeue_list(q); } EXPORT_SYMBOL(blk_mq_requeue_request); static void blk_mq_requeue_work(struct work_struct *work) { struct request_queue *q = container_of(work, struct request_queue, requeue_work.work); LIST_HEAD(rq_list); LIST_HEAD(flush_list); struct request *rq; spin_lock_irq(&q->requeue_lock); list_splice_init(&q->requeue_list, &rq_list); list_splice_init(&q->flush_list, &flush_list); spin_unlock_irq(&q->requeue_lock); while (!list_empty(&rq_list)) { rq = list_entry(rq_list.next, struct request, queuelist); list_del_init(&rq->queuelist); /* * If RQF_DONTPREP is set, the request has been started by the * driver already and might have driver-specific data allocated * already. Insert it into the hctx dispatch list to avoid * block layer merges for the request. */ if (rq->rq_flags & RQF_DONTPREP) blk_mq_request_bypass_insert(rq, 0); else blk_mq_insert_request(rq, BLK_MQ_INSERT_AT_HEAD); } while (!list_empty(&flush_list)) { rq = list_entry(flush_list.next, struct request, queuelist); list_del_init(&rq->queuelist); blk_mq_insert_request(rq, 0); } blk_mq_run_hw_queues(q, false); } void blk_mq_kick_requeue_list(struct request_queue *q) { kblockd_mod_delayed_work_on(WORK_CPU_UNBOUND, &q->requeue_work, 0); } EXPORT_SYMBOL(blk_mq_kick_requeue_list); void blk_mq_delay_kick_requeue_list(struct request_queue *q, unsigned long msecs) { kblockd_mod_delayed_work_on(WORK_CPU_UNBOUND, &q->requeue_work, msecs_to_jiffies(msecs)); } EXPORT_SYMBOL(blk_mq_delay_kick_requeue_list); static bool blk_is_flush_data_rq(struct request *rq) { return (rq->rq_flags & RQF_FLUSH_SEQ) && !is_flush_rq(rq); } static bool blk_mq_rq_inflight(struct request *rq, void *priv) { /* * If we find a request that isn't idle we know the queue is busy * as it's checked in the iter. * Return false to stop the iteration. * * In case of queue quiesce, if one flush data request is completed, * don't count it as inflight given the flush sequence is suspended, * and the original flush data request is invisible to driver, just * like other pending requests because of quiesce */ if (blk_mq_request_started(rq) && !(blk_queue_quiesced(rq->q) && blk_is_flush_data_rq(rq) && blk_mq_request_completed(rq))) { bool *busy = priv; *busy = true; return false; } return true; } bool blk_mq_queue_inflight(struct request_queue *q) { bool busy = false; blk_mq_queue_tag_busy_iter(q, blk_mq_rq_inflight, &busy); return busy; } EXPORT_SYMBOL_GPL(blk_mq_queue_inflight); static void blk_mq_rq_timed_out(struct request *req) { req->rq_flags |= RQF_TIMED_OUT; if (req->q->mq_ops->timeout) { enum blk_eh_timer_return ret; ret = req->q->mq_ops->timeout(req); if (ret == BLK_EH_DONE) return; WARN_ON_ONCE(ret != BLK_EH_RESET_TIMER); } blk_add_timer(req); } struct blk_expired_data { bool has_timedout_rq; unsigned long next; unsigned long timeout_start; }; static bool blk_mq_req_expired(struct request *rq, struct blk_expired_data *expired) { unsigned long deadline; if (blk_mq_rq_state(rq) != MQ_RQ_IN_FLIGHT) return false; if (rq->rq_flags & RQF_TIMED_OUT) return false; deadline = READ_ONCE(rq->deadline); if (time_after_eq(expired->timeout_start, deadline)) return true; if (expired->next == 0) expired->next = deadline; else if (time_after(expired->next, deadline)) expired->next = deadline; return false; } void blk_mq_put_rq_ref(struct request *rq) { if (is_flush_rq(rq)) { if (rq->end_io(rq, 0) == RQ_END_IO_FREE) blk_mq_free_request(rq); } else if (req_ref_put_and_test(rq)) { __blk_mq_free_request(rq); } } static bool blk_mq_check_expired(struct request *rq, void *priv) { struct blk_expired_data *expired = priv; /* * blk_mq_queue_tag_busy_iter() has locked the request, so it cannot * be reallocated underneath the timeout handler's processing, then * the expire check is reliable. If the request is not expired, then * it was completed and reallocated as a new request after returning * from blk_mq_check_expired(). */ if (blk_mq_req_expired(rq, expired)) { expired->has_timedout_rq = true; return false; } return true; } static bool blk_mq_handle_expired(struct request *rq, void *priv) { struct blk_expired_data *expired = priv; if (blk_mq_req_expired(rq, expired)) blk_mq_rq_timed_out(rq); return true; } static void blk_mq_timeout_work(struct work_struct *work) { struct request_queue *q = container_of(work, struct request_queue, timeout_work); struct blk_expired_data expired = { .timeout_start = jiffies, }; struct blk_mq_hw_ctx *hctx; unsigned long i; /* A deadlock might occur if a request is stuck requiring a * timeout at the same time a queue freeze is waiting * completion, since the timeout code would not be able to * acquire the queue reference here. * * That's why we don't use blk_queue_enter here; instead, we use * percpu_ref_tryget directly, because we need to be able to * obtain a reference even in the short window between the queue * starting to freeze, by dropping the first reference in * blk_freeze_queue_start, and the moment the last request is * consumed, marked by the instant q_usage_counter reaches * zero. */ if (!percpu_ref_tryget(&q->q_usage_counter)) return; /* check if there is any timed-out request */ blk_mq_queue_tag_busy_iter(q, blk_mq_check_expired, &expired); if (expired.has_timedout_rq) { /* * Before walking tags, we must ensure any submit started * before the current time has finished. Since the submit * uses srcu or rcu, wait for a synchronization point to * ensure all running submits have finished */ blk_mq_wait_quiesce_done(q->tag_set); expired.next = 0; blk_mq_queue_tag_busy_iter(q, blk_mq_handle_expired, &expired); } if (expired.next != 0) { mod_timer(&q->timeout, expired.next); } else { /* * Request timeouts are handled as a forward rolling timer. If * we end up here it means that no requests are pending and * also that no request has been pending for a while. Mark * each hctx as idle. */ queue_for_each_hw_ctx(q, hctx, i) { /* the hctx may be unmapped, so check it here */ if (blk_mq_hw_queue_mapped(hctx)) blk_mq_tag_idle(hctx); } } blk_queue_exit(q); } struct flush_busy_ctx_data { struct blk_mq_hw_ctx *hctx; struct list_head *list; }; static bool flush_busy_ctx(struct sbitmap *sb, unsigned int bitnr, void *data) { struct flush_busy_ctx_data *flush_data = data; struct blk_mq_hw_ctx *hctx = flush_data->hctx; struct blk_mq_ctx *ctx = hctx->ctxs[bitnr]; enum hctx_type type = hctx->type; spin_lock(&ctx->lock); list_splice_tail_init(&ctx->rq_lists[type], flush_data->list); sbitmap_clear_bit(sb, bitnr); spin_unlock(&ctx->lock); return true; } /* * Process software queues that have been marked busy, splicing them * to the for-dispatch */ void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list) { struct flush_busy_ctx_data data = { .hctx = hctx, .list = list, }; sbitmap_for_each_set(&hctx->ctx_map, flush_busy_ctx, &data); } struct dispatch_rq_data { struct blk_mq_hw_ctx *hctx; struct request *rq; }; static bool dispatch_rq_from_ctx(struct sbitmap *sb, unsigned int bitnr, void *data) { struct dispatch_rq_data *dispatch_data = data; struct blk_mq_hw_ctx *hctx = dispatch_data->hctx; struct blk_mq_ctx *ctx = hctx->ctxs[bitnr]; enum hctx_type type = hctx->type; spin_lock(&ctx->lock); if (!list_empty(&ctx->rq_lists[type])) { dispatch_data->rq = list_entry_rq(ctx->rq_lists[type].next); list_del_init(&dispatch_data->rq->queuelist); if (list_empty(&ctx->rq_lists[type])) sbitmap_clear_bit(sb, bitnr); } spin_unlock(&ctx->lock); return !dispatch_data->rq; } struct request *blk_mq_dequeue_from_ctx(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *start) { unsigned off = start ? start->index_hw[hctx->type] : 0; struct dispatch_rq_data data = { .hctx = hctx, .rq = NULL, }; __sbitmap_for_each_set(&hctx->ctx_map, off, dispatch_rq_from_ctx, &data); return data.rq; } bool __blk_mq_alloc_driver_tag(struct request *rq) { struct sbitmap_queue *bt = &rq->mq_hctx->tags->bitmap_tags; unsigned int tag_offset = rq->mq_hctx->tags->nr_reserved_tags; int tag; blk_mq_tag_busy(rq->mq_hctx); if (blk_mq_tag_is_reserved(rq->mq_hctx->sched_tags, rq->internal_tag)) { bt = &rq->mq_hctx->tags->breserved_tags; tag_offset = 0; } else { if (!hctx_may_queue(rq->mq_hctx, bt)) return false; } tag = __sbitmap_queue_get(bt); if (tag == BLK_MQ_NO_TAG) return false; rq->tag = tag + tag_offset; blk_mq_inc_active_requests(rq->mq_hctx); return true; } static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode, int flags, void *key) { struct blk_mq_hw_ctx *hctx; hctx = container_of(wait, struct blk_mq_hw_ctx, dispatch_wait); spin_lock(&hctx->dispatch_wait_lock); if (!list_empty(&wait->entry)) { struct sbitmap_queue *sbq; list_del_init(&wait->entry); sbq = &hctx->tags->bitmap_tags; atomic_dec(&sbq->ws_active); } spin_unlock(&hctx->dispatch_wait_lock); blk_mq_run_hw_queue(hctx, true); return 1; } /* * Mark us waiting for a tag. For shared tags, this involves hooking us into * the tag wakeups. For non-shared tags, we can simply mark us needing a * restart. For both cases, take care to check the condition again after * marking us as waiting. */ static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx *hctx, struct request *rq) { struct sbitmap_queue *sbq; struct wait_queue_head *wq; wait_queue_entry_t *wait; bool ret; if (!(hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED) && !(blk_mq_is_shared_tags(hctx->flags))) { blk_mq_sched_mark_restart_hctx(hctx); /* * It's possible that a tag was freed in the window between the * allocation failure and adding the hardware queue to the wait * queue. * * Don't clear RESTART here, someone else could have set it. * At most this will cost an extra queue run. */ return blk_mq_get_driver_tag(rq); } wait = &hctx->dispatch_wait; if (!list_empty_careful(&wait->entry)) return false; if (blk_mq_tag_is_reserved(rq->mq_hctx->sched_tags, rq->internal_tag)) sbq = &hctx->tags->breserved_tags; else sbq = &hctx->tags->bitmap_tags; wq = &bt_wait_ptr(sbq, hctx)->wait; spin_lock_irq(&wq->lock); spin_lock(&hctx->dispatch_wait_lock); if (!list_empty(&wait->entry)) { spin_unlock(&hctx->dispatch_wait_lock); spin_unlock_irq(&wq->lock); return false; } atomic_inc(&sbq->ws_active); wait->flags &= ~WQ_FLAG_EXCLUSIVE; __add_wait_queue(wq, wait); /* * Add one explicit barrier since blk_mq_get_driver_tag() may * not imply barrier in case of failure. * * Order adding us to wait queue and allocating driver tag. * * The pair is the one implied in sbitmap_queue_wake_up() which * orders clearing sbitmap tag bits and waitqueue_active() in * __sbitmap_queue_wake_up(), since waitqueue_active() is lockless * * Otherwise, re-order of adding wait queue and getting driver tag * may cause __sbitmap_queue_wake_up() to wake up nothing because * the waitqueue_active() may not observe us in wait queue. */ smp_mb(); /* * It's possible that a tag was freed in the window between the * allocation failure and adding the hardware queue to the wait * queue. */ ret = blk_mq_get_driver_tag(rq); if (!ret) { spin_unlock(&hctx->dispatch_wait_lock); spin_unlock_irq(&wq->lock); return false; } /* * We got a tag, remove ourselves from the wait queue to ensure * someone else gets the wakeup. */ list_del_init(&wait->entry); atomic_dec(&sbq->ws_active); spin_unlock(&hctx->dispatch_wait_lock); spin_unlock_irq(&wq->lock); return true; } #define BLK_MQ_DISPATCH_BUSY_EWMA_WEIGHT 8 #define BLK_MQ_DISPATCH_BUSY_EWMA_FACTOR 4 /* * Update dispatch busy with the Exponential Weighted Moving Average(EWMA): * - EWMA is one simple way to compute running average value * - weight(7/8 and 1/8) is applied so that it can decrease exponentially * - take 4 as factor for avoiding to get too small(0) result, and this * factor doesn't matter because EWMA decreases exponentially */ static void blk_mq_update_dispatch_busy(struct blk_mq_hw_ctx *hctx, bool busy) { unsigned int ewma; ewma = hctx->dispatch_busy; if (!ewma && !busy) return; ewma *= BLK_MQ_DISPATCH_BUSY_EWMA_WEIGHT - 1; if (busy) ewma += 1 << BLK_MQ_DISPATCH_BUSY_EWMA_FACTOR; ewma /= BLK_MQ_DISPATCH_BUSY_EWMA_WEIGHT; hctx->dispatch_busy = ewma; } #define BLK_MQ_RESOURCE_DELAY 3 /* ms units */ static void blk_mq_handle_dev_resource(struct request *rq, struct list_head *list) { list_add(&rq->queuelist, list); __blk_mq_requeue_request(rq); } enum prep_dispatch { PREP_DISPATCH_OK, PREP_DISPATCH_NO_TAG, PREP_DISPATCH_NO_BUDGET, }; static enum prep_dispatch blk_mq_prep_dispatch_rq(struct request *rq, bool need_budget) { struct blk_mq_hw_ctx *hctx = rq->mq_hctx; int budget_token = -1; if (need_budget) { budget_token = blk_mq_get_dispatch_budget(rq->q); if (budget_token < 0) { blk_mq_put_driver_tag(rq); return PREP_DISPATCH_NO_BUDGET; } blk_mq_set_rq_budget_token(rq, budget_token); } if (!blk_mq_get_driver_tag(rq)) { /* * The initial allocation attempt failed, so we need to * rerun the hardware queue when a tag is freed. The * waitqueue takes care of that. If the queue is run * before we add this entry back on the dispatch list, * we'll re-run it below. */ if (!blk_mq_mark_tag_wait(hctx, rq)) { /* * All budgets not got from this function will be put * together during handling partial dispatch */ if (need_budget) blk_mq_put_dispatch_budget(rq->q, budget_token); return PREP_DISPATCH_NO_TAG; } } return PREP_DISPATCH_OK; } /* release all allocated budgets before calling to blk_mq_dispatch_rq_list */ static void blk_mq_release_budgets(struct request_queue *q, struct list_head *list) { struct request *rq; list_for_each_entry(rq, list, queuelist) { int budget_token = blk_mq_get_rq_budget_token(rq); if (budget_token >= 0) blk_mq_put_dispatch_budget(q, budget_token); } } /* * blk_mq_commit_rqs will notify driver using bd->last that there is no * more requests. (See comment in struct blk_mq_ops for commit_rqs for * details) * Attention, we should explicitly call this in unusual cases: * 1) did not queue everything initially scheduled to queue * 2) the last attempt to queue a request failed */ static void blk_mq_commit_rqs(struct blk_mq_hw_ctx *hctx, int queued, bool from_schedule) { if (hctx->queue->mq_ops->commit_rqs && queued) { trace_block_unplug(hctx->queue, queued, !from_schedule); hctx->queue->mq_ops->commit_rqs(hctx); } } /* * Returns true if we did some work AND can potentially do more. */ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list, bool get_budget) { enum prep_dispatch prep; struct request_queue *q = hctx->queue; struct request *rq; int queued; blk_status_t ret = BLK_STS_OK; bool needs_resource = false; if (list_empty(list)) return false; /* * Now process all the entries, sending them to the driver. */ queued = 0; do { struct blk_mq_queue_data bd; rq = list_first_entry(list, struct request, queuelist); WARN_ON_ONCE(hctx != rq->mq_hctx); prep = blk_mq_prep_dispatch_rq(rq, get_budget); if (prep != PREP_DISPATCH_OK) break; list_del_init(&rq->queuelist); bd.rq = rq; bd.last = list_empty(list); ret = q->mq_ops->queue_rq(hctx, &bd); switch (ret) { case BLK_STS_OK: queued++; break; case BLK_STS_RESOURCE: needs_resource = true; fallthrough; case BLK_STS_DEV_RESOURCE: blk_mq_handle_dev_resource(rq, list); goto out; default: blk_mq_end_request(rq, ret); } } while (!list_empty(list)); out: /* If we didn't flush the entire list, we could have told the driver * there was more coming, but that turned out to be a lie. */ if (!list_empty(list) || ret != BLK_STS_OK) blk_mq_commit_rqs(hctx, queued, false); /* * Any items that need requeuing? Stuff them into hctx->dispatch, * that is where we will continue on next queue run. */ if (!list_empty(list)) { bool needs_restart; /* For non-shared tags, the RESTART check will suffice */ bool no_tag = prep == PREP_DISPATCH_NO_TAG && ((hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED) || blk_mq_is_shared_tags(hctx->flags)); /* * If the caller allocated budgets, free the budgets of the * requests that have not yet been passed to the block driver. */ if (!get_budget) blk_mq_release_budgets(q, list); spin_lock(&hctx->lock); list_splice_tail_init(list, &hctx->dispatch); spin_unlock(&hctx->lock); /* * Order adding requests to hctx->dispatch and checking * SCHED_RESTART flag. The pair of this smp_mb() is the one * in blk_mq_sched_restart(). Avoid restart code path to * miss the new added requests to hctx->dispatch, meantime * SCHED_RESTART is observed here. */ smp_mb(); /* * If SCHED_RESTART was set by the caller of this function and * it is no longer set that means that it was cleared by another * thread and hence that a queue rerun is needed. * * If 'no_tag' is set, that means that we failed getting * a driver tag with an I/O scheduler attached. If our dispatch * waitqueue is no longer active, ensure that we run the queue * AFTER adding our entries back to the list. * * If no I/O scheduler has been configured it is possible that * the hardware queue got stopped and restarted before requests * were pushed back onto the dispatch list. Rerun the queue to * avoid starvation. Notes: * - blk_mq_run_hw_queue() checks whether or not a queue has * been stopped before rerunning a queue. * - Some but not all block drivers stop a queue before * returning BLK_STS_RESOURCE. Two exceptions are scsi-mq * and dm-rq. * * If driver returns BLK_STS_RESOURCE and SCHED_RESTART * bit is set, run queue after a delay to avoid IO stalls * that could otherwise occur if the queue is idle. We'll do * similar if we couldn't get budget or couldn't lock a zone * and SCHED_RESTART is set. */ needs_restart = blk_mq_sched_needs_restart(hctx); if (prep == PREP_DISPATCH_NO_BUDGET) needs_resource = true; if (!needs_restart || (no_tag && list_empty_careful(&hctx->dispatch_wait.entry))) blk_mq_run_hw_queue(hctx, true); else if (needs_resource) blk_mq_delay_run_hw_queue(hctx, BLK_MQ_RESOURCE_DELAY); blk_mq_update_dispatch_busy(hctx, true); return false; } blk_mq_update_dispatch_busy(hctx, false); return true; } static inline int blk_mq_first_mapped_cpu(struct blk_mq_hw_ctx *hctx) { int cpu = cpumask_first_and(hctx->cpumask, cpu_online_mask); if (cpu >= nr_cpu_ids) cpu = cpumask_first(hctx->cpumask); return cpu; } /* * ->next_cpu is always calculated from hctx->cpumask, so simply use * it for speeding up the check */ static bool blk_mq_hctx_empty_cpumask(struct blk_mq_hw_ctx *hctx) { return hctx->next_cpu >= nr_cpu_ids; } /* * It'd be great if the workqueue API had a way to pass * in a mask and had some smarts for more clever placement. * For now we just round-robin here, switching for every * BLK_MQ_CPU_WORK_BATCH queued items. */ static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx) { bool tried = false; int next_cpu = hctx->next_cpu; /* Switch to unbound if no allowable CPUs in this hctx */ if (hctx->queue->nr_hw_queues == 1 || blk_mq_hctx_empty_cpumask(hctx)) return WORK_CPU_UNBOUND; if (--hctx->next_cpu_batch <= 0) { select_cpu: next_cpu = cpumask_next_and(next_cpu, hctx->cpumask, cpu_online_mask); if (next_cpu >= nr_cpu_ids) next_cpu = blk_mq_first_mapped_cpu(hctx); hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH; } /* * Do unbound schedule if we can't find a online CPU for this hctx, * and it should only happen in the path of handling CPU DEAD. */ if (!cpu_online(next_cpu)) { if (!tried) { tried = true; goto select_cpu; } /* * Make sure to re-select CPU next time once after CPUs * in hctx->cpumask become online again. */ hctx->next_cpu = next_cpu; hctx->next_cpu_batch = 1; return WORK_CPU_UNBOUND; } hctx->next_cpu = next_cpu; return next_cpu; } /** * blk_mq_delay_run_hw_queue - Run a hardware queue asynchronously. * @hctx: Pointer to the hardware queue to run. * @msecs: Milliseconds of delay to wait before running the queue. * * Run a hardware queue asynchronously with a delay of @msecs. */ void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs) { if (unlikely(blk_mq_hctx_stopped(hctx))) return; kblockd_mod_delayed_work_on(blk_mq_hctx_next_cpu(hctx), &hctx->run_work, msecs_to_jiffies(msecs)); } EXPORT_SYMBOL(blk_mq_delay_run_hw_queue); static inline bool blk_mq_hw_queue_need_run(struct blk_mq_hw_ctx *hctx) { bool need_run; /* * When queue is quiesced, we may be switching io scheduler, or * updating nr_hw_queues, or other things, and we can't run queue * any more, even blk_mq_hctx_has_pending() can't be called safely. * * And queue will be rerun in blk_mq_unquiesce_queue() if it is * quiesced. */ __blk_mq_run_dispatch_ops(hctx->queue, false, need_run = !blk_queue_quiesced(hctx->queue) && blk_mq_hctx_has_pending(hctx)); return need_run; } /** * blk_mq_run_hw_queue - Start to run a hardware queue. * @hctx: Pointer to the hardware queue to run. * @async: If we want to run the queue asynchronously. * * Check if the request queue is not in a quiesced state and if there are * pending requests to be sent. If this is true, run the queue to send requests * to hardware. */ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) { bool need_run; /* * We can't run the queue inline with interrupts disabled. */ WARN_ON_ONCE(!async && in_interrupt()); might_sleep_if(!async && hctx->flags & BLK_MQ_F_BLOCKING); need_run = blk_mq_hw_queue_need_run(hctx); if (!need_run) { unsigned long flags; /* * Synchronize with blk_mq_unquiesce_queue(), because we check * if hw queue is quiesced locklessly above, we need the use * ->queue_lock to make sure we see the up-to-date status to * not miss rerunning the hw queue. */ spin_lock_irqsave(&hctx->queue->queue_lock, flags); need_run = blk_mq_hw_queue_need_run(hctx); spin_unlock_irqrestore(&hctx->queue->queue_lock, flags); if (!need_run) return; } if (async || !cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask)) { blk_mq_delay_run_hw_queue(hctx, 0); return; } blk_mq_run_dispatch_ops(hctx->queue, blk_mq_sched_dispatch_requests(hctx)); } EXPORT_SYMBOL(blk_mq_run_hw_queue); /* * Return prefered queue to dispatch from (if any) for non-mq aware IO * scheduler. */ static struct blk_mq_hw_ctx *blk_mq_get_sq_hctx(struct request_queue *q) { struct blk_mq_ctx *ctx = blk_mq_get_ctx(q); /* * If the IO scheduler does not respect hardware queues when * dispatching, we just don't bother with multiple HW queues and * dispatch from hctx for the current CPU since running multiple queues * just causes lock contention inside the scheduler and pointless cache * bouncing. */ struct blk_mq_hw_ctx *hctx = ctx->hctxs[HCTX_TYPE_DEFAULT]; if (!blk_mq_hctx_stopped(hctx)) return hctx; return NULL; } /** * blk_mq_run_hw_queues - Run all hardware queues in a request queue. * @q: Pointer to the request queue to run. * @async: If we want to run the queue asynchronously. */ void blk_mq_run_hw_queues(struct request_queue *q, bool async) { struct blk_mq_hw_ctx *hctx, *sq_hctx; unsigned long i; sq_hctx = NULL; if (blk_queue_sq_sched(q)) sq_hctx = blk_mq_get_sq_hctx(q); queue_for_each_hw_ctx(q, hctx, i) { if (blk_mq_hctx_stopped(hctx)) continue; /* * Dispatch from this hctx either if there's no hctx preferred * by IO scheduler or if it has requests that bypass the * scheduler. */ if (!sq_hctx || sq_hctx == hctx || !list_empty_careful(&hctx->dispatch)) blk_mq_run_hw_queue(hctx, async); } } EXPORT_SYMBOL(blk_mq_run_hw_queues); /** * blk_mq_delay_run_hw_queues - Run all hardware queues asynchronously. * @q: Pointer to the request queue to run. * @msecs: Milliseconds of delay to wait before running the queues. */ void blk_mq_delay_run_hw_queues(struct request_queue *q, unsigned long msecs) { struct blk_mq_hw_ctx *hctx, *sq_hctx; unsigned long i; sq_hctx = NULL; if (blk_queue_sq_sched(q)) sq_hctx = blk_mq_get_sq_hctx(q); queue_for_each_hw_ctx(q, hctx, i) { if (blk_mq_hctx_stopped(hctx)) continue; /* * If there is already a run_work pending, leave the * pending delay untouched. Otherwise, a hctx can stall * if another hctx is re-delaying the other's work * before the work executes. */ if (delayed_work_pending(&hctx->run_work)) continue; /* * Dispatch from this hctx either if there's no hctx preferred * by IO scheduler or if it has requests that bypass the * scheduler. */ if (!sq_hctx || sq_hctx == hctx || !list_empty_careful(&hctx->dispatch)) blk_mq_delay_run_hw_queue(hctx, msecs); } } EXPORT_SYMBOL(blk_mq_delay_run_hw_queues); /* * This function is often used for pausing .queue_rq() by driver when * there isn't enough resource or some conditions aren't satisfied, and * BLK_STS_RESOURCE is usually returned. * * We do not guarantee that dispatch can be drained or blocked * after blk_mq_stop_hw_queue() returns. Please use * blk_mq_quiesce_queue() for that requirement. */ void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx) { cancel_delayed_work(&hctx->run_work); set_bit(BLK_MQ_S_STOPPED, &hctx->state); } EXPORT_SYMBOL(blk_mq_stop_hw_queue); /* * This function is often used for pausing .queue_rq() by driver when * there isn't enough resource or some conditions aren't satisfied, and * BLK_STS_RESOURCE is usually returned. * * We do not guarantee that dispatch can be drained or blocked * after blk_mq_stop_hw_queues() returns. Please use * blk_mq_quiesce_queue() for that requirement. */ void blk_mq_stop_hw_queues(struct request_queue *q) { struct blk_mq_hw_ctx *hctx; unsigned long i; queue_for_each_hw_ctx(q, hctx, i) blk_mq_stop_hw_queue(hctx); } EXPORT_SYMBOL(blk_mq_stop_hw_queues); void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx) { clear_bit(BLK_MQ_S_STOPPED, &hctx->state); blk_mq_run_hw_queue(hctx, hctx->flags & BLK_MQ_F_BLOCKING); } EXPORT_SYMBOL(blk_mq_start_hw_queue); void blk_mq_start_hw_queues(struct request_queue *q) { struct blk_mq_hw_ctx *hctx; unsigned long i; queue_for_each_hw_ctx(q, hctx, i) blk_mq_start_hw_queue(hctx); } EXPORT_SYMBOL(blk_mq_start_hw_queues); void blk_mq_start_stopped_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) { if (!blk_mq_hctx_stopped(hctx)) return; clear_bit(BLK_MQ_S_STOPPED, &hctx->state); /* * Pairs with the smp_mb() in blk_mq_hctx_stopped() to order the * clearing of BLK_MQ_S_STOPPED above and the checking of dispatch * list in the subsequent routine. */ smp_mb__after_atomic(); blk_mq_run_hw_queue(hctx, async); } EXPORT_SYMBOL_GPL(blk_mq_start_stopped_hw_queue); void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async) { struct blk_mq_hw_ctx *hctx; unsigned long i; queue_for_each_hw_ctx(q, hctx, i) blk_mq_start_stopped_hw_queue(hctx, async || (hctx->flags & BLK_MQ_F_BLOCKING)); } EXPORT_SYMBOL(blk_mq_start_stopped_hw_queues); static void blk_mq_run_work_fn(struct work_struct *work) { struct blk_mq_hw_ctx *hctx = container_of(work, struct blk_mq_hw_ctx, run_work.work); blk_mq_run_dispatch_ops(hctx->queue, blk_mq_sched_dispatch_requests(hctx)); } /** * blk_mq_request_bypass_insert - Insert a request at dispatch list. * @rq: Pointer to request to be inserted. * @flags: BLK_MQ_INSERT_* * * Should only be used carefully, when the caller knows we want to * bypass a potential IO scheduler on the target device. */ static void blk_mq_request_bypass_insert(struct request *rq, blk_insert_t flags) { struct blk_mq_hw_ctx *hctx = rq->mq_hctx; spin_lock(&hctx->lock); if (flags & BLK_MQ_INSERT_AT_HEAD) list_add(&rq->queuelist, &hctx->dispatch); else list_add_tail(&rq->queuelist, &hctx->dispatch); spin_unlock(&hctx->lock); } static void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx, struct list_head *list, bool run_queue_async) { struct request *rq; enum hctx_type type = hctx->type; /* * Try to issue requests directly if the hw queue isn't busy to save an * extra enqueue & dequeue to the sw queue. */ if (!hctx->dispatch_busy && !run_queue_async) { blk_mq_run_dispatch_ops(hctx->queue, blk_mq_try_issue_list_directly(hctx, list)); if (list_empty(list)) goto out; } /* * preemption doesn't flush plug list, so it's possible ctx->cpu is * offline now */ list_for_each_entry(rq, list, queuelist) { BUG_ON(rq->mq_ctx != ctx); trace_block_rq_insert(rq); if (rq->cmd_flags & REQ_NOWAIT) run_queue_async = true; } spin_lock(&ctx->lock); list_splice_tail_init(list, &ctx->rq_lists[type]); blk_mq_hctx_mark_pending(hctx, ctx); spin_unlock(&ctx->lock); out: blk_mq_run_hw_queue(hctx, run_queue_async); } static void blk_mq_insert_request(struct request *rq, blk_insert_t flags) { struct request_queue *q = rq->q; struct blk_mq_ctx *ctx = rq->mq_ctx; struct blk_mq_hw_ctx *hctx = rq->mq_hctx; if (blk_rq_is_passthrough(rq)) { /* * Passthrough request have to be added to hctx->dispatch * directly. The device may be in a situation where it can't * handle FS request, and always returns BLK_STS_RESOURCE for * them, which gets them added to hctx->dispatch. * * If a passthrough request is required to unblock the queues, * and it is added to the scheduler queue, there is no chance to * dispatch it given we prioritize requests in hctx->dispatch. */ blk_mq_request_bypass_insert(rq, flags); } else if (req_op(rq) == REQ_OP_FLUSH) { /* * Firstly normal IO request is inserted to scheduler queue or * sw queue, meantime we add flush request to dispatch queue( * hctx->dispatch) directly and there is at most one in-flight * flush request for each hw queue, so it doesn't matter to add * flush request to tail or front of the dispatch queue. * * Secondly in case of NCQ, flush request belongs to non-NCQ * command, and queueing it will fail when there is any * in-flight normal IO request(NCQ command). When adding flush * rq to the front of hctx->dispatch, it is easier to introduce * extra time to flush rq's latency because of S_SCHED_RESTART * compared with adding to the tail of dispatch queue, then * chance of flush merge is increased, and less flush requests * will be issued to controller. It is observed that ~10% time * is saved in blktests block/004 on disk attached to AHCI/NCQ * drive when adding flush rq to the front of hctx->dispatch. * * Simply queue flush rq to the front of hctx->dispatch so that * intensive flush workloads can benefit in case of NCQ HW. */ blk_mq_request_bypass_insert(rq, BLK_MQ_INSERT_AT_HEAD); } else if (q->elevator) { LIST_HEAD(list); WARN_ON_ONCE(rq->tag != BLK_MQ_NO_TAG); list_add(&rq->queuelist, &list); q->elevator->type->ops.insert_requests(hctx, &list, flags); } else { trace_block_rq_insert(rq); spin_lock(&ctx->lock); if (flags & BLK_MQ_INSERT_AT_HEAD) list_add(&rq->queuelist, &ctx->rq_lists[hctx->type]); else list_add_tail(&rq->queuelist, &ctx->rq_lists[hctx->type]); blk_mq_hctx_mark_pending(hctx, ctx); spin_unlock(&ctx->lock); } } static void blk_mq_bio_to_request(struct request *rq, struct bio *bio, unsigned int nr_segs) { int err; if (bio->bi_opf & REQ_RAHEAD) rq->cmd_flags |= REQ_FAILFAST_MASK; rq->bio = rq->biotail = bio; rq->__sector = bio->bi_iter.bi_sector; rq->__data_len = bio->bi_iter.bi_size; rq->nr_phys_segments = nr_segs; if (bio_integrity(bio)) rq->nr_integrity_segments = blk_rq_count_integrity_sg(rq->q, bio); /* This can't fail, since GFP_NOIO includes __GFP_DIRECT_RECLAIM. */ err = blk_crypto_rq_bio_prep(rq, bio, GFP_NOIO); WARN_ON_ONCE(err); blk_account_io_start(rq); } static blk_status_t __blk_mq_issue_directly(struct blk_mq_hw_ctx *hctx, struct request *rq, bool last) { struct request_queue *q = rq->q; struct blk_mq_queue_data bd = { .rq = rq, .last = last, }; blk_status_t ret; /* * For OK queue, we are done. For error, caller may kill it. * Any other error (busy), just add it to our list as we * previously would have done. */ ret = q->mq_ops->queue_rq(hctx, &bd); switch (ret) { case BLK_STS_OK: blk_mq_update_dispatch_busy(hctx, false); break; case BLK_STS_RESOURCE: case BLK_STS_DEV_RESOURCE: blk_mq_update_dispatch_busy(hctx, true); __blk_mq_requeue_request(rq); break; default: blk_mq_update_dispatch_busy(hctx, false); break; } return ret; } static bool blk_mq_get_budget_and_tag(struct request *rq) { int budget_token; budget_token = blk_mq_get_dispatch_budget(rq->q); if (budget_token < 0) return false; blk_mq_set_rq_budget_token(rq, budget_token); if (!blk_mq_get_driver_tag(rq)) { blk_mq_put_dispatch_budget(rq->q, budget_token); return false; } return true; } /** * blk_mq_try_issue_directly - Try to send a request directly to device driver. * @hctx: Pointer of the associated hardware queue. * @rq: Pointer to request to be sent. * * If the device has enough resources to accept a new request now, send the * request directly to device driver. Else, insert at hctx->dispatch queue, so * we can try send it another time in the future. Requests inserted at this * queue have higher priority. */ static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, struct request *rq) { blk_status_t ret; if (blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(rq->q)) { blk_mq_insert_request(rq, 0); blk_mq_run_hw_queue(hctx, false); return; } if ((rq->rq_flags & RQF_USE_SCHED) || !blk_mq_get_budget_and_tag(rq)) { blk_mq_insert_request(rq, 0); blk_mq_run_hw_queue(hctx, rq->cmd_flags & REQ_NOWAIT); return; } ret = __blk_mq_issue_directly(hctx, rq, true); switch (ret) { case BLK_STS_OK: break; case BLK_STS_RESOURCE: case BLK_STS_DEV_RESOURCE: blk_mq_request_bypass_insert(rq, 0); blk_mq_run_hw_queue(hctx, false); break; default: blk_mq_end_request(rq, ret); break; } } static blk_status_t blk_mq_request_issue_directly(struct request *rq, bool last) { struct blk_mq_hw_ctx *hctx = rq->mq_hctx; if (blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(rq->q)) { blk_mq_insert_request(rq, 0); blk_mq_run_hw_queue(hctx, false); return BLK_STS_OK; } if (!blk_mq_get_budget_and_tag(rq)) return BLK_STS_RESOURCE; return __blk_mq_issue_directly(hctx, rq, last); } static void blk_mq_issue_direct(struct rq_list *rqs) { struct blk_mq_hw_ctx *hctx = NULL; struct request *rq; int queued = 0; blk_status_t ret = BLK_STS_OK; while ((rq = rq_list_pop(rqs))) { bool last = rq_list_empty(rqs); if (hctx != rq->mq_hctx) { if (hctx) { blk_mq_commit_rqs(hctx, queued, false); queued = 0; } hctx = rq->mq_hctx; } ret = blk_mq_request_issue_directly(rq, last); switch (ret) { case BLK_STS_OK: queued++; break; case BLK_STS_RESOURCE: case BLK_STS_DEV_RESOURCE: blk_mq_request_bypass_insert(rq, 0); blk_mq_run_hw_queue(hctx, false); goto out; default: blk_mq_end_request(rq, ret); break; } } out: if (ret != BLK_STS_OK) blk_mq_commit_rqs(hctx, queued, false); } static void __blk_mq_flush_list(struct request_queue *q, struct rq_list *rqs) { if (blk_queue_quiesced(q)) return; q->mq_ops->queue_rqs(rqs); } static unsigned blk_mq_extract_queue_requests(struct rq_list *rqs, struct rq_list *queue_rqs) { struct request *rq = rq_list_pop(rqs); struct request_queue *this_q = rq->q; struct request **prev = &rqs->head; struct rq_list matched_rqs = {}; struct request *last = NULL; unsigned depth = 1; rq_list_add_tail(&matched_rqs, rq); while ((rq = *prev)) { if (rq->q == this_q) { /* move rq from rqs to matched_rqs */ *prev = rq->rq_next; rq_list_add_tail(&matched_rqs, rq); depth++; } else { /* leave rq in rqs */ prev = &rq->rq_next; last = rq; } } rqs->tail = last; *queue_rqs = matched_rqs; return depth; } static void blk_mq_dispatch_queue_requests(struct rq_list *rqs, unsigned depth) { struct request_queue *q = rq_list_peek(rqs)->q; trace_block_unplug(q, depth, true); /* * Peek first request and see if we have a ->queue_rqs() hook. * If we do, we can dispatch the whole list in one go. * We already know at this point that all requests belong to the * same queue, caller must ensure that's the case. */ if (q->mq_ops->queue_rqs) { blk_mq_run_dispatch_ops(q, __blk_mq_flush_list(q, rqs)); if (rq_list_empty(rqs)) return; } blk_mq_run_dispatch_ops(q, blk_mq_issue_direct(rqs)); } static void blk_mq_dispatch_list(struct rq_list *rqs, bool from_sched) { struct blk_mq_hw_ctx *this_hctx = NULL; struct blk_mq_ctx *this_ctx = NULL; struct rq_list requeue_list = {}; unsigned int depth = 0; bool is_passthrough = false; LIST_HEAD(list); do { struct request *rq = rq_list_pop(rqs); if (!this_hctx) { this_hctx = rq->mq_hctx; this_ctx = rq->mq_ctx; is_passthrough = blk_rq_is_passthrough(rq); } else if (this_hctx != rq->mq_hctx || this_ctx != rq->mq_ctx || is_passthrough != blk_rq_is_passthrough(rq)) { rq_list_add_tail(&requeue_list, rq); continue; } list_add_tail(&rq->queuelist, &list); depth++; } while (!rq_list_empty(rqs)); *rqs = requeue_list; trace_block_unplug(this_hctx->queue, depth, !from_sched); percpu_ref_get(&this_hctx->queue->q_usage_counter); /* passthrough requests should never be issued to the I/O scheduler */ if (is_passthrough) { spin_lock(&this_hctx->lock); list_splice_tail_init(&list, &this_hctx->dispatch); spin_unlock(&this_hctx->lock); blk_mq_run_hw_queue(this_hctx, from_sched); } else if (this_hctx->queue->elevator) { this_hctx->queue->elevator->type->ops.insert_requests(this_hctx, &list, 0); blk_mq_run_hw_queue(this_hctx, from_sched); } else { blk_mq_insert_requests(this_hctx, this_ctx, &list, from_sched); } percpu_ref_put(&this_hctx->queue->q_usage_counter); } static void blk_mq_dispatch_multiple_queue_requests(struct rq_list *rqs) { do { struct rq_list queue_rqs; unsigned depth; depth = blk_mq_extract_queue_requests(rqs, &queue_rqs); blk_mq_dispatch_queue_requests(&queue_rqs, depth); while (!rq_list_empty(&queue_rqs)) blk_mq_dispatch_list(&queue_rqs, false); } while (!rq_list_empty(rqs)); } void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule) { unsigned int depth; /* * We may have been called recursively midway through handling * plug->mq_list via a schedule() in the driver's queue_rq() callback. * To avoid mq_list changing under our feet, clear rq_count early and * bail out specifically if rq_count is 0 rather than checking * whether the mq_list is empty. */ if (plug->rq_count == 0) return; depth = plug->rq_count; plug->rq_count = 0; if (!plug->has_elevator && !from_schedule) { if (plug->multiple_queues) { blk_mq_dispatch_multiple_queue_requests(&plug->mq_list); return; } blk_mq_dispatch_queue_requests(&plug->mq_list, depth); if (rq_list_empty(&plug->mq_list)) return; } do { blk_mq_dispatch_list(&plug->mq_list, from_schedule); } while (!rq_list_empty(&plug->mq_list)); } static void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx, struct list_head *list) { int queued = 0; blk_status_t ret = BLK_STS_OK; while (!list_empty(list)) { struct request *rq = list_first_entry(list, struct request, queuelist); list_del_init(&rq->queuelist); ret = blk_mq_request_issue_directly(rq, list_empty(list)); switch (ret) { case BLK_STS_OK: queued++; break; case BLK_STS_RESOURCE: case BLK_STS_DEV_RESOURCE: blk_mq_request_bypass_insert(rq, 0); if (list_empty(list)) blk_mq_run_hw_queue(hctx, false); goto out; default: blk_mq_end_request(rq, ret); break; } } out: if (ret != BLK_STS_OK) blk_mq_commit_rqs(hctx, queued, false); } static bool blk_mq_attempt_bio_merge(struct request_queue *q, struct bio *bio, unsigned int nr_segs) { if (!blk_queue_nomerges(q) && bio_mergeable(bio)) { if (blk_attempt_plug_merge(q, bio, nr_segs)) return true; if (blk_mq_sched_bio_merge(q, bio, nr_segs)) return true; } return false; } static struct request *blk_mq_get_new_requests(struct request_queue *q, struct blk_plug *plug, struct bio *bio) { struct blk_mq_alloc_data data = { .q = q, .flags = 0, .shallow_depth = 0, .cmd_flags = bio->bi_opf, .rq_flags = 0, .nr_tags = 1, .cached_rqs = NULL, .ctx = NULL, .hctx = NULL }; struct request *rq; rq_qos_throttle(q, bio); if (plug) { data.nr_tags = plug->nr_ios; plug->nr_ios = 1; data.cached_rqs = &plug->cached_rqs; } rq = __blk_mq_alloc_requests(&data); if (unlikely(!rq)) rq_qos_cleanup(q, bio); return rq; } /* * Check if there is a suitable cached request and return it. */ static struct request *blk_mq_peek_cached_request(struct blk_plug *plug, struct request_queue *q, blk_opf_t opf) { enum hctx_type type = blk_mq_get_hctx_type(opf); struct request *rq; if (!plug) return NULL; rq = rq_list_peek(&plug->cached_rqs); if (!rq || rq->q != q) return NULL; if (type != rq->mq_hctx->type && (type != HCTX_TYPE_READ || rq->mq_hctx->type != HCTX_TYPE_DEFAULT)) return NULL; if (op_is_flush(rq->cmd_flags) != op_is_flush(opf)) return NULL; return rq; } static void blk_mq_use_cached_rq(struct request *rq, struct blk_plug *plug, struct bio *bio) { if (rq_list_pop(&plug->cached_rqs) != rq) WARN_ON_ONCE(1); /* * If any qos ->throttle() end up blocking, we will have flushed the * plug and hence killed the cached_rq list as well. Pop this entry * before we throttle. */ rq_qos_throttle(rq->q, bio); blk_mq_rq_time_init(rq, blk_time_get_ns()); rq->cmd_flags = bio->bi_opf; INIT_LIST_HEAD(&rq->queuelist); } static bool bio_unaligned(const struct bio *bio, struct request_queue *q) { unsigned int bs_mask = queue_logical_block_size(q) - 1; /* .bi_sector of any zero sized bio need to be initialized */ if ((bio->bi_iter.bi_size & bs_mask) || ((bio->bi_iter.bi_sector << SECTOR_SHIFT) & bs_mask)) return true; return false; } /** * blk_mq_submit_bio - Create and send a request to block device. * @bio: Bio pointer. * * Builds up a request structure from @q and @bio and send to the device. The * request may not be queued directly to hardware if: * * This request can be merged with another one * * We want to place request at plug queue for possible future merging * * There is an IO scheduler active at this queue * * It will not queue the request if there is an error with the bio, or at the * request creation. */ void blk_mq_submit_bio(struct bio *bio) { struct request_queue *q = bdev_get_queue(bio->bi_bdev); struct blk_plug *plug = current->plug; const int is_sync = op_is_sync(bio->bi_opf); struct blk_mq_hw_ctx *hctx; unsigned int nr_segs; struct request *rq; blk_status_t ret; /* * If the plug has a cached request for this queue, try to use it. */ rq = blk_mq_peek_cached_request(plug, q, bio->bi_opf); /* * A BIO that was released from a zone write plug has already been * through the preparation in this function, already holds a reference * on the queue usage counter, and is the only write BIO in-flight for * the target zone. Go straight to preparing a request for it. */ if (bio_zone_write_plugging(bio)) { nr_segs = bio->__bi_nr_segments; if (rq) blk_queue_exit(q); goto new_request; } /* * The cached request already holds a q_usage_counter reference and we * don't have to acquire a new one if we use it. */ if (!rq) { if (unlikely(bio_queue_enter(bio))) return; } /* * Device reconfiguration may change logical block size or reduce the * number of poll queues, so the checks for alignment and poll support * have to be done with queue usage counter held. */ if (unlikely(bio_unaligned(bio, q))) { bio_io_error(bio); goto queue_exit; } if ((bio->bi_opf & REQ_POLLED) && !blk_mq_can_poll(q)) { bio->bi_status = BLK_STS_NOTSUPP; bio_endio(bio); goto queue_exit; } bio = __bio_split_to_limits(bio, &q->limits, &nr_segs); if (!bio) goto queue_exit; if (!bio_integrity_prep(bio)) goto queue_exit; if (blk_mq_attempt_bio_merge(q, bio, nr_segs)) goto queue_exit; if (bio_needs_zone_write_plugging(bio)) { if (blk_zone_plug_bio(bio, nr_segs)) goto queue_exit; } new_request: if (rq) { blk_mq_use_cached_rq(rq, plug, bio); } else { rq = blk_mq_get_new_requests(q, plug, bio); if (unlikely(!rq)) { if (bio->bi_opf & REQ_NOWAIT) bio_wouldblock_error(bio); goto queue_exit; } } trace_block_getrq(bio); rq_qos_track(q, rq, bio); blk_mq_bio_to_request(rq, bio, nr_segs); ret = blk_crypto_rq_get_keyslot(rq); if (ret != BLK_STS_OK) { bio->bi_status = ret; bio_endio(bio); blk_mq_free_request(rq); return; } if (bio_zone_write_plugging(bio)) blk_zone_write_plug_init_request(rq); if (op_is_flush(bio->bi_opf) && blk_insert_flush(rq)) return; if (plug) { blk_add_rq_to_plug(plug, rq); return; } hctx = rq->mq_hctx; if ((rq->rq_flags & RQF_USE_SCHED) || (hctx->dispatch_busy && (q->nr_hw_queues == 1 || !is_sync))) { blk_mq_insert_request(rq, 0); blk_mq_run_hw_queue(hctx, true); } else { blk_mq_run_dispatch_ops(q, blk_mq_try_issue_directly(hctx, rq)); } return; queue_exit: /* * Don't drop the queue reference if we were trying to use a cached * request and thus didn't acquire one. */ if (!rq) blk_queue_exit(q); } #ifdef CONFIG_BLK_MQ_STACKING /** * blk_insert_cloned_request - Helper for stacking drivers to submit a request * @rq: the request being queued */ blk_status_t blk_insert_cloned_request(struct request *rq) { struct request_queue *q = rq->q; unsigned int max_sectors = blk_queue_get_max_sectors(rq); unsigned int max_segments = blk_rq_get_max_segments(rq); blk_status_t ret; if (blk_rq_sectors(rq) > max_sectors) { /* * SCSI device does not have a good way to return if * Write Same/Zero is actually supported. If a device rejects * a non-read/write command (discard, write same,etc.) the * low-level device driver will set the relevant queue limit to * 0 to prevent blk-lib from issuing more of the offending * operations. Commands queued prior to the queue limit being * reset need to be completed with BLK_STS_NOTSUPP to avoid I/O * errors being propagated to upper layers. */ if (max_sectors == 0) return BLK_STS_NOTSUPP; printk(KERN_ERR "%s: over max size limit. (%u > %u)\n", __func__, blk_rq_sectors(rq), max_sectors); return BLK_STS_IOERR; } /* * The queue settings related to segment counting may differ from the * original queue. */ rq->nr_phys_segments = blk_recalc_rq_segments(rq); if (rq->nr_phys_segments > max_segments) { printk(KERN_ERR "%s: over max segments limit. (%u > %u)\n", __func__, rq->nr_phys_segments, max_segments); return BLK_STS_IOERR; } if (q->disk && should_fail_request(q->disk->part0, blk_rq_bytes(rq))) return BLK_STS_IOERR; ret = blk_crypto_rq_get_keyslot(rq); if (ret != BLK_STS_OK) return ret; blk_account_io_start(rq); /* * Since we have a scheduler attached on the top device, * bypass a potential scheduler on the bottom device for * insert. */ blk_mq_run_dispatch_ops(q, ret = blk_mq_request_issue_directly(rq, true)); if (ret) blk_account_io_done(rq, blk_time_get_ns()); return ret; } EXPORT_SYMBOL_GPL(blk_insert_cloned_request); /** * blk_rq_unprep_clone - Helper function to free all bios in a cloned request * @rq: the clone request to be cleaned up * * Description: * Free all bios in @rq for a cloned request. */ void blk_rq_unprep_clone(struct request *rq) { struct bio *bio; while ((bio = rq->bio) != NULL) { rq->bio = bio->bi_next; bio_put(bio); } } EXPORT_SYMBOL_GPL(blk_rq_unprep_clone); /** * blk_rq_prep_clone - Helper function to setup clone request * @rq: the request to be setup * @rq_src: original request to be cloned * @bs: bio_set that bios for clone are allocated from * @gfp_mask: memory allocation mask for bio * @bio_ctr: setup function to be called for each clone bio. * Returns %0 for success, non %0 for failure. * @data: private data to be passed to @bio_ctr * * Description: * Clones bios in @rq_src to @rq, and copies attributes of @rq_src to @rq. * Also, pages which the original bios are pointing to are not copied * and the cloned bios just point same pages. * So cloned bios must be completed before original bios, which means * the caller must complete @rq before @rq_src. */ int blk_rq_prep_clone(struct request *rq, struct request *rq_src, struct bio_set *bs, gfp_t gfp_mask, int (*bio_ctr)(struct bio *, struct bio *, void *), void *data) { struct bio *bio_src; if (!bs) bs = &fs_bio_set; __rq_for_each_bio(bio_src, rq_src) { struct bio *bio = bio_alloc_clone(rq->q->disk->part0, bio_src, gfp_mask, bs); if (!bio) goto free_and_out; if (bio_ctr && bio_ctr(bio, bio_src, data)) { bio_put(bio); goto free_and_out; } if (rq->bio) { rq->biotail->bi_next = bio; rq->biotail = bio; } else { rq->bio = rq->biotail = bio; } } /* Copy attributes of the original request to the clone request. */ rq->__sector = blk_rq_pos(rq_src); rq->__data_len = blk_rq_bytes(rq_src); if (rq_src->rq_flags & RQF_SPECIAL_PAYLOAD) { rq->rq_flags |= RQF_SPECIAL_PAYLOAD; rq->special_vec = rq_src->special_vec; } rq->nr_phys_segments = rq_src->nr_phys_segments; rq->nr_integrity_segments = rq_src->nr_integrity_segments; if (rq->bio && blk_crypto_rq_bio_prep(rq, rq->bio, gfp_mask) < 0) goto free_and_out; return 0; free_and_out: blk_rq_unprep_clone(rq); return -ENOMEM; } EXPORT_SYMBOL_GPL(blk_rq_prep_clone); #endif /* CONFIG_BLK_MQ_STACKING */ /* * Steal bios from a request and add them to a bio list. * The request must not have been partially completed before. */ void blk_steal_bios(struct bio_list *list, struct request *rq) { if (rq->bio) { if (list->tail) list->tail->bi_next = rq->bio; else list->head = rq->bio; list->tail = rq->biotail; rq->bio = NULL; rq->biotail = NULL; } rq->__data_len = 0; } EXPORT_SYMBOL_GPL(blk_steal_bios); static size_t order_to_size(unsigned int order) { return (size_t)PAGE_SIZE << order; } /* called before freeing request pool in @tags */ static void blk_mq_clear_rq_mapping(struct blk_mq_tags *drv_tags, struct blk_mq_tags *tags) { struct page *page; unsigned long flags; /* * There is no need to clear mapping if driver tags is not initialized * or the mapping belongs to the driver tags. */ if (!drv_tags || drv_tags == tags) return; list_for_each_entry(page, &tags->page_list, lru) { unsigned long start = (unsigned long)page_address(page); unsigned long end = start + order_to_size(page->private); int i; for (i = 0; i < drv_tags->nr_tags; i++) { struct request *rq = drv_tags->rqs[i]; unsigned long rq_addr = (unsigned long)rq; if (rq_addr >= start && rq_addr < end) { WARN_ON_ONCE(req_ref_read(rq) != 0); cmpxchg(&drv_tags->rqs[i], rq, NULL); } } } /* * Wait until all pending iteration is done. * * Request reference is cleared and it is guaranteed to be observed * after the ->lock is released. */ spin_lock_irqsave(&drv_tags->lock, flags); spin_unlock_irqrestore(&drv_tags->lock, flags); } void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, unsigned int hctx_idx) { struct blk_mq_tags *drv_tags; struct page *page; if (list_empty(&tags->page_list)) return; if (blk_mq_is_shared_tags(set->flags)) drv_tags = set->shared_tags; else drv_tags = set->tags[hctx_idx]; if (tags->static_rqs && set->ops->exit_request) { int i; for (i = 0; i < tags->nr_tags; i++) { struct request *rq = tags->static_rqs[i]; if (!rq) continue; set->ops->exit_request(set, rq, hctx_idx); tags->static_rqs[i] = NULL; } } blk_mq_clear_rq_mapping(drv_tags, tags); while (!list_empty(&tags->page_list)) { page = list_first_entry(&tags->page_list, struct page, lru); list_del_init(&page->lru); /* * Remove kmemleak object previously allocated in * blk_mq_alloc_rqs(). */ kmemleak_free(page_address(page)); __free_pages(page, page->private); } } void blk_mq_free_rq_map(struct blk_mq_tags *tags) { kfree(tags->rqs); tags->rqs = NULL; kfree(tags->static_rqs); tags->static_rqs = NULL; blk_mq_free_tags(tags); } static enum hctx_type hctx_idx_to_type(struct blk_mq_tag_set *set, unsigned int hctx_idx) { int i; for (i = 0; i < set->nr_maps; i++) { unsigned int start = set->map[i].queue_offset; unsigned int end = start + set->map[i].nr_queues; if (hctx_idx >= start && hctx_idx < end) break; } if (i >= set->nr_maps) i = HCTX_TYPE_DEFAULT; return i; } static int blk_mq_get_hctx_node(struct blk_mq_tag_set *set, unsigned int hctx_idx) { enum hctx_type type = hctx_idx_to_type(set, hctx_idx); return blk_mq_hw_queue_to_node(&set->map[type], hctx_idx); } static struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set, unsigned int hctx_idx, unsigned int nr_tags, unsigned int reserved_tags) { int node = blk_mq_get_hctx_node(set, hctx_idx); struct blk_mq_tags *tags; if (node == NUMA_NO_NODE) node = set->numa_node; tags = blk_mq_init_tags(nr_tags, reserved_tags, set->flags, node); if (!tags) return NULL; tags->rqs = kcalloc_node(nr_tags, sizeof(struct request *), GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY, node); if (!tags->rqs) goto err_free_tags; tags->static_rqs = kcalloc_node(nr_tags, sizeof(struct request *), GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY, node); if (!tags->static_rqs) goto err_free_rqs; return tags; err_free_rqs: kfree(tags->rqs); err_free_tags: blk_mq_free_tags(tags); return NULL; } static int blk_mq_init_request(struct blk_mq_tag_set *set, struct request *rq, unsigned int hctx_idx, int node) { int ret; if (set->ops->init_request) { ret = set->ops->init_request(set, rq, hctx_idx, node); if (ret) return ret; } WRITE_ONCE(rq->state, MQ_RQ_IDLE); return 0; } static int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, unsigned int hctx_idx, unsigned int depth) { unsigned int i, j, entries_per_page, max_order = 4; int node = blk_mq_get_hctx_node(set, hctx_idx); size_t rq_size, left; if (node == NUMA_NO_NODE) node = set->numa_node; INIT_LIST_HEAD(&tags->page_list); /* * rq_size is the size of the request plus driver payload, rounded * to the cacheline size */ rq_size = round_up(sizeof(struct request) + set->cmd_size, cache_line_size()); left = rq_size * depth; for (i = 0; i < depth; ) { int this_order = max_order; struct page *page; int to_do; void *p; while (this_order && left < order_to_size(this_order - 1)) this_order--; do { page = alloc_pages_node(node, GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY | __GFP_ZERO, this_order); if (page) break; if (!this_order--) break; if (order_to_size(this_order) < rq_size) break; } while (1); if (!page) goto fail; page->private = this_order; list_add_tail(&page->lru, &tags->page_list); p = page_address(page); /* * Allow kmemleak to scan these pages as they contain pointers * to additional allocations like via ops->init_request(). */ kmemleak_alloc(p, order_to_size(this_order), 1, GFP_NOIO); entries_per_page = order_to_size(this_order) / rq_size; to_do = min(entries_per_page, depth - i); left -= to_do * rq_size; for (j = 0; j < to_do; j++) { struct request *rq = p; tags->static_rqs[i] = rq; if (blk_mq_init_request(set, rq, hctx_idx, node)) { tags->static_rqs[i] = NULL; goto fail; } p += rq_size; i++; } } return 0; fail: blk_mq_free_rqs(set, tags, hctx_idx); return -ENOMEM; } struct rq_iter_data { struct blk_mq_hw_ctx *hctx; bool has_rq; }; static bool blk_mq_has_request(struct request *rq, void *data) { struct rq_iter_data *iter_data = data; if (rq->mq_hctx != iter_data->hctx) return true; iter_data->has_rq = true; return false; } static bool blk_mq_hctx_has_requests(struct blk_mq_hw_ctx *hctx) { struct blk_mq_tags *tags = hctx->sched_tags ? hctx->sched_tags : hctx->tags; struct rq_iter_data data = { .hctx = hctx, }; blk_mq_all_tag_iter(tags, blk_mq_has_request, &data); return data.has_rq; } static bool blk_mq_hctx_has_online_cpu(struct blk_mq_hw_ctx *hctx, unsigned int this_cpu) { enum hctx_type type = hctx->type; int cpu; /* * hctx->cpumask has to rule out isolated CPUs, but userspace still * might submit IOs on these isolated CPUs, so use the queue map to * check if all CPUs mapped to this hctx are offline */ for_each_online_cpu(cpu) { struct blk_mq_hw_ctx *h = blk_mq_map_queue_type(hctx->queue, type, cpu); if (h != hctx) continue; /* this hctx has at least one online CPU */ if (this_cpu != cpu) return true; } return false; } static int blk_mq_hctx_notify_offline(unsigned int cpu, struct hlist_node *node) { struct blk_mq_hw_ctx *hctx = hlist_entry_safe(node, struct blk_mq_hw_ctx, cpuhp_online); if (blk_mq_hctx_has_online_cpu(hctx, cpu)) return 0; /* * Prevent new request from being allocated on the current hctx. * * The smp_mb__after_atomic() Pairs with the implied barrier in * test_and_set_bit_lock in sbitmap_get(). Ensures the inactive flag is * seen once we return from the tag allocator. */ set_bit(BLK_MQ_S_INACTIVE, &hctx->state); smp_mb__after_atomic(); /* * Try to grab a reference to the queue and wait for any outstanding * requests. If we could not grab a reference the queue has been * frozen and there are no requests. */ if (percpu_ref_tryget(&hctx->queue->q_usage_counter)) { while (blk_mq_hctx_has_requests(hctx)) msleep(5); percpu_ref_put(&hctx->queue->q_usage_counter); } return 0; } /* * Check if one CPU is mapped to the specified hctx * * Isolated CPUs have been ruled out from hctx->cpumask, which is supposed * to be used for scheduling kworker only. For other usage, please call this * helper for checking if one CPU belongs to the specified hctx */ static bool blk_mq_cpu_mapped_to_hctx(unsigned int cpu, const struct blk_mq_hw_ctx *hctx) { struct blk_mq_hw_ctx *mapped_hctx = blk_mq_map_queue_type(hctx->queue, hctx->type, cpu); return mapped_hctx == hctx; } static int blk_mq_hctx_notify_online(unsigned int cpu, struct hlist_node *node) { struct blk_mq_hw_ctx *hctx = hlist_entry_safe(node, struct blk_mq_hw_ctx, cpuhp_online); if (blk_mq_cpu_mapped_to_hctx(cpu, hctx)) clear_bit(BLK_MQ_S_INACTIVE, &hctx->state); return 0; } /* * 'cpu' is going away. splice any existing rq_list entries from this * software queue to the hw queue dispatch list, and ensure that it * gets run. */ static int blk_mq_hctx_notify_dead(unsigned int cpu, struct hlist_node *node) { struct blk_mq_hw_ctx *hctx; struct blk_mq_ctx *ctx; LIST_HEAD(tmp); enum hctx_type type; hctx = hlist_entry_safe(node, struct blk_mq_hw_ctx, cpuhp_dead); if (!blk_mq_cpu_mapped_to_hctx(cpu, hctx)) return 0; ctx = __blk_mq_get_ctx(hctx->queue, cpu); type = hctx->type; spin_lock(&ctx->lock); if (!list_empty(&ctx->rq_lists[type])) { list_splice_init(&ctx->rq_lists[type], &tmp); blk_mq_hctx_clear_pending(hctx, ctx); } spin_unlock(&ctx->lock); if (list_empty(&tmp)) return 0; spin_lock(&hctx->lock); list_splice_tail_init(&tmp, &hctx->dispatch); spin_unlock(&hctx->lock); blk_mq_run_hw_queue(hctx, true); return 0; } static void __blk_mq_remove_cpuhp(struct blk_mq_hw_ctx *hctx) { lockdep_assert_held(&blk_mq_cpuhp_lock); if (!(hctx->flags & BLK_MQ_F_STACKING) && !hlist_unhashed(&hctx->cpuhp_online)) { cpuhp_state_remove_instance_nocalls(CPUHP_AP_BLK_MQ_ONLINE, &hctx->cpuhp_online); INIT_HLIST_NODE(&hctx->cpuhp_online); } if (!hlist_unhashed(&hctx->cpuhp_dead)) { cpuhp_state_remove_instance_nocalls(CPUHP_BLK_MQ_DEAD, &hctx->cpuhp_dead); INIT_HLIST_NODE(&hctx->cpuhp_dead); } } static void blk_mq_remove_cpuhp(struct blk_mq_hw_ctx *hctx) { mutex_lock(&blk_mq_cpuhp_lock); __blk_mq_remove_cpuhp(hctx); mutex_unlock(&blk_mq_cpuhp_lock); } static void __blk_mq_add_cpuhp(struct blk_mq_hw_ctx *hctx) { lockdep_assert_held(&blk_mq_cpuhp_lock); if (!(hctx->flags & BLK_MQ_F_STACKING) && hlist_unhashed(&hctx->cpuhp_online)) cpuhp_state_add_instance_nocalls(CPUHP_AP_BLK_MQ_ONLINE, &hctx->cpuhp_online); if (hlist_unhashed(&hctx->cpuhp_dead)) cpuhp_state_add_instance_nocalls(CPUHP_BLK_MQ_DEAD, &hctx->cpuhp_dead); } static void __blk_mq_remove_cpuhp_list(struct list_head *head) { struct blk_mq_hw_ctx *hctx; lockdep_assert_held(&blk_mq_cpuhp_lock); list_for_each_entry(hctx, head, hctx_list) __blk_mq_remove_cpuhp(hctx); } /* * Unregister cpuhp callbacks from exited hw queues * * Safe to call if this `request_queue` is live */ static void blk_mq_remove_hw_queues_cpuhp(struct request_queue *q) { LIST_HEAD(hctx_list); spin_lock(&q->unused_hctx_lock); list_splice_init(&q->unused_hctx_list, &hctx_list); spin_unlock(&q->unused_hctx_lock); mutex_lock(&blk_mq_cpuhp_lock); __blk_mq_remove_cpuhp_list(&hctx_list); mutex_unlock(&blk_mq_cpuhp_lock); spin_lock(&q->unused_hctx_lock); list_splice(&hctx_list, &q->unused_hctx_list); spin_unlock(&q->unused_hctx_lock); } /* * Register cpuhp callbacks from all hw queues * * Safe to call if this `request_queue` is live */ static void blk_mq_add_hw_queues_cpuhp(struct request_queue *q) { struct blk_mq_hw_ctx *hctx; unsigned long i; mutex_lock(&blk_mq_cpuhp_lock); queue_for_each_hw_ctx(q, hctx, i) __blk_mq_add_cpuhp(hctx); mutex_unlock(&blk_mq_cpuhp_lock); } /* * Before freeing hw queue, clearing the flush request reference in * tags->rqs[] for avoiding potential UAF. */ static void blk_mq_clear_flush_rq_mapping(struct blk_mq_tags *tags, unsigned int queue_depth, struct request *flush_rq) { int i; unsigned long flags; /* The hw queue may not be mapped yet */ if (!tags) return; WARN_ON_ONCE(req_ref_read(flush_rq) != 0); for (i = 0; i < queue_depth; i++) cmpxchg(&tags->rqs[i], flush_rq, NULL); /* * Wait until all pending iteration is done. * * Request reference is cleared and it is guaranteed to be observed * after the ->lock is released. */ spin_lock_irqsave(&tags->lock, flags); spin_unlock_irqrestore(&tags->lock, flags); } /* hctx->ctxs will be freed in queue's release handler */ static void blk_mq_exit_hctx(struct request_queue *q, struct blk_mq_tag_set *set, struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx) { struct request *flush_rq = hctx->fq->flush_rq; if (blk_mq_hw_queue_mapped(hctx)) blk_mq_tag_idle(hctx); if (blk_queue_init_done(q)) blk_mq_clear_flush_rq_mapping(set->tags[hctx_idx], set->queue_depth, flush_rq); if (set->ops->exit_request) set->ops->exit_request(set, flush_rq, hctx_idx); if (set->ops->exit_hctx) set->ops->exit_hctx(hctx, hctx_idx); xa_erase(&q->hctx_table, hctx_idx); spin_lock(&q->unused_hctx_lock); list_add(&hctx->hctx_list, &q->unused_hctx_list); spin_unlock(&q->unused_hctx_lock); } static void blk_mq_exit_hw_queues(struct request_queue *q, struct blk_mq_tag_set *set, int nr_queue) { struct blk_mq_hw_ctx *hctx; unsigned long i; queue_for_each_hw_ctx(q, hctx, i) { if (i == nr_queue) break; blk_mq_remove_cpuhp(hctx); blk_mq_exit_hctx(q, set, hctx, i); } } static int blk_mq_init_hctx(struct request_queue *q, struct blk_mq_tag_set *set, struct blk_mq_hw_ctx *hctx, unsigned hctx_idx) { hctx->queue_num = hctx_idx; hctx->tags = set->tags[hctx_idx]; if (set->ops->init_hctx && set->ops->init_hctx(hctx, set->driver_data, hctx_idx)) goto fail; if (blk_mq_init_request(set, hctx->fq->flush_rq, hctx_idx, hctx->numa_node)) goto exit_hctx; if (xa_insert(&q->hctx_table, hctx_idx, hctx, GFP_KERNEL)) goto exit_flush_rq; return 0; exit_flush_rq: if (set->ops->exit_request) set->ops->exit_request(set, hctx->fq->flush_rq, hctx_idx); exit_hctx: if (set->ops->exit_hctx) set->ops->exit_hctx(hctx, hctx_idx); fail: return -1; } static struct blk_mq_hw_ctx * blk_mq_alloc_hctx(struct request_queue *q, struct blk_mq_tag_set *set, int node) { struct blk_mq_hw_ctx *hctx; gfp_t gfp = GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY; hctx = kzalloc_node(sizeof(struct blk_mq_hw_ctx), gfp, node); if (!hctx) goto fail_alloc_hctx; if (!zalloc_cpumask_var_node(&hctx->cpumask, gfp, node)) goto free_hctx; atomic_set(&hctx->nr_active, 0); if (node == NUMA_NO_NODE) node = set->numa_node; hctx->numa_node = node; INIT_DELAYED_WORK(&hctx->run_work, blk_mq_run_work_fn); spin_lock_init(&hctx->lock); INIT_LIST_HEAD(&hctx->dispatch); INIT_HLIST_NODE(&hctx->cpuhp_dead); INIT_HLIST_NODE(&hctx->cpuhp_online); hctx->queue = q; hctx->flags = set->flags & ~BLK_MQ_F_TAG_QUEUE_SHARED; INIT_LIST_HEAD(&hctx->hctx_list); /* * Allocate space for all possible cpus to avoid allocation at * runtime */ hctx->ctxs = kmalloc_array_node(nr_cpu_ids, sizeof(void *), gfp, node); if (!hctx->ctxs) goto free_cpumask; if (sbitmap_init_node(&hctx->ctx_map, nr_cpu_ids, ilog2(8), gfp, node, false, false)) goto free_ctxs; hctx->nr_ctx = 0; spin_lock_init(&hctx->dispatch_wait_lock); init_waitqueue_func_entry(&hctx->dispatch_wait, blk_mq_dispatch_wake); INIT_LIST_HEAD(&hctx->dispatch_wait.entry); hctx->fq = blk_alloc_flush_queue(hctx->numa_node, set->cmd_size, gfp); if (!hctx->fq) goto free_bitmap; blk_mq_hctx_kobj_init(hctx); return hctx; free_bitmap: sbitmap_free(&hctx->ctx_map); free_ctxs: kfree(hctx->ctxs); free_cpumask: free_cpumask_var(hctx->cpumask); free_hctx: kfree(hctx); fail_alloc_hctx: return NULL; } static void blk_mq_init_cpu_queues(struct request_queue *q, unsigned int nr_hw_queues) { struct blk_mq_tag_set *set = q->tag_set; unsigned int i, j; for_each_possible_cpu(i) { struct blk_mq_ctx *__ctx = per_cpu_ptr(q->queue_ctx, i); struct blk_mq_hw_ctx *hctx; int k; __ctx->cpu = i; spin_lock_init(&__ctx->lock); for (k = HCTX_TYPE_DEFAULT; k < HCTX_MAX_TYPES; k++) INIT_LIST_HEAD(&__ctx->rq_lists[k]); __ctx->queue = q; /* * Set local node, IFF we have more than one hw queue. If * not, we remain on the home node of the device */ for (j = 0; j < set->nr_maps; j++) { hctx = blk_mq_map_queue_type(q, j, i); if (nr_hw_queues > 1 && hctx->numa_node == NUMA_NO_NODE) hctx->numa_node = cpu_to_node(i); } } } struct blk_mq_tags *blk_mq_alloc_map_and_rqs(struct blk_mq_tag_set *set, unsigned int hctx_idx, unsigned int depth) { struct blk_mq_tags *tags; int ret; tags = blk_mq_alloc_rq_map(set, hctx_idx, depth, set->reserved_tags); if (!tags) return NULL; ret = blk_mq_alloc_rqs(set, tags, hctx_idx, depth); if (ret) { blk_mq_free_rq_map(tags); return NULL; } return tags; } static bool __blk_mq_alloc_map_and_rqs(struct blk_mq_tag_set *set, int hctx_idx) { if (blk_mq_is_shared_tags(set->flags)) { set->tags[hctx_idx] = set->shared_tags; return true; } set->tags[hctx_idx] = blk_mq_alloc_map_and_rqs(set, hctx_idx, set->queue_depth); return set->tags[hctx_idx]; } void blk_mq_free_map_and_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, unsigned int hctx_idx) { if (tags) { blk_mq_free_rqs(set, tags, hctx_idx); blk_mq_free_rq_map(tags); } } static void __blk_mq_free_map_and_rqs(struct blk_mq_tag_set *set, unsigned int hctx_idx) { if (!blk_mq_is_shared_tags(set->flags)) blk_mq_free_map_and_rqs(set, set->tags[hctx_idx], hctx_idx); set->tags[hctx_idx] = NULL; } static void blk_mq_map_swqueue(struct request_queue *q) { unsigned int j, hctx_idx; unsigned long i; struct blk_mq_hw_ctx *hctx; struct blk_mq_ctx *ctx; struct blk_mq_tag_set *set = q->tag_set; queue_for_each_hw_ctx(q, hctx, i) { cpumask_clear(hctx->cpumask); hctx->nr_ctx = 0; hctx->dispatch_from = NULL; } /* * Map software to hardware queues. * * If the cpu isn't present, the cpu is mapped to first hctx. */ for_each_possible_cpu(i) { ctx = per_cpu_ptr(q->queue_ctx, i); for (j = 0; j < set->nr_maps; j++) { if (!set->map[j].nr_queues) { ctx->hctxs[j] = blk_mq_map_queue_type(q, HCTX_TYPE_DEFAULT, i); continue; } hctx_idx = set->map[j].mq_map[i]; /* unmapped hw queue can be remapped after CPU topo changed */ if (!set->tags[hctx_idx] && !__blk_mq_alloc_map_and_rqs(set, hctx_idx)) { /* * If tags initialization fail for some hctx, * that hctx won't be brought online. In this * case, remap the current ctx to hctx[0] which * is guaranteed to always have tags allocated */ set->map[j].mq_map[i] = 0; } hctx = blk_mq_map_queue_type(q, j, i); ctx->hctxs[j] = hctx; /* * If the CPU is already set in the mask, then we've * mapped this one already. This can happen if * devices share queues across queue maps. */ if (cpumask_test_cpu(i, hctx->cpumask)) continue; cpumask_set_cpu(i, hctx->cpumask); hctx->type = j; ctx->index_hw[hctx->type] = hctx->nr_ctx; hctx->ctxs[hctx->nr_ctx++] = ctx; /* * If the nr_ctx type overflows, we have exceeded the * amount of sw queues we can support. */ BUG_ON(!hctx->nr_ctx); } for (; j < HCTX_MAX_TYPES; j++) ctx->hctxs[j] = blk_mq_map_queue_type(q, HCTX_TYPE_DEFAULT, i); } queue_for_each_hw_ctx(q, hctx, i) { int cpu; /* * If no software queues are mapped to this hardware queue, * disable it and free the request entries. */ if (!hctx->nr_ctx) { /* Never unmap queue 0. We need it as a * fallback in case of a new remap fails * allocation */ if (i) __blk_mq_free_map_and_rqs(set, i); hctx->tags = NULL; continue; } hctx->tags = set->tags[i]; WARN_ON(!hctx->tags); /* * Set the map size to the number of mapped software queues. * This is more accurate and more efficient than looping * over all possibly mapped software queues. */ sbitmap_resize(&hctx->ctx_map, hctx->nr_ctx); /* * Rule out isolated CPUs from hctx->cpumask to avoid * running block kworker on isolated CPUs */ for_each_cpu(cpu, hctx->cpumask) { if (cpu_is_isolated(cpu)) cpumask_clear_cpu(cpu, hctx->cpumask); } /* * Initialize batch roundrobin counts */ hctx->next_cpu = blk_mq_first_mapped_cpu(hctx); hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH; } } /* * Caller needs to ensure that we're either frozen/quiesced, or that * the queue isn't live yet. */ static void queue_set_hctx_shared(struct request_queue *q, bool shared) { struct blk_mq_hw_ctx *hctx; unsigned long i; queue_for_each_hw_ctx(q, hctx, i) { if (shared) { hctx->flags |= BLK_MQ_F_TAG_QUEUE_SHARED; } else { blk_mq_tag_idle(hctx); hctx->flags &= ~BLK_MQ_F_TAG_QUEUE_SHARED; } } } static void blk_mq_update_tag_set_shared(struct blk_mq_tag_set *set, bool shared) { struct request_queue *q; unsigned int memflags; lockdep_assert_held(&set->tag_list_lock); list_for_each_entry(q, &set->tag_list, tag_set_list) { memflags = blk_mq_freeze_queue(q); queue_set_hctx_shared(q, shared); blk_mq_unfreeze_queue(q, memflags); } } static void blk_mq_del_queue_tag_set(struct request_queue *q) { struct blk_mq_tag_set *set = q->tag_set; mutex_lock(&set->tag_list_lock); list_del(&q->tag_set_list); if (list_is_singular(&set->tag_list)) { /* just transitioned to unshared */ set->flags &= ~BLK_MQ_F_TAG_QUEUE_SHARED; /* update existing queue */ blk_mq_update_tag_set_shared(set, false); } mutex_unlock(&set->tag_list_lock); INIT_LIST_HEAD(&q->tag_set_list); } static void blk_mq_add_queue_tag_set(struct blk_mq_tag_set *set, struct request_queue *q) { mutex_lock(&set->tag_list_lock); /* * Check to see if we're transitioning to shared (from 1 to 2 queues). */ if (!list_empty(&set->tag_list) && !(set->flags & BLK_MQ_F_TAG_QUEUE_SHARED)) { set->flags |= BLK_MQ_F_TAG_QUEUE_SHARED; /* update existing queue */ blk_mq_update_tag_set_shared(set, true); } if (set->flags & BLK_MQ_F_TAG_QUEUE_SHARED) queue_set_hctx_shared(q, true); list_add_tail(&q->tag_set_list, &set->tag_list); mutex_unlock(&set->tag_list_lock); } /* All allocations will be freed in release handler of q->mq_kobj */ static int blk_mq_alloc_ctxs(struct request_queue *q) { struct blk_mq_ctxs *ctxs; int cpu; ctxs = kzalloc(sizeof(*ctxs), GFP_KERNEL); if (!ctxs) return -ENOMEM; ctxs->queue_ctx = alloc_percpu(struct blk_mq_ctx); if (!ctxs->queue_ctx) goto fail; for_each_possible_cpu(cpu) { struct blk_mq_ctx *ctx = per_cpu_ptr(ctxs->queue_ctx, cpu); ctx->ctxs = ctxs; } q->mq_kobj = &ctxs->kobj; q->queue_ctx = ctxs->queue_ctx; return 0; fail: kfree(ctxs); return -ENOMEM; } /* * It is the actual release handler for mq, but we do it from * request queue's release handler for avoiding use-after-free * and headache because q->mq_kobj shouldn't have been introduced, * but we can't group ctx/kctx kobj without it. */ void blk_mq_release(struct request_queue *q) { struct blk_mq_hw_ctx *hctx, *next; unsigned long i; queue_for_each_hw_ctx(q, hctx, i) WARN_ON_ONCE(hctx && list_empty(&hctx->hctx_list)); /* all hctx are in .unused_hctx_list now */ list_for_each_entry_safe(hctx, next, &q->unused_hctx_list, hctx_list) { list_del_init(&hctx->hctx_list); kobject_put(&hctx->kobj); } xa_destroy(&q->hctx_table); /* * release .mq_kobj and sw queue's kobject now because * both share lifetime with request queue. */ blk_mq_sysfs_deinit(q); } struct request_queue *blk_mq_alloc_queue(struct blk_mq_tag_set *set, struct queue_limits *lim, void *queuedata) { struct queue_limits default_lim = { }; struct request_queue *q; int ret; if (!lim) lim = &default_lim; lim->features |= BLK_FEAT_IO_STAT | BLK_FEAT_NOWAIT; if (set->nr_maps > HCTX_TYPE_POLL) lim->features |= BLK_FEAT_POLL; q = blk_alloc_queue(lim, set->numa_node); if (IS_ERR(q)) return q; q->queuedata = queuedata; ret = blk_mq_init_allocated_queue(set, q); if (ret) { blk_put_queue(q); return ERR_PTR(ret); } return q; } EXPORT_SYMBOL(blk_mq_alloc_queue); /** * blk_mq_destroy_queue - shutdown a request queue * @q: request queue to shutdown * * This shuts down a request queue allocated by blk_mq_alloc_queue(). All future * requests will be failed with -ENODEV. The caller is responsible for dropping * the reference from blk_mq_alloc_queue() by calling blk_put_queue(). * * Context: can sleep */ void blk_mq_destroy_queue(struct request_queue *q) { WARN_ON_ONCE(!queue_is_mq(q)); WARN_ON_ONCE(blk_queue_registered(q)); might_sleep(); blk_queue_flag_set(QUEUE_FLAG_DYING, q); blk_queue_start_drain(q); blk_mq_freeze_queue_wait(q); blk_sync_queue(q); blk_mq_cancel_work_sync(q); blk_mq_exit_queue(q); } EXPORT_SYMBOL(blk_mq_destroy_queue); struct gendisk *__blk_mq_alloc_disk(struct blk_mq_tag_set *set, struct queue_limits *lim, void *queuedata, struct lock_class_key *lkclass) { struct request_queue *q; struct gendisk *disk; q = blk_mq_alloc_queue(set, lim, queuedata); if (IS_ERR(q)) return ERR_CAST(q); disk = __alloc_disk_node(q, set->numa_node, lkclass); if (!disk) { blk_mq_destroy_queue(q); blk_put_queue(q); return ERR_PTR(-ENOMEM); } set_bit(GD_OWNS_QUEUE, &disk->state); return disk; } EXPORT_SYMBOL(__blk_mq_alloc_disk); struct gendisk *blk_mq_alloc_disk_for_queue(struct request_queue *q, struct lock_class_key *lkclass) { struct gendisk *disk; if (!blk_get_queue(q)) return NULL; disk = __alloc_disk_node(q, NUMA_NO_NODE, lkclass); if (!disk) blk_put_queue(q); return disk; } EXPORT_SYMBOL(blk_mq_alloc_disk_for_queue); /* * Only hctx removed from cpuhp list can be reused */ static bool blk_mq_hctx_is_reusable(struct blk_mq_hw_ctx *hctx) { return hlist_unhashed(&hctx->cpuhp_online) && hlist_unhashed(&hctx->cpuhp_dead); } static struct blk_mq_hw_ctx *blk_mq_alloc_and_init_hctx( struct blk_mq_tag_set *set, struct request_queue *q, int hctx_idx, int node) { struct blk_mq_hw_ctx *hctx = NULL, *tmp; /* reuse dead hctx first */ spin_lock(&q->unused_hctx_lock); list_for_each_entry(tmp, &q->unused_hctx_list, hctx_list) { if (tmp->numa_node == node && blk_mq_hctx_is_reusable(tmp)) { hctx = tmp; break; } } if (hctx) list_del_init(&hctx->hctx_list); spin_unlock(&q->unused_hctx_lock); if (!hctx) hctx = blk_mq_alloc_hctx(q, set, node); if (!hctx) goto fail; if (blk_mq_init_hctx(q, set, hctx, hctx_idx)) goto free_hctx; return hctx; free_hctx: kobject_put(&hctx->kobj); fail: return NULL; } static void __blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set, struct request_queue *q) { struct blk_mq_hw_ctx *hctx; unsigned long i, j; for (i = 0; i < set->nr_hw_queues; i++) { int old_node; int node = blk_mq_get_hctx_node(set, i); struct blk_mq_hw_ctx *old_hctx = xa_load(&q->hctx_table, i); if (old_hctx) { old_node = old_hctx->numa_node; blk_mq_exit_hctx(q, set, old_hctx, i); } if (!blk_mq_alloc_and_init_hctx(set, q, i, node)) { if (!old_hctx) break; pr_warn("Allocate new hctx on node %d fails, fallback to previous one on node %d\n", node, old_node); hctx = blk_mq_alloc_and_init_hctx(set, q, i, old_node); WARN_ON_ONCE(!hctx); } } /* * Increasing nr_hw_queues fails. Free the newly allocated * hctxs and keep the previous q->nr_hw_queues. */ if (i != set->nr_hw_queues) { j = q->nr_hw_queues; } else { j = i; q->nr_hw_queues = set->nr_hw_queues; } xa_for_each_start(&q->hctx_table, j, hctx, j) blk_mq_exit_hctx(q, set, hctx, j); } static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set, struct request_queue *q) { __blk_mq_realloc_hw_ctxs(set, q); /* unregister cpuhp callbacks for exited hctxs */ blk_mq_remove_hw_queues_cpuhp(q); /* register cpuhp for new initialized hctxs */ blk_mq_add_hw_queues_cpuhp(q); } int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, struct request_queue *q) { /* mark the queue as mq asap */ q->mq_ops = set->ops; /* * ->tag_set has to be setup before initialize hctx, which cpuphp * handler needs it for checking queue mapping */ q->tag_set = set; if (blk_mq_alloc_ctxs(q)) goto err_exit; /* init q->mq_kobj and sw queues' kobjects */ blk_mq_sysfs_init(q); INIT_LIST_HEAD(&q->unused_hctx_list); spin_lock_init(&q->unused_hctx_lock); xa_init(&q->hctx_table); blk_mq_realloc_hw_ctxs(set, q); if (!q->nr_hw_queues) goto err_hctxs; INIT_WORK(&q->timeout_work, blk_mq_timeout_work); blk_queue_rq_timeout(q, set->timeout ? set->timeout : 30 * HZ); q->queue_flags |= QUEUE_FLAG_MQ_DEFAULT; INIT_DELAYED_WORK(&q->requeue_work, blk_mq_requeue_work); INIT_LIST_HEAD(&q->flush_list); INIT_LIST_HEAD(&q->requeue_list); spin_lock_init(&q->requeue_lock); q->nr_requests = set->queue_depth; blk_mq_init_cpu_queues(q, set->nr_hw_queues); blk_mq_map_swqueue(q); blk_mq_add_queue_tag_set(set, q); return 0; err_hctxs: blk_mq_release(q); err_exit: q->mq_ops = NULL; return -ENOMEM; } EXPORT_SYMBOL(blk_mq_init_allocated_queue); /* tags can _not_ be used after returning from blk_mq_exit_queue */ void blk_mq_exit_queue(struct request_queue *q) { struct blk_mq_tag_set *set = q->tag_set; /* Checks hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED. */ blk_mq_exit_hw_queues(q, set, set->nr_hw_queues); /* May clear BLK_MQ_F_TAG_QUEUE_SHARED in hctx->flags. */ blk_mq_del_queue_tag_set(q); } static int __blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set) { int i; if (blk_mq_is_shared_tags(set->flags)) { set->shared_tags = blk_mq_alloc_map_and_rqs(set, BLK_MQ_NO_HCTX_IDX, set->queue_depth); if (!set->shared_tags) return -ENOMEM; } for (i = 0; i < set->nr_hw_queues; i++) { if (!__blk_mq_alloc_map_and_rqs(set, i)) goto out_unwind; cond_resched(); } return 0; out_unwind: while (--i >= 0) __blk_mq_free_map_and_rqs(set, i); if (blk_mq_is_shared_tags(set->flags)) { blk_mq_free_map_and_rqs(set, set->shared_tags, BLK_MQ_NO_HCTX_IDX); } return -ENOMEM; } /* * Allocate the request maps associated with this tag_set. Note that this * may reduce the depth asked for, if memory is tight. set->queue_depth * will be updated to reflect the allocated depth. */ static int blk_mq_alloc_set_map_and_rqs(struct blk_mq_tag_set *set) { unsigned int depth; int err; depth = set->queue_depth; do { err = __blk_mq_alloc_rq_maps(set); if (!err) break; set->queue_depth >>= 1; if (set->queue_depth < set->reserved_tags + BLK_MQ_TAG_MIN) { err = -ENOMEM; break; } } while (set->queue_depth); if (!set->queue_depth || err) { pr_err("blk-mq: failed to allocate request map\n"); return -ENOMEM; } if (depth != set->queue_depth) pr_info("blk-mq: reduced tag depth (%u -> %u)\n", depth, set->queue_depth); return 0; } static void blk_mq_update_queue_map(struct blk_mq_tag_set *set) { /* * blk_mq_map_queues() and multiple .map_queues() implementations * expect that set->map[HCTX_TYPE_DEFAULT].nr_queues is set to the * number of hardware queues. */ if (set->nr_maps == 1) set->map[HCTX_TYPE_DEFAULT].nr_queues = set->nr_hw_queues; if (set->ops->map_queues) { int i; /* * transport .map_queues is usually done in the following * way: * * for (queue = 0; queue < set->nr_hw_queues; queue++) { * mask = get_cpu_mask(queue) * for_each_cpu(cpu, mask) * set->map[x].mq_map[cpu] = queue; * } * * When we need to remap, the table has to be cleared for * killing stale mapping since one CPU may not be mapped * to any hw queue. */ for (i = 0; i < set->nr_maps; i++) blk_mq_clear_mq_map(&set->map[i]); set->ops->map_queues(set); } else { BUG_ON(set->nr_maps > 1); blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]); } } static int blk_mq_realloc_tag_set_tags(struct blk_mq_tag_set *set, int new_nr_hw_queues) { struct blk_mq_tags **new_tags; int i; if (set->nr_hw_queues >= new_nr_hw_queues) goto done; new_tags = kcalloc_node(new_nr_hw_queues, sizeof(struct blk_mq_tags *), GFP_KERNEL, set->numa_node); if (!new_tags) return -ENOMEM; if (set->tags) memcpy(new_tags, set->tags, set->nr_hw_queues * sizeof(*set->tags)); kfree(set->tags); set->tags = new_tags; for (i = set->nr_hw_queues; i < new_nr_hw_queues; i++) { if (!__blk_mq_alloc_map_and_rqs(set, i)) { while (--i >= set->nr_hw_queues) __blk_mq_free_map_and_rqs(set, i); return -ENOMEM; } cond_resched(); } done: set->nr_hw_queues = new_nr_hw_queues; return 0; } /* * Alloc a tag set to be associated with one or more request queues. * May fail with EINVAL for various error conditions. May adjust the * requested depth down, if it's too large. In that case, the set * value will be stored in set->queue_depth. */ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) { int i, ret; BUILD_BUG_ON(BLK_MQ_MAX_DEPTH > 1 << BLK_MQ_UNIQUE_TAG_BITS); if (!set->nr_hw_queues) return -EINVAL; if (!set->queue_depth) return -EINVAL; if (set->queue_depth < set->reserved_tags + BLK_MQ_TAG_MIN) return -EINVAL; if (!set->ops->queue_rq) return -EINVAL; if (!set->ops->get_budget ^ !set->ops->put_budget) return -EINVAL; if (set->queue_depth > BLK_MQ_MAX_DEPTH) { pr_info("blk-mq: reduced tag depth to %u\n", BLK_MQ_MAX_DEPTH); set->queue_depth = BLK_MQ_MAX_DEPTH; } if (!set->nr_maps) set->nr_maps = 1; else if (set->nr_maps > HCTX_MAX_TYPES) return -EINVAL; /* * If a crashdump is active, then we are potentially in a very * memory constrained environment. Limit us to 64 tags to prevent * using too much memory. */ if (is_kdump_kernel()) set->queue_depth = min(64U, set->queue_depth); /* * There is no use for more h/w queues than cpus if we just have * a single map */ if (set->nr_maps == 1 && set->nr_hw_queues > nr_cpu_ids) set->nr_hw_queues = nr_cpu_ids; if (set->flags & BLK_MQ_F_BLOCKING) { set->srcu = kmalloc(sizeof(*set->srcu), GFP_KERNEL); if (!set->srcu) return -ENOMEM; ret = init_srcu_struct(set->srcu); if (ret) goto out_free_srcu; } init_rwsem(&set->update_nr_hwq_lock); ret = -ENOMEM; set->tags = kcalloc_node(set->nr_hw_queues, sizeof(struct blk_mq_tags *), GFP_KERNEL, set->numa_node); if (!set->tags) goto out_cleanup_srcu; for (i = 0; i < set->nr_maps; i++) { set->map[i].mq_map = kcalloc_node(nr_cpu_ids, sizeof(set->map[i].mq_map[0]), GFP_KERNEL, set->numa_node); if (!set->map[i].mq_map) goto out_free_mq_map; set->map[i].nr_queues = set->nr_hw_queues; } blk_mq_update_queue_map(set); ret = blk_mq_alloc_set_map_and_rqs(set); if (ret) goto out_free_mq_map; mutex_init(&set->tag_list_lock); INIT_LIST_HEAD(&set->tag_list); return 0; out_free_mq_map: for (i = 0; i < set->nr_maps; i++) { kfree(set->map[i].mq_map); set->map[i].mq_map = NULL; } kfree(set->tags); set->tags = NULL; out_cleanup_srcu: if (set->flags & BLK_MQ_F_BLOCKING) cleanup_srcu_struct(set->srcu); out_free_srcu: if (set->flags & BLK_MQ_F_BLOCKING) kfree(set->srcu); return ret; } EXPORT_SYMBOL(blk_mq_alloc_tag_set); /* allocate and initialize a tagset for a simple single-queue device */ int blk_mq_alloc_sq_tag_set(struct blk_mq_tag_set *set, const struct blk_mq_ops *ops, unsigned int queue_depth, unsigned int set_flags) { memset(set, 0, sizeof(*set)); set->ops = ops; set->nr_hw_queues = 1; set->nr_maps = 1; set->queue_depth = queue_depth; set->numa_node = NUMA_NO_NODE; set->flags = set_flags; return blk_mq_alloc_tag_set(set); } EXPORT_SYMBOL_GPL(blk_mq_alloc_sq_tag_set); void blk_mq_free_tag_set(struct blk_mq_tag_set *set) { int i, j; for (i = 0; i < set->nr_hw_queues; i++) __blk_mq_free_map_and_rqs(set, i); if (blk_mq_is_shared_tags(set->flags)) { blk_mq_free_map_and_rqs(set, set->shared_tags, BLK_MQ_NO_HCTX_IDX); } for (j = 0; j < set->nr_maps; j++) { kfree(set->map[j].mq_map); set->map[j].mq_map = NULL; } kfree(set->tags); set->tags = NULL; if (set->flags & BLK_MQ_F_BLOCKING) { cleanup_srcu_struct(set->srcu); kfree(set->srcu); } } EXPORT_SYMBOL(blk_mq_free_tag_set); int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr) { struct blk_mq_tag_set *set = q->tag_set; struct blk_mq_hw_ctx *hctx; int ret; unsigned long i; if (WARN_ON_ONCE(!q->mq_freeze_depth)) return -EINVAL; if (!set) return -EINVAL; if (q->nr_requests == nr) return 0; blk_mq_quiesce_queue(q); ret = 0; queue_for_each_hw_ctx(q, hctx, i) { if (!hctx->tags) continue; /* * If we're using an MQ scheduler, just update the scheduler * queue depth. This is similar to what the old code would do. */ if (hctx->sched_tags) { ret = blk_mq_tag_update_depth(hctx, &hctx->sched_tags, nr, true); } else { ret = blk_mq_tag_update_depth(hctx, &hctx->tags, nr, false); } if (ret) break; if (q->elevator && q->elevator->type->ops.depth_updated) q->elevator->type->ops.depth_updated(hctx); } if (!ret) { q->nr_requests = nr; if (blk_mq_is_shared_tags(set->flags)) { if (q->elevator) blk_mq_tag_update_sched_shared_tags(q); else blk_mq_tag_resize_shared_tags(set, nr); } } blk_mq_unquiesce_queue(q); return ret; } /* * Switch back to the elevator type stored in the xarray. */ static void blk_mq_elv_switch_back(struct request_queue *q, struct xarray *elv_tbl, struct xarray *et_tbl) { struct elevator_type *e = xa_load(elv_tbl, q->id); struct elevator_tags *t = xa_load(et_tbl, q->id); /* The elv_update_nr_hw_queues unfreezes the queue. */ elv_update_nr_hw_queues(q, e, t); /* Drop the reference acquired in blk_mq_elv_switch_none. */ if (e) elevator_put(e); } /* * Stores elevator type in xarray and set current elevator to none. It uses * q->id as an index to store the elevator type into the xarray. */ static int blk_mq_elv_switch_none(struct request_queue *q, struct xarray *elv_tbl) { int ret = 0; lockdep_assert_held_write(&q->tag_set->update_nr_hwq_lock); /* * Accessing q->elevator without holding q->elevator_lock is safe here * because we're called from nr_hw_queue update which is protected by * set->update_nr_hwq_lock in the writer context. So, scheduler update/ * switch code (which acquires the same lock in the reader context) * can't run concurrently. */ if (q->elevator) { ret = xa_insert(elv_tbl, q->id, q->elevator->type, GFP_KERNEL); if (WARN_ON_ONCE(ret)) return ret; /* * Before we switch elevator to 'none', take a reference to * the elevator module so that while nr_hw_queue update is * running, no one can remove elevator module. We'd put the * reference to elevator module later when we switch back * elevator. */ __elevator_get(q->elevator->type); elevator_set_none(q); } return ret; } static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues) { struct request_queue *q; int prev_nr_hw_queues = set->nr_hw_queues; unsigned int memflags; int i; struct xarray elv_tbl, et_tbl; lockdep_assert_held(&set->tag_list_lock); if (set->nr_maps == 1 && nr_hw_queues > nr_cpu_ids) nr_hw_queues = nr_cpu_ids; if (nr_hw_queues < 1) return; if (set->nr_maps == 1 && nr_hw_queues == set->nr_hw_queues) return; memflags = memalloc_noio_save(); xa_init(&et_tbl); if (blk_mq_alloc_sched_tags_batch(&et_tbl, set, nr_hw_queues) < 0) goto out_memalloc_restore; xa_init(&elv_tbl); list_for_each_entry(q, &set->tag_list, tag_set_list) { blk_mq_debugfs_unregister_hctxs(q); blk_mq_sysfs_unregister_hctxs(q); } list_for_each_entry(q, &set->tag_list, tag_set_list) blk_mq_freeze_queue_nomemsave(q); /* * Switch IO scheduler to 'none', cleaning up the data associated * with the previous scheduler. We will switch back once we are done * updating the new sw to hw queue mappings. */ list_for_each_entry(q, &set->tag_list, tag_set_list) if (blk_mq_elv_switch_none(q, &elv_tbl)) goto switch_back; if (blk_mq_realloc_tag_set_tags(set, nr_hw_queues) < 0) goto switch_back; fallback: blk_mq_update_queue_map(set); list_for_each_entry(q, &set->tag_list, tag_set_list) { __blk_mq_realloc_hw_ctxs(set, q); if (q->nr_hw_queues != set->nr_hw_queues) { int i = prev_nr_hw_queues; pr_warn("Increasing nr_hw_queues to %d fails, fallback to %d\n", nr_hw_queues, prev_nr_hw_queues); for (; i < set->nr_hw_queues; i++) __blk_mq_free_map_and_rqs(set, i); set->nr_hw_queues = prev_nr_hw_queues; goto fallback; } blk_mq_map_swqueue(q); } switch_back: /* The blk_mq_elv_switch_back unfreezes queue for us. */ list_for_each_entry(q, &set->tag_list, tag_set_list) blk_mq_elv_switch_back(q, &elv_tbl, &et_tbl); list_for_each_entry(q, &set->tag_list, tag_set_list) { blk_mq_sysfs_register_hctxs(q); blk_mq_debugfs_register_hctxs(q); blk_mq_remove_hw_queues_cpuhp(q); blk_mq_add_hw_queues_cpuhp(q); } xa_destroy(&elv_tbl); xa_destroy(&et_tbl); out_memalloc_restore: memalloc_noio_restore(memflags); /* Free the excess tags when nr_hw_queues shrink. */ for (i = set->nr_hw_queues; i < prev_nr_hw_queues; i++) __blk_mq_free_map_and_rqs(set, i); } void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues) { down_write(&set->update_nr_hwq_lock); mutex_lock(&set->tag_list_lock); __blk_mq_update_nr_hw_queues(set, nr_hw_queues); mutex_unlock(&set->tag_list_lock); up_write(&set->update_nr_hwq_lock); } EXPORT_SYMBOL_GPL(blk_mq_update_nr_hw_queues); static int blk_hctx_poll(struct request_queue *q, struct blk_mq_hw_ctx *hctx, struct io_comp_batch *iob, unsigned int flags) { long state = get_current_state(); int ret; do { ret = q->mq_ops->poll(hctx, iob); if (ret > 0) { __set_current_state(TASK_RUNNING); return ret; } if (signal_pending_state(state, current)) __set_current_state(TASK_RUNNING); if (task_is_running(current)) return 1; if (ret < 0 || (flags & BLK_POLL_ONESHOT)) break; cpu_relax(); } while (!need_resched()); __set_current_state(TASK_RUNNING); return 0; } int blk_mq_poll(struct request_queue *q, blk_qc_t cookie, struct io_comp_batch *iob, unsigned int flags) { if (!blk_mq_can_poll(q)) return 0; return blk_hctx_poll(q, xa_load(&q->hctx_table, cookie), iob, flags); } int blk_rq_poll(struct request *rq, struct io_comp_batch *iob, unsigned int poll_flags) { struct request_queue *q = rq->q; int ret; if (!blk_rq_is_poll(rq)) return 0; if (!percpu_ref_tryget(&q->q_usage_counter)) return 0; ret = blk_hctx_poll(q, rq->mq_hctx, iob, poll_flags); blk_queue_exit(q); return ret; } EXPORT_SYMBOL_GPL(blk_rq_poll); unsigned int blk_mq_rq_cpu(struct request *rq) { return rq->mq_ctx->cpu; } EXPORT_SYMBOL(blk_mq_rq_cpu); void blk_mq_cancel_work_sync(struct request_queue *q) { struct blk_mq_hw_ctx *hctx; unsigned long i; cancel_delayed_work_sync(&q->requeue_work); queue_for_each_hw_ctx(q, hctx, i) cancel_delayed_work_sync(&hctx->run_work); } static int __init blk_mq_init(void) { int i; for_each_possible_cpu(i) init_llist_head(&per_cpu(blk_cpu_done, i)); for_each_possible_cpu(i) INIT_CSD(&per_cpu(blk_cpu_csd, i), __blk_mq_complete_request_remote, NULL); open_softirq(BLOCK_SOFTIRQ, blk_done_softirq); cpuhp_setup_state_nocalls(CPUHP_BLOCK_SOFTIRQ_DEAD, "block/softirq:dead", NULL, blk_softirq_cpu_dead); cpuhp_setup_state_multi(CPUHP_BLK_MQ_DEAD, "block/mq:dead", NULL, blk_mq_hctx_notify_dead); cpuhp_setup_state_multi(CPUHP_AP_BLK_MQ_ONLINE, "block/mq:online", blk_mq_hctx_notify_online, blk_mq_hctx_notify_offline); return 0; } subsys_initcall(blk_mq_init); |
| 6 5 5 6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 | // SPDX-License-Identifier: GPL-2.0-only #include <linux/export.h> #include <linux/netfilter/ipset/pfxlen.h> /* Prefixlen maps for fast conversions, by Jan Engelhardt. */ #ifdef E #undef E #endif #define PREFIXES_MAP \ E(0x00000000, 0x00000000, 0x00000000, 0x00000000), \ E(0x80000000, 0x00000000, 0x00000000, 0x00000000), \ E(0xC0000000, 0x00000000, 0x00000000, 0x00000000), \ E(0xE0000000, 0x00000000, 0x00000000, 0x00000000), \ E(0xF0000000, 0x00000000, 0x00000000, 0x00000000), \ E(0xF8000000, 0x00000000, 0x00000000, 0x00000000), \ E(0xFC000000, 0x00000000, 0x00000000, 0x00000000), \ E(0xFE000000, 0x00000000, 0x00000000, 0x00000000), \ E(0xFF000000, 0x00000000, 0x00000000, 0x00000000), \ E(0xFF800000, 0x00000000, 0x00000000, 0x00000000), \ E(0xFFC00000, 0x00000000, 0x00000000, 0x00000000), \ E(0xFFE00000, 0x00000000, 0x00000000, 0x00000000), \ E(0xFFF00000, 0x00000000, 0x00000000, 0x00000000), \ E(0xFFF80000, 0x00000000, 0x00000000, 0x00000000), \ E(0xFFFC0000, 0x00000000, 0x00000000, 0x00000000), \ E(0xFFFE0000, 0x00000000, 0x00000000, 0x00000000), \ E(0xFFFF0000, 0x00000000, 0x00000000, 0x00000000), \ E(0xFFFF8000, 0x00000000, 0x00000000, 0x00000000), \ E(0xFFFFC000, 0x00000000, 0x00000000, 0x00000000), \ E(0xFFFFE000, 0x00000000, 0x00000000, 0x00000000), \ E(0xFFFFF000, 0x00000000, 0x00000000, 0x00000000), \ E(0xFFFFF800, 0x00000000, 0x00000000, 0x00000000), \ E(0xFFFFFC00, 0x00000000, 0x00000000, 0x00000000), \ E(0xFFFFFE00, 0x00000000, 0x00000000, 0x00000000), \ E(0xFFFFFF00, 0x00000000, 0x00000000, 0x00000000), \ E(0xFFFFFF80, 0x00000000, 0x00000000, 0x00000000), \ E(0xFFFFFFC0, 0x00000000, 0x00000000, 0x00000000), \ E(0xFFFFFFE0, 0x00000000, 0x00000000, 0x00000000), \ E(0xFFFFFFF0, 0x00000000, 0x00000000, 0x00000000), \ E(0xFFFFFFF8, 0x00000000, 0x00000000, 0x00000000), \ E(0xFFFFFFFC, 0x00000000, 0x00000000, 0x00000000), \ E(0xFFFFFFFE, 0x00000000, 0x00000000, 0x00000000), \ E(0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000), \ E(0xFFFFFFFF, 0x80000000, 0x00000000, 0x00000000), \ E(0xFFFFFFFF, 0xC0000000, 0x00000000, 0x00000000), \ E(0xFFFFFFFF, 0xE0000000, 0x00000000, 0x00000000), \ E(0xFFFFFFFF, 0xF0000000, 0x00000000, 0x00000000), \ E(0xFFFFFFFF, 0xF8000000, 0x00000000, 0x00000000), \ E(0xFFFFFFFF, 0xFC000000, 0x00000000, 0x00000000), \ E(0xFFFFFFFF, 0xFE000000, 0x00000000, 0x00000000), \ E(0xFFFFFFFF, 0xFF000000, 0x00000000, 0x00000000), \ E(0xFFFFFFFF, 0xFF800000, 0x00000000, 0x00000000), \ E(0xFFFFFFFF, 0xFFC00000, 0x00000000, 0x00000000), \ E(0xFFFFFFFF, 0xFFE00000, 0x00000000, 0x00000000), \ E(0xFFFFFFFF, 0xFFF00000, 0x00000000, 0x00000000), \ E(0xFFFFFFFF, 0xFFF80000, 0x00000000, 0x00000000), \ E(0xFFFFFFFF, 0xFFFC0000, 0x00000000, 0x00000000), \ E(0xFFFFFFFF, 0xFFFE0000, 0x00000000, 0x00000000), \ E(0xFFFFFFFF, 0xFFFF0000, 0x00000000, 0x00000000), \ E(0xFFFFFFFF, 0xFFFF8000, 0x00000000, 0x00000000), \ E(0xFFFFFFFF, 0xFFFFC000, 0x00000000, 0x00000000), \ E(0xFFFFFFFF, 0xFFFFE000, 0x00000000, 0x00000000), \ E(0xFFFFFFFF, 0xFFFFF000, 0x00000000, 0x00000000), \ E(0xFFFFFFFF, 0xFFFFF800, 0x00000000, 0x00000000), \ E(0xFFFFFFFF, 0xFFFFFC00, 0x00000000, 0x00000000), \ E(0xFFFFFFFF, 0xFFFFFE00, 0x00000000, 0x00000000), \ E(0xFFFFFFFF, 0xFFFFFF00, 0x00000000, 0x00000000), \ E(0xFFFFFFFF, 0xFFFFFF80, 0x00000000, 0x00000000), \ E(0xFFFFFFFF, 0xFFFFFFC0, 0x00000000, 0x00000000), \ E(0xFFFFFFFF, 0xFFFFFFE0, 0x00000000, 0x00000000), \ E(0xFFFFFFFF, 0xFFFFFFF0, 0x00000000, 0x00000000), \ E(0xFFFFFFFF, 0xFFFFFFF8, 0x00000000, 0x00000000), \ E(0xFFFFFFFF, 0xFFFFFFFC, 0x00000000, 0x00000000), \ E(0xFFFFFFFF, 0xFFFFFFFE, 0x00000000, 0x00000000), \ E(0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000), \ E(0xFFFFFFFF, 0xFFFFFFFF, 0x80000000, 0x00000000), \ E(0xFFFFFFFF, 0xFFFFFFFF, 0xC0000000, 0x00000000), \ E(0xFFFFFFFF, 0xFFFFFFFF, 0xE0000000, 0x00000000), \ E(0xFFFFFFFF, 0xFFFFFFFF, 0xF0000000, 0x00000000), \ E(0xFFFFFFFF, 0xFFFFFFFF, 0xF8000000, 0x00000000), \ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFC000000, 0x00000000), \ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFE000000, 0x00000000), \ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFF000000, 0x00000000), \ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFF800000, 0x00000000), \ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFC00000, 0x00000000), \ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFE00000, 0x00000000), \ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFF00000, 0x00000000), \ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFF80000, 0x00000000), \ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFC0000, 0x00000000), \ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFE0000, 0x00000000), \ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFF0000, 0x00000000), \ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFF8000, 0x00000000), \ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFC000, 0x00000000), \ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFE000, 0x00000000), \ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFF000, 0x00000000), \ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFF800, 0x00000000), \ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFC00, 0x00000000), \ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFE00, 0x00000000), \ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFF00, 0x00000000), \ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFF80, 0x00000000), \ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFC0, 0x00000000), \ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFE0, 0x00000000), \ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFF0, 0x00000000), \ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFF8, 0x00000000), \ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFC, 0x00000000), \ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFE, 0x00000000), \ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000), \ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x80000000), \ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xC0000000), \ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xE0000000), \ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xF0000000), \ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xF8000000), \ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFC000000), \ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFE000000), \ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFF000000), \ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFF800000), \ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFC00000), \ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFE00000), \ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFF00000), \ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFF80000), \ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFC0000), \ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFE0000), \ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFF0000), \ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFF8000), \ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFC000), \ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFE000), \ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFF000), \ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFF800), \ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFC00), \ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFE00), \ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFF00), \ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFF80), \ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFC0), \ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFE0), \ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFF0), \ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFF8), \ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFC), \ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFE), \ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF), #define E(a, b, c, d) \ {.ip6 = { \ htonl(a), htonl(b), \ htonl(c), htonl(d), \ } } /* This table works for both IPv4 and IPv6; * just use prefixlen_netmask_map[prefixlength].ip. */ const union nf_inet_addr ip_set_netmask_map[] = { PREFIXES_MAP }; EXPORT_SYMBOL_GPL(ip_set_netmask_map); #undef E #define E(a, b, c, d) \ {.ip6 = { (__force __be32)a, (__force __be32)b, \ (__force __be32)c, (__force __be32)d, \ } } /* This table works for both IPv4 and IPv6; * just use prefixlen_hostmask_map[prefixlength].ip. */ const union nf_inet_addr ip_set_hostmask_map[] = { PREFIXES_MAP }; EXPORT_SYMBOL_GPL(ip_set_hostmask_map); /* Find the largest network which matches the range from left, in host order. */ u32 ip_set_range_to_cidr(u32 from, u32 to, u8 *cidr) { u32 last; u8 i; for (i = 1; i < 32; i++) { if ((from & ip_set_hostmask(i)) != from) continue; last = from | ~ip_set_hostmask(i); if (!after(last, to)) { *cidr = i; return last; } } *cidr = 32; return from; } EXPORT_SYMBOL_GPL(ip_set_range_to_cidr); |
| 7 7 7 10 10 10 7 7 7 6 1 7 7 6 7 7 7 7 7 7 7 6 7 7 6 7 7 7 7 7 7 7 7 3 3 3 3 28 27 27 28 28 13 14 13 14 3 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 | /* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright (c) 2016 Qualcomm Atheros, Inc * * Based on net/sched/sch_fq_codel.c */ #ifndef __NET_SCHED_FQ_IMPL_H #define __NET_SCHED_FQ_IMPL_H #include <net/fq.h> /* functions that are embedded into includer */ static void __fq_adjust_removal(struct fq *fq, struct fq_flow *flow, unsigned int packets, unsigned int bytes, unsigned int truesize) { struct fq_tin *tin = flow->tin; int idx; tin->backlog_bytes -= bytes; tin->backlog_packets -= packets; flow->backlog -= bytes; fq->backlog -= packets; fq->memory_usage -= truesize; if (flow->backlog) return; if (flow == &tin->default_flow) { list_del_init(&tin->tin_list); return; } idx = flow - fq->flows; __clear_bit(idx, fq->flows_bitmap); } static void fq_adjust_removal(struct fq *fq, struct fq_flow *flow, struct sk_buff *skb) { __fq_adjust_removal(fq, flow, 1, skb->len, skb->truesize); } static struct sk_buff *fq_flow_dequeue(struct fq *fq, struct fq_flow *flow) { struct sk_buff *skb; lockdep_assert_held(&fq->lock); skb = __skb_dequeue(&flow->queue); if (!skb) return NULL; fq_adjust_removal(fq, flow, skb); return skb; } static int fq_flow_drop(struct fq *fq, struct fq_flow *flow, fq_skb_free_t free_func) { unsigned int packets = 0, bytes = 0, truesize = 0; struct fq_tin *tin = flow->tin; struct sk_buff *skb; int pending; lockdep_assert_held(&fq->lock); pending = min_t(int, 32, skb_queue_len(&flow->queue) / 2); do { skb = __skb_dequeue(&flow->queue); if (!skb) break; packets++; bytes += skb->len; truesize += skb->truesize; free_func(fq, tin, flow, skb); } while (packets < pending); __fq_adjust_removal(fq, flow, packets, bytes, truesize); return packets; } static struct sk_buff *fq_tin_dequeue(struct fq *fq, struct fq_tin *tin, fq_tin_dequeue_t dequeue_func) { struct fq_flow *flow; struct list_head *head; struct sk_buff *skb; lockdep_assert_held(&fq->lock); begin: head = &tin->new_flows; if (list_empty(head)) { head = &tin->old_flows; if (list_empty(head)) return NULL; } flow = list_first_entry(head, struct fq_flow, flowchain); if (flow->deficit <= 0) { flow->deficit += fq->quantum; list_move_tail(&flow->flowchain, &tin->old_flows); goto begin; } skb = dequeue_func(fq, tin, flow); if (!skb) { /* force a pass through old_flows to prevent starvation */ if ((head == &tin->new_flows) && !list_empty(&tin->old_flows)) { list_move_tail(&flow->flowchain, &tin->old_flows); } else { list_del_init(&flow->flowchain); flow->tin = NULL; } goto begin; } flow->deficit -= skb->len; tin->tx_bytes += skb->len; tin->tx_packets++; return skb; } static u32 fq_flow_idx(struct fq *fq, struct sk_buff *skb) { u32 hash = skb_get_hash(skb); return reciprocal_scale(hash, fq->flows_cnt); } static struct fq_flow *fq_flow_classify(struct fq *fq, struct fq_tin *tin, u32 idx, struct sk_buff *skb) { struct fq_flow *flow; lockdep_assert_held(&fq->lock); flow = &fq->flows[idx]; if (flow->tin && flow->tin != tin) { flow = &tin->default_flow; tin->collisions++; fq->collisions++; } if (!flow->tin) tin->flows++; return flow; } static struct fq_flow *fq_find_fattest_flow(struct fq *fq) { struct fq_tin *tin; struct fq_flow *flow = NULL; u32 len = 0; int i; for_each_set_bit(i, fq->flows_bitmap, fq->flows_cnt) { struct fq_flow *cur = &fq->flows[i]; unsigned int cur_len; cur_len = cur->backlog; if (cur_len <= len) continue; flow = cur; len = cur_len; } list_for_each_entry(tin, &fq->tin_backlog, tin_list) { unsigned int cur_len = tin->default_flow.backlog; if (cur_len <= len) continue; flow = &tin->default_flow; len = cur_len; } return flow; } static void fq_tin_enqueue(struct fq *fq, struct fq_tin *tin, u32 idx, struct sk_buff *skb, fq_skb_free_t free_func) { struct fq_flow *flow; struct sk_buff *next; bool oom; lockdep_assert_held(&fq->lock); flow = fq_flow_classify(fq, tin, idx, skb); if (!flow->backlog) { if (flow != &tin->default_flow) __set_bit(idx, fq->flows_bitmap); else if (list_empty(&tin->tin_list)) list_add(&tin->tin_list, &fq->tin_backlog); } flow->tin = tin; skb_list_walk_safe(skb, skb, next) { skb_mark_not_on_list(skb); flow->backlog += skb->len; tin->backlog_bytes += skb->len; tin->backlog_packets++; fq->memory_usage += skb->truesize; fq->backlog++; __skb_queue_tail(&flow->queue, skb); } if (list_empty(&flow->flowchain)) { flow->deficit = fq->quantum; list_add_tail(&flow->flowchain, &tin->new_flows); } oom = (fq->memory_usage > fq->memory_limit); while (fq->backlog > fq->limit || oom) { flow = fq_find_fattest_flow(fq); if (!flow) return; if (!fq_flow_drop(fq, flow, free_func)) return; flow->tin->overlimit++; fq->overlimit++; if (oom) { fq->overmemory++; oom = (fq->memory_usage > fq->memory_limit); } } } static void fq_flow_filter(struct fq *fq, struct fq_flow *flow, fq_skb_filter_t filter_func, void *filter_data, fq_skb_free_t free_func) { struct fq_tin *tin = flow->tin; struct sk_buff *skb, *tmp; lockdep_assert_held(&fq->lock); skb_queue_walk_safe(&flow->queue, skb, tmp) { if (!filter_func(fq, tin, flow, skb, filter_data)) continue; __skb_unlink(skb, &flow->queue); fq_adjust_removal(fq, flow, skb); free_func(fq, tin, flow, skb); } } static void fq_tin_filter(struct fq *fq, struct fq_tin *tin, fq_skb_filter_t filter_func, void *filter_data, fq_skb_free_t free_func) { struct fq_flow *flow; lockdep_assert_held(&fq->lock); list_for_each_entry(flow, &tin->new_flows, flowchain) fq_flow_filter(fq, flow, filter_func, filter_data, free_func); list_for_each_entry(flow, &tin->old_flows, flowchain) fq_flow_filter(fq, flow, filter_func, filter_data, free_func); } static void fq_flow_reset(struct fq *fq, struct fq_flow *flow, fq_skb_free_t free_func) { struct fq_tin *tin = flow->tin; struct sk_buff *skb; while ((skb = fq_flow_dequeue(fq, flow))) free_func(fq, tin, flow, skb); if (!list_empty(&flow->flowchain)) { list_del_init(&flow->flowchain); if (list_empty(&tin->new_flows) && list_empty(&tin->old_flows)) list_del_init(&tin->tin_list); } flow->tin = NULL; WARN_ON_ONCE(flow->backlog); } static void fq_tin_reset(struct fq *fq, struct fq_tin *tin, fq_skb_free_t free_func) { struct list_head *head; struct fq_flow *flow; for (;;) { head = &tin->new_flows; if (list_empty(head)) { head = &tin->old_flows; if (list_empty(head)) break; } flow = list_first_entry(head, struct fq_flow, flowchain); fq_flow_reset(fq, flow, free_func); } WARN_ON_ONCE(!list_empty(&tin->tin_list)); WARN_ON_ONCE(tin->backlog_bytes); WARN_ON_ONCE(tin->backlog_packets); } static void fq_flow_init(struct fq_flow *flow) { INIT_LIST_HEAD(&flow->flowchain); __skb_queue_head_init(&flow->queue); } static void fq_tin_init(struct fq_tin *tin) { INIT_LIST_HEAD(&tin->new_flows); INIT_LIST_HEAD(&tin->old_flows); INIT_LIST_HEAD(&tin->tin_list); fq_flow_init(&tin->default_flow); } static int fq_init(struct fq *fq, int flows_cnt) { int i; memset(fq, 0, sizeof(fq[0])); spin_lock_init(&fq->lock); INIT_LIST_HEAD(&fq->tin_backlog); fq->flows_cnt = max_t(u32, flows_cnt, 1); fq->quantum = 300; fq->limit = 8192; fq->memory_limit = 16 << 20; /* 16 MBytes */ fq->flows = kvcalloc(fq->flows_cnt, sizeof(fq->flows[0]), GFP_KERNEL); if (!fq->flows) return -ENOMEM; fq->flows_bitmap = bitmap_zalloc(fq->flows_cnt, GFP_KERNEL); if (!fq->flows_bitmap) { kvfree(fq->flows); fq->flows = NULL; return -ENOMEM; } for (i = 0; i < fq->flows_cnt; i++) fq_flow_init(&fq->flows[i]); return 0; } static void fq_reset(struct fq *fq, fq_skb_free_t free_func) { int i; for (i = 0; i < fq->flows_cnt; i++) fq_flow_reset(fq, &fq->flows[i], free_func); kvfree(fq->flows); fq->flows = NULL; bitmap_free(fq->flows_bitmap); fq->flows_bitmap = NULL; } #endif |
| 1 4 4 1 1 2 1 4 1 2 3 3 3 2 3 1 2 2 3 2 2 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 | // SPDX-License-Identifier: GPL-2.0-only /* * File: datagram.c * * Datagram (ISI) Phonet sockets * * Copyright (C) 2008 Nokia Corporation. * * Authors: Sakari Ailus <sakari.ailus@nokia.com> * Rémi Denis-Courmont */ #include <linux/kernel.h> #include <linux/slab.h> #include <linux/socket.h> #include <asm/ioctls.h> #include <net/sock.h> #include <linux/phonet.h> #include <linux/export.h> #include <net/phonet/phonet.h> static int pn_backlog_rcv(struct sock *sk, struct sk_buff *skb); /* associated socket ceases to exist */ static void pn_sock_close(struct sock *sk, long timeout) { sk_common_release(sk); } static int pn_ioctl(struct sock *sk, int cmd, int *karg) { struct sk_buff *skb; switch (cmd) { case SIOCINQ: spin_lock_bh(&sk->sk_receive_queue.lock); skb = skb_peek(&sk->sk_receive_queue); *karg = skb ? skb->len : 0; spin_unlock_bh(&sk->sk_receive_queue.lock); return 0; case SIOCPNADDRESOURCE: case SIOCPNDELRESOURCE: { u32 res = *karg; if (res >= 256) return -EINVAL; if (cmd == SIOCPNADDRESOURCE) return pn_sock_bind_res(sk, res); else return pn_sock_unbind_res(sk, res); } } return -ENOIOCTLCMD; } /* Destroy socket. All references are gone. */ static void pn_destruct(struct sock *sk) { skb_queue_purge(&sk->sk_receive_queue); } static int pn_init(struct sock *sk) { sk->sk_destruct = pn_destruct; return 0; } static int pn_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) { DECLARE_SOCKADDR(struct sockaddr_pn *, target, msg->msg_name); struct sk_buff *skb; int err; if (msg->msg_flags & ~(MSG_DONTWAIT|MSG_EOR|MSG_NOSIGNAL| MSG_CMSG_COMPAT)) return -EOPNOTSUPP; if (target == NULL) return -EDESTADDRREQ; if (msg->msg_namelen < sizeof(struct sockaddr_pn)) return -EINVAL; if (target->spn_family != AF_PHONET) return -EAFNOSUPPORT; skb = sock_alloc_send_skb(sk, MAX_PHONET_HEADER + len, msg->msg_flags & MSG_DONTWAIT, &err); if (skb == NULL) return err; skb_reserve(skb, MAX_PHONET_HEADER); err = memcpy_from_msg((void *)skb_put(skb, len), msg, len); if (err < 0) { kfree_skb(skb); return err; } /* * Fill in the Phonet header and * finally pass the packet forwards. */ err = pn_skb_send(sk, skb, target); /* If ok, return len. */ return (err >= 0) ? len : err; } static int pn_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, int *addr_len) { struct sk_buff *skb = NULL; struct sockaddr_pn sa; int rval = -EOPNOTSUPP; int copylen; if (flags & ~(MSG_PEEK|MSG_TRUNC|MSG_DONTWAIT|MSG_NOSIGNAL| MSG_CMSG_COMPAT)) goto out_nofree; skb = skb_recv_datagram(sk, flags, &rval); if (skb == NULL) goto out_nofree; pn_skb_get_src_sockaddr(skb, &sa); copylen = skb->len; if (len < copylen) { msg->msg_flags |= MSG_TRUNC; copylen = len; } rval = skb_copy_datagram_msg(skb, 0, msg, copylen); if (rval) { rval = -EFAULT; goto out; } rval = (flags & MSG_TRUNC) ? skb->len : copylen; if (msg->msg_name != NULL) { __sockaddr_check_size(sizeof(sa)); memcpy(msg->msg_name, &sa, sizeof(sa)); *addr_len = sizeof(sa); } out: skb_free_datagram(sk, skb); out_nofree: return rval; } /* Queue an skb for a sock. */ static int pn_backlog_rcv(struct sock *sk, struct sk_buff *skb) { int err = sock_queue_rcv_skb(sk, skb); if (err < 0) kfree_skb(skb); return err ? NET_RX_DROP : NET_RX_SUCCESS; } /* Module registration */ static struct proto pn_proto = { .close = pn_sock_close, .ioctl = pn_ioctl, .init = pn_init, .sendmsg = pn_sendmsg, .recvmsg = pn_recvmsg, .backlog_rcv = pn_backlog_rcv, .hash = pn_sock_hash, .unhash = pn_sock_unhash, .get_port = pn_sock_get_port, .obj_size = sizeof(struct pn_sock), .owner = THIS_MODULE, .name = "PHONET", }; static const struct phonet_protocol pn_dgram_proto = { .ops = &phonet_dgram_ops, .prot = &pn_proto, .sock_type = SOCK_DGRAM, }; int __init isi_register(void) { return phonet_proto_register(PN_PROTO_PHONET, &pn_dgram_proto); } void __exit isi_unregister(void) { phonet_proto_unregister(PN_PROTO_PHONET, &pn_dgram_proto); } |
| 1 1 1 1 1 1 1 6 7 7 6 6 9 3 6 1 8 6 3 1 2 2 2 1 1 1 10 10 9 7 7 3 10 1 9 9 5 9 9 9 1 13 13 12 12 12 10 1 10 2 1 1 2 2 2 2 3 3 3 3 3 3 3 2 2 2 5 5 5 5 5 5 4 5 5 1 1 1 5 5 5 3 5 1 5 5 5 3 3 2 1 1 1 3 5 5 6 6 1 5 5 4 4 5 4 5 5 5 5 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 | // SPDX-License-Identifier: GPL-2.0-only #include <linux/ceph/ceph_debug.h> #include <linux/backing-dev.h> #include <linux/ctype.h> #include <linux/fs.h> #include <linux/inet.h> #include <linux/in6.h> #include <linux/module.h> #include <linux/mount.h> #include <linux/fs_context.h> #include <linux/fs_parser.h> #include <linux/sched.h> #include <linux/seq_file.h> #include <linux/slab.h> #include <linux/statfs.h> #include <linux/string.h> #include "super.h" #include "mds_client.h" #include "cache.h" #include "crypto.h" #include <linux/ceph/ceph_features.h> #include <linux/ceph/decode.h> #include <linux/ceph/mon_client.h> #include <linux/ceph/auth.h> #include <linux/ceph/debugfs.h> #include <uapi/linux/magic.h> static DEFINE_SPINLOCK(ceph_fsc_lock); static LIST_HEAD(ceph_fsc_list); /* * Ceph superblock operations * * Handle the basics of mounting, unmounting. */ /* * super ops */ static void ceph_put_super(struct super_block *s) { struct ceph_fs_client *fsc = ceph_sb_to_fs_client(s); doutc(fsc->client, "begin\n"); ceph_fscrypt_free_dummy_policy(fsc); ceph_mdsc_close_sessions(fsc->mdsc); doutc(fsc->client, "done\n"); } static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf) { struct ceph_fs_client *fsc = ceph_inode_to_fs_client(d_inode(dentry)); struct ceph_mon_client *monc = &fsc->client->monc; struct ceph_statfs st; int i, err; u64 data_pool; doutc(fsc->client, "begin\n"); if (fsc->mdsc->mdsmap->m_num_data_pg_pools == 1) { data_pool = fsc->mdsc->mdsmap->m_data_pg_pools[0]; } else { data_pool = CEPH_NOPOOL; } err = ceph_monc_do_statfs(monc, data_pool, &st); if (err < 0) return err; /* fill in kstatfs */ buf->f_type = CEPH_SUPER_MAGIC; /* ?? */ /* * Express utilization in terms of large blocks to avoid * overflow on 32-bit machines. */ buf->f_frsize = 1 << CEPH_BLOCK_SHIFT; /* * By default use root quota for stats; fallback to overall filesystem * usage if using 'noquotadf' mount option or if the root dir doesn't * have max_bytes quota set. */ if (ceph_test_mount_opt(fsc, NOQUOTADF) || !ceph_quota_update_statfs(fsc, buf)) { buf->f_blocks = le64_to_cpu(st.kb) >> (CEPH_BLOCK_SHIFT-10); buf->f_bfree = le64_to_cpu(st.kb_avail) >> (CEPH_BLOCK_SHIFT-10); buf->f_bavail = le64_to_cpu(st.kb_avail) >> (CEPH_BLOCK_SHIFT-10); } /* * NOTE: for the time being, we make bsize == frsize to humor * not-yet-ancient versions of glibc that are broken. * Someday, we will probably want to report a real block * size... whatever that may mean for a network file system! */ buf->f_bsize = buf->f_frsize; buf->f_files = le64_to_cpu(st.num_objects); buf->f_ffree = -1; buf->f_namelen = NAME_MAX; /* Must convert the fsid, for consistent values across arches */ buf->f_fsid.val[0] = 0; mutex_lock(&monc->mutex); for (i = 0 ; i < sizeof(monc->monmap->fsid) / sizeof(__le32) ; ++i) buf->f_fsid.val[0] ^= le32_to_cpu(((__le32 *)&monc->monmap->fsid)[i]); mutex_unlock(&monc->mutex); /* fold the fs_cluster_id into the upper bits */ buf->f_fsid.val[1] = monc->fs_cluster_id; doutc(fsc->client, "done\n"); return 0; } static int ceph_sync_fs(struct super_block *sb, int wait) { struct ceph_fs_client *fsc = ceph_sb_to_fs_client(sb); struct ceph_client *cl = fsc->client; if (!wait) { doutc(cl, "(non-blocking)\n"); ceph_flush_dirty_caps(fsc->mdsc); ceph_flush_cap_releases(fsc->mdsc); doutc(cl, "(non-blocking) done\n"); return 0; } doutc(cl, "(blocking)\n"); ceph_osdc_sync(&fsc->client->osdc); ceph_mdsc_sync(fsc->mdsc); doutc(cl, "(blocking) done\n"); return 0; } /* * mount options */ enum { Opt_wsize, Opt_rsize, Opt_rasize, Opt_caps_wanted_delay_min, Opt_caps_wanted_delay_max, Opt_caps_max, Opt_readdir_max_entries, Opt_readdir_max_bytes, Opt_congestion_kb, /* int args above */ Opt_snapdirname, Opt_mds_namespace, Opt_recover_session, Opt_source, Opt_mon_addr, Opt_test_dummy_encryption, /* string args above */ Opt_dirstat, Opt_rbytes, Opt_asyncreaddir, Opt_dcache, Opt_ino32, Opt_fscache, Opt_poolperm, Opt_require_active_mds, Opt_acl, Opt_quotadf, Opt_copyfrom, Opt_wsync, Opt_pagecache, Opt_sparseread, }; enum ceph_recover_session_mode { ceph_recover_session_no, ceph_recover_session_clean }; static const struct constant_table ceph_param_recover[] = { { "no", ceph_recover_session_no }, { "clean", ceph_recover_session_clean }, {} }; static const struct fs_parameter_spec ceph_mount_parameters[] = { fsparam_flag_no ("acl", Opt_acl), fsparam_flag_no ("asyncreaddir", Opt_asyncreaddir), fsparam_s32 ("caps_max", Opt_caps_max), fsparam_u32 ("caps_wanted_delay_max", Opt_caps_wanted_delay_max), fsparam_u32 ("caps_wanted_delay_min", Opt_caps_wanted_delay_min), fsparam_u32 ("write_congestion_kb", Opt_congestion_kb), fsparam_flag_no ("copyfrom", Opt_copyfrom), fsparam_flag_no ("dcache", Opt_dcache), fsparam_flag_no ("dirstat", Opt_dirstat), fsparam_flag_no ("fsc", Opt_fscache), // fsc|nofsc fsparam_string ("fsc", Opt_fscache), // fsc=... fsparam_flag_no ("ino32", Opt_ino32), fsparam_string ("mds_namespace", Opt_mds_namespace), fsparam_string ("mon_addr", Opt_mon_addr), fsparam_flag_no ("poolperm", Opt_poolperm), fsparam_flag_no ("quotadf", Opt_quotadf), fsparam_u32 ("rasize", Opt_rasize), fsparam_flag_no ("rbytes", Opt_rbytes), fsparam_u32 ("readdir_max_bytes", Opt_readdir_max_bytes), fsparam_u32 ("readdir_max_entries", Opt_readdir_max_entries), fsparam_enum ("recover_session", Opt_recover_session, ceph_param_recover), fsparam_flag_no ("require_active_mds", Opt_require_active_mds), fsparam_u32 ("rsize", Opt_rsize), fsparam_string ("snapdirname", Opt_snapdirname), fsparam_string ("source", Opt_source), fsparam_flag ("test_dummy_encryption", Opt_test_dummy_encryption), fsparam_string ("test_dummy_encryption", Opt_test_dummy_encryption), fsparam_u32 ("wsize", Opt_wsize), fsparam_flag_no ("wsync", Opt_wsync), fsparam_flag_no ("pagecache", Opt_pagecache), fsparam_flag_no ("sparseread", Opt_sparseread), {} }; struct ceph_parse_opts_ctx { struct ceph_options *copts; struct ceph_mount_options *opts; }; /* * Remove adjacent slashes and then the trailing slash, unless it is * the only remaining character. * * E.g. "//dir1////dir2///" --> "/dir1/dir2", "///" --> "/". */ static void canonicalize_path(char *path) { int i, j = 0; for (i = 0; path[i] != '\0'; i++) { if (path[i] != '/' || j < 1 || path[j - 1] != '/') path[j++] = path[i]; } if (j > 1 && path[j - 1] == '/') j--; path[j] = '\0'; } /* * Check if the mds namespace in ceph_mount_options matches * the passed in namespace string. First time match (when * ->mds_namespace is NULL) is treated specially, since * ->mds_namespace needs to be initialized by the caller. */ static int namespace_equals(struct ceph_mount_options *fsopt, const char *namespace, size_t len) { return !(fsopt->mds_namespace && (strlen(fsopt->mds_namespace) != len || strncmp(fsopt->mds_namespace, namespace, len))); } static int ceph_parse_old_source(const char *dev_name, const char *dev_name_end, struct fs_context *fc) { int r; struct ceph_parse_opts_ctx *pctx = fc->fs_private; struct ceph_mount_options *fsopt = pctx->opts; if (*dev_name_end != ':') return invalfc(fc, "separator ':' missing in source"); r = ceph_parse_mon_ips(dev_name, dev_name_end - dev_name, pctx->copts, fc->log.log, ','); if (r) return r; fsopt->new_dev_syntax = false; return 0; } static int ceph_parse_new_source(const char *dev_name, const char *dev_name_end, struct fs_context *fc) { size_t len; struct ceph_fsid fsid; struct ceph_parse_opts_ctx *pctx = fc->fs_private; struct ceph_options *opts = pctx->copts; struct ceph_mount_options *fsopt = pctx->opts; const char *name_start = dev_name; const char *fsid_start, *fs_name_start; if (*dev_name_end != '=') { dout("separator '=' missing in source"); return -EINVAL; } fsid_start = strchr(dev_name, '@'); if (!fsid_start) return invalfc(fc, "missing cluster fsid"); len = fsid_start - name_start; kfree(opts->name); opts->name = kstrndup(name_start, len, GFP_KERNEL); if (!opts->name) return -ENOMEM; dout("using %s entity name", opts->name); ++fsid_start; /* start of cluster fsid */ fs_name_start = strchr(fsid_start, '.'); if (!fs_name_start) return invalfc(fc, "missing file system name"); if (ceph_parse_fsid(fsid_start, &fsid)) return invalfc(fc, "Invalid FSID"); ++fs_name_start; /* start of file system name */ len = dev_name_end - fs_name_start; if (!namespace_equals(fsopt, fs_name_start, len)) return invalfc(fc, "Mismatching mds_namespace"); kfree(fsopt->mds_namespace); fsopt->mds_namespace = kstrndup(fs_name_start, len, GFP_KERNEL); if (!fsopt->mds_namespace) return -ENOMEM; dout("file system (mds namespace) '%s'\n", fsopt->mds_namespace); fsopt->new_dev_syntax = true; return 0; } /* * Parse the source parameter for new device format. Distinguish the device * spec from the path. Try parsing new device format and fallback to old * format if needed. * * New device syntax will looks like: * <device_spec>=/<path> * where * <device_spec> is name@fsid.fsname * <path> is optional, but if present must begin with '/' * (monitor addresses are passed via mount option) * * Old device syntax is: * <server_spec>[,<server_spec>...]:[<path>] * where * <server_spec> is <ip>[:<port>] * <path> is optional, but if present must begin with '/' */ static int ceph_parse_source(struct fs_parameter *param, struct fs_context *fc) { struct ceph_parse_opts_ctx *pctx = fc->fs_private; struct ceph_mount_options *fsopt = pctx->opts; char *dev_name = param->string, *dev_name_end; int ret; dout("'%s'\n", dev_name); if (!dev_name || !*dev_name) return invalfc(fc, "Empty source"); dev_name_end = strchr(dev_name, '/'); if (dev_name_end) { /* * The server_path will include the whole chars from userland * including the leading '/'. */ kfree(fsopt->server_path); fsopt->server_path = kstrdup(dev_name_end, GFP_KERNEL); if (!fsopt->server_path) return -ENOMEM; canonicalize_path(fsopt->server_path); } else { dev_name_end = dev_name + strlen(dev_name); } dev_name_end--; /* back up to separator */ if (dev_name_end < dev_name) return invalfc(fc, "Path missing in source"); dout("device name '%.*s'\n", (int)(dev_name_end - dev_name), dev_name); if (fsopt->server_path) dout("server path '%s'\n", fsopt->server_path); dout("trying new device syntax"); ret = ceph_parse_new_source(dev_name, dev_name_end, fc); if (ret) { if (ret != -EINVAL) return ret; dout("trying old device syntax"); ret = ceph_parse_old_source(dev_name, dev_name_end, fc); if (ret) return ret; } fc->source = param->string; param->string = NULL; return 0; } static int ceph_parse_mon_addr(struct fs_parameter *param, struct fs_context *fc) { struct ceph_parse_opts_ctx *pctx = fc->fs_private; struct ceph_mount_options *fsopt = pctx->opts; kfree(fsopt->mon_addr); fsopt->mon_addr = param->string; param->string = NULL; return ceph_parse_mon_ips(fsopt->mon_addr, strlen(fsopt->mon_addr), pctx->copts, fc->log.log, '/'); } static int ceph_parse_mount_param(struct fs_context *fc, struct fs_parameter *param) { struct ceph_parse_opts_ctx *pctx = fc->fs_private; struct ceph_mount_options *fsopt = pctx->opts; struct fs_parse_result result; unsigned int mode; int token, ret; ret = ceph_parse_param(param, pctx->copts, fc->log.log); if (ret != -ENOPARAM) return ret; token = fs_parse(fc, ceph_mount_parameters, param, &result); dout("%s: fs_parse '%s' token %d\n",__func__, param->key, token); if (token < 0) return token; switch (token) { case Opt_snapdirname: if (strlen(param->string) > NAME_MAX) return invalfc(fc, "snapdirname too long"); kfree(fsopt->snapdir_name); fsopt->snapdir_name = param->string; param->string = NULL; break; case Opt_mds_namespace: if (!namespace_equals(fsopt, param->string, strlen(param->string))) return invalfc(fc, "Mismatching mds_namespace"); kfree(fsopt->mds_namespace); fsopt->mds_namespace = param->string; param->string = NULL; break; case Opt_recover_session: mode = result.uint_32; if (mode == ceph_recover_session_no) fsopt->flags &= ~CEPH_MOUNT_OPT_CLEANRECOVER; else if (mode == ceph_recover_session_clean) fsopt->flags |= CEPH_MOUNT_OPT_CLEANRECOVER; else BUG(); break; case Opt_source: if (fc->source) return invalfc(fc, "Multiple sources specified"); return ceph_parse_source(param, fc); case Opt_mon_addr: return ceph_parse_mon_addr(param, fc); case Opt_wsize: if (result.uint_32 < PAGE_SIZE || result.uint_32 > CEPH_MAX_WRITE_SIZE) goto out_of_range; fsopt->wsize = ALIGN(result.uint_32, PAGE_SIZE); break; case Opt_rsize: if (result.uint_32 < PAGE_SIZE || result.uint_32 > CEPH_MAX_READ_SIZE) goto out_of_range; fsopt->rsize = ALIGN(result.uint_32, PAGE_SIZE); break; case Opt_rasize: fsopt->rasize = ALIGN(result.uint_32, PAGE_SIZE); break; case Opt_caps_wanted_delay_min: if (result.uint_32 < 1) goto out_of_range; fsopt->caps_wanted_delay_min = result.uint_32; break; case Opt_caps_wanted_delay_max: if (result.uint_32 < 1) goto out_of_range; fsopt->caps_wanted_delay_max = result.uint_32; break; case Opt_caps_max: if (result.int_32 < 0) goto out_of_range; fsopt->caps_max = result.int_32; break; case Opt_readdir_max_entries: if (result.uint_32 < 1) goto out_of_range; fsopt->max_readdir = result.uint_32; break; case Opt_readdir_max_bytes: if (result.uint_32 < PAGE_SIZE && result.uint_32 != 0) goto out_of_range; fsopt->max_readdir_bytes = result.uint_32; break; case Opt_congestion_kb: if (result.uint_32 < 1024) /* at least 1M */ goto out_of_range; fsopt->congestion_kb = result.uint_32; break; case Opt_dirstat: if (!result.negated) fsopt->flags |= CEPH_MOUNT_OPT_DIRSTAT; else fsopt->flags &= ~CEPH_MOUNT_OPT_DIRSTAT; break; case Opt_rbytes: if (!result.negated) fsopt->flags |= CEPH_MOUNT_OPT_RBYTES; else fsopt->flags &= ~CEPH_MOUNT_OPT_RBYTES; break; case Opt_asyncreaddir: if (!result.negated) fsopt->flags &= ~CEPH_MOUNT_OPT_NOASYNCREADDIR; else fsopt->flags |= CEPH_MOUNT_OPT_NOASYNCREADDIR; break; case Opt_dcache: if (!result.negated) fsopt->flags |= CEPH_MOUNT_OPT_DCACHE; else fsopt->flags &= ~CEPH_MOUNT_OPT_DCACHE; break; case Opt_ino32: if (!result.negated) fsopt->flags |= CEPH_MOUNT_OPT_INO32; else fsopt->flags &= ~CEPH_MOUNT_OPT_INO32; break; case Opt_fscache: #ifdef CONFIG_CEPH_FSCACHE kfree(fsopt->fscache_uniq); fsopt->fscache_uniq = NULL; if (result.negated) { fsopt->flags &= ~CEPH_MOUNT_OPT_FSCACHE; } else { fsopt->flags |= CEPH_MOUNT_OPT_FSCACHE; fsopt->fscache_uniq = param->string; param->string = NULL; } break; #else return invalfc(fc, "fscache support is disabled"); #endif case Opt_poolperm: if (!result.negated) fsopt->flags &= ~CEPH_MOUNT_OPT_NOPOOLPERM; else fsopt->flags |= CEPH_MOUNT_OPT_NOPOOLPERM; break; case Opt_require_active_mds: if (!result.negated) fsopt->flags &= ~CEPH_MOUNT_OPT_MOUNTWAIT; else fsopt->flags |= CEPH_MOUNT_OPT_MOUNTWAIT; break; case Opt_quotadf: if (!result.negated) fsopt->flags &= ~CEPH_MOUNT_OPT_NOQUOTADF; else fsopt->flags |= CEPH_MOUNT_OPT_NOQUOTADF; break; case Opt_copyfrom: if (!result.negated) fsopt->flags &= ~CEPH_MOUNT_OPT_NOCOPYFROM; else fsopt->flags |= CEPH_MOUNT_OPT_NOCOPYFROM; break; case Opt_acl: if (!result.negated) { #ifdef CONFIG_CEPH_FS_POSIX_ACL fc->sb_flags |= SB_POSIXACL; #else return invalfc(fc, "POSIX ACL support is disabled"); #endif } else { fc->sb_flags &= ~SB_POSIXACL; } break; case Opt_wsync: if (!result.negated) fsopt->flags &= ~CEPH_MOUNT_OPT_ASYNC_DIROPS; else fsopt->flags |= CEPH_MOUNT_OPT_ASYNC_DIROPS; break; case Opt_pagecache: if (result.negated) fsopt->flags |= CEPH_MOUNT_OPT_NOPAGECACHE; else fsopt->flags &= ~CEPH_MOUNT_OPT_NOPAGECACHE; break; case Opt_sparseread: if (result.negated) fsopt->flags &= ~CEPH_MOUNT_OPT_SPARSEREAD; else fsopt->flags |= CEPH_MOUNT_OPT_SPARSEREAD; break; case Opt_test_dummy_encryption: #ifdef CONFIG_FS_ENCRYPTION fscrypt_free_dummy_policy(&fsopt->dummy_enc_policy); ret = fscrypt_parse_test_dummy_encryption(param, &fsopt->dummy_enc_policy); if (ret == -EINVAL) { warnfc(fc, "Value of option \"%s\" is unrecognized", param->key); } else if (ret == -EEXIST) { warnfc(fc, "Conflicting test_dummy_encryption options"); ret = -EINVAL; } #else warnfc(fc, "FS encryption not supported: test_dummy_encryption mount option ignored"); #endif break; default: BUG(); } return 0; out_of_range: return invalfc(fc, "%s out of range", param->key); } static void destroy_mount_options(struct ceph_mount_options *args) { dout("destroy_mount_options %p\n", args); if (!args) return; kfree(args->snapdir_name); kfree(args->mds_namespace); kfree(args->server_path); kfree(args->fscache_uniq); kfree(args->mon_addr); fscrypt_free_dummy_policy(&args->dummy_enc_policy); kfree(args); } static int strcmp_null(const char *s1, const char *s2) { if (!s1 && !s2) return 0; if (s1 && !s2) return -1; if (!s1 && s2) return 1; return strcmp(s1, s2); } static int compare_mount_options(struct ceph_mount_options *new_fsopt, struct ceph_options *new_opt, struct ceph_fs_client *fsc) { struct ceph_mount_options *fsopt1 = new_fsopt; struct ceph_mount_options *fsopt2 = fsc->mount_options; int ofs = offsetof(struct ceph_mount_options, snapdir_name); int ret; ret = memcmp(fsopt1, fsopt2, ofs); if (ret) return ret; ret = strcmp_null(fsopt1->snapdir_name, fsopt2->snapdir_name); if (ret) return ret; ret = strcmp_null(fsopt1->mds_namespace, fsopt2->mds_namespace); if (ret) return ret; ret = strcmp_null(fsopt1->server_path, fsopt2->server_path); if (ret) return ret; ret = strcmp_null(fsopt1->fscache_uniq, fsopt2->fscache_uniq); if (ret) return ret; ret = strcmp_null(fsopt1->mon_addr, fsopt2->mon_addr); if (ret) return ret; return ceph_compare_options(new_opt, fsc->client); } /** * ceph_show_options - Show mount options in /proc/mounts * @m: seq_file to write to * @root: root of that (sub)tree */ static int ceph_show_options(struct seq_file *m, struct dentry *root) { struct ceph_fs_client *fsc = ceph_sb_to_fs_client(root->d_sb); struct ceph_mount_options *fsopt = fsc->mount_options; size_t pos; int ret; /* a comma between MNT/MS and client options */ seq_putc(m, ','); pos = m->count; ret = ceph_print_client_options(m, fsc->client, false); if (ret) return ret; /* retract our comma if no client options */ if (m->count == pos) m->count--; if (fsopt->flags & CEPH_MOUNT_OPT_DIRSTAT) seq_puts(m, ",dirstat"); if ((fsopt->flags & CEPH_MOUNT_OPT_RBYTES)) seq_puts(m, ",rbytes"); if (fsopt->flags & CEPH_MOUNT_OPT_NOASYNCREADDIR) seq_puts(m, ",noasyncreaddir"); if ((fsopt->flags & CEPH_MOUNT_OPT_DCACHE) == 0) seq_puts(m, ",nodcache"); if (fsopt->flags & CEPH_MOUNT_OPT_INO32) seq_puts(m, ",ino32"); if (fsopt->flags & CEPH_MOUNT_OPT_FSCACHE) { seq_show_option(m, "fsc", fsopt->fscache_uniq); } if (fsopt->flags & CEPH_MOUNT_OPT_NOPOOLPERM) seq_puts(m, ",nopoolperm"); if (fsopt->flags & CEPH_MOUNT_OPT_NOQUOTADF) seq_puts(m, ",noquotadf"); #ifdef CONFIG_CEPH_FS_POSIX_ACL if (root->d_sb->s_flags & SB_POSIXACL) seq_puts(m, ",acl"); else seq_puts(m, ",noacl"); #endif if ((fsopt->flags & CEPH_MOUNT_OPT_NOCOPYFROM) == 0) seq_puts(m, ",copyfrom"); /* dump mds_namespace when old device syntax is in use */ if (fsopt->mds_namespace && !fsopt->new_dev_syntax) seq_show_option(m, "mds_namespace", fsopt->mds_namespace); if (fsopt->mon_addr) seq_printf(m, ",mon_addr=%s", fsopt->mon_addr); if (fsopt->flags & CEPH_MOUNT_OPT_CLEANRECOVER) seq_show_option(m, "recover_session", "clean"); if (!(fsopt->flags & CEPH_MOUNT_OPT_ASYNC_DIROPS)) seq_puts(m, ",wsync"); if (fsopt->flags & CEPH_MOUNT_OPT_NOPAGECACHE) seq_puts(m, ",nopagecache"); if (fsopt->flags & CEPH_MOUNT_OPT_SPARSEREAD) seq_puts(m, ",sparseread"); fscrypt_show_test_dummy_encryption(m, ',', root->d_sb); if (fsopt->wsize != CEPH_MAX_WRITE_SIZE) seq_printf(m, ",wsize=%u", fsopt->wsize); if (fsopt->rsize != CEPH_MAX_READ_SIZE) seq_printf(m, ",rsize=%u", fsopt->rsize); if (fsopt->rasize != CEPH_RASIZE_DEFAULT) seq_printf(m, ",rasize=%u", fsopt->rasize); if (fsopt->congestion_kb != default_congestion_kb()) seq_printf(m, ",write_congestion_kb=%u", fsopt->congestion_kb); if (fsopt->caps_max) seq_printf(m, ",caps_max=%d", fsopt->caps_max); if (fsopt->caps_wanted_delay_min != CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT) seq_printf(m, ",caps_wanted_delay_min=%u", fsopt->caps_wanted_delay_min); if (fsopt->caps_wanted_delay_max != CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT) seq_printf(m, ",caps_wanted_delay_max=%u", fsopt->caps_wanted_delay_max); if (fsopt->max_readdir != CEPH_MAX_READDIR_DEFAULT) seq_printf(m, ",readdir_max_entries=%u", fsopt->max_readdir); if (fsopt->max_readdir_bytes != CEPH_MAX_READDIR_BYTES_DEFAULT) seq_printf(m, ",readdir_max_bytes=%u", fsopt->max_readdir_bytes); if (strcmp(fsopt->snapdir_name, CEPH_SNAPDIRNAME_DEFAULT)) seq_show_option(m, "snapdirname", fsopt->snapdir_name); return 0; } /* * handle any mon messages the standard library doesn't understand. * return error if we don't either. */ static int extra_mon_dispatch(struct ceph_client *client, struct ceph_msg *msg) { struct ceph_fs_client *fsc = client->private; int type = le16_to_cpu(msg->hdr.type); switch (type) { case CEPH_MSG_MDS_MAP: ceph_mdsc_handle_mdsmap(fsc->mdsc, msg); return 0; case CEPH_MSG_FS_MAP_USER: ceph_mdsc_handle_fsmap(fsc->mdsc, msg); return 0; default: return -1; } } /* * create a new fs client * * Success or not, this function consumes @fsopt and @opt. */ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt, struct ceph_options *opt) { struct ceph_fs_client *fsc; int err; fsc = kzalloc(sizeof(*fsc), GFP_KERNEL); if (!fsc) { err = -ENOMEM; goto fail; } fsc->client = ceph_create_client(opt, fsc); if (IS_ERR(fsc->client)) { err = PTR_ERR(fsc->client); goto fail; } opt = NULL; /* fsc->client now owns this */ fsc->client->extra_mon_dispatch = extra_mon_dispatch; ceph_set_opt(fsc->client, ABORT_ON_FULL); if (!fsopt->mds_namespace) { ceph_monc_want_map(&fsc->client->monc, CEPH_SUB_MDSMAP, 0, true); } else { ceph_monc_want_map(&fsc->client->monc, CEPH_SUB_FSMAP, 0, false); } fsc->mount_options = fsopt; fsc->sb = NULL; fsc->mount_state = CEPH_MOUNT_MOUNTING; fsc->filp_gen = 1; fsc->have_copy_from2 = true; atomic_long_set(&fsc->writeback_count, 0); fsc->write_congested = false; err = -ENOMEM; /* * The number of concurrent works can be high but they don't need * to be processed in parallel, limit concurrency. */ fsc->inode_wq = alloc_workqueue("ceph-inode", WQ_UNBOUND, 0); if (!fsc->inode_wq) goto fail_client; fsc->cap_wq = alloc_workqueue("ceph-cap", 0, 1); if (!fsc->cap_wq) goto fail_inode_wq; hash_init(fsc->async_unlink_conflict); spin_lock_init(&fsc->async_unlink_conflict_lock); spin_lock(&ceph_fsc_lock); list_add_tail(&fsc->metric_wakeup, &ceph_fsc_list); spin_unlock(&ceph_fsc_lock); return fsc; fail_inode_wq: destroy_workqueue(fsc->inode_wq); fail_client: ceph_destroy_client(fsc->client); fail: kfree(fsc); if (opt) ceph_destroy_options(opt); destroy_mount_options(fsopt); return ERR_PTR(err); } static void flush_fs_workqueues(struct ceph_fs_client *fsc) { flush_workqueue(fsc->inode_wq); flush_workqueue(fsc->cap_wq); } static void destroy_fs_client(struct ceph_fs_client *fsc) { doutc(fsc->client, "%p\n", fsc); spin_lock(&ceph_fsc_lock); list_del(&fsc->metric_wakeup); spin_unlock(&ceph_fsc_lock); ceph_mdsc_destroy(fsc); destroy_workqueue(fsc->inode_wq); destroy_workqueue(fsc->cap_wq); destroy_mount_options(fsc->mount_options); ceph_destroy_client(fsc->client); kfree(fsc); dout("%s: %p done\n", __func__, fsc); } /* * caches */ struct kmem_cache *ceph_inode_cachep; struct kmem_cache *ceph_cap_cachep; struct kmem_cache *ceph_cap_snap_cachep; struct kmem_cache *ceph_cap_flush_cachep; struct kmem_cache *ceph_dentry_cachep; struct kmem_cache *ceph_file_cachep; struct kmem_cache *ceph_dir_file_cachep; struct kmem_cache *ceph_mds_request_cachep; mempool_t *ceph_wb_pagevec_pool; static void ceph_inode_init_once(void *foo) { struct ceph_inode_info *ci = foo; inode_init_once(&ci->netfs.inode); } static int __init init_caches(void) { int error = -ENOMEM; ceph_inode_cachep = kmem_cache_create("ceph_inode_info", sizeof(struct ceph_inode_info), __alignof__(struct ceph_inode_info), SLAB_RECLAIM_ACCOUNT | SLAB_ACCOUNT, ceph_inode_init_once); if (!ceph_inode_cachep) return -ENOMEM; ceph_cap_cachep = KMEM_CACHE(ceph_cap, 0); if (!ceph_cap_cachep) goto bad_cap; ceph_cap_snap_cachep = KMEM_CACHE(ceph_cap_snap, 0); if (!ceph_cap_snap_cachep) goto bad_cap_snap; ceph_cap_flush_cachep = KMEM_CACHE(ceph_cap_flush, SLAB_RECLAIM_ACCOUNT); if (!ceph_cap_flush_cachep) goto bad_cap_flush; ceph_dentry_cachep = KMEM_CACHE(ceph_dentry_info, SLAB_RECLAIM_ACCOUNT); if (!ceph_dentry_cachep) goto bad_dentry; ceph_file_cachep = KMEM_CACHE(ceph_file_info, 0); if (!ceph_file_cachep) goto bad_file; ceph_dir_file_cachep = KMEM_CACHE(ceph_dir_file_info, 0); if (!ceph_dir_file_cachep) goto bad_dir_file; ceph_mds_request_cachep = KMEM_CACHE(ceph_mds_request, 0); if (!ceph_mds_request_cachep) goto bad_mds_req; ceph_wb_pagevec_pool = mempool_create_kmalloc_pool(10, (CEPH_MAX_WRITE_SIZE >> PAGE_SHIFT) * sizeof(struct page *)); if (!ceph_wb_pagevec_pool) goto bad_pagevec_pool; return 0; bad_pagevec_pool: kmem_cache_destroy(ceph_mds_request_cachep); bad_mds_req: kmem_cache_destroy(ceph_dir_file_cachep); bad_dir_file: kmem_cache_destroy(ceph_file_cachep); bad_file: kmem_cache_destroy(ceph_dentry_cachep); bad_dentry: kmem_cache_destroy(ceph_cap_flush_cachep); bad_cap_flush: kmem_cache_destroy(ceph_cap_snap_cachep); bad_cap_snap: kmem_cache_destroy(ceph_cap_cachep); bad_cap: kmem_cache_destroy(ceph_inode_cachep); return error; } static void destroy_caches(void) { /* * Make sure all delayed rcu free inodes are flushed before we * destroy cache. */ rcu_barrier(); kmem_cache_destroy(ceph_inode_cachep); kmem_cache_destroy(ceph_cap_cachep); kmem_cache_destroy(ceph_cap_snap_cachep); kmem_cache_destroy(ceph_cap_flush_cachep); kmem_cache_destroy(ceph_dentry_cachep); kmem_cache_destroy(ceph_file_cachep); kmem_cache_destroy(ceph_dir_file_cachep); kmem_cache_destroy(ceph_mds_request_cachep); mempool_destroy(ceph_wb_pagevec_pool); } static void __ceph_umount_begin(struct ceph_fs_client *fsc) { ceph_osdc_abort_requests(&fsc->client->osdc, -EIO); ceph_mdsc_force_umount(fsc->mdsc); fsc->filp_gen++; // invalidate open files } /* * ceph_umount_begin - initiate forced umount. Tear down the * mount, skipping steps that may hang while waiting for server(s). */ void ceph_umount_begin(struct super_block *sb) { struct ceph_fs_client *fsc = ceph_sb_to_fs_client(sb); doutc(fsc->client, "starting forced umount\n"); fsc->mount_state = CEPH_MOUNT_SHUTDOWN; __ceph_umount_begin(fsc); } static const struct super_operations ceph_super_ops = { .alloc_inode = ceph_alloc_inode, .free_inode = ceph_free_inode, .write_inode = ceph_write_inode, .drop_inode = generic_delete_inode, .evict_inode = ceph_evict_inode, .sync_fs = ceph_sync_fs, .put_super = ceph_put_super, .show_options = ceph_show_options, .statfs = ceph_statfs, .umount_begin = ceph_umount_begin, }; /* * Bootstrap mount by opening the root directory. Note the mount * @started time from caller, and time out if this takes too long. */ static struct dentry *open_root_dentry(struct ceph_fs_client *fsc, const char *path, unsigned long started) { struct ceph_client *cl = fsc->client; struct ceph_mds_client *mdsc = fsc->mdsc; struct ceph_mds_request *req = NULL; int err; struct dentry *root; /* open dir */ doutc(cl, "opening '%s'\n", path); req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, USE_ANY_MDS); if (IS_ERR(req)) return ERR_CAST(req); req->r_path1 = kstrdup(path, GFP_NOFS); if (!req->r_path1) { root = ERR_PTR(-ENOMEM); goto out; } req->r_ino1.ino = CEPH_INO_ROOT; req->r_ino1.snap = CEPH_NOSNAP; req->r_started = started; req->r_timeout = fsc->client->options->mount_timeout; req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INODE); req->r_num_caps = 2; err = ceph_mdsc_do_request(mdsc, NULL, req); if (err == 0) { struct inode *inode = req->r_target_inode; req->r_target_inode = NULL; doutc(cl, "success\n"); root = d_make_root(inode); if (!root) { root = ERR_PTR(-ENOMEM); goto out; } doutc(cl, "success, root dentry is %p\n", root); } else { root = ERR_PTR(err); } out: ceph_mdsc_put_request(req); return root; } #ifdef CONFIG_FS_ENCRYPTION static int ceph_apply_test_dummy_encryption(struct super_block *sb, struct fs_context *fc, struct ceph_mount_options *fsopt) { struct ceph_fs_client *fsc = sb->s_fs_info; if (!fscrypt_is_dummy_policy_set(&fsopt->dummy_enc_policy)) return 0; /* No changing encryption context on remount. */ if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE && !fscrypt_is_dummy_policy_set(&fsc->fsc_dummy_enc_policy)) { if (fscrypt_dummy_policies_equal(&fsopt->dummy_enc_policy, &fsc->fsc_dummy_enc_policy)) return 0; errorfc(fc, "Can't set test_dummy_encryption on remount"); return -EINVAL; } /* Also make sure fsopt doesn't contain a conflicting value. */ if (fscrypt_is_dummy_policy_set(&fsc->fsc_dummy_enc_policy)) { if (fscrypt_dummy_policies_equal(&fsopt->dummy_enc_policy, &fsc->fsc_dummy_enc_policy)) return 0; errorfc(fc, "Conflicting test_dummy_encryption options"); return -EINVAL; } fsc->fsc_dummy_enc_policy = fsopt->dummy_enc_policy; memset(&fsopt->dummy_enc_policy, 0, sizeof(fsopt->dummy_enc_policy)); warnfc(fc, "test_dummy_encryption mode enabled"); return 0; } #else static int ceph_apply_test_dummy_encryption(struct super_block *sb, struct fs_context *fc, struct ceph_mount_options *fsopt) { return 0; } #endif /* * mount: join the ceph cluster, and open root directory. */ static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc, struct fs_context *fc) { struct ceph_client *cl = fsc->client; int err; unsigned long started = jiffies; /* note the start time */ struct dentry *root; doutc(cl, "mount start %p\n", fsc); mutex_lock(&fsc->client->mount_mutex); if (!fsc->sb->s_root) { const char *path = fsc->mount_options->server_path ? fsc->mount_options->server_path + 1 : ""; err = __ceph_open_session(fsc->client, started); if (err < 0) goto out; /* setup fscache */ if (fsc->mount_options->flags & CEPH_MOUNT_OPT_FSCACHE) { err = ceph_fscache_register_fs(fsc, fc); if (err < 0) goto out; } err = ceph_apply_test_dummy_encryption(fsc->sb, fc, fsc->mount_options); if (err) goto out; doutc(cl, "mount opening path '%s'\n", path); ceph_fs_debugfs_init(fsc); root = open_root_dentry(fsc, path, started); if (IS_ERR(root)) { err = PTR_ERR(root); goto out; } fsc->sb->s_root = dget(root); } else { root = dget(fsc->sb->s_root); } fsc->mount_state = CEPH_MOUNT_MOUNTED; doutc(cl, "mount success\n"); mutex_unlock(&fsc->client->mount_mutex); return root; out: mutex_unlock(&fsc->client->mount_mutex); ceph_fscrypt_free_dummy_policy(fsc); return ERR_PTR(err); } static int ceph_set_super(struct super_block *s, struct fs_context *fc) { struct ceph_fs_client *fsc = s->s_fs_info; struct ceph_client *cl = fsc->client; int ret; doutc(cl, "%p\n", s); s->s_maxbytes = MAX_LFS_FILESIZE; s->s_xattr = ceph_xattr_handlers; fsc->sb = s; fsc->max_file_size = 1ULL << 40; /* temp value until we get mdsmap */ s->s_op = &ceph_super_ops; set_default_d_op(s, &ceph_dentry_ops); s->s_export_op = &ceph_export_ops; s->s_time_gran = 1; s->s_time_min = 0; s->s_time_max = U32_MAX; s->s_flags |= SB_NODIRATIME | SB_NOATIME; s->s_magic = CEPH_SUPER_MAGIC; ceph_fscrypt_set_ops(s); ret = set_anon_super_fc(s, fc); if (ret != 0) fsc->sb = NULL; return ret; } /* * share superblock if same fs AND options */ static int ceph_compare_super(struct super_block *sb, struct fs_context *fc) { struct ceph_fs_client *new = fc->s_fs_info; struct ceph_mount_options *fsopt = new->mount_options; struct ceph_options *opt = new->client->options; struct ceph_fs_client *fsc = ceph_sb_to_fs_client(sb); struct ceph_client *cl = fsc->client; doutc(cl, "%p\n", sb); if (compare_mount_options(fsopt, opt, fsc)) { doutc(cl, "monitor(s)/mount options don't match\n"); return 0; } if ((opt->flags & CEPH_OPT_FSID) && ceph_fsid_compare(&opt->fsid, &fsc->client->fsid)) { doutc(cl, "fsid doesn't match\n"); return 0; } if (fc->sb_flags != (sb->s_flags & ~SB_BORN)) { doutc(cl, "flags differ\n"); return 0; } if (fsc->blocklisted && !ceph_test_mount_opt(fsc, CLEANRECOVER)) { doutc(cl, "client is blocklisted (and CLEANRECOVER is not set)\n"); return 0; } if (fsc->mount_state == CEPH_MOUNT_SHUTDOWN) { doutc(cl, "client has been forcibly unmounted\n"); return 0; } return 1; } /* * construct our own bdi so we can control readahead, etc. */ static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0); static int ceph_setup_bdi(struct super_block *sb, struct ceph_fs_client *fsc) { int err; err = super_setup_bdi_name(sb, "ceph-%ld", atomic_long_inc_return(&bdi_seq)); if (err) return err; /* set ra_pages based on rasize mount option? */ sb->s_bdi->ra_pages = fsc->mount_options->rasize >> PAGE_SHIFT; /* set io_pages based on max osd read size */ sb->s_bdi->io_pages = fsc->mount_options->rsize >> PAGE_SHIFT; return 0; } static int ceph_get_tree(struct fs_context *fc) { struct ceph_parse_opts_ctx *pctx = fc->fs_private; struct ceph_mount_options *fsopt = pctx->opts; struct super_block *sb; struct ceph_fs_client *fsc; struct dentry *res; int (*compare_super)(struct super_block *, struct fs_context *) = ceph_compare_super; int err; dout("ceph_get_tree\n"); if (!fc->source) return invalfc(fc, "No source"); if (fsopt->new_dev_syntax && !fsopt->mon_addr) return invalfc(fc, "No monitor address"); /* create client (which we may/may not use) */ fsc = create_fs_client(pctx->opts, pctx->copts); pctx->opts = NULL; pctx->copts = NULL; if (IS_ERR(fsc)) { err = PTR_ERR(fsc); goto out_final; } err = ceph_mdsc_init(fsc); if (err < 0) goto out; if (ceph_test_opt(fsc->client, NOSHARE)) compare_super = NULL; fc->s_fs_info = fsc; sb = sget_fc(fc, compare_super, ceph_set_super); fc->s_fs_info = NULL; if (IS_ERR(sb)) { err = PTR_ERR(sb); goto out; } if (ceph_sb_to_fs_client(sb) != fsc) { destroy_fs_client(fsc); fsc = ceph_sb_to_fs_client(sb); dout("get_sb got existing client %p\n", fsc); } else { dout("get_sb using new client %p\n", fsc); err = ceph_setup_bdi(sb, fsc); if (err < 0) goto out_splat; } res = ceph_real_mount(fsc, fc); if (IS_ERR(res)) { err = PTR_ERR(res); goto out_splat; } doutc(fsc->client, "root %p inode %p ino %llx.%llx\n", res, d_inode(res), ceph_vinop(d_inode(res))); fc->root = fsc->sb->s_root; return 0; out_splat: if (!ceph_mdsmap_is_cluster_available(fsc->mdsc->mdsmap)) { pr_info("No mds server is up or the cluster is laggy\n"); err = -EHOSTUNREACH; } ceph_mdsc_close_sessions(fsc->mdsc); deactivate_locked_super(sb); goto out_final; out: destroy_fs_client(fsc); out_final: dout("ceph_get_tree fail %d\n", err); return err; } static void ceph_free_fc(struct fs_context *fc) { struct ceph_parse_opts_ctx *pctx = fc->fs_private; if (pctx) { destroy_mount_options(pctx->opts); ceph_destroy_options(pctx->copts); kfree(pctx); } } static int ceph_reconfigure_fc(struct fs_context *fc) { int err; struct ceph_parse_opts_ctx *pctx = fc->fs_private; struct ceph_mount_options *fsopt = pctx->opts; struct super_block *sb = fc->root->d_sb; struct ceph_fs_client *fsc = ceph_sb_to_fs_client(sb); err = ceph_apply_test_dummy_encryption(sb, fc, fsopt); if (err) return err; if (fsopt->flags & CEPH_MOUNT_OPT_ASYNC_DIROPS) ceph_set_mount_opt(fsc, ASYNC_DIROPS); else ceph_clear_mount_opt(fsc, ASYNC_DIROPS); if (fsopt->flags & CEPH_MOUNT_OPT_SPARSEREAD) ceph_set_mount_opt(fsc, SPARSEREAD); else ceph_clear_mount_opt(fsc, SPARSEREAD); if (strcmp_null(fsc->mount_options->mon_addr, fsopt->mon_addr)) { kfree(fsc->mount_options->mon_addr); fsc->mount_options->mon_addr = fsopt->mon_addr; fsopt->mon_addr = NULL; pr_notice_client(fsc->client, "monitor addresses recorded, but not used for reconnection"); } sync_filesystem(sb); return 0; } static const struct fs_context_operations ceph_context_ops = { .free = ceph_free_fc, .parse_param = ceph_parse_mount_param, .get_tree = ceph_get_tree, .reconfigure = ceph_reconfigure_fc, }; /* * Set up the filesystem mount context. */ static int ceph_init_fs_context(struct fs_context *fc) { struct ceph_parse_opts_ctx *pctx; struct ceph_mount_options *fsopt; pctx = kzalloc(sizeof(*pctx), GFP_KERNEL); if (!pctx) return -ENOMEM; pctx->copts = ceph_alloc_options(); if (!pctx->copts) goto nomem; pctx->opts = kzalloc(sizeof(*pctx->opts), GFP_KERNEL); if (!pctx->opts) goto nomem; fsopt = pctx->opts; fsopt->flags = CEPH_MOUNT_OPT_DEFAULT; fsopt->wsize = CEPH_MAX_WRITE_SIZE; fsopt->rsize = CEPH_MAX_READ_SIZE; fsopt->rasize = CEPH_RASIZE_DEFAULT; fsopt->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL); if (!fsopt->snapdir_name) goto nomem; fsopt->caps_wanted_delay_min = CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT; fsopt->caps_wanted_delay_max = CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT; fsopt->max_readdir = CEPH_MAX_READDIR_DEFAULT; fsopt->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT; fsopt->congestion_kb = default_congestion_kb(); #ifdef CONFIG_CEPH_FS_POSIX_ACL fc->sb_flags |= SB_POSIXACL; #endif fc->fs_private = pctx; fc->ops = &ceph_context_ops; return 0; nomem: destroy_mount_options(pctx->opts); ceph_destroy_options(pctx->copts); kfree(pctx); return -ENOMEM; } /* * Return true if it successfully increases the blocker counter, * or false if the mdsc is in stopping and flushed state. */ static bool __inc_stopping_blocker(struct ceph_mds_client *mdsc) { spin_lock(&mdsc->stopping_lock); if (mdsc->stopping >= CEPH_MDSC_STOPPING_FLUSHING) { spin_unlock(&mdsc->stopping_lock); return false; } atomic_inc(&mdsc->stopping_blockers); spin_unlock(&mdsc->stopping_lock); return true; } static void __dec_stopping_blocker(struct ceph_mds_client *mdsc) { spin_lock(&mdsc->stopping_lock); if (!atomic_dec_return(&mdsc->stopping_blockers) && mdsc->stopping >= CEPH_MDSC_STOPPING_FLUSHING) complete_all(&mdsc->stopping_waiter); spin_unlock(&mdsc->stopping_lock); } /* For metadata IO requests */ bool ceph_inc_mds_stopping_blocker(struct ceph_mds_client *mdsc, struct ceph_mds_session *session) { mutex_lock(&session->s_mutex); inc_session_sequence(session); mutex_unlock(&session->s_mutex); return __inc_stopping_blocker(mdsc); } void ceph_dec_mds_stopping_blocker(struct ceph_mds_client *mdsc) { __dec_stopping_blocker(mdsc); } /* For data IO requests */ bool ceph_inc_osd_stopping_blocker(struct ceph_mds_client *mdsc) { return __inc_stopping_blocker(mdsc); } void ceph_dec_osd_stopping_blocker(struct ceph_mds_client *mdsc) { __dec_stopping_blocker(mdsc); } static void ceph_kill_sb(struct super_block *s) { struct ceph_fs_client *fsc = ceph_sb_to_fs_client(s); struct ceph_client *cl = fsc->client; struct ceph_mds_client *mdsc = fsc->mdsc; bool wait; doutc(cl, "%p\n", s); ceph_mdsc_pre_umount(mdsc); flush_fs_workqueues(fsc); /* * Though the kill_anon_super() will finally trigger the * sync_filesystem() anyway, we still need to do it here and * then bump the stage of shutdown. This will allow us to * drop any further message, which will increase the inodes' * i_count reference counters but makes no sense any more, * from MDSs. * * Without this when evicting the inodes it may fail in the * kill_anon_super(), which will trigger a warning when * destroying the fscrypt keyring and then possibly trigger * a further crash in ceph module when the iput() tries to * evict the inodes later. */ sync_filesystem(s); if (atomic64_read(&mdsc->dirty_folios) > 0) { wait_queue_head_t *wq = &mdsc->flush_end_wq; long timeleft = wait_event_killable_timeout(*wq, atomic64_read(&mdsc->dirty_folios) <= 0, fsc->client->options->mount_timeout); if (!timeleft) /* timed out */ pr_warn_client(cl, "umount timed out, %ld\n", timeleft); else if (timeleft < 0) /* killed */ pr_warn_client(cl, "umount was killed, %ld\n", timeleft); } spin_lock(&mdsc->stopping_lock); mdsc->stopping = CEPH_MDSC_STOPPING_FLUSHING; wait = !!atomic_read(&mdsc->stopping_blockers); spin_unlock(&mdsc->stopping_lock); if (wait && atomic_read(&mdsc->stopping_blockers)) { long timeleft = wait_for_completion_killable_timeout( &mdsc->stopping_waiter, fsc->client->options->mount_timeout); if (!timeleft) /* timed out */ pr_warn_client(cl, "umount timed out, %ld\n", timeleft); else if (timeleft < 0) /* killed */ pr_warn_client(cl, "umount was killed, %ld\n", timeleft); } mdsc->stopping = CEPH_MDSC_STOPPING_FLUSHED; kill_anon_super(s); fsc->client->extra_mon_dispatch = NULL; ceph_fs_debugfs_cleanup(fsc); ceph_fscache_unregister_fs(fsc); destroy_fs_client(fsc); } static struct file_system_type ceph_fs_type = { .owner = THIS_MODULE, .name = "ceph", .init_fs_context = ceph_init_fs_context, .kill_sb = ceph_kill_sb, .fs_flags = FS_RENAME_DOES_D_MOVE | FS_ALLOW_IDMAP, }; MODULE_ALIAS_FS("ceph"); int ceph_force_reconnect(struct super_block *sb) { struct ceph_fs_client *fsc = ceph_sb_to_fs_client(sb); int err = 0; fsc->mount_state = CEPH_MOUNT_RECOVER; __ceph_umount_begin(fsc); /* Make sure all page caches get invalidated. * see remove_session_caps_cb() */ flush_workqueue(fsc->inode_wq); /* In case that we were blocklisted. This also reset * all mon/osd connections */ ceph_reset_client_addr(fsc->client); ceph_osdc_clear_abort_err(&fsc->client->osdc); fsc->blocklisted = false; fsc->mount_state = CEPH_MOUNT_MOUNTED; if (sb->s_root) { err = __ceph_do_getattr(d_inode(sb->s_root), NULL, CEPH_STAT_CAP_INODE, true); } return err; } static int __init init_ceph(void) { int ret = init_caches(); if (ret) goto out; ceph_flock_init(); ret = register_filesystem(&ceph_fs_type); if (ret) goto out_caches; pr_info("loaded (mds proto %d)\n", CEPH_MDSC_PROTOCOL); return 0; out_caches: destroy_caches(); out: return ret; } static void __exit exit_ceph(void) { dout("exit_ceph\n"); unregister_filesystem(&ceph_fs_type); destroy_caches(); } static int param_set_metrics(const char *val, const struct kernel_param *kp) { struct ceph_fs_client *fsc; int ret; ret = param_set_bool(val, kp); if (ret) { pr_err("Failed to parse sending metrics switch value '%s'\n", val); return ret; } else if (!disable_send_metrics) { // wake up all the mds clients spin_lock(&ceph_fsc_lock); list_for_each_entry(fsc, &ceph_fsc_list, metric_wakeup) { metric_schedule_delayed(&fsc->mdsc->metric); } spin_unlock(&ceph_fsc_lock); } return 0; } static const struct kernel_param_ops param_ops_metrics = { .set = param_set_metrics, .get = param_get_bool, }; bool disable_send_metrics = false; module_param_cb(disable_send_metrics, ¶m_ops_metrics, &disable_send_metrics, 0644); MODULE_PARM_DESC(disable_send_metrics, "Enable sending perf metrics to ceph cluster (default: on)"); /* for both v1 and v2 syntax */ static bool mount_support = true; static const struct kernel_param_ops param_ops_mount_syntax = { .get = param_get_bool, }; module_param_cb(mount_syntax_v1, ¶m_ops_mount_syntax, &mount_support, 0444); module_param_cb(mount_syntax_v2, ¶m_ops_mount_syntax, &mount_support, 0444); bool enable_unsafe_idmap = false; module_param(enable_unsafe_idmap, bool, 0644); MODULE_PARM_DESC(enable_unsafe_idmap, "Allow to use idmapped mounts with MDS without CEPHFS_FEATURE_HAS_OWNER_UIDGID"); module_init(init_ceph); module_exit(exit_ceph); MODULE_AUTHOR("Sage Weil <sage@newdream.net>"); MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>"); MODULE_AUTHOR("Patience Warnick <patience@newdream.net>"); MODULE_DESCRIPTION("Ceph filesystem for Linux"); MODULE_LICENSE("GPL"); |
| 2 1 2 2 2 3 3 3 3 3 3 3 3 2 3 3 2 2 3 3 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 | // SPDX-License-Identifier: GPL-2.0-or-later /* * em_canid.c Ematch rule to match CAN frames according to their CAN IDs * * Idea: Oliver Hartkopp <oliver.hartkopp@volkswagen.de> * Copyright: (c) 2011 Czech Technical University in Prague * (c) 2011 Volkswagen Group Research * Authors: Michal Sojka <sojkam1@fel.cvut.cz> * Pavel Pisa <pisa@cmp.felk.cvut.cz> * Rostislav Lisovy <lisovy@gmail.cz> * Funded by: Volkswagen Group Research */ #include <linux/slab.h> #include <linux/module.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/skbuff.h> #include <net/pkt_cls.h> #include <linux/can.h> #define EM_CAN_RULES_MAX 500 struct canid_match { /* For each SFF CAN ID (11 bit) there is one record in this bitfield */ DECLARE_BITMAP(match_sff, (1 << CAN_SFF_ID_BITS)); int rules_count; int sff_rules_count; int eff_rules_count; /* * Raw rules copied from netlink message; Used for sending * information to userspace (when 'tc filter show' is invoked) * AND when matching EFF frames */ struct can_filter rules_raw[]; }; /** * em_canid_get_id() - Extracts Can ID out of the sk_buff structure. * @skb: buffer to extract Can ID from */ static canid_t em_canid_get_id(struct sk_buff *skb) { /* CAN ID is stored within the data field */ struct can_frame *cf = (struct can_frame *)skb->data; return cf->can_id; } static void em_canid_sff_match_add(struct canid_match *cm, u32 can_id, u32 can_mask) { int i; /* * Limit can_mask and can_id to SFF range to * protect against write after end of array */ can_mask &= CAN_SFF_MASK; can_id &= can_mask; /* Single frame */ if (can_mask == CAN_SFF_MASK) { set_bit(can_id, cm->match_sff); return; } /* All frames */ if (can_mask == 0) { bitmap_fill(cm->match_sff, (1 << CAN_SFF_ID_BITS)); return; } /* * Individual frame filter. * Add record (set bit to 1) for each ID that * conforms particular rule */ for (i = 0; i < (1 << CAN_SFF_ID_BITS); i++) { if ((i & can_mask) == can_id) set_bit(i, cm->match_sff); } } static inline struct canid_match *em_canid_priv(struct tcf_ematch *m) { return (struct canid_match *)m->data; } static int em_canid_match(struct sk_buff *skb, struct tcf_ematch *m, struct tcf_pkt_info *info) { struct canid_match *cm = em_canid_priv(m); canid_t can_id; int match = 0; int i; const struct can_filter *lp; can_id = em_canid_get_id(skb); if (can_id & CAN_EFF_FLAG) { for (i = 0, lp = cm->rules_raw; i < cm->eff_rules_count; i++, lp++) { if (!(((lp->can_id ^ can_id) & lp->can_mask))) { match = 1; break; } } } else { /* SFF */ can_id &= CAN_SFF_MASK; match = (test_bit(can_id, cm->match_sff) ? 1 : 0); } return match; } static int em_canid_change(struct net *net, void *data, int len, struct tcf_ematch *m) { struct can_filter *conf = data; /* Array with rules */ struct canid_match *cm; int i; if (!len) return -EINVAL; if (len % sizeof(struct can_filter)) return -EINVAL; if (len > sizeof(struct can_filter) * EM_CAN_RULES_MAX) return -EINVAL; cm = kzalloc(sizeof(struct canid_match) + len, GFP_KERNEL); if (!cm) return -ENOMEM; cm->rules_count = len / sizeof(struct can_filter); /* * We need two for() loops for copying rules into two contiguous * areas in rules_raw to process all eff rules with a simple loop. * NB: The configuration interface supports sff and eff rules. * We do not support filters here that match for the same can_id * provided in a SFF and EFF frame (e.g. 0x123 / 0x80000123). * For this (unusual case) two filters have to be specified. The * SFF/EFF separation is done with the CAN_EFF_FLAG in the can_id. */ /* Fill rules_raw with EFF rules first */ for (i = 0; i < cm->rules_count; i++) { if (conf[i].can_id & CAN_EFF_FLAG) { memcpy(cm->rules_raw + cm->eff_rules_count, &conf[i], sizeof(struct can_filter)); cm->eff_rules_count++; } } /* append SFF frame rules */ for (i = 0; i < cm->rules_count; i++) { if (!(conf[i].can_id & CAN_EFF_FLAG)) { memcpy(cm->rules_raw + cm->eff_rules_count + cm->sff_rules_count, &conf[i], sizeof(struct can_filter)); cm->sff_rules_count++; em_canid_sff_match_add(cm, conf[i].can_id, conf[i].can_mask); } } m->datalen = sizeof(struct canid_match) + len; m->data = (unsigned long)cm; return 0; } static void em_canid_destroy(struct tcf_ematch *m) { struct canid_match *cm = em_canid_priv(m); kfree(cm); } static int em_canid_dump(struct sk_buff *skb, struct tcf_ematch *m) { struct canid_match *cm = em_canid_priv(m); /* * When configuring this ematch 'rules_count' is set not to exceed * 'rules_raw' array size */ if (nla_put_nohdr(skb, sizeof(struct can_filter) * cm->rules_count, &cm->rules_raw) < 0) return -EMSGSIZE; return 0; } static struct tcf_ematch_ops em_canid_ops = { .kind = TCF_EM_CANID, .change = em_canid_change, .match = em_canid_match, .destroy = em_canid_destroy, .dump = em_canid_dump, .owner = THIS_MODULE, .link = LIST_HEAD_INIT(em_canid_ops.link) }; static int __init init_em_canid(void) { return tcf_em_register(&em_canid_ops); } static void __exit exit_em_canid(void) { tcf_em_unregister(&em_canid_ops); } MODULE_DESCRIPTION("ematch classifier to match CAN IDs embedded in skb CAN frames"); MODULE_LICENSE("GPL"); module_init(init_em_canid); module_exit(exit_em_canid); MODULE_ALIAS_TCF_EMATCH(TCF_EM_CANID); |
| 29 1 1 1 1 1 3 2 3 3 3 24 1 7 7 7 5 2 1 1 1 3 3 2 2 3 3 1 1 3 3 3 1 1 24 1 24 25 25 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 | // SPDX-License-Identifier: GPL-2.0 #include <linux/compat.h> #include <linux/console.h> #include <linux/fb.h> #include <linux/fbcon.h> #include <linux/major.h> #include "fb_internal.h" /* * We hold a reference to the fb_info in file->private_data, * but if the current registered fb has changed, we don't * actually want to use it. * * So look up the fb_info using the inode minor number, * and just verify it against the reference we have. */ static struct fb_info *file_fb_info(struct file *file) { struct inode *inode = file_inode(file); int fbidx = iminor(inode); struct fb_info *info = registered_fb[fbidx]; if (info != file->private_data) info = NULL; return info; } static ssize_t fb_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { struct fb_info *info = file_fb_info(file); if (!info) return -ENODEV; if (fb_WARN_ON_ONCE(info, !info->fbops->fb_read)) return -EINVAL; if (info->state != FBINFO_STATE_RUNNING) return -EPERM; return info->fbops->fb_read(info, buf, count, ppos); } static ssize_t fb_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { struct fb_info *info = file_fb_info(file); if (!info) return -ENODEV; if (fb_WARN_ON_ONCE(info, !info->fbops->fb_write)) return -EINVAL; if (info->state != FBINFO_STATE_RUNNING) return -EPERM; return info->fbops->fb_write(info, buf, count, ppos); } static long do_fb_ioctl(struct fb_info *info, unsigned int cmd, unsigned long arg) { const struct fb_ops *fb; struct fb_var_screeninfo var; struct fb_fix_screeninfo fix; struct fb_cmap cmap_from; struct fb_cmap_user cmap; void __user *argp = (void __user *)arg; long ret = 0; switch (cmd) { case FBIOGET_VSCREENINFO: lock_fb_info(info); var = info->var; unlock_fb_info(info); ret = copy_to_user(argp, &var, sizeof(var)) ? -EFAULT : 0; break; case FBIOPUT_VSCREENINFO: if (copy_from_user(&var, argp, sizeof(var))) return -EFAULT; /* only for kernel-internal use */ var.activate &= ~FB_ACTIVATE_KD_TEXT; console_lock(); lock_fb_info(info); ret = fbcon_modechange_possible(info, &var); if (!ret) ret = fb_set_var(info, &var); if (!ret) fbcon_update_vcs(info, var.activate & FB_ACTIVATE_ALL); unlock_fb_info(info); console_unlock(); if (!ret && copy_to_user(argp, &var, sizeof(var))) ret = -EFAULT; break; case FBIOGET_FSCREENINFO: lock_fb_info(info); memcpy(&fix, &info->fix, sizeof(fix)); if (info->flags & FBINFO_HIDE_SMEM_START) fix.smem_start = 0; unlock_fb_info(info); ret = copy_to_user(argp, &fix, sizeof(fix)) ? -EFAULT : 0; break; case FBIOPUTCMAP: if (copy_from_user(&cmap, argp, sizeof(cmap))) return -EFAULT; ret = fb_set_user_cmap(&cmap, info); break; case FBIOGETCMAP: if (copy_from_user(&cmap, argp, sizeof(cmap))) return -EFAULT; lock_fb_info(info); cmap_from = info->cmap; unlock_fb_info(info); ret = fb_cmap_to_user(&cmap_from, &cmap); break; case FBIOPAN_DISPLAY: if (copy_from_user(&var, argp, sizeof(var))) return -EFAULT; console_lock(); lock_fb_info(info); ret = fb_pan_display(info, &var); unlock_fb_info(info); console_unlock(); if (ret == 0 && copy_to_user(argp, &var, sizeof(var))) return -EFAULT; break; case FBIO_CURSOR: ret = -EINVAL; break; case FBIOGET_CON2FBMAP: ret = fbcon_get_con2fb_map_ioctl(argp); break; case FBIOPUT_CON2FBMAP: ret = fbcon_set_con2fb_map_ioctl(argp); break; case FBIOBLANK: if (arg > FB_BLANK_POWERDOWN) return -EINVAL; console_lock(); lock_fb_info(info); ret = fb_blank(info, arg); /* might again call into fb_blank */ fbcon_fb_blanked(info, arg); unlock_fb_info(info); console_unlock(); break; default: lock_fb_info(info); fb = info->fbops; if (fb->fb_ioctl) ret = fb->fb_ioctl(info, cmd, arg); else ret = -ENOTTY; unlock_fb_info(info); } return ret; } static long fb_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct fb_info *info = file_fb_info(file); if (!info) return -ENODEV; return do_fb_ioctl(info, cmd, arg); } #ifdef CONFIG_COMPAT struct fb_fix_screeninfo32 { char id[16]; compat_caddr_t smem_start; u32 smem_len; u32 type; u32 type_aux; u32 visual; u16 xpanstep; u16 ypanstep; u16 ywrapstep; u32 line_length; compat_caddr_t mmio_start; u32 mmio_len; u32 accel; u16 reserved[3]; }; struct fb_cmap32 { u32 start; u32 len; compat_caddr_t red; compat_caddr_t green; compat_caddr_t blue; compat_caddr_t transp; }; static int fb_getput_cmap(struct fb_info *info, unsigned int cmd, unsigned long arg) { struct fb_cmap32 cmap32; struct fb_cmap cmap_from; struct fb_cmap_user cmap; if (copy_from_user(&cmap32, compat_ptr(arg), sizeof(cmap32))) return -EFAULT; cmap = (struct fb_cmap_user) { .start = cmap32.start, .len = cmap32.len, .red = compat_ptr(cmap32.red), .green = compat_ptr(cmap32.green), .blue = compat_ptr(cmap32.blue), .transp = compat_ptr(cmap32.transp), }; if (cmd == FBIOPUTCMAP) return fb_set_user_cmap(&cmap, info); lock_fb_info(info); cmap_from = info->cmap; unlock_fb_info(info); return fb_cmap_to_user(&cmap_from, &cmap); } static int do_fscreeninfo_to_user(struct fb_fix_screeninfo *fix, struct fb_fix_screeninfo32 __user *fix32) { __u32 data; int err; err = copy_to_user(&fix32->id, &fix->id, sizeof(fix32->id)); data = (__u32) (unsigned long) fix->smem_start; err |= put_user(data, &fix32->smem_start); err |= put_user(fix->smem_len, &fix32->smem_len); err |= put_user(fix->type, &fix32->type); err |= put_user(fix->type_aux, &fix32->type_aux); err |= put_user(fix->visual, &fix32->visual); err |= put_user(fix->xpanstep, &fix32->xpanstep); err |= put_user(fix->ypanstep, &fix32->ypanstep); err |= put_user(fix->ywrapstep, &fix32->ywrapstep); err |= put_user(fix->line_length, &fix32->line_length); data = (__u32) (unsigned long) fix->mmio_start; err |= put_user(data, &fix32->mmio_start); err |= put_user(fix->mmio_len, &fix32->mmio_len); err |= put_user(fix->accel, &fix32->accel); err |= copy_to_user(fix32->reserved, fix->reserved, sizeof(fix->reserved)); if (err) return -EFAULT; return 0; } static int fb_get_fscreeninfo(struct fb_info *info, unsigned int cmd, unsigned long arg) { struct fb_fix_screeninfo fix; lock_fb_info(info); fix = info->fix; if (info->flags & FBINFO_HIDE_SMEM_START) fix.smem_start = 0; unlock_fb_info(info); return do_fscreeninfo_to_user(&fix, compat_ptr(arg)); } static long fb_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct fb_info *info = file_fb_info(file); const struct fb_ops *fb; long ret = -ENOIOCTLCMD; if (!info) return -ENODEV; fb = info->fbops; switch (cmd) { case FBIOGET_VSCREENINFO: case FBIOPUT_VSCREENINFO: case FBIOPAN_DISPLAY: case FBIOGET_CON2FBMAP: case FBIOPUT_CON2FBMAP: arg = (unsigned long) compat_ptr(arg); fallthrough; case FBIOBLANK: ret = do_fb_ioctl(info, cmd, arg); break; case FBIOGET_FSCREENINFO: ret = fb_get_fscreeninfo(info, cmd, arg); break; case FBIOGETCMAP: case FBIOPUTCMAP: ret = fb_getput_cmap(info, cmd, arg); break; default: if (fb->fb_compat_ioctl) ret = fb->fb_compat_ioctl(info, cmd, arg); break; } return ret; } #endif static int fb_mmap(struct file *file, struct vm_area_struct *vma) { struct fb_info *info = file_fb_info(file); int res; if (!info) return -ENODEV; if (fb_WARN_ON_ONCE(info, !info->fbops->fb_mmap)) return -ENODEV; mutex_lock(&info->mm_lock); res = info->fbops->fb_mmap(info, vma); mutex_unlock(&info->mm_lock); return res; } static int fb_open(struct inode *inode, struct file *file) __acquires(&info->lock) __releases(&info->lock) { int fbidx = iminor(inode); struct fb_info *info; int res = 0; info = get_fb_info(fbidx); if (!info) { request_module("fb%d", fbidx); info = get_fb_info(fbidx); if (!info) return -ENODEV; } if (IS_ERR(info)) return PTR_ERR(info); lock_fb_info(info); if (!try_module_get(info->fbops->owner)) { res = -ENODEV; goto out; } file->private_data = info; if (info->fbops->fb_open) { res = info->fbops->fb_open(info, 1); if (res) module_put(info->fbops->owner); } #ifdef CONFIG_FB_DEFERRED_IO if (info->fbdefio) fb_deferred_io_open(info, inode, file); #endif out: unlock_fb_info(info); if (res) put_fb_info(info); return res; } static int fb_release(struct inode *inode, struct file *file) __acquires(&info->lock) __releases(&info->lock) { struct fb_info * const info = file->private_data; lock_fb_info(info); #if IS_ENABLED(CONFIG_FB_DEFERRED_IO) if (info->fbdefio) fb_deferred_io_release(info); #endif if (info->fbops->fb_release) info->fbops->fb_release(info, 1); module_put(info->fbops->owner); unlock_fb_info(info); put_fb_info(info); return 0; } #if defined(CONFIG_FB_PROVIDE_GET_FB_UNMAPPED_AREA) && !defined(CONFIG_MMU) static unsigned long get_fb_unmapped_area(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { struct fb_info * const info = filp->private_data; unsigned long fb_size = PAGE_ALIGN(info->fix.smem_len); if (pgoff > fb_size || len > fb_size - pgoff) return -EINVAL; return (unsigned long)info->screen_base + pgoff; } #endif static const struct file_operations fb_fops = { .owner = THIS_MODULE, .read = fb_read, .write = fb_write, .unlocked_ioctl = fb_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = fb_compat_ioctl, #endif .mmap = fb_mmap, .open = fb_open, .release = fb_release, #if defined(HAVE_ARCH_FB_UNMAPPED_AREA) || \ (defined(CONFIG_FB_PROVIDE_GET_FB_UNMAPPED_AREA) && \ !defined(CONFIG_MMU)) .get_unmapped_area = get_fb_unmapped_area, #endif #ifdef CONFIG_FB_DEFERRED_IO .fsync = fb_deferred_io_fsync, #endif .llseek = default_llseek, }; int fb_register_chrdev(void) { int ret; ret = register_chrdev(FB_MAJOR, "fb", &fb_fops); if (ret) { pr_err("Unable to get major %d for fb devs\n", FB_MAJOR); return ret; } return ret; } void fb_unregister_chrdev(void) { unregister_chrdev(FB_MAJOR, "fb"); } |
| 7 6 2 7 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 | // SPDX-License-Identifier: GPL-2.0-only /* * Detect Hung Task * * kernel/hung_task.c - kernel thread for detecting tasks stuck in D state * */ #include <linux/mm.h> #include <linux/cpu.h> #include <linux/nmi.h> #include <linux/init.h> #include <linux/delay.h> #include <linux/freezer.h> #include <linux/kthread.h> #include <linux/lockdep.h> #include <linux/export.h> #include <linux/panic_notifier.h> #include <linux/sysctl.h> #include <linux/suspend.h> #include <linux/utsname.h> #include <linux/sched/signal.h> #include <linux/sched/debug.h> #include <linux/sched/sysctl.h> #include <linux/hung_task.h> #include <linux/rwsem.h> #include <trace/events/sched.h> /* * The number of tasks checked: */ static int __read_mostly sysctl_hung_task_check_count = PID_MAX_LIMIT; /* * Total number of tasks detected as hung since boot: */ static unsigned long __read_mostly sysctl_hung_task_detect_count; /* * Limit number of tasks checked in a batch. * * This value controls the preemptibility of khungtaskd since preemption * is disabled during the critical section. It also controls the size of * the RCU grace period. So it needs to be upper-bound. */ #define HUNG_TASK_LOCK_BREAK (HZ / 10) /* * Zero means infinite timeout - no checking done: */ unsigned long __read_mostly sysctl_hung_task_timeout_secs = CONFIG_DEFAULT_HUNG_TASK_TIMEOUT; EXPORT_SYMBOL_GPL(sysctl_hung_task_timeout_secs); /* * Zero (default value) means use sysctl_hung_task_timeout_secs: */ static unsigned long __read_mostly sysctl_hung_task_check_interval_secs; static int __read_mostly sysctl_hung_task_warnings = 10; static int __read_mostly did_panic; static bool hung_task_show_lock; static bool hung_task_call_panic; static bool hung_task_show_all_bt; static struct task_struct *watchdog_task; #ifdef CONFIG_SMP /* * Should we dump all CPUs backtraces in a hung task event? * Defaults to 0, can be changed via sysctl. */ static unsigned int __read_mostly sysctl_hung_task_all_cpu_backtrace; #else #define sysctl_hung_task_all_cpu_backtrace 0 #endif /* CONFIG_SMP */ /* * Should we panic (and reboot, if panic_timeout= is set) when a * hung task is detected: */ static unsigned int __read_mostly sysctl_hung_task_panic = IS_ENABLED(CONFIG_BOOTPARAM_HUNG_TASK_PANIC); static int hung_task_panic(struct notifier_block *this, unsigned long event, void *ptr) { did_panic = 1; return NOTIFY_DONE; } static struct notifier_block panic_block = { .notifier_call = hung_task_panic, }; #ifdef CONFIG_DETECT_HUNG_TASK_BLOCKER static void debug_show_blocker(struct task_struct *task) { struct task_struct *g, *t; unsigned long owner, blocker, blocker_type; const char *rwsem_blocked_by, *rwsem_blocked_as; RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "No rcu lock held"); blocker = READ_ONCE(task->blocker); if (!blocker) return; blocker_type = hung_task_get_blocker_type(blocker); switch (blocker_type) { case BLOCKER_TYPE_MUTEX: owner = mutex_get_owner(hung_task_blocker_to_lock(blocker)); break; case BLOCKER_TYPE_SEM: owner = sem_last_holder(hung_task_blocker_to_lock(blocker)); break; case BLOCKER_TYPE_RWSEM_READER: case BLOCKER_TYPE_RWSEM_WRITER: owner = (unsigned long)rwsem_owner( hung_task_blocker_to_lock(blocker)); rwsem_blocked_as = (blocker_type == BLOCKER_TYPE_RWSEM_READER) ? "reader" : "writer"; rwsem_blocked_by = is_rwsem_reader_owned( hung_task_blocker_to_lock(blocker)) ? "reader" : "writer"; break; default: WARN_ON_ONCE(1); return; } if (unlikely(!owner)) { switch (blocker_type) { case BLOCKER_TYPE_MUTEX: pr_err("INFO: task %s:%d is blocked on a mutex, but the owner is not found.\n", task->comm, task->pid); break; case BLOCKER_TYPE_SEM: pr_err("INFO: task %s:%d is blocked on a semaphore, but the last holder is not found.\n", task->comm, task->pid); break; case BLOCKER_TYPE_RWSEM_READER: case BLOCKER_TYPE_RWSEM_WRITER: pr_err("INFO: task %s:%d is blocked on an rw-semaphore, but the owner is not found.\n", task->comm, task->pid); break; } return; } /* Ensure the owner information is correct. */ for_each_process_thread(g, t) { if ((unsigned long)t != owner) continue; switch (blocker_type) { case BLOCKER_TYPE_MUTEX: pr_err("INFO: task %s:%d is blocked on a mutex likely owned by task %s:%d.\n", task->comm, task->pid, t->comm, t->pid); break; case BLOCKER_TYPE_SEM: pr_err("INFO: task %s:%d blocked on a semaphore likely last held by task %s:%d\n", task->comm, task->pid, t->comm, t->pid); break; case BLOCKER_TYPE_RWSEM_READER: case BLOCKER_TYPE_RWSEM_WRITER: pr_err("INFO: task %s:%d <%s> blocked on an rw-semaphore likely owned by task %s:%d <%s>\n", task->comm, task->pid, rwsem_blocked_as, t->comm, t->pid, rwsem_blocked_by); break; } sched_show_task(t); return; } } #else static inline void debug_show_blocker(struct task_struct *task) { } #endif static void check_hung_task(struct task_struct *t, unsigned long timeout) { unsigned long switch_count = t->nvcsw + t->nivcsw; /* * Ensure the task is not frozen. * Also, skip vfork and any other user process that freezer should skip. */ if (unlikely(READ_ONCE(t->__state) & TASK_FROZEN)) return; /* * When a freshly created task is scheduled once, changes its state to * TASK_UNINTERRUPTIBLE without having ever been switched out once, it * musn't be checked. */ if (unlikely(!switch_count)) return; if (switch_count != t->last_switch_count) { t->last_switch_count = switch_count; t->last_switch_time = jiffies; return; } if (time_is_after_jiffies(t->last_switch_time + timeout * HZ)) return; /* * This counter tracks the total number of tasks detected as hung * since boot. */ sysctl_hung_task_detect_count++; trace_sched_process_hang(t); if (sysctl_hung_task_panic) { console_verbose(); hung_task_show_lock = true; hung_task_call_panic = true; } /* * Ok, the task did not get scheduled for more than 2 minutes, * complain: */ if (sysctl_hung_task_warnings || hung_task_call_panic) { if (sysctl_hung_task_warnings > 0) sysctl_hung_task_warnings--; pr_err("INFO: task %s:%d blocked for more than %ld seconds.\n", t->comm, t->pid, (jiffies - t->last_switch_time) / HZ); pr_err(" %s %s %.*s\n", print_tainted(), init_utsname()->release, (int)strcspn(init_utsname()->version, " "), init_utsname()->version); if (t->flags & PF_POSTCOREDUMP) pr_err(" Blocked by coredump.\n"); pr_err("\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\"" " disables this message.\n"); sched_show_task(t); debug_show_blocker(t); hung_task_show_lock = true; if (sysctl_hung_task_all_cpu_backtrace) hung_task_show_all_bt = true; if (!sysctl_hung_task_warnings) pr_info("Future hung task reports are suppressed, see sysctl kernel.hung_task_warnings\n"); } touch_nmi_watchdog(); } /* * To avoid extending the RCU grace period for an unbounded amount of time, * periodically exit the critical section and enter a new one. * * For preemptible RCU it is sufficient to call rcu_read_unlock in order * to exit the grace period. For classic RCU, a reschedule is required. */ static bool rcu_lock_break(struct task_struct *g, struct task_struct *t) { bool can_cont; get_task_struct(g); get_task_struct(t); rcu_read_unlock(); cond_resched(); rcu_read_lock(); can_cont = pid_alive(g) && pid_alive(t); put_task_struct(t); put_task_struct(g); return can_cont; } /* * Check whether a TASK_UNINTERRUPTIBLE does not get woken up for * a really long time (120 seconds). If that happens, print out * a warning. */ static void check_hung_uninterruptible_tasks(unsigned long timeout) { int max_count = sysctl_hung_task_check_count; unsigned long last_break = jiffies; struct task_struct *g, *t; /* * If the system crashed already then all bets are off, * do not report extra hung tasks: */ if (test_taint(TAINT_DIE) || did_panic) return; hung_task_show_lock = false; rcu_read_lock(); for_each_process_thread(g, t) { unsigned int state; if (!max_count--) goto unlock; if (time_after(jiffies, last_break + HUNG_TASK_LOCK_BREAK)) { if (!rcu_lock_break(g, t)) goto unlock; last_break = jiffies; } /* * skip the TASK_KILLABLE tasks -- these can be killed * skip the TASK_IDLE tasks -- those are genuinely idle */ state = READ_ONCE(t->__state); if ((state & TASK_UNINTERRUPTIBLE) && !(state & TASK_WAKEKILL) && !(state & TASK_NOLOAD)) check_hung_task(t, timeout); } unlock: rcu_read_unlock(); if (hung_task_show_lock) debug_show_all_locks(); if (hung_task_show_all_bt) { hung_task_show_all_bt = false; trigger_all_cpu_backtrace(); } if (hung_task_call_panic) panic("hung_task: blocked tasks"); } static long hung_timeout_jiffies(unsigned long last_checked, unsigned long timeout) { /* timeout of 0 will disable the watchdog */ return timeout ? last_checked - jiffies + timeout * HZ : MAX_SCHEDULE_TIMEOUT; } #ifdef CONFIG_SYSCTL /* * Process updating of timeout sysctl */ static int proc_dohung_task_timeout_secs(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int ret; ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos); if (ret || !write) goto out; wake_up_process(watchdog_task); out: return ret; } /* * This is needed for proc_doulongvec_minmax of sysctl_hung_task_timeout_secs * and hung_task_check_interval_secs */ static const unsigned long hung_task_timeout_max = (LONG_MAX / HZ); static const struct ctl_table hung_task_sysctls[] = { #ifdef CONFIG_SMP { .procname = "hung_task_all_cpu_backtrace", .data = &sysctl_hung_task_all_cpu_backtrace, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, #endif /* CONFIG_SMP */ { .procname = "hung_task_panic", .data = &sysctl_hung_task_panic, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, { .procname = "hung_task_check_count", .data = &sysctl_hung_task_check_count, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, }, { .procname = "hung_task_timeout_secs", .data = &sysctl_hung_task_timeout_secs, .maxlen = sizeof(unsigned long), .mode = 0644, .proc_handler = proc_dohung_task_timeout_secs, .extra2 = (void *)&hung_task_timeout_max, }, { .procname = "hung_task_check_interval_secs", .data = &sysctl_hung_task_check_interval_secs, .maxlen = sizeof(unsigned long), .mode = 0644, .proc_handler = proc_dohung_task_timeout_secs, .extra2 = (void *)&hung_task_timeout_max, }, { .procname = "hung_task_warnings", .data = &sysctl_hung_task_warnings, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_NEG_ONE, }, { .procname = "hung_task_detect_count", .data = &sysctl_hung_task_detect_count, .maxlen = sizeof(unsigned long), .mode = 0444, .proc_handler = proc_doulongvec_minmax, }, }; static void __init hung_task_sysctl_init(void) { register_sysctl_init("kernel", hung_task_sysctls); } #else #define hung_task_sysctl_init() do { } while (0) #endif /* CONFIG_SYSCTL */ static atomic_t reset_hung_task = ATOMIC_INIT(0); void reset_hung_task_detector(void) { atomic_set(&reset_hung_task, 1); } EXPORT_SYMBOL_GPL(reset_hung_task_detector); static bool hung_detector_suspended; static int hungtask_pm_notify(struct notifier_block *self, unsigned long action, void *hcpu) { switch (action) { case PM_SUSPEND_PREPARE: case PM_HIBERNATION_PREPARE: case PM_RESTORE_PREPARE: hung_detector_suspended = true; break; case PM_POST_SUSPEND: case PM_POST_HIBERNATION: case PM_POST_RESTORE: hung_detector_suspended = false; break; default: break; } return NOTIFY_OK; } /* * kthread which checks for tasks stuck in D state */ static int watchdog(void *dummy) { unsigned long hung_last_checked = jiffies; set_user_nice(current, 0); for ( ; ; ) { unsigned long timeout = sysctl_hung_task_timeout_secs; unsigned long interval = sysctl_hung_task_check_interval_secs; long t; if (interval == 0) interval = timeout; interval = min_t(unsigned long, interval, timeout); t = hung_timeout_jiffies(hung_last_checked, interval); if (t <= 0) { if (!atomic_xchg(&reset_hung_task, 0) && !hung_detector_suspended) check_hung_uninterruptible_tasks(timeout); hung_last_checked = jiffies; continue; } schedule_timeout_interruptible(t); } return 0; } static int __init hung_task_init(void) { atomic_notifier_chain_register(&panic_notifier_list, &panic_block); /* Disable hung task detector on suspend */ pm_notifier(hungtask_pm_notify, 0); watchdog_task = kthread_run(watchdog, NULL, "khungtaskd"); hung_task_sysctl_init(); return 0; } subsys_initcall(hung_task_init); |
| 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 | // SPDX-License-Identifier: GPL-2.0-or-later /* * * Bluetooth HCI UART driver for marvell devices * * Copyright (C) 2016 Marvell International Ltd. * Copyright (C) 2016 Intel Corporation */ #include <linux/kernel.h> #include <linux/errno.h> #include <linux/skbuff.h> #include <linux/firmware.h> #include <linux/module.h> #include <linux/tty.h> #include <linux/of.h> #include <linux/serdev.h> #include <net/bluetooth/bluetooth.h> #include <net/bluetooth/hci_core.h> #include "hci_uart.h" #define HCI_FW_REQ_PKT 0xA5 #define HCI_CHIP_VER_PKT 0xAA #define MRVL_ACK 0x5A #define MRVL_NAK 0xBF #define MRVL_RAW_DATA 0x1F #define MRVL_SET_BAUDRATE 0xFC09 enum { STATE_CHIP_VER_PENDING, STATE_FW_REQ_PENDING, STATE_FW_LOADED, }; struct mrvl_data { struct sk_buff *rx_skb; struct sk_buff_head txq; struct sk_buff_head rawq; unsigned long flags; unsigned int tx_len; u8 id, rev; }; struct mrvl_serdev { struct hci_uart hu; }; struct hci_mrvl_pkt { __le16 lhs; __le16 rhs; } __packed; #define HCI_MRVL_PKT_SIZE 4 static int mrvl_open(struct hci_uart *hu) { struct mrvl_data *mrvl; int ret; BT_DBG("hu %p", hu); if (!hci_uart_has_flow_control(hu)) return -EOPNOTSUPP; mrvl = kzalloc(sizeof(*mrvl), GFP_KERNEL); if (!mrvl) return -ENOMEM; skb_queue_head_init(&mrvl->txq); skb_queue_head_init(&mrvl->rawq); set_bit(STATE_CHIP_VER_PENDING, &mrvl->flags); hu->priv = mrvl; if (hu->serdev) { ret = serdev_device_open(hu->serdev); if (ret) goto err; } return 0; err: kfree(mrvl); return ret; } static int mrvl_close(struct hci_uart *hu) { struct mrvl_data *mrvl = hu->priv; BT_DBG("hu %p", hu); if (hu->serdev) serdev_device_close(hu->serdev); skb_queue_purge(&mrvl->txq); skb_queue_purge(&mrvl->rawq); kfree_skb(mrvl->rx_skb); kfree(mrvl); hu->priv = NULL; return 0; } static int mrvl_flush(struct hci_uart *hu) { struct mrvl_data *mrvl = hu->priv; BT_DBG("hu %p", hu); skb_queue_purge(&mrvl->txq); skb_queue_purge(&mrvl->rawq); return 0; } static struct sk_buff *mrvl_dequeue(struct hci_uart *hu) { struct mrvl_data *mrvl = hu->priv; struct sk_buff *skb; skb = skb_dequeue(&mrvl->txq); if (!skb) { /* Any raw data ? */ skb = skb_dequeue(&mrvl->rawq); } else { /* Prepend skb with frame type */ memcpy(skb_push(skb, 1), &bt_cb(skb)->pkt_type, 1); } return skb; } static int mrvl_enqueue(struct hci_uart *hu, struct sk_buff *skb) { struct mrvl_data *mrvl = hu->priv; skb_queue_tail(&mrvl->txq, skb); return 0; } static void mrvl_send_ack(struct hci_uart *hu, unsigned char type) { struct mrvl_data *mrvl = hu->priv; struct sk_buff *skb; /* No H4 payload, only 1 byte header */ skb = bt_skb_alloc(0, GFP_ATOMIC); if (!skb) { bt_dev_err(hu->hdev, "Unable to alloc ack/nak packet"); return; } hci_skb_pkt_type(skb) = type; skb_queue_tail(&mrvl->txq, skb); hci_uart_tx_wakeup(hu); } static int mrvl_recv_fw_req(struct hci_dev *hdev, struct sk_buff *skb) { struct hci_mrvl_pkt *pkt = (void *)skb->data; struct hci_uart *hu = hci_get_drvdata(hdev); struct mrvl_data *mrvl = hu->priv; int ret = 0; if ((pkt->lhs ^ pkt->rhs) != 0xffff) { bt_dev_err(hdev, "Corrupted mrvl header"); mrvl_send_ack(hu, MRVL_NAK); ret = -EINVAL; goto done; } mrvl_send_ack(hu, MRVL_ACK); if (!test_bit(STATE_FW_REQ_PENDING, &mrvl->flags)) { bt_dev_err(hdev, "Received unexpected firmware request"); ret = -EINVAL; goto done; } mrvl->tx_len = le16_to_cpu(pkt->lhs); clear_bit(STATE_FW_REQ_PENDING, &mrvl->flags); smp_mb__after_atomic(); wake_up_bit(&mrvl->flags, STATE_FW_REQ_PENDING); done: kfree_skb(skb); return ret; } static int mrvl_recv_chip_ver(struct hci_dev *hdev, struct sk_buff *skb) { struct hci_mrvl_pkt *pkt = (void *)skb->data; struct hci_uart *hu = hci_get_drvdata(hdev); struct mrvl_data *mrvl = hu->priv; u16 version = le16_to_cpu(pkt->lhs); int ret = 0; if ((pkt->lhs ^ pkt->rhs) != 0xffff) { bt_dev_err(hdev, "Corrupted mrvl header"); mrvl_send_ack(hu, MRVL_NAK); ret = -EINVAL; goto done; } mrvl_send_ack(hu, MRVL_ACK); if (!test_bit(STATE_CHIP_VER_PENDING, &mrvl->flags)) { bt_dev_err(hdev, "Received unexpected chip version"); goto done; } mrvl->id = version; mrvl->rev = version >> 8; bt_dev_info(hdev, "Controller id = %x, rev = %x", mrvl->id, mrvl->rev); clear_bit(STATE_CHIP_VER_PENDING, &mrvl->flags); smp_mb__after_atomic(); wake_up_bit(&mrvl->flags, STATE_CHIP_VER_PENDING); done: kfree_skb(skb); return ret; } #define HCI_RECV_CHIP_VER \ .type = HCI_CHIP_VER_PKT, \ .hlen = HCI_MRVL_PKT_SIZE, \ .loff = 0, \ .lsize = 0, \ .maxlen = HCI_MRVL_PKT_SIZE #define HCI_RECV_FW_REQ \ .type = HCI_FW_REQ_PKT, \ .hlen = HCI_MRVL_PKT_SIZE, \ .loff = 0, \ .lsize = 0, \ .maxlen = HCI_MRVL_PKT_SIZE static const struct h4_recv_pkt mrvl_recv_pkts[] = { { H4_RECV_ACL, .recv = hci_recv_frame }, { H4_RECV_SCO, .recv = hci_recv_frame }, { H4_RECV_EVENT, .recv = hci_recv_frame }, { HCI_RECV_FW_REQ, .recv = mrvl_recv_fw_req }, { HCI_RECV_CHIP_VER, .recv = mrvl_recv_chip_ver }, }; static int mrvl_recv(struct hci_uart *hu, const void *data, int count) { struct mrvl_data *mrvl = hu->priv; if (!test_bit(HCI_UART_REGISTERED, &hu->flags)) return -EUNATCH; /* We might receive some noise when there is no firmware loaded. Therefore, * we drop data if the firmware is not loaded yet and if there is no fw load * request pending. */ if (!test_bit(STATE_FW_REQ_PENDING, &mrvl->flags) && !test_bit(STATE_FW_LOADED, &mrvl->flags)) return count; mrvl->rx_skb = h4_recv_buf(hu->hdev, mrvl->rx_skb, data, count, mrvl_recv_pkts, ARRAY_SIZE(mrvl_recv_pkts)); if (IS_ERR(mrvl->rx_skb)) { int err = PTR_ERR(mrvl->rx_skb); bt_dev_err(hu->hdev, "Frame reassembly failed (%d)", err); mrvl->rx_skb = NULL; return err; } return count; } static int mrvl_load_firmware(struct hci_dev *hdev, const char *name) { struct hci_uart *hu = hci_get_drvdata(hdev); struct mrvl_data *mrvl = hu->priv; const struct firmware *fw = NULL; const u8 *fw_ptr, *fw_max; int err; err = request_firmware(&fw, name, &hdev->dev); if (err < 0) { bt_dev_err(hdev, "Failed to load firmware file %s", name); return err; } fw_ptr = fw->data; fw_max = fw->data + fw->size; bt_dev_info(hdev, "Loading %s", name); set_bit(STATE_FW_REQ_PENDING, &mrvl->flags); while (fw_ptr <= fw_max) { struct sk_buff *skb; /* Controller drives the firmware load by sending firmware * request packets containing the expected fragment size. */ err = wait_on_bit_timeout(&mrvl->flags, STATE_FW_REQ_PENDING, TASK_INTERRUPTIBLE, msecs_to_jiffies(2000)); if (err == 1) { bt_dev_err(hdev, "Firmware load interrupted"); err = -EINTR; break; } else if (err) { bt_dev_err(hdev, "Firmware request timeout"); err = -ETIMEDOUT; break; } bt_dev_dbg(hdev, "Firmware request, expecting %d bytes", mrvl->tx_len); if (fw_ptr == fw_max) { /* Controller requests a null size once firmware is * fully loaded. If controller expects more data, there * is an issue. */ if (!mrvl->tx_len) { bt_dev_info(hdev, "Firmware loading complete"); } else { bt_dev_err(hdev, "Firmware loading failure"); err = -EINVAL; } break; } if (fw_ptr + mrvl->tx_len > fw_max) { mrvl->tx_len = fw_max - fw_ptr; bt_dev_dbg(hdev, "Adjusting tx_len to %d", mrvl->tx_len); } skb = bt_skb_alloc(mrvl->tx_len, GFP_KERNEL); if (!skb) { bt_dev_err(hdev, "Failed to alloc mem for FW packet"); err = -ENOMEM; break; } bt_cb(skb)->pkt_type = MRVL_RAW_DATA; skb_put_data(skb, fw_ptr, mrvl->tx_len); fw_ptr += mrvl->tx_len; set_bit(STATE_FW_REQ_PENDING, &mrvl->flags); skb_queue_tail(&mrvl->rawq, skb); hci_uart_tx_wakeup(hu); } release_firmware(fw); return err; } static int mrvl_setup(struct hci_uart *hu) { int err; struct mrvl_data *mrvl = hu->priv; hci_uart_set_flow_control(hu, true); err = mrvl_load_firmware(hu->hdev, "mrvl/helper_uart_3000000.bin"); if (err) { bt_dev_err(hu->hdev, "Unable to download firmware helper"); return -EINVAL; } /* Let the final ack go out before switching the baudrate */ hci_uart_wait_until_sent(hu); if (hu->serdev) serdev_device_set_baudrate(hu->serdev, hu->oper_speed); else hci_uart_set_baudrate(hu, hu->oper_speed); hci_uart_set_flow_control(hu, false); err = mrvl_load_firmware(hu->hdev, "mrvl/uart8897_bt.bin"); if (err) return err; set_bit(STATE_FW_LOADED, &mrvl->flags); return 0; } static int mrvl_set_baudrate(struct hci_uart *hu, unsigned int speed) { int err; struct mrvl_data *mrvl = hu->priv; __le32 speed_le = cpu_to_le32(speed); /* The firmware might be loaded by the Wifi driver over SDIO. We wait * up to 10s for the CTS to go up. Afterward, we know that the firmware * is ready. */ err = serdev_device_wait_for_cts(hu->serdev, true, 10000); if (err) { bt_dev_err(hu->hdev, "Wait for CTS failed with %d\n", err); return err; } set_bit(STATE_FW_LOADED, &mrvl->flags); err = __hci_cmd_sync_status(hu->hdev, MRVL_SET_BAUDRATE, sizeof(speed_le), &speed_le, HCI_INIT_TIMEOUT); if (err) { bt_dev_err(hu->hdev, "send command failed: %d", err); return err; } serdev_device_set_baudrate(hu->serdev, speed); /* We forcefully have to send a command to the bluetooth module so that * the driver detects it after a baudrate change. This is foreseen by * hci_serdev by setting HCI_UART_VND_DETECT which then causes a dummy * local version read. */ set_bit(HCI_UART_VND_DETECT, &hu->hdev_flags); return 0; } static const struct hci_uart_proto mrvl_proto_8897 = { .id = HCI_UART_MRVL, .name = "Marvell", .init_speed = 115200, .oper_speed = 3000000, .open = mrvl_open, .close = mrvl_close, .flush = mrvl_flush, .setup = mrvl_setup, .recv = mrvl_recv, .enqueue = mrvl_enqueue, .dequeue = mrvl_dequeue, }; static const struct hci_uart_proto mrvl_proto_8997 = { .id = HCI_UART_MRVL, .name = "Marvell 8997", .init_speed = 115200, .oper_speed = 3000000, .open = mrvl_open, .close = mrvl_close, .flush = mrvl_flush, .set_baudrate = mrvl_set_baudrate, .recv = mrvl_recv, .enqueue = mrvl_enqueue, .dequeue = mrvl_dequeue, }; static int mrvl_serdev_probe(struct serdev_device *serdev) { struct mrvl_serdev *mrvldev; const struct hci_uart_proto *mrvl_proto = device_get_match_data(&serdev->dev); mrvldev = devm_kzalloc(&serdev->dev, sizeof(*mrvldev), GFP_KERNEL); if (!mrvldev) return -ENOMEM; mrvldev->hu.oper_speed = mrvl_proto->oper_speed; if (mrvl_proto->set_baudrate) of_property_read_u32(serdev->dev.of_node, "max-speed", &mrvldev->hu.oper_speed); mrvldev->hu.serdev = serdev; serdev_device_set_drvdata(serdev, mrvldev); return hci_uart_register_device(&mrvldev->hu, mrvl_proto); } static void mrvl_serdev_remove(struct serdev_device *serdev) { struct mrvl_serdev *mrvldev = serdev_device_get_drvdata(serdev); hci_uart_unregister_device(&mrvldev->hu); } static const struct of_device_id __maybe_unused mrvl_bluetooth_of_match[] = { { .compatible = "mrvl,88w8897", .data = &mrvl_proto_8897}, { .compatible = "mrvl,88w8997", .data = &mrvl_proto_8997}, { }, }; MODULE_DEVICE_TABLE(of, mrvl_bluetooth_of_match); static struct serdev_device_driver mrvl_serdev_driver = { .probe = mrvl_serdev_probe, .remove = mrvl_serdev_remove, .driver = { .name = "hci_uart_mrvl", .of_match_table = of_match_ptr(mrvl_bluetooth_of_match), }, }; int __init mrvl_init(void) { serdev_device_driver_register(&mrvl_serdev_driver); return hci_uart_register_proto(&mrvl_proto_8897); } int __exit mrvl_deinit(void) { serdev_device_driver_unregister(&mrvl_serdev_driver); return hci_uart_unregister_proto(&mrvl_proto_8897); } |
| 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_PART_STAT_H #define _LINUX_PART_STAT_H #include <linux/blkdev.h> #include <asm/local.h> struct disk_stats { u64 nsecs[NR_STAT_GROUPS]; unsigned long sectors[NR_STAT_GROUPS]; unsigned long ios[NR_STAT_GROUPS]; unsigned long merges[NR_STAT_GROUPS]; unsigned long io_ticks; local_t in_flight[2]; }; /* * Macros to operate on percpu disk statistics: * * {disk|part|all}_stat_{add|sub|inc|dec}() modify the stat counters and should * be called between disk_stat_lock() and disk_stat_unlock(). * * part_stat_read() can be called at any time. */ #define part_stat_lock() preempt_disable() #define part_stat_unlock() preempt_enable() #define part_stat_get_cpu(part, field, cpu) \ (per_cpu_ptr((part)->bd_stats, (cpu))->field) #define part_stat_get(part, field) \ part_stat_get_cpu(part, field, smp_processor_id()) #define part_stat_read(part, field) \ ({ \ TYPEOF_UNQUAL((part)->bd_stats->field) res = 0; \ unsigned int _cpu; \ for_each_possible_cpu(_cpu) \ res += per_cpu_ptr((part)->bd_stats, _cpu)->field; \ res; \ }) static inline void part_stat_set_all(struct block_device *part, int value) { int i; for_each_possible_cpu(i) memset(per_cpu_ptr(part->bd_stats, i), value, sizeof(struct disk_stats)); } #define part_stat_read_accum(part, field) \ (part_stat_read(part, field[STAT_READ]) + \ part_stat_read(part, field[STAT_WRITE]) + \ part_stat_read(part, field[STAT_DISCARD])) #define __part_stat_add(part, field, addnd) \ __this_cpu_add((part)->bd_stats->field, addnd) #define part_stat_add(part, field, addnd) do { \ __part_stat_add((part), field, addnd); \ if (bdev_is_partition(part)) \ __part_stat_add(bdev_whole(part), field, addnd); \ } while (0) #define part_stat_dec(part, field) \ part_stat_add(part, field, -1) #define part_stat_inc(part, field) \ part_stat_add(part, field, 1) #define part_stat_sub(part, field, subnd) \ part_stat_add(part, field, -subnd) #define part_stat_local_dec(part, field) \ local_dec(&(part_stat_get(part, field))) #define part_stat_local_inc(part, field) \ local_inc(&(part_stat_get(part, field))) #define part_stat_local_read(part, field) \ local_read(&(part_stat_get(part, field))) #define part_stat_local_read_cpu(part, field, cpu) \ local_read(&(part_stat_get_cpu(part, field, cpu))) unsigned int bdev_count_inflight(struct block_device *part); #endif /* _LINUX_PART_STAT_H */ |
| 11 11 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 | // SPDX-License-Identifier: GPL-2.0-only /* * Input Multitouch Library * * Copyright (c) 2008-2010 Henrik Rydberg */ #include <linux/input/mt.h> #include <linux/export.h> #include <linux/slab.h> #include "input-core-private.h" #define TRKID_SGN ((TRKID_MAX + 1) >> 1) static void copy_abs(struct input_dev *dev, unsigned int dst, unsigned int src) { if (dev->absinfo && test_bit(src, dev->absbit)) { dev->absinfo[dst] = dev->absinfo[src]; dev->absinfo[dst].fuzz = 0; __set_bit(dst, dev->absbit); } } /** * input_mt_init_slots() - initialize MT input slots * @dev: input device supporting MT events and finger tracking * @num_slots: number of slots used by the device * @flags: mt tasks to handle in core * * This function allocates all necessary memory for MT slot handling * in the input device, prepares the ABS_MT_SLOT and * ABS_MT_TRACKING_ID events for use and sets up appropriate buffers. * Depending on the flags set, it also performs pointer emulation and * frame synchronization. * * May be called repeatedly. Returns -EINVAL if attempting to * reinitialize with a different number of slots. */ int input_mt_init_slots(struct input_dev *dev, unsigned int num_slots, unsigned int flags) { if (!num_slots) return 0; if (dev->mt) return dev->mt->num_slots != num_slots ? -EINVAL : 0; /* Arbitrary limit for avoiding too large memory allocation. */ if (num_slots > 1024) return -EINVAL; struct input_mt *mt __free(kfree) = kzalloc(struct_size(mt, slots, num_slots), GFP_KERNEL); if (!mt) return -ENOMEM; mt->num_slots = num_slots; mt->flags = flags; input_set_abs_params(dev, ABS_MT_SLOT, 0, num_slots - 1, 0, 0); input_set_abs_params(dev, ABS_MT_TRACKING_ID, 0, TRKID_MAX, 0, 0); if (flags & (INPUT_MT_POINTER | INPUT_MT_DIRECT)) { __set_bit(EV_KEY, dev->evbit); __set_bit(BTN_TOUCH, dev->keybit); copy_abs(dev, ABS_X, ABS_MT_POSITION_X); copy_abs(dev, ABS_Y, ABS_MT_POSITION_Y); copy_abs(dev, ABS_PRESSURE, ABS_MT_PRESSURE); } if (flags & INPUT_MT_POINTER) { __set_bit(BTN_TOOL_FINGER, dev->keybit); __set_bit(BTN_TOOL_DOUBLETAP, dev->keybit); if (num_slots >= 3) __set_bit(BTN_TOOL_TRIPLETAP, dev->keybit); if (num_slots >= 4) __set_bit(BTN_TOOL_QUADTAP, dev->keybit); if (num_slots >= 5) __set_bit(BTN_TOOL_QUINTTAP, dev->keybit); __set_bit(INPUT_PROP_POINTER, dev->propbit); } if (flags & INPUT_MT_DIRECT) __set_bit(INPUT_PROP_DIRECT, dev->propbit); if (flags & INPUT_MT_SEMI_MT) __set_bit(INPUT_PROP_SEMI_MT, dev->propbit); if (flags & INPUT_MT_TRACK) { unsigned int n2 = num_slots * num_slots; mt->red = kcalloc(n2, sizeof(*mt->red), GFP_KERNEL); if (!mt->red) return -ENOMEM; } /* Mark slots as 'inactive' */ for (unsigned int i = 0; i < num_slots; i++) input_mt_set_value(&mt->slots[i], ABS_MT_TRACKING_ID, -1); /* Mark slots as 'unused' */ mt->frame = 1; dev->mt = no_free_ptr(mt); return 0; } EXPORT_SYMBOL(input_mt_init_slots); /** * input_mt_destroy_slots() - frees the MT slots of the input device * @dev: input device with allocated MT slots * * This function is only needed in error path as the input core will * automatically free the MT slots when the device is destroyed. */ void input_mt_destroy_slots(struct input_dev *dev) { if (dev->mt) { kfree(dev->mt->red); kfree(dev->mt); } dev->mt = NULL; } EXPORT_SYMBOL(input_mt_destroy_slots); /** * input_mt_report_slot_state() - report contact state * @dev: input device with allocated MT slots * @tool_type: the tool type to use in this slot * @active: true if contact is active, false otherwise * * Reports a contact via ABS_MT_TRACKING_ID, and optionally * ABS_MT_TOOL_TYPE. If active is true and the slot is currently * inactive, or if the tool type is changed, a new tracking id is * assigned to the slot. The tool type is only reported if the * corresponding absbit field is set. * * Returns true if contact is active. */ bool input_mt_report_slot_state(struct input_dev *dev, unsigned int tool_type, bool active) { struct input_mt *mt = dev->mt; struct input_mt_slot *slot; int id; if (!mt) return false; slot = &mt->slots[mt->slot]; slot->frame = mt->frame; if (!active) { input_event(dev, EV_ABS, ABS_MT_TRACKING_ID, -1); return false; } id = input_mt_get_value(slot, ABS_MT_TRACKING_ID); if (id < 0) id = input_mt_new_trkid(mt); input_event(dev, EV_ABS, ABS_MT_TRACKING_ID, id); input_event(dev, EV_ABS, ABS_MT_TOOL_TYPE, tool_type); return true; } EXPORT_SYMBOL(input_mt_report_slot_state); /** * input_mt_report_finger_count() - report contact count * @dev: input device with allocated MT slots * @count: the number of contacts * * Reports the contact count via BTN_TOOL_FINGER, BTN_TOOL_DOUBLETAP, * BTN_TOOL_TRIPLETAP and BTN_TOOL_QUADTAP. * * The input core ensures only the KEY events already setup for * this device will produce output. */ void input_mt_report_finger_count(struct input_dev *dev, int count) { input_event(dev, EV_KEY, BTN_TOOL_FINGER, count == 1); input_event(dev, EV_KEY, BTN_TOOL_DOUBLETAP, count == 2); input_event(dev, EV_KEY, BTN_TOOL_TRIPLETAP, count == 3); input_event(dev, EV_KEY, BTN_TOOL_QUADTAP, count == 4); input_event(dev, EV_KEY, BTN_TOOL_QUINTTAP, count == 5); } EXPORT_SYMBOL(input_mt_report_finger_count); /** * input_mt_report_pointer_emulation() - common pointer emulation * @dev: input device with allocated MT slots * @use_count: report number of active contacts as finger count * * Performs legacy pointer emulation via BTN_TOUCH, ABS_X, ABS_Y and * ABS_PRESSURE. Touchpad finger count is emulated if use_count is true. * * The input core ensures only the KEY and ABS axes already setup for * this device will produce output. */ void input_mt_report_pointer_emulation(struct input_dev *dev, bool use_count) { struct input_mt *mt = dev->mt; struct input_mt_slot *oldest; int oldid, count, i; if (!mt) return; oldest = NULL; oldid = mt->trkid; count = 0; for (i = 0; i < mt->num_slots; ++i) { struct input_mt_slot *ps = &mt->slots[i]; int id = input_mt_get_value(ps, ABS_MT_TRACKING_ID); if (id < 0) continue; if ((id - oldid) & TRKID_SGN) { oldest = ps; oldid = id; } count++; } input_event(dev, EV_KEY, BTN_TOUCH, count > 0); if (use_count) { if (count == 0 && !test_bit(ABS_MT_DISTANCE, dev->absbit) && test_bit(ABS_DISTANCE, dev->absbit) && input_abs_get_val(dev, ABS_DISTANCE) != 0) { /* * Force reporting BTN_TOOL_FINGER for devices that * only report general hover (and not per-contact * distance) when contact is in proximity but not * on the surface. */ count = 1; } input_mt_report_finger_count(dev, count); } if (oldest) { int x = input_mt_get_value(oldest, ABS_MT_POSITION_X); int y = input_mt_get_value(oldest, ABS_MT_POSITION_Y); input_event(dev, EV_ABS, ABS_X, x); input_event(dev, EV_ABS, ABS_Y, y); if (test_bit(ABS_MT_PRESSURE, dev->absbit)) { int p = input_mt_get_value(oldest, ABS_MT_PRESSURE); input_event(dev, EV_ABS, ABS_PRESSURE, p); } } else { if (test_bit(ABS_MT_PRESSURE, dev->absbit)) input_event(dev, EV_ABS, ABS_PRESSURE, 0); } } EXPORT_SYMBOL(input_mt_report_pointer_emulation); static void __input_mt_drop_unused(struct input_dev *dev, struct input_mt *mt) { int i; lockdep_assert_held(&dev->event_lock); for (i = 0; i < mt->num_slots; i++) { if (input_mt_is_active(&mt->slots[i]) && !input_mt_is_used(mt, &mt->slots[i])) { input_handle_event(dev, EV_ABS, ABS_MT_SLOT, i); input_handle_event(dev, EV_ABS, ABS_MT_TRACKING_ID, -1); } } } /** * input_mt_drop_unused() - Inactivate slots not seen in this frame * @dev: input device with allocated MT slots * * Lift all slots not seen since the last call to this function. */ void input_mt_drop_unused(struct input_dev *dev) { struct input_mt *mt = dev->mt; if (mt) { guard(spinlock_irqsave)(&dev->event_lock); __input_mt_drop_unused(dev, mt); mt->frame++; } } EXPORT_SYMBOL(input_mt_drop_unused); /** * input_mt_release_slots() - Deactivate all slots * @dev: input device with allocated MT slots * * Lift all active slots. */ void input_mt_release_slots(struct input_dev *dev) { struct input_mt *mt = dev->mt; lockdep_assert_held(&dev->event_lock); if (mt) { /* This will effectively mark all slots unused. */ mt->frame++; __input_mt_drop_unused(dev, mt); if (test_bit(ABS_PRESSURE, dev->absbit)) input_handle_event(dev, EV_ABS, ABS_PRESSURE, 0); mt->frame++; } } /** * input_mt_sync_frame() - synchronize mt frame * @dev: input device with allocated MT slots * * Close the frame and prepare the internal state for a new one. * Depending on the flags, marks unused slots as inactive and performs * pointer emulation. */ void input_mt_sync_frame(struct input_dev *dev) { struct input_mt *mt = dev->mt; bool use_count = false; if (!mt) return; if (mt->flags & INPUT_MT_DROP_UNUSED) { guard(spinlock_irqsave)(&dev->event_lock); __input_mt_drop_unused(dev, mt); } if ((mt->flags & INPUT_MT_POINTER) && !(mt->flags & INPUT_MT_SEMI_MT)) use_count = true; input_mt_report_pointer_emulation(dev, use_count); mt->frame++; } EXPORT_SYMBOL(input_mt_sync_frame); static int adjust_dual(int *begin, int step, int *end, int eq, int mu) { int f, *p, s, c; if (begin == end) return 0; f = *begin; p = begin + step; s = p == end ? f + 1 : *p; for (; p != end; p += step) { if (*p < f) { s = f; f = *p; } else if (*p < s) { s = *p; } } c = (f + s + 1) / 2; if (c == 0 || (c > mu && (!eq || mu > 0))) return 0; /* Improve convergence for positive matrices by penalizing overcovers */ if (s < 0 && mu <= 0) c *= 2; for (p = begin; p != end; p += step) *p -= c; return (c < s && s <= 0) || (f >= 0 && f < c); } static void find_reduced_matrix(int *w, int nr, int nc, int nrc, int mu) { int i, k, sum; for (k = 0; k < nrc; k++) { for (i = 0; i < nr; i++) adjust_dual(w + i, nr, w + i + nrc, nr <= nc, mu); sum = 0; for (i = 0; i < nrc; i += nr) sum += adjust_dual(w + i, 1, w + i + nr, nc <= nr, mu); if (!sum) break; } } static int input_mt_set_matrix(struct input_mt *mt, const struct input_mt_pos *pos, int num_pos, int mu) { const struct input_mt_pos *p; struct input_mt_slot *s; int *w = mt->red; int x, y; for (s = mt->slots; s != mt->slots + mt->num_slots; s++) { if (!input_mt_is_active(s)) continue; x = input_mt_get_value(s, ABS_MT_POSITION_X); y = input_mt_get_value(s, ABS_MT_POSITION_Y); for (p = pos; p != pos + num_pos; p++) { int dx = x - p->x, dy = y - p->y; *w++ = dx * dx + dy * dy - mu; } } return w - mt->red; } static void input_mt_set_slots(struct input_mt *mt, int *slots, int num_pos) { struct input_mt_slot *s; int *w = mt->red, j; for (j = 0; j != num_pos; j++) slots[j] = -1; for (s = mt->slots; s != mt->slots + mt->num_slots; s++) { if (!input_mt_is_active(s)) continue; for (j = 0; j != num_pos; j++) { if (w[j] < 0) { slots[j] = s - mt->slots; break; } } w += num_pos; } for (s = mt->slots; s != mt->slots + mt->num_slots; s++) { if (input_mt_is_active(s)) continue; for (j = 0; j != num_pos; j++) { if (slots[j] < 0) { slots[j] = s - mt->slots; break; } } } } /** * input_mt_assign_slots() - perform a best-match assignment * @dev: input device with allocated MT slots * @slots: the slot assignment to be filled * @pos: the position array to match * @num_pos: number of positions * @dmax: maximum ABS_MT_POSITION displacement (zero for infinite) * * Performs a best match against the current contacts and returns * the slot assignment list. New contacts are assigned to unused * slots. * * The assignments are balanced so that all coordinate displacements are * below the euclidian distance dmax. If no such assignment can be found, * some contacts are assigned to unused slots. * * Returns zero on success, or negative error in case of failure. */ int input_mt_assign_slots(struct input_dev *dev, int *slots, const struct input_mt_pos *pos, int num_pos, int dmax) { struct input_mt *mt = dev->mt; int mu = 2 * dmax * dmax; int nrc; if (!mt || !mt->red) return -ENXIO; if (num_pos > mt->num_slots) return -EINVAL; if (num_pos < 1) return 0; nrc = input_mt_set_matrix(mt, pos, num_pos, mu); find_reduced_matrix(mt->red, num_pos, nrc / num_pos, nrc, mu); input_mt_set_slots(mt, slots, num_pos); return 0; } EXPORT_SYMBOL(input_mt_assign_slots); /** * input_mt_get_slot_by_key() - return slot matching key * @dev: input device with allocated MT slots * @key: the key of the sought slot * * Returns the slot of the given key, if it exists, otherwise * set the key on the first unused slot and return. * * If no available slot can be found, -1 is returned. * Note that for this function to work properly, input_mt_sync_frame() has * to be called at each frame. */ int input_mt_get_slot_by_key(struct input_dev *dev, int key) { struct input_mt *mt = dev->mt; struct input_mt_slot *s; if (!mt) return -1; for (s = mt->slots; s != mt->slots + mt->num_slots; s++) if (input_mt_is_active(s) && s->key == key) return s - mt->slots; for (s = mt->slots; s != mt->slots + mt->num_slots; s++) if (!input_mt_is_active(s) && !input_mt_is_used(mt, s)) { s->key = key; return s - mt->slots; } return -1; } EXPORT_SYMBOL(input_mt_get_slot_by_key); |
| 1008 18 778 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 | /* SPDX-License-Identifier: GPL-2.0 OR MIT */ #ifndef __LINUX_OVERFLOW_H #define __LINUX_OVERFLOW_H #include <linux/compiler.h> #include <linux/limits.h> #include <linux/const.h> /* * We need to compute the minimum and maximum values representable in a given * type. These macros may also be useful elsewhere. It would seem more obvious * to do something like: * * #define type_min(T) (T)(is_signed_type(T) ? (T)1 << (8*sizeof(T)-1) : 0) * #define type_max(T) (T)(is_signed_type(T) ? ((T)1 << (8*sizeof(T)-1)) - 1 : ~(T)0) * * Unfortunately, the middle expressions, strictly speaking, have * undefined behaviour, and at least some versions of gcc warn about * the type_max expression (but not if -fsanitize=undefined is in * effect; in that case, the warning is deferred to runtime...). * * The slightly excessive casting in type_min is to make sure the * macros also produce sensible values for the exotic type _Bool. [The * overflow checkers only almost work for _Bool, but that's * a-feature-not-a-bug, since people shouldn't be doing arithmetic on * _Bools. Besides, the gcc builtins don't allow _Bool* as third * argument.] * * Idea stolen from * https://mail-index.netbsd.org/tech-misc/2007/02/05/0000.html - * credit to Christian Biere. */ #define __type_half_max(type) ((type)1 << (8*sizeof(type) - 1 - is_signed_type(type))) #define __type_max(T) ((T)((__type_half_max(T) - 1) + __type_half_max(T))) #define type_max(t) __type_max(typeof(t)) #define __type_min(T) ((T)((T)-type_max(T)-(T)1)) #define type_min(t) __type_min(typeof(t)) /* * Avoids triggering -Wtype-limits compilation warning, * while using unsigned data types to check a < 0. */ #define is_non_negative(a) ((a) > 0 || (a) == 0) #define is_negative(a) (!(is_non_negative(a))) /* * Allows for effectively applying __must_check to a macro so we can have * both the type-agnostic benefits of the macros while also being able to * enforce that the return value is, in fact, checked. */ static inline bool __must_check __must_check_overflow(bool overflow) { return unlikely(overflow); } /** * check_add_overflow() - Calculate addition with overflow checking * @a: first addend * @b: second addend * @d: pointer to store sum * * Returns true on wrap-around, false otherwise. * * *@d holds the results of the attempted addition, regardless of whether * wrap-around occurred. */ #define check_add_overflow(a, b, d) \ __must_check_overflow(__builtin_add_overflow(a, b, d)) /** * wrapping_add() - Intentionally perform a wrapping addition * @type: type for result of calculation * @a: first addend * @b: second addend * * Return the potentially wrapped-around addition without * tripping any wrap-around sanitizers that may be enabled. */ #define wrapping_add(type, a, b) \ ({ \ type __val; \ __builtin_add_overflow(a, b, &__val); \ __val; \ }) /** * wrapping_assign_add() - Intentionally perform a wrapping increment assignment * @var: variable to be incremented * @offset: amount to add * * Increments @var by @offset with wrap-around. Returns the resulting * value of @var. Will not trip any wrap-around sanitizers. * * Returns the new value of @var. */ #define wrapping_assign_add(var, offset) \ ({ \ typeof(var) *__ptr = &(var); \ *__ptr = wrapping_add(typeof(var), *__ptr, offset); \ }) /** * check_sub_overflow() - Calculate subtraction with overflow checking * @a: minuend; value to subtract from * @b: subtrahend; value to subtract from @a * @d: pointer to store difference * * Returns true on wrap-around, false otherwise. * * *@d holds the results of the attempted subtraction, regardless of whether * wrap-around occurred. */ #define check_sub_overflow(a, b, d) \ __must_check_overflow(__builtin_sub_overflow(a, b, d)) /** * wrapping_sub() - Intentionally perform a wrapping subtraction * @type: type for result of calculation * @a: minuend; value to subtract from * @b: subtrahend; value to subtract from @a * * Return the potentially wrapped-around subtraction without * tripping any wrap-around sanitizers that may be enabled. */ #define wrapping_sub(type, a, b) \ ({ \ type __val; \ __builtin_sub_overflow(a, b, &__val); \ __val; \ }) /** * wrapping_assign_sub() - Intentionally perform a wrapping decrement assign * @var: variable to be decremented * @offset: amount to subtract * * Decrements @var by @offset with wrap-around. Returns the resulting * value of @var. Will not trip any wrap-around sanitizers. * * Returns the new value of @var. */ #define wrapping_assign_sub(var, offset) \ ({ \ typeof(var) *__ptr = &(var); \ *__ptr = wrapping_sub(typeof(var), *__ptr, offset); \ }) /** * check_mul_overflow() - Calculate multiplication with overflow checking * @a: first factor * @b: second factor * @d: pointer to store product * * Returns true on wrap-around, false otherwise. * * *@d holds the results of the attempted multiplication, regardless of whether * wrap-around occurred. */ #define check_mul_overflow(a, b, d) \ __must_check_overflow(__builtin_mul_overflow(a, b, d)) /** * wrapping_mul() - Intentionally perform a wrapping multiplication * @type: type for result of calculation * @a: first factor * @b: second factor * * Return the potentially wrapped-around multiplication without * tripping any wrap-around sanitizers that may be enabled. */ #define wrapping_mul(type, a, b) \ ({ \ type __val; \ __builtin_mul_overflow(a, b, &__val); \ __val; \ }) /** * check_shl_overflow() - Calculate a left-shifted value and check overflow * @a: Value to be shifted * @s: How many bits left to shift * @d: Pointer to where to store the result * * Computes *@d = (@a << @s) * * Returns true if '*@d' cannot hold the result or when '@a << @s' doesn't * make sense. Example conditions: * * - '@a << @s' causes bits to be lost when stored in *@d. * - '@s' is garbage (e.g. negative) or so large that the result of * '@a << @s' is guaranteed to be 0. * - '@a' is negative. * - '@a << @s' sets the sign bit, if any, in '*@d'. * * '*@d' will hold the results of the attempted shift, but is not * considered "safe for use" if true is returned. */ #define check_shl_overflow(a, s, d) __must_check_overflow(({ \ typeof(a) _a = a; \ typeof(s) _s = s; \ typeof(d) _d = d; \ unsigned long long _a_full = _a; \ unsigned int _to_shift = \ is_non_negative(_s) && _s < 8 * sizeof(*d) ? _s : 0; \ *_d = (_a_full << _to_shift); \ (_to_shift != _s || is_negative(*_d) || is_negative(_a) || \ (*_d >> _to_shift) != _a); \ })) #define __overflows_type_constexpr(x, T) ( \ is_unsigned_type(typeof(x)) ? \ (x) > type_max(T) : \ is_unsigned_type(typeof(T)) ? \ (x) < 0 || (x) > type_max(T) : \ (x) < type_min(T) || (x) > type_max(T)) #define __overflows_type(x, T) ({ \ typeof(T) v = 0; \ check_add_overflow((x), v, &v); \ }) /** * overflows_type - helper for checking the overflows between value, variables, * or data type * * @n: source constant value or variable to be checked * @T: destination variable or data type proposed to store @x * * Compares the @x expression for whether or not it can safely fit in * the storage of the type in @T. @x and @T can have different types. * If @x is a constant expression, this will also resolve to a constant * expression. * * Returns: true if overflow can occur, false otherwise. */ #define overflows_type(n, T) \ __builtin_choose_expr(__is_constexpr(n), \ __overflows_type_constexpr(n, T), \ __overflows_type(n, T)) /** * castable_to_type - like __same_type(), but also allows for casted literals * * @n: variable or constant value * @T: variable or data type * * Unlike the __same_type() macro, this allows a constant value as the * first argument. If this value would not overflow into an assignment * of the second argument's type, it returns true. Otherwise, this falls * back to __same_type(). */ #define castable_to_type(n, T) \ __builtin_choose_expr(__is_constexpr(n), \ !__overflows_type_constexpr(n, T), \ __same_type(n, T)) /** * size_mul() - Calculate size_t multiplication with saturation at SIZE_MAX * @factor1: first factor * @factor2: second factor * * Returns: calculate @factor1 * @factor2, both promoted to size_t, * with any overflow causing the return value to be SIZE_MAX. The * lvalue must be size_t to avoid implicit type conversion. */ static inline size_t __must_check size_mul(size_t factor1, size_t factor2) { size_t bytes; if (check_mul_overflow(factor1, factor2, &bytes)) return SIZE_MAX; return bytes; } /** * size_add() - Calculate size_t addition with saturation at SIZE_MAX * @addend1: first addend * @addend2: second addend * * Returns: calculate @addend1 + @addend2, both promoted to size_t, * with any overflow causing the return value to be SIZE_MAX. The * lvalue must be size_t to avoid implicit type conversion. */ static inline size_t __must_check size_add(size_t addend1, size_t addend2) { size_t bytes; if (check_add_overflow(addend1, addend2, &bytes)) return SIZE_MAX; return bytes; } /** * size_sub() - Calculate size_t subtraction with saturation at SIZE_MAX * @minuend: value to subtract from * @subtrahend: value to subtract from @minuend * * Returns: calculate @minuend - @subtrahend, both promoted to size_t, * with any overflow causing the return value to be SIZE_MAX. For * composition with the size_add() and size_mul() helpers, neither * argument may be SIZE_MAX (or the result with be forced to SIZE_MAX). * The lvalue must be size_t to avoid implicit type conversion. */ static inline size_t __must_check size_sub(size_t minuend, size_t subtrahend) { size_t bytes; if (minuend == SIZE_MAX || subtrahend == SIZE_MAX || check_sub_overflow(minuend, subtrahend, &bytes)) return SIZE_MAX; return bytes; } /** * array_size() - Calculate size of 2-dimensional array. * @a: dimension one * @b: dimension two * * Calculates size of 2-dimensional array: @a * @b. * * Returns: number of bytes needed to represent the array or SIZE_MAX on * overflow. */ #define array_size(a, b) size_mul(a, b) /** * array3_size() - Calculate size of 3-dimensional array. * @a: dimension one * @b: dimension two * @c: dimension three * * Calculates size of 3-dimensional array: @a * @b * @c. * * Returns: number of bytes needed to represent the array or SIZE_MAX on * overflow. */ #define array3_size(a, b, c) size_mul(size_mul(a, b), c) /** * flex_array_size() - Calculate size of a flexible array member * within an enclosing structure. * @p: Pointer to the structure. * @member: Name of the flexible array member. * @count: Number of elements in the array. * * Calculates size of a flexible array of @count number of @member * elements, at the end of structure @p. * * Return: number of bytes needed or SIZE_MAX on overflow. */ #define flex_array_size(p, member, count) \ __builtin_choose_expr(__is_constexpr(count), \ (count) * sizeof(*(p)->member) + __must_be_array((p)->member), \ size_mul(count, sizeof(*(p)->member) + __must_be_array((p)->member))) /** * struct_size() - Calculate size of structure with trailing flexible array. * @p: Pointer to the structure. * @member: Name of the array member. * @count: Number of elements in the array. * * Calculates size of memory needed for structure of @p followed by an * array of @count number of @member elements. * * Return: number of bytes needed or SIZE_MAX on overflow. */ #define struct_size(p, member, count) \ __builtin_choose_expr(__is_constexpr(count), \ sizeof(*(p)) + flex_array_size(p, member, count), \ size_add(sizeof(*(p)), flex_array_size(p, member, count))) /** * struct_size_t() - Calculate size of structure with trailing flexible array * @type: structure type name. * @member: Name of the array member. * @count: Number of elements in the array. * * Calculates size of memory needed for structure @type followed by an * array of @count number of @member elements. Prefer using struct_size() * when possible instead, to keep calculations associated with a specific * instance variable of type @type. * * Return: number of bytes needed or SIZE_MAX on overflow. */ #define struct_size_t(type, member, count) \ struct_size((type *)NULL, member, count) /** * __DEFINE_FLEX() - helper macro for DEFINE_FLEX() family. * Enables caller macro to pass arbitrary trailing expressions * * @type: structure type name, including "struct" keyword. * @name: Name for a variable to define. * @member: Name of the array member. * @count: Number of elements in the array; must be compile-time const. * @trailer: Trailing expressions for attributes and/or initializers. */ #define __DEFINE_FLEX(type, name, member, count, trailer...) \ _Static_assert(__builtin_constant_p(count), \ "onstack flex array members require compile-time const count"); \ union { \ u8 bytes[struct_size_t(type, member, count)]; \ type obj; \ } name##_u trailer; \ type *name = (type *)&name##_u /** * _DEFINE_FLEX() - helper macro for DEFINE_FLEX() family. * Enables caller macro to pass (different) initializer. * * @type: structure type name, including "struct" keyword. * @name: Name for a variable to define. * @member: Name of the array member. * @count: Number of elements in the array; must be compile-time const. * @initializer: Initializer expression (e.g., pass `= { }` at minimum). */ #define _DEFINE_FLEX(type, name, member, count, initializer...) \ __DEFINE_FLEX(type, name, member, count, = { .obj initializer }) /** * DEFINE_RAW_FLEX() - Define an on-stack instance of structure with a trailing * flexible array member, when it does not have a __counted_by annotation. * * @type: structure type name, including "struct" keyword. * @name: Name for a variable to define. * @member: Name of the array member. * @count: Number of elements in the array; must be compile-time const. * * Define a zeroed, on-stack, instance of @type structure with a trailing * flexible array member. * Use __struct_size(@name) to get compile-time size of it afterwards. * Use __member_size(@name->member) to get compile-time size of @name members. * Use STACK_FLEX_ARRAY_SIZE(@name, @member) to get compile-time number of * elements in array @member. */ #define DEFINE_RAW_FLEX(type, name, member, count) \ __DEFINE_FLEX(type, name, member, count, = { }) /** * DEFINE_FLEX() - Define an on-stack instance of structure with a trailing * flexible array member. * * @TYPE: structure type name, including "struct" keyword. * @NAME: Name for a variable to define. * @MEMBER: Name of the array member. * @COUNTER: Name of the __counted_by member. * @COUNT: Number of elements in the array; must be compile-time const. * * Define a zeroed, on-stack, instance of @TYPE structure with a trailing * flexible array member. * Use __struct_size(@NAME) to get compile-time size of it afterwards. * Use __member_size(@NAME->member) to get compile-time size of @NAME members. * Use STACK_FLEX_ARRAY_SIZE(@name, @member) to get compile-time number of * elements in array @member. */ #define DEFINE_FLEX(TYPE, NAME, MEMBER, COUNTER, COUNT) \ _DEFINE_FLEX(TYPE, NAME, MEMBER, COUNT, = { .COUNTER = COUNT, }) /** * STACK_FLEX_ARRAY_SIZE() - helper macro for DEFINE_FLEX() family. * Returns the number of elements in @array. * * @name: Name for a variable defined in DEFINE_RAW_FLEX()/DEFINE_FLEX(). * @array: Name of the array member. */ #define STACK_FLEX_ARRAY_SIZE(name, array) \ (__member_size((name)->array) / sizeof(*(name)->array) + \ __must_be_array((name)->array)) #endif /* __LINUX_OVERFLOW_H */ |
| 98 12 69 342 210 333 210 392 210 210 12 107 210 42 14 14 322 333 218 11 9 20 59 9 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 | /* SPDX-License-Identifier: GPL-2.0-or-later */ /* include/asm-generic/tlb.h * * Generic TLB shootdown code * * Copyright 2001 Red Hat, Inc. * Based on code from mm/memory.c Copyright Linus Torvalds and others. * * Copyright 2011 Red Hat, Inc., Peter Zijlstra */ #ifndef _ASM_GENERIC__TLB_H #define _ASM_GENERIC__TLB_H #include <linux/mmu_notifier.h> #include <linux/swap.h> #include <linux/hugetlb_inline.h> #include <asm/tlbflush.h> #include <asm/cacheflush.h> /* * Blindly accessing user memory from NMI context can be dangerous * if we're in the middle of switching the current user task or switching * the loaded mm. */ #ifndef nmi_uaccess_okay # define nmi_uaccess_okay() true #endif #ifdef CONFIG_MMU /* * Generic MMU-gather implementation. * * The mmu_gather data structure is used by the mm code to implement the * correct and efficient ordering of freeing pages and TLB invalidations. * * This correct ordering is: * * 1) unhook page * 2) TLB invalidate page * 3) free page * * That is, we must never free a page before we have ensured there are no live * translations left to it. Otherwise it might be possible to observe (or * worse, change) the page content after it has been reused. * * The mmu_gather API consists of: * * - tlb_gather_mmu() / tlb_gather_mmu_fullmm() / tlb_finish_mmu() * * start and finish a mmu_gather * * Finish in particular will issue a (final) TLB invalidate and free * all (remaining) queued pages. * * - tlb_start_vma() / tlb_end_vma(); marks the start / end of a VMA * * Defaults to flushing at tlb_end_vma() to reset the range; helps when * there's large holes between the VMAs. * * - tlb_free_vmas() * * tlb_free_vmas() marks the start of unlinking of one or more vmas * and freeing page-tables. * * - tlb_remove_table() * * tlb_remove_table() is the basic primitive to free page-table directories * (__p*_free_tlb()). In it's most primitive form it is an alias for * tlb_remove_page() below, for when page directories are pages and have no * additional constraints. * * See also MMU_GATHER_TABLE_FREE and MMU_GATHER_RCU_TABLE_FREE. * * - tlb_remove_page() / tlb_remove_page_size() * - __tlb_remove_folio_pages() / __tlb_remove_page_size() * - __tlb_remove_folio_pages_size() * * __tlb_remove_folio_pages_size() is the basic primitive that queues pages * for freeing. It will return a boolean indicating if the queue is (now) * full and a call to tlb_flush_mmu() is required. * * tlb_remove_page() and tlb_remove_page_size() imply the call to * tlb_flush_mmu() when required and has no return value. * * __tlb_remove_folio_pages() is similar to __tlb_remove_page_size(), * however, instead of removing a single page, assume PAGE_SIZE and remove * the given number of consecutive pages that are all part of the * same (large) folio. * * - tlb_change_page_size() * * call before __tlb_remove_page*() to set the current page-size; implies a * possible tlb_flush_mmu() call. * * - tlb_flush_mmu() / tlb_flush_mmu_tlbonly() * * tlb_flush_mmu_tlbonly() - does the TLB invalidate (and resets * related state, like the range) * * tlb_flush_mmu() - in addition to the above TLB invalidate, also frees * whatever pages are still batched. * * - mmu_gather::fullmm * * A flag set by tlb_gather_mmu_fullmm() to indicate we're going to free * the entire mm; this allows a number of optimizations. * * - We can ignore tlb_{start,end}_vma(); because we don't * care about ranges. Everything will be shot down. * * - (RISC) architectures that use ASIDs can cycle to a new ASID * and delay the invalidation until ASID space runs out. * * - mmu_gather::need_flush_all * * A flag that can be set by the arch code if it wants to force * flush the entire TLB irrespective of the range. For instance * x86-PAE needs this when changing top-level entries. * * And allows the architecture to provide and implement tlb_flush(): * * tlb_flush() may, in addition to the above mentioned mmu_gather fields, make * use of: * * - mmu_gather::start / mmu_gather::end * * which provides the range that needs to be flushed to cover the pages to * be freed. * * - mmu_gather::freed_tables * * set when we freed page table pages * * - tlb_get_unmap_shift() / tlb_get_unmap_size() * * returns the smallest TLB entry size unmapped in this range. * * If an architecture does not provide tlb_flush() a default implementation * based on flush_tlb_range() will be used, unless MMU_GATHER_NO_RANGE is * specified, in which case we'll default to flush_tlb_mm(). * * Additionally there are a few opt-in features: * * MMU_GATHER_PAGE_SIZE * * This ensures we call tlb_flush() every time tlb_change_page_size() actually * changes the size and provides mmu_gather::page_size to tlb_flush(). * * This might be useful if your architecture has size specific TLB * invalidation instructions. * * MMU_GATHER_TABLE_FREE * * This provides tlb_remove_table(), to be used instead of tlb_remove_page() * for page directores (__p*_free_tlb()). * * Useful if your architecture has non-page page directories. * * When used, an architecture is expected to provide __tlb_remove_table() or * use the generic __tlb_remove_table(), which does the actual freeing of these * pages. * * MMU_GATHER_RCU_TABLE_FREE * * Like MMU_GATHER_TABLE_FREE, and adds semi-RCU semantics to the free (see * comment below). * * Useful if your architecture doesn't use IPIs for remote TLB invalidates * and therefore doesn't naturally serialize with software page-table walkers. * * MMU_GATHER_NO_FLUSH_CACHE * * Indicates the architecture has flush_cache_range() but it needs *NOT* be called * before unmapping a VMA. * * NOTE: strictly speaking we shouldn't have this knob and instead rely on * flush_cache_range() being a NOP, except Sparc64 seems to be * different here. * * MMU_GATHER_MERGE_VMAS * * Indicates the architecture wants to merge ranges over VMAs; typical when * multiple range invalidates are more expensive than a full invalidate. * * MMU_GATHER_NO_RANGE * * Use this if your architecture lacks an efficient flush_tlb_range(). This * option implies MMU_GATHER_MERGE_VMAS above. * * MMU_GATHER_NO_GATHER * * If the option is set the mmu_gather will not track individual pages for * delayed page free anymore. A platform that enables the option needs to * provide its own implementation of the __tlb_remove_page_size() function to * free pages. * * This is useful if your architecture already flushes TLB entries in the * various ptep_get_and_clear() functions. */ #ifdef CONFIG_MMU_GATHER_TABLE_FREE struct mmu_table_batch { #ifdef CONFIG_MMU_GATHER_RCU_TABLE_FREE struct rcu_head rcu; #endif unsigned int nr; void *tables[]; }; #define MAX_TABLE_BATCH \ ((PAGE_SIZE - sizeof(struct mmu_table_batch)) / sizeof(void *)) #ifndef __HAVE_ARCH_TLB_REMOVE_TABLE static inline void __tlb_remove_table(void *table) { struct ptdesc *ptdesc = (struct ptdesc *)table; pagetable_dtor_free(ptdesc); } #endif extern void tlb_remove_table(struct mmu_gather *tlb, void *table); #else /* !CONFIG_MMU_GATHER_TABLE_FREE */ static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page); /* * Without MMU_GATHER_TABLE_FREE the architecture is assumed to have page based * page directories and we can use the normal page batching to free them. */ static inline void tlb_remove_table(struct mmu_gather *tlb, void *table) { struct ptdesc *ptdesc = (struct ptdesc *)table; pagetable_dtor(ptdesc); tlb_remove_page(tlb, ptdesc_page(ptdesc)); } #endif /* CONFIG_MMU_GATHER_TABLE_FREE */ #ifdef CONFIG_MMU_GATHER_RCU_TABLE_FREE /* * This allows an architecture that does not use the linux page-tables for * hardware to skip the TLBI when freeing page tables. */ #ifndef tlb_needs_table_invalidate #define tlb_needs_table_invalidate() (true) #endif void tlb_remove_table_sync_one(void); #else #ifdef tlb_needs_table_invalidate #error tlb_needs_table_invalidate() requires MMU_GATHER_RCU_TABLE_FREE #endif static inline void tlb_remove_table_sync_one(void) { } #endif /* CONFIG_MMU_GATHER_RCU_TABLE_FREE */ #ifndef CONFIG_MMU_GATHER_NO_GATHER /* * If we can't allocate a page to make a big batch of page pointers * to work on, then just handle a few from the on-stack structure. */ #define MMU_GATHER_BUNDLE 8 struct mmu_gather_batch { struct mmu_gather_batch *next; unsigned int nr; unsigned int max; struct encoded_page *encoded_pages[]; }; #define MAX_GATHER_BATCH \ ((PAGE_SIZE - sizeof(struct mmu_gather_batch)) / sizeof(void *)) /* * Limit the maximum number of mmu_gather batches to reduce a risk of soft * lockups for non-preemptible kernels on huge machines when a lot of memory * is zapped during unmapping. * 10K pages freed at once should be safe even without a preemption point. */ #define MAX_GATHER_BATCH_COUNT (10000UL/MAX_GATHER_BATCH) extern bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, bool delay_rmap, int page_size); bool __tlb_remove_folio_pages(struct mmu_gather *tlb, struct page *page, unsigned int nr_pages, bool delay_rmap); #ifdef CONFIG_SMP /* * This both sets 'delayed_rmap', and returns true. It would be an inline * function, except we define it before the 'struct mmu_gather'. */ #define tlb_delay_rmap(tlb) (((tlb)->delayed_rmap = 1), true) extern void tlb_flush_rmaps(struct mmu_gather *tlb, struct vm_area_struct *vma); #endif #endif /* * We have a no-op version of the rmap removal that doesn't * delay anything. That is used on S390, which flushes remote * TLBs synchronously, and on UP, which doesn't have any * remote TLBs to flush and is not preemptible due to this * all happening under the page table lock. */ #ifndef tlb_delay_rmap #define tlb_delay_rmap(tlb) (false) static inline void tlb_flush_rmaps(struct mmu_gather *tlb, struct vm_area_struct *vma) { } #endif /* * struct mmu_gather is an opaque type used by the mm code for passing around * any data needed by arch specific code for tlb_remove_page. */ struct mmu_gather { struct mm_struct *mm; #ifdef CONFIG_MMU_GATHER_TABLE_FREE struct mmu_table_batch *batch; #endif unsigned long start; unsigned long end; /* * we are in the middle of an operation to clear * a full mm and can make some optimizations */ unsigned int fullmm : 1; /* * we have performed an operation which * requires a complete flush of the tlb */ unsigned int need_flush_all : 1; /* * we have removed page directories */ unsigned int freed_tables : 1; /* * Do we have pending delayed rmap removals? */ unsigned int delayed_rmap : 1; /* * at which levels have we cleared entries? */ unsigned int cleared_ptes : 1; unsigned int cleared_pmds : 1; unsigned int cleared_puds : 1; unsigned int cleared_p4ds : 1; /* * tracks VM_EXEC | VM_HUGETLB in tlb_start_vma */ unsigned int vma_exec : 1; unsigned int vma_huge : 1; unsigned int vma_pfn : 1; unsigned int batch_count; #ifndef CONFIG_MMU_GATHER_NO_GATHER struct mmu_gather_batch *active; struct mmu_gather_batch local; struct page *__pages[MMU_GATHER_BUNDLE]; #ifdef CONFIG_MMU_GATHER_PAGE_SIZE unsigned int page_size; #endif #endif }; void tlb_flush_mmu(struct mmu_gather *tlb); static inline void __tlb_adjust_range(struct mmu_gather *tlb, unsigned long address, unsigned int range_size) { tlb->start = min(tlb->start, address); tlb->end = max(tlb->end, address + range_size); } static inline void __tlb_reset_range(struct mmu_gather *tlb) { if (tlb->fullmm) { tlb->start = tlb->end = ~0; } else { tlb->start = TASK_SIZE; tlb->end = 0; } tlb->freed_tables = 0; tlb->cleared_ptes = 0; tlb->cleared_pmds = 0; tlb->cleared_puds = 0; tlb->cleared_p4ds = 0; /* * Do not reset mmu_gather::vma_* fields here, we do not * call into tlb_start_vma() again to set them if there is an * intermediate flush. */ } #ifdef CONFIG_MMU_GATHER_NO_RANGE #if defined(tlb_flush) #error MMU_GATHER_NO_RANGE relies on default tlb_flush() #endif /* * When an architecture does not have efficient means of range flushing TLBs * there is no point in doing intermediate flushes on tlb_end_vma() to keep the * range small. We equally don't have to worry about page granularity or other * things. * * All we need to do is issue a full flush for any !0 range. */ static inline void tlb_flush(struct mmu_gather *tlb) { if (tlb->end) flush_tlb_mm(tlb->mm); } #else /* CONFIG_MMU_GATHER_NO_RANGE */ #ifndef tlb_flush /* * When an architecture does not provide its own tlb_flush() implementation * but does have a reasonably efficient flush_vma_range() implementation * use that. */ static inline void tlb_flush(struct mmu_gather *tlb) { if (tlb->fullmm || tlb->need_flush_all) { flush_tlb_mm(tlb->mm); } else if (tlb->end) { struct vm_area_struct vma = { .vm_mm = tlb->mm, .vm_flags = (tlb->vma_exec ? VM_EXEC : 0) | (tlb->vma_huge ? VM_HUGETLB : 0), }; flush_tlb_range(&vma, tlb->start, tlb->end); } } #endif #endif /* CONFIG_MMU_GATHER_NO_RANGE */ static inline void tlb_update_vma_flags(struct mmu_gather *tlb, struct vm_area_struct *vma) { /* * flush_tlb_range() implementations that look at VM_HUGETLB (tile, * mips-4k) flush only large pages. * * flush_tlb_range() implementations that flush I-TLB also flush D-TLB * (tile, xtensa, arm), so it's ok to just add VM_EXEC to an existing * range. * * We rely on tlb_end_vma() to issue a flush, such that when we reset * these values the batch is empty. */ tlb->vma_huge = is_vm_hugetlb_page(vma); tlb->vma_exec = !!(vma->vm_flags & VM_EXEC); /* * Track if there's at least one VM_PFNMAP/VM_MIXEDMAP vma * in the tracked range, see tlb_free_vmas(). */ tlb->vma_pfn |= !!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)); } static inline void tlb_flush_mmu_tlbonly(struct mmu_gather *tlb) { /* * Anything calling __tlb_adjust_range() also sets at least one of * these bits. */ if (!(tlb->freed_tables || tlb->cleared_ptes || tlb->cleared_pmds || tlb->cleared_puds || tlb->cleared_p4ds)) return; tlb_flush(tlb); __tlb_reset_range(tlb); } static inline void tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_size) { if (__tlb_remove_page_size(tlb, page, false, page_size)) tlb_flush_mmu(tlb); } static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page) { return tlb_remove_page_size(tlb, page, PAGE_SIZE); } static inline void tlb_remove_ptdesc(struct mmu_gather *tlb, struct ptdesc *pt) { tlb_remove_table(tlb, pt); } static inline void tlb_change_page_size(struct mmu_gather *tlb, unsigned int page_size) { #ifdef CONFIG_MMU_GATHER_PAGE_SIZE if (tlb->page_size && tlb->page_size != page_size) { if (!tlb->fullmm && !tlb->need_flush_all) tlb_flush_mmu(tlb); } tlb->page_size = page_size; #endif } static inline unsigned long tlb_get_unmap_shift(struct mmu_gather *tlb) { if (tlb->cleared_ptes) return PAGE_SHIFT; if (tlb->cleared_pmds) return PMD_SHIFT; if (tlb->cleared_puds) return PUD_SHIFT; if (tlb->cleared_p4ds) return P4D_SHIFT; return PAGE_SHIFT; } static inline unsigned long tlb_get_unmap_size(struct mmu_gather *tlb) { return 1UL << tlb_get_unmap_shift(tlb); } /* * In the case of tlb vma handling, we can optimise these away in the * case where we're doing a full MM flush. When we're doing a munmap, * the vmas are adjusted to only cover the region to be torn down. */ static inline void tlb_start_vma(struct mmu_gather *tlb, struct vm_area_struct *vma) { if (tlb->fullmm) return; tlb_update_vma_flags(tlb, vma); #ifndef CONFIG_MMU_GATHER_NO_FLUSH_CACHE flush_cache_range(vma, vma->vm_start, vma->vm_end); #endif } static inline void tlb_end_vma(struct mmu_gather *tlb, struct vm_area_struct *vma) { if (tlb->fullmm || IS_ENABLED(CONFIG_MMU_GATHER_MERGE_VMAS)) return; /* * Do a TLB flush and reset the range at VMA boundaries; this avoids * the ranges growing with the unused space between consecutive VMAs, * but also the mmu_gather::vma_* flags from tlb_start_vma() rely on * this. */ tlb_flush_mmu_tlbonly(tlb); } static inline void tlb_free_vmas(struct mmu_gather *tlb) { if (tlb->fullmm) return; /* * VM_PFNMAP is more fragile because the core mm will not track the * page mapcount -- there might not be page-frames for these PFNs * after all. * * Specifically() there is a race between munmap() and * unmap_mapping_range(), where munmap() will unlink the VMA, such * that unmap_mapping_range() will no longer observe the VMA and * no-op, without observing the TLBI, returning prematurely. * * So if we're about to unlink such a VMA, and we have pending * TLBI for such a vma, flush things now. */ if (tlb->vma_pfn) tlb_flush_mmu_tlbonly(tlb); } /* * tlb_flush_{pte|pmd|pud|p4d}_range() adjust the tlb->start and tlb->end, * and set corresponding cleared_*. */ static inline void tlb_flush_pte_range(struct mmu_gather *tlb, unsigned long address, unsigned long size) { __tlb_adjust_range(tlb, address, size); tlb->cleared_ptes = 1; } static inline void tlb_flush_pmd_range(struct mmu_gather *tlb, unsigned long address, unsigned long size) { __tlb_adjust_range(tlb, address, size); tlb->cleared_pmds = 1; } static inline void tlb_flush_pud_range(struct mmu_gather *tlb, unsigned long address, unsigned long size) { __tlb_adjust_range(tlb, address, size); tlb->cleared_puds = 1; } static inline void tlb_flush_p4d_range(struct mmu_gather *tlb, unsigned long address, unsigned long size) { __tlb_adjust_range(tlb, address, size); tlb->cleared_p4ds = 1; } #ifndef __tlb_remove_tlb_entry static inline void __tlb_remove_tlb_entry(struct mmu_gather *tlb, pte_t *ptep, unsigned long address) { } #endif /** * tlb_remove_tlb_entry - remember a pte unmapping for later tlb invalidation. * * Record the fact that pte's were really unmapped by updating the range, * so we can later optimise away the tlb invalidate. This helps when * userspace is unmapping already-unmapped pages, which happens quite a lot. */ #define tlb_remove_tlb_entry(tlb, ptep, address) \ do { \ tlb_flush_pte_range(tlb, address, PAGE_SIZE); \ __tlb_remove_tlb_entry(tlb, ptep, address); \ } while (0) /** * tlb_remove_tlb_entries - remember unmapping of multiple consecutive ptes for * later tlb invalidation. * * Similar to tlb_remove_tlb_entry(), but remember unmapping of multiple * consecutive ptes instead of only a single one. */ static inline void tlb_remove_tlb_entries(struct mmu_gather *tlb, pte_t *ptep, unsigned int nr, unsigned long address) { tlb_flush_pte_range(tlb, address, PAGE_SIZE * nr); for (;;) { __tlb_remove_tlb_entry(tlb, ptep, address); if (--nr == 0) break; ptep++; address += PAGE_SIZE; } } #define tlb_remove_huge_tlb_entry(h, tlb, ptep, address) \ do { \ unsigned long _sz = huge_page_size(h); \ if (_sz >= P4D_SIZE) \ tlb_flush_p4d_range(tlb, address, _sz); \ else if (_sz >= PUD_SIZE) \ tlb_flush_pud_range(tlb, address, _sz); \ else if (_sz >= PMD_SIZE) \ tlb_flush_pmd_range(tlb, address, _sz); \ else \ tlb_flush_pte_range(tlb, address, _sz); \ __tlb_remove_tlb_entry(tlb, ptep, address); \ } while (0) /** * tlb_remove_pmd_tlb_entry - remember a pmd mapping for later tlb invalidation * This is a nop so far, because only x86 needs it. */ #ifndef __tlb_remove_pmd_tlb_entry #define __tlb_remove_pmd_tlb_entry(tlb, pmdp, address) do {} while (0) #endif #define tlb_remove_pmd_tlb_entry(tlb, pmdp, address) \ do { \ tlb_flush_pmd_range(tlb, address, HPAGE_PMD_SIZE); \ __tlb_remove_pmd_tlb_entry(tlb, pmdp, address); \ } while (0) /** * tlb_remove_pud_tlb_entry - remember a pud mapping for later tlb * invalidation. This is a nop so far, because only x86 needs it. */ #ifndef __tlb_remove_pud_tlb_entry #define __tlb_remove_pud_tlb_entry(tlb, pudp, address) do {} while (0) #endif #define tlb_remove_pud_tlb_entry(tlb, pudp, address) \ do { \ tlb_flush_pud_range(tlb, address, HPAGE_PUD_SIZE); \ __tlb_remove_pud_tlb_entry(tlb, pudp, address); \ } while (0) /* * For things like page tables caches (ie caching addresses "inside" the * page tables, like x86 does), for legacy reasons, flushing an * individual page had better flush the page table caches behind it. This * is definitely how x86 works, for example. And if you have an * architected non-legacy page table cache (which I'm not aware of * anybody actually doing), you're going to have some architecturally * explicit flushing for that, likely *separate* from a regular TLB entry * flush, and thus you'd need more than just some range expansion.. * * So if we ever find an architecture * that would want something that odd, I think it is up to that * architecture to do its own odd thing, not cause pain for others * http://lkml.kernel.org/r/CA+55aFzBggoXtNXQeng5d_mRoDnaMBE5Y+URs+PHR67nUpMtaw@mail.gmail.com * * For now w.r.t page table cache, mark the range_size as PAGE_SIZE */ #ifndef pte_free_tlb #define pte_free_tlb(tlb, ptep, address) \ do { \ tlb_flush_pmd_range(tlb, address, PAGE_SIZE); \ tlb->freed_tables = 1; \ __pte_free_tlb(tlb, ptep, address); \ } while (0) #endif #ifndef pmd_free_tlb #define pmd_free_tlb(tlb, pmdp, address) \ do { \ tlb_flush_pud_range(tlb, address, PAGE_SIZE); \ tlb->freed_tables = 1; \ __pmd_free_tlb(tlb, pmdp, address); \ } while (0) #endif #ifndef pud_free_tlb #define pud_free_tlb(tlb, pudp, address) \ do { \ tlb_flush_p4d_range(tlb, address, PAGE_SIZE); \ tlb->freed_tables = 1; \ __pud_free_tlb(tlb, pudp, address); \ } while (0) #endif #ifndef p4d_free_tlb #define p4d_free_tlb(tlb, pudp, address) \ do { \ __tlb_adjust_range(tlb, address, PAGE_SIZE); \ tlb->freed_tables = 1; \ __p4d_free_tlb(tlb, pudp, address); \ } while (0) #endif #ifndef pte_needs_flush static inline bool pte_needs_flush(pte_t oldpte, pte_t newpte) { return true; } #endif #ifndef huge_pmd_needs_flush static inline bool huge_pmd_needs_flush(pmd_t oldpmd, pmd_t newpmd) { return true; } #endif #endif /* CONFIG_MMU */ #endif /* _ASM_GENERIC__TLB_H */ |
| 1 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 | // SPDX-License-Identifier: GPL-2.0 #include <linux/build_bug.h> #include <linux/errno.h> #include <linux/errname.h> #include <linux/kernel.h> #include <linux/math.h> /* * Ensure these tables do not accidentally become gigantic if some * huge errno makes it in. On most architectures, the first table will * only have about 140 entries, but mips and parisc have more sparsely * allocated errnos (with EHWPOISON = 257 on parisc, and EDQUOT = 1133 * on mips), so this wastes a bit of space on those - though we * special case the EDQUOT case. */ #define E(err) [err + BUILD_BUG_ON_ZERO(err <= 0 || err > 300)] = "-" #err static const char *names_0[] = { E(E2BIG), E(EACCES), E(EADDRINUSE), E(EADDRNOTAVAIL), E(EADV), E(EAFNOSUPPORT), E(EAGAIN), /* EWOULDBLOCK */ E(EALREADY), E(EBADE), E(EBADF), E(EBADFD), E(EBADMSG), E(EBADR), E(EBADRQC), E(EBADSLT), E(EBFONT), E(EBUSY), E(ECANCELED), /* ECANCELLED */ E(ECHILD), E(ECHRNG), E(ECOMM), E(ECONNABORTED), E(ECONNREFUSED), /* EREFUSED */ E(ECONNRESET), E(EDEADLK), /* EDEADLOCK */ #if EDEADLK != EDEADLOCK /* mips, sparc, powerpc */ E(EDEADLOCK), #endif E(EDESTADDRREQ), E(EDOM), E(EDOTDOT), #ifndef CONFIG_MIPS E(EDQUOT), #endif E(EEXIST), E(EFAULT), E(EFBIG), E(EHOSTDOWN), E(EHOSTUNREACH), E(EHWPOISON), E(EIDRM), E(EILSEQ), #ifdef EINIT E(EINIT), #endif E(EINPROGRESS), E(EINTR), E(EINVAL), E(EIO), E(EISCONN), E(EISDIR), E(EISNAM), E(EKEYEXPIRED), E(EKEYREJECTED), E(EKEYREVOKED), E(EL2HLT), E(EL2NSYNC), E(EL3HLT), E(EL3RST), E(ELIBACC), E(ELIBBAD), E(ELIBEXEC), E(ELIBMAX), E(ELIBSCN), E(ELNRNG), E(ELOOP), E(EMEDIUMTYPE), E(EMFILE), E(EMLINK), E(EMSGSIZE), E(EMULTIHOP), E(ENAMETOOLONG), E(ENAVAIL), E(ENETDOWN), E(ENETRESET), E(ENETUNREACH), E(ENFILE), E(ENOANO), E(ENOBUFS), E(ENOCSI), E(ENODATA), E(ENODEV), E(ENOENT), E(ENOEXEC), E(ENOKEY), E(ENOLCK), E(ENOLINK), E(ENOMEDIUM), E(ENOMEM), E(ENOMSG), E(ENONET), E(ENOPKG), E(ENOPROTOOPT), E(ENOSPC), E(ENOSR), E(ENOSTR), E(ENOSYS), E(ENOTBLK), E(ENOTCONN), E(ENOTDIR), E(ENOTEMPTY), E(ENOTNAM), E(ENOTRECOVERABLE), E(ENOTSOCK), E(ENOTTY), E(ENOTUNIQ), E(ENXIO), E(EOPNOTSUPP), E(EOVERFLOW), E(EOWNERDEAD), E(EPERM), E(EPFNOSUPPORT), E(EPIPE), #ifdef EPROCLIM E(EPROCLIM), #endif E(EPROTO), E(EPROTONOSUPPORT), E(EPROTOTYPE), E(ERANGE), E(EREMCHG), #ifdef EREMDEV E(EREMDEV), #endif E(EREMOTE), E(EREMOTEIO), E(ERESTART), E(ERFKILL), E(EROFS), #ifdef ERREMOTE E(ERREMOTE), #endif E(ESHUTDOWN), E(ESOCKTNOSUPPORT), E(ESPIPE), E(ESRCH), E(ESRMNT), E(ESTALE), E(ESTRPIPE), E(ETIME), E(ETIMEDOUT), E(ETOOMANYREFS), E(ETXTBSY), E(EUCLEAN), E(EUNATCH), E(EUSERS), E(EXDEV), E(EXFULL), }; #undef E #ifdef EREFUSED /* parisc */ static_assert(EREFUSED == ECONNREFUSED); #endif #ifdef ECANCELLED /* parisc */ static_assert(ECANCELLED == ECANCELED); #endif static_assert(EAGAIN == EWOULDBLOCK); /* everywhere */ #define E(err) [err - 512 + BUILD_BUG_ON_ZERO(err < 512 || err > 550)] = "-" #err static const char *names_512[] = { E(ERESTARTSYS), E(ERESTARTNOINTR), E(ERESTARTNOHAND), E(ENOIOCTLCMD), E(ERESTART_RESTARTBLOCK), E(EPROBE_DEFER), E(EOPENSTALE), E(ENOPARAM), E(EBADHANDLE), E(ENOTSYNC), E(EBADCOOKIE), E(ENOTSUPP), E(ETOOSMALL), E(ESERVERFAULT), E(EBADTYPE), E(EJUKEBOX), E(EIOCBQUEUED), E(ERECALLCONFLICT), }; #undef E static const char *__errname(unsigned err) { if (err < ARRAY_SIZE(names_0)) return names_0[err]; if (err >= 512 && err - 512 < ARRAY_SIZE(names_512)) return names_512[err - 512]; /* But why? */ if (IS_ENABLED(CONFIG_MIPS) && err == EDQUOT) /* 1133 */ return "-EDQUOT"; return NULL; } /* * errname(EIO) -> "EIO" * errname(-EIO) -> "-EIO" */ const char *errname(int err) { const char *name = __errname(abs(err)); if (!name) return NULL; return err > 0 ? name + 1 : name; } EXPORT_SYMBOL(errname); |
| 41 42 42 2 42 42 42 42 42 42 42 2 42 2 42 42 42 42 42 2 2 1 1 2 1 2 2 2 2 2 1 2 1 1 2 42 6 5 1 1 6 6 7 1 1 5 1 1 1 1 2 2 11 10 11 11 10 10 9 10 10 10 10 10 4 10 4 9 5 6 5 5 6 6 6 6 5 5 1 5 4 4 3 3 4 4 4 4 4 4 7 7 7 7 7 1 7 1 11 3 3 3 3 3 1 1 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 1 1 3 2 11 11 4 4 2 1 1 3 3 3 3 1 3 1 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 | // SPDX-License-Identifier: GPL-2.0-only /* * binfmt_misc.c * * Copyright (C) 1997 Richard Günther * * binfmt_misc detects binaries via a magic or filename extension and invokes * a specified wrapper. See Documentation/admin-guide/binfmt-misc.rst for more details. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/kernel.h> #include <linux/module.h> #include <linux/init.h> #include <linux/sched/mm.h> #include <linux/magic.h> #include <linux/binfmts.h> #include <linux/slab.h> #include <linux/ctype.h> #include <linux/string_helpers.h> #include <linux/file.h> #include <linux/pagemap.h> #include <linux/namei.h> #include <linux/mount.h> #include <linux/fs_context.h> #include <linux/syscalls.h> #include <linux/fs.h> #include <linux/uaccess.h> #include "internal.h" #ifdef DEBUG # define USE_DEBUG 1 #else # define USE_DEBUG 0 #endif enum { VERBOSE_STATUS = 1 /* make it zero to save 400 bytes kernel memory */ }; enum {Enabled, Magic}; #define MISC_FMT_PRESERVE_ARGV0 (1UL << 31) #define MISC_FMT_OPEN_BINARY (1UL << 30) #define MISC_FMT_CREDENTIALS (1UL << 29) #define MISC_FMT_OPEN_FILE (1UL << 28) typedef struct { struct list_head list; unsigned long flags; /* type, status, etc. */ int offset; /* offset of magic */ int size; /* size of magic/mask */ char *magic; /* magic or filename extension */ char *mask; /* mask, NULL for exact match */ const char *interpreter; /* filename of interpreter */ char *name; struct dentry *dentry; struct file *interp_file; refcount_t users; /* sync removal with load_misc_binary() */ } Node; static struct file_system_type bm_fs_type; /* * Max length of the register string. Determined by: * - 7 delimiters * - name: ~50 bytes * - type: 1 byte * - offset: 3 bytes (has to be smaller than BINPRM_BUF_SIZE) * - magic: 128 bytes (512 in escaped form) * - mask: 128 bytes (512 in escaped form) * - interp: ~50 bytes * - flags: 5 bytes * Round that up a bit, and then back off to hold the internal data * (like struct Node). */ #define MAX_REGISTER_LENGTH 1920 /** * search_binfmt_handler - search for a binary handler for @bprm * @misc: handle to binfmt_misc instance * @bprm: binary for which we are looking for a handler * * Search for a binary type handler for @bprm in the list of registered binary * type handlers. * * Return: binary type list entry on success, NULL on failure */ static Node *search_binfmt_handler(struct binfmt_misc *misc, struct linux_binprm *bprm) { char *p = strrchr(bprm->interp, '.'); Node *e; /* Walk all the registered handlers. */ list_for_each_entry(e, &misc->entries, list) { char *s; int j; /* Make sure this one is currently enabled. */ if (!test_bit(Enabled, &e->flags)) continue; /* Do matching based on extension if applicable. */ if (!test_bit(Magic, &e->flags)) { if (p && !strcmp(e->magic, p + 1)) return e; continue; } /* Do matching based on magic & mask. */ s = bprm->buf + e->offset; if (e->mask) { for (j = 0; j < e->size; j++) if ((*s++ ^ e->magic[j]) & e->mask[j]) break; } else { for (j = 0; j < e->size; j++) if ((*s++ ^ e->magic[j])) break; } if (j == e->size) return e; } return NULL; } /** * get_binfmt_handler - try to find a binary type handler * @misc: handle to binfmt_misc instance * @bprm: binary for which we are looking for a handler * * Try to find a binfmt handler for the binary type. If one is found take a * reference to protect against removal via bm_{entry,status}_write(). * * Return: binary type list entry on success, NULL on failure */ static Node *get_binfmt_handler(struct binfmt_misc *misc, struct linux_binprm *bprm) { Node *e; read_lock(&misc->entries_lock); e = search_binfmt_handler(misc, bprm); if (e) refcount_inc(&e->users); read_unlock(&misc->entries_lock); return e; } /** * put_binfmt_handler - put binary handler node * @e: node to put * * Free node syncing with load_misc_binary() and defer final free to * load_misc_binary() in case it is using the binary type handler we were * requested to remove. */ static void put_binfmt_handler(Node *e) { if (refcount_dec_and_test(&e->users)) { if (e->flags & MISC_FMT_OPEN_FILE) filp_close(e->interp_file, NULL); kfree(e); } } /** * load_binfmt_misc - load the binfmt_misc of the caller's user namespace * * To be called in load_misc_binary() to load the relevant struct binfmt_misc. * If a user namespace doesn't have its own binfmt_misc mount it can make use * of its ancestor's binfmt_misc handlers. This mimicks the behavior of * pre-namespaced binfmt_misc where all registered binfmt_misc handlers where * available to all user and user namespaces on the system. * * Return: the binfmt_misc instance of the caller's user namespace */ static struct binfmt_misc *load_binfmt_misc(void) { const struct user_namespace *user_ns; struct binfmt_misc *misc; user_ns = current_user_ns(); while (user_ns) { /* Pairs with smp_store_release() in bm_fill_super(). */ misc = smp_load_acquire(&user_ns->binfmt_misc); if (misc) return misc; user_ns = user_ns->parent; } return &init_binfmt_misc; } /* * the loader itself */ static int load_misc_binary(struct linux_binprm *bprm) { Node *fmt; struct file *interp_file = NULL; int retval = -ENOEXEC; struct binfmt_misc *misc; misc = load_binfmt_misc(); if (!misc->enabled) return retval; fmt = get_binfmt_handler(misc, bprm); if (!fmt) return retval; /* Need to be able to load the file after exec */ retval = -ENOENT; if (bprm->interp_flags & BINPRM_FLAGS_PATH_INACCESSIBLE) goto ret; if (fmt->flags & MISC_FMT_PRESERVE_ARGV0) { bprm->interp_flags |= BINPRM_FLAGS_PRESERVE_ARGV0; } else { retval = remove_arg_zero(bprm); if (retval) goto ret; } if (fmt->flags & MISC_FMT_OPEN_BINARY) bprm->have_execfd = 1; /* make argv[1] be the path to the binary */ retval = copy_string_kernel(bprm->interp, bprm); if (retval < 0) goto ret; bprm->argc++; /* add the interp as argv[0] */ retval = copy_string_kernel(fmt->interpreter, bprm); if (retval < 0) goto ret; bprm->argc++; /* Update interp in case binfmt_script needs it. */ retval = bprm_change_interp(fmt->interpreter, bprm); if (retval < 0) goto ret; if (fmt->flags & MISC_FMT_OPEN_FILE) { interp_file = file_clone_open(fmt->interp_file); if (!IS_ERR(interp_file)) deny_write_access(interp_file); } else { interp_file = open_exec(fmt->interpreter); } retval = PTR_ERR(interp_file); if (IS_ERR(interp_file)) goto ret; bprm->interpreter = interp_file; if (fmt->flags & MISC_FMT_CREDENTIALS) bprm->execfd_creds = 1; retval = 0; ret: /* * If we actually put the node here all concurrent calls to * load_misc_binary() will have finished. We also know * that for the refcount to be zero someone must have concurently * removed the binary type handler from the list and it's our job to * free it. */ put_binfmt_handler(fmt); return retval; } /* Command parsers */ /* * parses and copies one argument enclosed in del from *sp to *dp, * recognising the \x special. * returns pointer to the copied argument or NULL in case of an * error (and sets err) or null argument length. */ static char *scanarg(char *s, char del) { char c; while ((c = *s++) != del) { if (c == '\\' && *s == 'x') { s++; if (!isxdigit(*s++)) return NULL; if (!isxdigit(*s++)) return NULL; } } s[-1] ='\0'; return s; } static char *check_special_flags(char *sfs, Node *e) { char *p = sfs; int cont = 1; /* special flags */ while (cont) { switch (*p) { case 'P': pr_debug("register: flag: P (preserve argv0)\n"); p++; e->flags |= MISC_FMT_PRESERVE_ARGV0; break; case 'O': pr_debug("register: flag: O (open binary)\n"); p++; e->flags |= MISC_FMT_OPEN_BINARY; break; case 'C': pr_debug("register: flag: C (preserve creds)\n"); p++; /* this flags also implies the open-binary flag */ e->flags |= (MISC_FMT_CREDENTIALS | MISC_FMT_OPEN_BINARY); break; case 'F': pr_debug("register: flag: F: open interpreter file now\n"); p++; e->flags |= MISC_FMT_OPEN_FILE; break; default: cont = 0; } } return p; } /* * This registers a new binary format, it recognises the syntax * ':name:type:offset:magic:mask:interpreter:flags' * where the ':' is the IFS, that can be chosen with the first char */ static Node *create_entry(const char __user *buffer, size_t count) { Node *e; int memsize, err; char *buf, *p; char del; pr_debug("register: received %zu bytes\n", count); /* some sanity checks */ err = -EINVAL; if ((count < 11) || (count > MAX_REGISTER_LENGTH)) goto out; err = -ENOMEM; memsize = sizeof(Node) + count + 8; e = kmalloc(memsize, GFP_KERNEL_ACCOUNT); if (!e) goto out; p = buf = (char *)e + sizeof(Node); memset(e, 0, sizeof(Node)); if (copy_from_user(buf, buffer, count)) goto efault; del = *p++; /* delimeter */ pr_debug("register: delim: %#x {%c}\n", del, del); /* Pad the buffer with the delim to simplify parsing below. */ memset(buf + count, del, 8); /* Parse the 'name' field. */ e->name = p; p = strchr(p, del); if (!p) goto einval; *p++ = '\0'; if (!e->name[0] || !strcmp(e->name, ".") || !strcmp(e->name, "..") || strchr(e->name, '/')) goto einval; pr_debug("register: name: {%s}\n", e->name); /* Parse the 'type' field. */ switch (*p++) { case 'E': pr_debug("register: type: E (extension)\n"); e->flags = 1 << Enabled; break; case 'M': pr_debug("register: type: M (magic)\n"); e->flags = (1 << Enabled) | (1 << Magic); break; default: goto einval; } if (*p++ != del) goto einval; if (test_bit(Magic, &e->flags)) { /* Handle the 'M' (magic) format. */ char *s; /* Parse the 'offset' field. */ s = strchr(p, del); if (!s) goto einval; *s = '\0'; if (p != s) { int r = kstrtoint(p, 10, &e->offset); if (r != 0 || e->offset < 0) goto einval; } p = s; if (*p++) goto einval; pr_debug("register: offset: %#x\n", e->offset); /* Parse the 'magic' field. */ e->magic = p; p = scanarg(p, del); if (!p) goto einval; if (!e->magic[0]) goto einval; if (USE_DEBUG) print_hex_dump_bytes( KBUILD_MODNAME ": register: magic[raw]: ", DUMP_PREFIX_NONE, e->magic, p - e->magic); /* Parse the 'mask' field. */ e->mask = p; p = scanarg(p, del); if (!p) goto einval; if (!e->mask[0]) { e->mask = NULL; pr_debug("register: mask[raw]: none\n"); } else if (USE_DEBUG) print_hex_dump_bytes( KBUILD_MODNAME ": register: mask[raw]: ", DUMP_PREFIX_NONE, e->mask, p - e->mask); /* * Decode the magic & mask fields. * Note: while we might have accepted embedded NUL bytes from * above, the unescape helpers here will stop at the first one * it encounters. */ e->size = string_unescape_inplace(e->magic, UNESCAPE_HEX); if (e->mask && string_unescape_inplace(e->mask, UNESCAPE_HEX) != e->size) goto einval; if (e->size > BINPRM_BUF_SIZE || BINPRM_BUF_SIZE - e->size < e->offset) goto einval; pr_debug("register: magic/mask length: %i\n", e->size); if (USE_DEBUG) { print_hex_dump_bytes( KBUILD_MODNAME ": register: magic[decoded]: ", DUMP_PREFIX_NONE, e->magic, e->size); if (e->mask) { int i; char *masked = kmalloc(e->size, GFP_KERNEL_ACCOUNT); print_hex_dump_bytes( KBUILD_MODNAME ": register: mask[decoded]: ", DUMP_PREFIX_NONE, e->mask, e->size); if (masked) { for (i = 0; i < e->size; ++i) masked[i] = e->magic[i] & e->mask[i]; print_hex_dump_bytes( KBUILD_MODNAME ": register: magic[masked]: ", DUMP_PREFIX_NONE, masked, e->size); kfree(masked); } } } } else { /* Handle the 'E' (extension) format. */ /* Skip the 'offset' field. */ p = strchr(p, del); if (!p) goto einval; *p++ = '\0'; /* Parse the 'magic' field. */ e->magic = p; p = strchr(p, del); if (!p) goto einval; *p++ = '\0'; if (!e->magic[0] || strchr(e->magic, '/')) goto einval; pr_debug("register: extension: {%s}\n", e->magic); /* Skip the 'mask' field. */ p = strchr(p, del); if (!p) goto einval; *p++ = '\0'; } /* Parse the 'interpreter' field. */ e->interpreter = p; p = strchr(p, del); if (!p) goto einval; *p++ = '\0'; if (!e->interpreter[0]) goto einval; pr_debug("register: interpreter: {%s}\n", e->interpreter); /* Parse the 'flags' field. */ p = check_special_flags(p, e); if (*p == '\n') p++; if (p != buf + count) goto einval; return e; out: return ERR_PTR(err); efault: kfree(e); return ERR_PTR(-EFAULT); einval: kfree(e); return ERR_PTR(-EINVAL); } /* * Set status of entry/binfmt_misc: * '1' enables, '0' disables and '-1' clears entry/binfmt_misc */ static int parse_command(const char __user *buffer, size_t count) { char s[4]; if (count > 3) return -EINVAL; if (copy_from_user(s, buffer, count)) return -EFAULT; if (!count) return 0; if (s[count - 1] == '\n') count--; if (count == 1 && s[0] == '0') return 1; if (count == 1 && s[0] == '1') return 2; if (count == 2 && s[0] == '-' && s[1] == '1') return 3; return -EINVAL; } /* generic stuff */ static void entry_status(Node *e, char *page) { char *dp = page; const char *status = "disabled"; if (test_bit(Enabled, &e->flags)) status = "enabled"; if (!VERBOSE_STATUS) { sprintf(page, "%s\n", status); return; } dp += sprintf(dp, "%s\ninterpreter %s\n", status, e->interpreter); /* print the special flags */ dp += sprintf(dp, "flags: "); if (e->flags & MISC_FMT_PRESERVE_ARGV0) *dp++ = 'P'; if (e->flags & MISC_FMT_OPEN_BINARY) *dp++ = 'O'; if (e->flags & MISC_FMT_CREDENTIALS) *dp++ = 'C'; if (e->flags & MISC_FMT_OPEN_FILE) *dp++ = 'F'; *dp++ = '\n'; if (!test_bit(Magic, &e->flags)) { sprintf(dp, "extension .%s\n", e->magic); } else { dp += sprintf(dp, "offset %i\nmagic ", e->offset); dp = bin2hex(dp, e->magic, e->size); if (e->mask) { dp += sprintf(dp, "\nmask "); dp = bin2hex(dp, e->mask, e->size); } *dp++ = '\n'; *dp = '\0'; } } static struct inode *bm_get_inode(struct super_block *sb, int mode) { struct inode *inode = new_inode(sb); if (inode) { inode->i_ino = get_next_ino(); inode->i_mode = mode; simple_inode_init_ts(inode); } return inode; } /** * i_binfmt_misc - retrieve struct binfmt_misc from a binfmt_misc inode * @inode: inode of the relevant binfmt_misc instance * * This helper retrieves struct binfmt_misc from a binfmt_misc inode. This can * be done without any memory barriers because we are guaranteed that * user_ns->binfmt_misc is fully initialized. It was fully initialized when the * binfmt_misc mount was first created. * * Return: struct binfmt_misc of the relevant binfmt_misc instance */ static struct binfmt_misc *i_binfmt_misc(struct inode *inode) { return inode->i_sb->s_user_ns->binfmt_misc; } /** * bm_evict_inode - cleanup data associated with @inode * @inode: inode to which the data is attached * * Cleanup the binary type handler data associated with @inode if a binary type * entry is removed or the filesystem is unmounted and the super block is * shutdown. * * If the ->evict call was not caused by a super block shutdown but by a write * to remove the entry or all entries via bm_{entry,status}_write() the entry * will have already been removed from the list. We keep the list_empty() check * to make that explicit. */ static void bm_evict_inode(struct inode *inode) { Node *e = inode->i_private; clear_inode(inode); if (e) { struct binfmt_misc *misc; misc = i_binfmt_misc(inode); write_lock(&misc->entries_lock); if (!list_empty(&e->list)) list_del_init(&e->list); write_unlock(&misc->entries_lock); put_binfmt_handler(e); } } /** * remove_binfmt_handler - remove a binary type handler * @misc: handle to binfmt_misc instance * @e: binary type handler to remove * * Remove a binary type handler from the list of binary type handlers and * remove its associated dentry. This is called from * binfmt_{entry,status}_write(). In the future, we might want to think about * adding a proper ->unlink() method to binfmt_misc instead of forcing caller's * to use writes to files in order to delete binary type handlers. But it has * worked for so long that it's not a pressing issue. */ static void remove_binfmt_handler(struct binfmt_misc *misc, Node *e) { write_lock(&misc->entries_lock); list_del_init(&e->list); write_unlock(&misc->entries_lock); locked_recursive_removal(e->dentry, NULL); } /* /<entry> */ static ssize_t bm_entry_read(struct file *file, char __user *buf, size_t nbytes, loff_t *ppos) { Node *e = file_inode(file)->i_private; ssize_t res; char *page; page = (char *) __get_free_page(GFP_KERNEL); if (!page) return -ENOMEM; entry_status(e, page); res = simple_read_from_buffer(buf, nbytes, ppos, page, strlen(page)); free_page((unsigned long) page); return res; } static ssize_t bm_entry_write(struct file *file, const char __user *buffer, size_t count, loff_t *ppos) { struct inode *inode = file_inode(file); Node *e = inode->i_private; int res = parse_command(buffer, count); switch (res) { case 1: /* Disable this handler. */ clear_bit(Enabled, &e->flags); break; case 2: /* Enable this handler. */ set_bit(Enabled, &e->flags); break; case 3: /* Delete this handler. */ inode = d_inode(inode->i_sb->s_root); inode_lock_nested(inode, I_MUTEX_PARENT); /* * In order to add new element or remove elements from the list * via bm_{entry,register,status}_write() inode_lock() on the * root inode must be held. * The lock is exclusive ensuring that the list can't be * modified. Only load_misc_binary() can access but does so * read-only. So we only need to take the write lock when we * actually remove the entry from the list. */ if (!list_empty(&e->list)) remove_binfmt_handler(i_binfmt_misc(inode), e); inode_unlock(inode); break; default: return res; } return count; } static const struct file_operations bm_entry_operations = { .read = bm_entry_read, .write = bm_entry_write, .llseek = default_llseek, }; /* /register */ static ssize_t bm_register_write(struct file *file, const char __user *buffer, size_t count, loff_t *ppos) { Node *e; struct inode *inode; struct super_block *sb = file_inode(file)->i_sb; struct dentry *root = sb->s_root, *dentry; struct binfmt_misc *misc; int err = 0; struct file *f = NULL; e = create_entry(buffer, count); if (IS_ERR(e)) return PTR_ERR(e); if (e->flags & MISC_FMT_OPEN_FILE) { const struct cred *old_cred; /* * Now that we support unprivileged binfmt_misc mounts make * sure we use the credentials that the register @file was * opened with to also open the interpreter. Before that this * didn't matter much as only a privileged process could open * the register file. */ old_cred = override_creds(file->f_cred); f = open_exec(e->interpreter); revert_creds(old_cred); if (IS_ERR(f)) { pr_notice("register: failed to install interpreter file %s\n", e->interpreter); kfree(e); return PTR_ERR(f); } e->interp_file = f; } inode_lock(d_inode(root)); dentry = lookup_noperm(&QSTR(e->name), root); err = PTR_ERR(dentry); if (IS_ERR(dentry)) goto out; err = -EEXIST; if (d_really_is_positive(dentry)) goto out2; inode = bm_get_inode(sb, S_IFREG | 0644); err = -ENOMEM; if (!inode) goto out2; refcount_set(&e->users, 1); e->dentry = dget(dentry); inode->i_private = e; inode->i_fop = &bm_entry_operations; d_instantiate(dentry, inode); misc = i_binfmt_misc(inode); write_lock(&misc->entries_lock); list_add(&e->list, &misc->entries); write_unlock(&misc->entries_lock); err = 0; out2: dput(dentry); out: inode_unlock(d_inode(root)); if (err) { if (f) filp_close(f, NULL); kfree(e); return err; } return count; } static const struct file_operations bm_register_operations = { .write = bm_register_write, .llseek = noop_llseek, }; /* /status */ static ssize_t bm_status_read(struct file *file, char __user *buf, size_t nbytes, loff_t *ppos) { struct binfmt_misc *misc; char *s; misc = i_binfmt_misc(file_inode(file)); s = misc->enabled ? "enabled\n" : "disabled\n"; return simple_read_from_buffer(buf, nbytes, ppos, s, strlen(s)); } static ssize_t bm_status_write(struct file *file, const char __user *buffer, size_t count, loff_t *ppos) { struct binfmt_misc *misc; int res = parse_command(buffer, count); Node *e, *next; struct inode *inode; misc = i_binfmt_misc(file_inode(file)); switch (res) { case 1: /* Disable all handlers. */ misc->enabled = false; break; case 2: /* Enable all handlers. */ misc->enabled = true; break; case 3: /* Delete all handlers. */ inode = d_inode(file_inode(file)->i_sb->s_root); inode_lock_nested(inode, I_MUTEX_PARENT); /* * In order to add new element or remove elements from the list * via bm_{entry,register,status}_write() inode_lock() on the * root inode must be held. * The lock is exclusive ensuring that the list can't be * modified. Only load_misc_binary() can access but does so * read-only. So we only need to take the write lock when we * actually remove the entry from the list. */ list_for_each_entry_safe(e, next, &misc->entries, list) remove_binfmt_handler(misc, e); inode_unlock(inode); break; default: return res; } return count; } static const struct file_operations bm_status_operations = { .read = bm_status_read, .write = bm_status_write, .llseek = default_llseek, }; /* Superblock handling */ static void bm_put_super(struct super_block *sb) { struct user_namespace *user_ns = sb->s_fs_info; sb->s_fs_info = NULL; put_user_ns(user_ns); } static const struct super_operations s_ops = { .statfs = simple_statfs, .evict_inode = bm_evict_inode, .put_super = bm_put_super, }; static int bm_fill_super(struct super_block *sb, struct fs_context *fc) { int err; struct user_namespace *user_ns = sb->s_user_ns; struct binfmt_misc *misc; static const struct tree_descr bm_files[] = { [2] = {"status", &bm_status_operations, S_IWUSR|S_IRUGO}, [3] = {"register", &bm_register_operations, S_IWUSR}, /* last one */ {""} }; if (WARN_ON(user_ns != current_user_ns())) return -EINVAL; /* * Lazily allocate a new binfmt_misc instance for this namespace, i.e. * do it here during the first mount of binfmt_misc. We don't need to * waste memory for every user namespace allocation. It's likely much * more common to not mount a separate binfmt_misc instance than it is * to mount one. * * While multiple superblocks can exist they are keyed by userns in * s_fs_info for binfmt_misc. Hence, the vfs guarantees that * bm_fill_super() is called exactly once whenever a binfmt_misc * superblock for a userns is created. This in turn lets us conclude * that when a binfmt_misc superblock is created for the first time for * a userns there's no one racing us. Therefore we don't need any * barriers when we dereference binfmt_misc. */ misc = user_ns->binfmt_misc; if (!misc) { /* * If it turns out that most user namespaces actually want to * register their own binary type handler and therefore all * create their own separate binfmt_misc mounts we should * consider turning this into a kmem cache. */ misc = kzalloc(sizeof(struct binfmt_misc), GFP_KERNEL); if (!misc) return -ENOMEM; INIT_LIST_HEAD(&misc->entries); rwlock_init(&misc->entries_lock); /* Pairs with smp_load_acquire() in load_binfmt_misc(). */ smp_store_release(&user_ns->binfmt_misc, misc); } /* * When the binfmt_misc superblock for this userns is shutdown * ->enabled might have been set to false and we don't reinitialize * ->enabled again in put_super() as someone might already be mounting * binfmt_misc again. It also would be pointless since by the time * ->put_super() is called we know that the binary type list for this * bintfmt_misc mount is empty making load_misc_binary() return * -ENOEXEC independent of whether ->enabled is true. Instead, if * someone mounts binfmt_misc for the first time or again we simply * reset ->enabled to true. */ misc->enabled = true; err = simple_fill_super(sb, BINFMTFS_MAGIC, bm_files); if (!err) sb->s_op = &s_ops; return err; } static void bm_free(struct fs_context *fc) { if (fc->s_fs_info) put_user_ns(fc->s_fs_info); } static int bm_get_tree(struct fs_context *fc) { return get_tree_keyed(fc, bm_fill_super, get_user_ns(fc->user_ns)); } static const struct fs_context_operations bm_context_ops = { .free = bm_free, .get_tree = bm_get_tree, }; static int bm_init_fs_context(struct fs_context *fc) { fc->ops = &bm_context_ops; return 0; } static struct linux_binfmt misc_format = { .module = THIS_MODULE, .load_binary = load_misc_binary, }; static struct file_system_type bm_fs_type = { .owner = THIS_MODULE, .name = "binfmt_misc", .init_fs_context = bm_init_fs_context, .fs_flags = FS_USERNS_MOUNT, .kill_sb = kill_litter_super, }; MODULE_ALIAS_FS("binfmt_misc"); static int __init init_misc_binfmt(void) { int err = register_filesystem(&bm_fs_type); if (!err) insert_binfmt(&misc_format); return err; } static void __exit exit_misc_binfmt(void) { unregister_binfmt(&misc_format); unregister_filesystem(&bm_fs_type); } core_initcall(init_misc_binfmt); module_exit(exit_misc_binfmt); MODULE_DESCRIPTION("Kernel support for miscellaneous binaries"); MODULE_LICENSE("GPL"); |
| 12 13 12 18 18 18 18 19 30 18 19 19 30 7 7 7 7 214 217 211 214 215 24 14 1 15 15 14 14 217 213 217 8 8 6 2 2 6 2 5 5 6 3 2 6 4 4 4 2 6 42 41 42 42 42 12 8 42 14 15 15 1 14 15 15 1 10 9 8 8 8 8 1 8 2 8 7 6 5 4 5 5 5 4 1 6 6 1 6 2 2 2 7 2 7 13 7 7 4 7 10 13 39 38 39 39 36 36 2 2 36 4 4 39 12 2 10 36 32 32 31 2 32 9 7 2 1 35 34 10 6 10 5 25 4 22 34 34 24 35 24 16 7 33 10 33 32 29 11 19 5 19 16 1 18 15 1 19 29 7 35 28 11 3 3 4 4 5 5 5 1 5 5 5 3 5 5 5 5 1 4 3 2 3 3 3 3 3 1 3 3 5 5 2 1 2 1 1 1 1 1 1 1 1 1 50 50 1 1 1 1 15 15 1 7 1 2 1 1 5 1 3 3 3 3 3 3 1 1 1 1 1 1 1 3 3 3 3 3 1 3 3 2 2 1 2 7 7 7 7 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 | // SPDX-License-Identifier: GPL-2.0-or-later /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * RAW - implementation of IP "raw" sockets. * * Authors: Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * * Fixes: * Alan Cox : verify_area() fixed up * Alan Cox : ICMP error handling * Alan Cox : EMSGSIZE if you send too big a packet * Alan Cox : Now uses generic datagrams and shared * skbuff library. No more peek crashes, * no more backlogs * Alan Cox : Checks sk->broadcast. * Alan Cox : Uses skb_free_datagram/skb_copy_datagram * Alan Cox : Raw passes ip options too * Alan Cox : Setsocketopt added * Alan Cox : Fixed error return for broadcasts * Alan Cox : Removed wake_up calls * Alan Cox : Use ttl/tos * Alan Cox : Cleaned up old debugging * Alan Cox : Use new kernel side addresses * Arnt Gulbrandsen : Fixed MSG_DONTROUTE in raw sockets. * Alan Cox : BSD style RAW socket demultiplexing. * Alan Cox : Beginnings of mrouted support. * Alan Cox : Added IP_HDRINCL option. * Alan Cox : Skip broadcast check if BSDism set. * David S. Miller : New socket lookup architecture. */ #include <linux/types.h> #include <linux/atomic.h> #include <asm/byteorder.h> #include <asm/current.h> #include <linux/uaccess.h> #include <asm/ioctls.h> #include <linux/stddef.h> #include <linux/slab.h> #include <linux/errno.h> #include <linux/kernel.h> #include <linux/export.h> #include <linux/spinlock.h> #include <linux/sockios.h> #include <linux/socket.h> #include <linux/in.h> #include <linux/mroute.h> #include <linux/netdevice.h> #include <linux/in_route.h> #include <linux/route.h> #include <linux/skbuff.h> #include <linux/igmp.h> #include <net/net_namespace.h> #include <net/dst.h> #include <net/sock.h> #include <linux/ip.h> #include <linux/net.h> #include <net/ip.h> #include <net/icmp.h> #include <net/udp.h> #include <net/raw.h> #include <net/snmp.h> #include <net/tcp_states.h> #include <net/inet_common.h> #include <net/checksum.h> #include <net/xfrm.h> #include <linux/rtnetlink.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/netfilter.h> #include <linux/netfilter_ipv4.h> #include <linux/compat.h> #include <linux/uio.h> struct raw_frag_vec { struct msghdr *msg; union { struct icmphdr icmph; char c[1]; } hdr; int hlen; }; struct raw_hashinfo raw_v4_hashinfo; EXPORT_SYMBOL_GPL(raw_v4_hashinfo); int raw_hash_sk(struct sock *sk) { struct raw_hashinfo *h = sk->sk_prot->h.raw_hash; struct hlist_head *hlist; hlist = &h->ht[raw_hashfunc(sock_net(sk), inet_sk(sk)->inet_num)]; spin_lock(&h->lock); sk_add_node_rcu(sk, hlist); sock_set_flag(sk, SOCK_RCU_FREE); spin_unlock(&h->lock); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); return 0; } EXPORT_SYMBOL_GPL(raw_hash_sk); void raw_unhash_sk(struct sock *sk) { struct raw_hashinfo *h = sk->sk_prot->h.raw_hash; spin_lock(&h->lock); if (sk_del_node_init_rcu(sk)) sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); spin_unlock(&h->lock); } EXPORT_SYMBOL_GPL(raw_unhash_sk); bool raw_v4_match(struct net *net, const struct sock *sk, unsigned short num, __be32 raddr, __be32 laddr, int dif, int sdif) { const struct inet_sock *inet = inet_sk(sk); if (net_eq(sock_net(sk), net) && inet->inet_num == num && !(inet->inet_daddr && inet->inet_daddr != raddr) && !(inet->inet_rcv_saddr && inet->inet_rcv_saddr != laddr) && raw_sk_bound_dev_eq(net, sk->sk_bound_dev_if, dif, sdif)) return true; return false; } EXPORT_SYMBOL_GPL(raw_v4_match); /* * 0 - deliver * 1 - block */ static int icmp_filter(const struct sock *sk, const struct sk_buff *skb) { struct icmphdr _hdr; const struct icmphdr *hdr; hdr = skb_header_pointer(skb, skb_transport_offset(skb), sizeof(_hdr), &_hdr); if (!hdr) return 1; if (hdr->type < 32) { __u32 data = raw_sk(sk)->filter.data; return ((1U << hdr->type) & data) != 0; } /* Do not block unknown ICMP types */ return 0; } /* IP input processing comes here for RAW socket delivery. * Caller owns SKB, so we must make clones. * * RFC 1122: SHOULD pass TOS value up to the transport layer. * -> It does. And not only TOS, but all IP header. */ static int raw_v4_input(struct net *net, struct sk_buff *skb, const struct iphdr *iph, int hash) { int sdif = inet_sdif(skb); struct hlist_head *hlist; int dif = inet_iif(skb); int delivered = 0; struct sock *sk; hlist = &raw_v4_hashinfo.ht[hash]; rcu_read_lock(); sk_for_each_rcu(sk, hlist) { if (!raw_v4_match(net, sk, iph->protocol, iph->saddr, iph->daddr, dif, sdif)) continue; if (atomic_read(&sk->sk_rmem_alloc) >= READ_ONCE(sk->sk_rcvbuf)) { atomic_inc(&sk->sk_drops); continue; } delivered = 1; if ((iph->protocol != IPPROTO_ICMP || !icmp_filter(sk, skb)) && ip_mc_sf_allow(sk, iph->daddr, iph->saddr, skb->dev->ifindex, sdif)) { struct sk_buff *clone = skb_clone(skb, GFP_ATOMIC); /* Not releasing hash table! */ if (clone) raw_rcv(sk, clone); } } rcu_read_unlock(); return delivered; } int raw_local_deliver(struct sk_buff *skb, int protocol) { struct net *net = dev_net(skb->dev); return raw_v4_input(net, skb, ip_hdr(skb), raw_hashfunc(net, protocol)); } static void raw_err(struct sock *sk, struct sk_buff *skb, u32 info) { struct inet_sock *inet = inet_sk(sk); const int type = icmp_hdr(skb)->type; const int code = icmp_hdr(skb)->code; int harderr = 0; bool recverr; int err = 0; if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) ipv4_sk_update_pmtu(skb, sk, info); else if (type == ICMP_REDIRECT) { ipv4_sk_redirect(skb, sk); return; } /* Report error on raw socket, if: 1. User requested ip_recverr. 2. Socket is connected (otherwise the error indication is useless without ip_recverr and error is hard. */ recverr = inet_test_bit(RECVERR, sk); if (!recverr && sk->sk_state != TCP_ESTABLISHED) return; switch (type) { default: case ICMP_TIME_EXCEEDED: err = EHOSTUNREACH; break; case ICMP_SOURCE_QUENCH: return; case ICMP_PARAMETERPROB: err = EPROTO; harderr = 1; break; case ICMP_DEST_UNREACH: err = EHOSTUNREACH; if (code > NR_ICMP_UNREACH) break; if (code == ICMP_FRAG_NEEDED) { harderr = READ_ONCE(inet->pmtudisc) != IP_PMTUDISC_DONT; err = EMSGSIZE; } else { err = icmp_err_convert[code].errno; harderr = icmp_err_convert[code].fatal; } } if (recverr) { const struct iphdr *iph = (const struct iphdr *)skb->data; u8 *payload = skb->data + (iph->ihl << 2); if (inet_test_bit(HDRINCL, sk)) payload = skb->data; ip_icmp_error(sk, skb, err, 0, info, payload); } if (recverr || harderr) { sk->sk_err = err; sk_error_report(sk); } } void raw_icmp_error(struct sk_buff *skb, int protocol, u32 info) { struct net *net = dev_net(skb->dev); int dif = skb->dev->ifindex; int sdif = inet_sdif(skb); struct hlist_head *hlist; const struct iphdr *iph; struct sock *sk; int hash; hash = raw_hashfunc(net, protocol); hlist = &raw_v4_hashinfo.ht[hash]; rcu_read_lock(); sk_for_each_rcu(sk, hlist) { iph = (const struct iphdr *)skb->data; if (!raw_v4_match(net, sk, iph->protocol, iph->daddr, iph->saddr, dif, sdif)) continue; raw_err(sk, skb, info); } rcu_read_unlock(); } static int raw_rcv_skb(struct sock *sk, struct sk_buff *skb) { enum skb_drop_reason reason; /* Charge it to the socket. */ ipv4_pktinfo_prepare(sk, skb, true); if (sock_queue_rcv_skb_reason(sk, skb, &reason) < 0) { sk_skb_reason_drop(sk, skb, reason); return NET_RX_DROP; } return NET_RX_SUCCESS; } int raw_rcv(struct sock *sk, struct sk_buff *skb) { if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) { atomic_inc(&sk->sk_drops); sk_skb_reason_drop(sk, skb, SKB_DROP_REASON_XFRM_POLICY); return NET_RX_DROP; } nf_reset_ct(skb); skb_push(skb, -skb_network_offset(skb)); raw_rcv_skb(sk, skb); return 0; } static int raw_send_hdrinc(struct sock *sk, struct flowi4 *fl4, struct msghdr *msg, size_t length, struct rtable **rtp, unsigned int flags, const struct sockcm_cookie *sockc) { struct inet_sock *inet = inet_sk(sk); struct net *net = sock_net(sk); struct iphdr *iph; struct sk_buff *skb; unsigned int iphlen; int err; struct rtable *rt = *rtp; int hlen, tlen; if (length > rt->dst.dev->mtu) { ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport, rt->dst.dev->mtu); return -EMSGSIZE; } if (length < sizeof(struct iphdr)) return -EINVAL; if (flags&MSG_PROBE) goto out; hlen = LL_RESERVED_SPACE(rt->dst.dev); tlen = rt->dst.dev->needed_tailroom; skb = sock_alloc_send_skb(sk, length + hlen + tlen + 15, flags & MSG_DONTWAIT, &err); if (!skb) goto error; skb_reserve(skb, hlen); skb->protocol = htons(ETH_P_IP); skb->priority = sockc->priority; skb->mark = sockc->mark; skb_set_delivery_type_by_clockid(skb, sockc->transmit_time, sk->sk_clockid); skb_dst_set(skb, &rt->dst); *rtp = NULL; skb_reset_network_header(skb); iph = ip_hdr(skb); skb_put(skb, length); skb->ip_summed = CHECKSUM_NONE; skb_setup_tx_timestamp(skb, sockc); if (flags & MSG_CONFIRM) skb_set_dst_pending_confirm(skb, 1); skb->transport_header = skb->network_header; err = -EFAULT; if (memcpy_from_msg(iph, msg, length)) goto error_free; iphlen = iph->ihl * 4; /* * We don't want to modify the ip header, but we do need to * be sure that it won't cause problems later along the network * stack. Specifically we want to make sure that iph->ihl is a * sane value. If ihl points beyond the length of the buffer passed * in, reject the frame as invalid */ err = -EINVAL; if (iphlen > length) goto error_free; if (iphlen >= sizeof(*iph)) { if (!iph->saddr) iph->saddr = fl4->saddr; iph->check = 0; iph->tot_len = htons(length); if (!iph->id) ip_select_ident(net, skb, NULL); iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl); skb->transport_header += iphlen; if (iph->protocol == IPPROTO_ICMP && length >= iphlen + sizeof(struct icmphdr)) icmp_out_count(net, ((struct icmphdr *) skb_transport_header(skb))->type); } err = NF_HOOK(NFPROTO_IPV4, NF_INET_LOCAL_OUT, net, sk, skb, NULL, rt->dst.dev, dst_output); if (err > 0) err = net_xmit_errno(err); if (err) goto error; out: return 0; error_free: kfree_skb(skb); error: IP_INC_STATS(net, IPSTATS_MIB_OUTDISCARDS); if (err == -ENOBUFS && !inet_test_bit(RECVERR, sk)) err = 0; return err; } static int raw_probe_proto_opt(struct raw_frag_vec *rfv, struct flowi4 *fl4) { int err; if (fl4->flowi4_proto != IPPROTO_ICMP) return 0; /* We only need the first two bytes. */ rfv->hlen = 2; err = memcpy_from_msg(rfv->hdr.c, rfv->msg, rfv->hlen); if (err) return err; fl4->fl4_icmp_type = rfv->hdr.icmph.type; fl4->fl4_icmp_code = rfv->hdr.icmph.code; return 0; } static int raw_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb) { struct raw_frag_vec *rfv = from; if (offset < rfv->hlen) { int copy = min(rfv->hlen - offset, len); if (skb->ip_summed == CHECKSUM_PARTIAL) memcpy(to, rfv->hdr.c + offset, copy); else skb->csum = csum_block_add( skb->csum, csum_partial_copy_nocheck(rfv->hdr.c + offset, to, copy), odd); odd = 0; offset += copy; to += copy; len -= copy; if (!len) return 0; } offset -= rfv->hlen; return ip_generic_getfrag(rfv->msg, to, offset, len, odd, skb); } static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) { struct inet_sock *inet = inet_sk(sk); struct net *net = sock_net(sk); struct ipcm_cookie ipc; struct rtable *rt = NULL; struct flowi4 fl4; u8 scope; int free = 0; __be32 daddr; __be32 saddr; int uc_index, err; struct ip_options_data opt_copy; struct raw_frag_vec rfv; int hdrincl; err = -EMSGSIZE; if (len > 0xFFFF) goto out; hdrincl = inet_test_bit(HDRINCL, sk); /* * Check the flags. */ err = -EOPNOTSUPP; if (msg->msg_flags & MSG_OOB) /* Mirror BSD error message */ goto out; /* compatibility */ /* * Get and verify the address. */ if (msg->msg_namelen) { DECLARE_SOCKADDR(struct sockaddr_in *, usin, msg->msg_name); err = -EINVAL; if (msg->msg_namelen < sizeof(*usin)) goto out; if (usin->sin_family != AF_INET) { pr_info_once("%s: %s forgot to set AF_INET. Fix it!\n", __func__, current->comm); err = -EAFNOSUPPORT; if (usin->sin_family) goto out; } daddr = usin->sin_addr.s_addr; /* ANK: I did not forget to get protocol from port field. * I just do not know, who uses this weirdness. * IP_HDRINCL is much more convenient. */ } else { err = -EDESTADDRREQ; if (sk->sk_state != TCP_ESTABLISHED) goto out; daddr = inet->inet_daddr; } ipcm_init_sk(&ipc, inet); /* Keep backward compat */ if (hdrincl) ipc.protocol = IPPROTO_RAW; if (msg->msg_controllen) { err = ip_cmsg_send(sk, msg, &ipc, false); if (unlikely(err)) { kfree(ipc.opt); goto out; } if (ipc.opt) free = 1; } saddr = ipc.addr; ipc.addr = daddr; if (!ipc.opt) { struct ip_options_rcu *inet_opt; rcu_read_lock(); inet_opt = rcu_dereference(inet->inet_opt); if (inet_opt) { memcpy(&opt_copy, inet_opt, sizeof(*inet_opt) + inet_opt->opt.optlen); ipc.opt = &opt_copy.opt; } rcu_read_unlock(); } if (ipc.opt) { err = -EINVAL; /* Linux does not mangle headers on raw sockets, * so that IP options + IP_HDRINCL is non-sense. */ if (hdrincl) goto done; if (ipc.opt->opt.srr) { if (!daddr) goto done; daddr = ipc.opt->opt.faddr; } } scope = ip_sendmsg_scope(inet, &ipc, msg); uc_index = READ_ONCE(inet->uc_index); if (ipv4_is_multicast(daddr)) { if (!ipc.oif || netif_index_is_l3_master(sock_net(sk), ipc.oif)) ipc.oif = READ_ONCE(inet->mc_index); if (!saddr) saddr = READ_ONCE(inet->mc_addr); } else if (!ipc.oif) { ipc.oif = uc_index; } else if (ipv4_is_lbcast(daddr) && uc_index) { /* oif is set, packet is to local broadcast * and uc_index is set. oif is most likely set * by sk_bound_dev_if. If uc_index != oif check if the * oif is an L3 master and uc_index is an L3 slave. * If so, we want to allow the send using the uc_index. */ if (ipc.oif != uc_index && ipc.oif == l3mdev_master_ifindex_by_index(sock_net(sk), uc_index)) { ipc.oif = uc_index; } } flowi4_init_output(&fl4, ipc.oif, ipc.sockc.mark, ipc.tos & INET_DSCP_MASK, scope, hdrincl ? ipc.protocol : sk->sk_protocol, inet_sk_flowi_flags(sk) | (hdrincl ? FLOWI_FLAG_KNOWN_NH : 0), daddr, saddr, 0, 0, sk_uid(sk)); fl4.fl4_icmp_type = 0; fl4.fl4_icmp_code = 0; if (!hdrincl) { rfv.msg = msg; rfv.hlen = 0; err = raw_probe_proto_opt(&rfv, &fl4); if (err) goto done; } security_sk_classify_flow(sk, flowi4_to_flowi_common(&fl4)); rt = ip_route_output_flow(net, &fl4, sk); if (IS_ERR(rt)) { err = PTR_ERR(rt); rt = NULL; goto done; } err = -EACCES; if (rt->rt_flags & RTCF_BROADCAST && !sock_flag(sk, SOCK_BROADCAST)) goto done; if (msg->msg_flags & MSG_CONFIRM) goto do_confirm; back_from_confirm: if (hdrincl) err = raw_send_hdrinc(sk, &fl4, msg, len, &rt, msg->msg_flags, &ipc.sockc); else { if (!ipc.addr) ipc.addr = fl4.daddr; lock_sock(sk); err = ip_append_data(sk, &fl4, raw_getfrag, &rfv, len, 0, &ipc, &rt, msg->msg_flags); if (err) ip_flush_pending_frames(sk); else if (!(msg->msg_flags & MSG_MORE)) { err = ip_push_pending_frames(sk, &fl4); if (err == -ENOBUFS && !inet_test_bit(RECVERR, sk)) err = 0; } release_sock(sk); } done: if (free) kfree(ipc.opt); ip_rt_put(rt); out: if (err < 0) return err; return len; do_confirm: if (msg->msg_flags & MSG_PROBE) dst_confirm_neigh(&rt->dst, &fl4.daddr); if (!(msg->msg_flags & MSG_PROBE) || len) goto back_from_confirm; err = 0; goto done; } static void raw_close(struct sock *sk, long timeout) { /* * Raw sockets may have direct kernel references. Kill them. */ ip_ra_control(sk, 0, NULL); sk_common_release(sk); } static void raw_destroy(struct sock *sk) { lock_sock(sk); ip_flush_pending_frames(sk); release_sock(sk); } /* This gets rid of all the nasties in af_inet. -DaveM */ static int raw_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len) { struct inet_sock *inet = inet_sk(sk); struct sockaddr_in *addr = (struct sockaddr_in *) uaddr; struct net *net = sock_net(sk); u32 tb_id = RT_TABLE_LOCAL; int ret = -EINVAL; int chk_addr_ret; lock_sock(sk); if (sk->sk_state != TCP_CLOSE || addr_len < sizeof(struct sockaddr_in)) goto out; if (sk->sk_bound_dev_if) tb_id = l3mdev_fib_table_by_index(net, sk->sk_bound_dev_if) ? : tb_id; chk_addr_ret = inet_addr_type_table(net, addr->sin_addr.s_addr, tb_id); ret = -EADDRNOTAVAIL; if (!inet_addr_valid_or_nonlocal(net, inet, addr->sin_addr.s_addr, chk_addr_ret)) goto out; inet->inet_rcv_saddr = inet->inet_saddr = addr->sin_addr.s_addr; if (chk_addr_ret == RTN_MULTICAST || chk_addr_ret == RTN_BROADCAST) inet->inet_saddr = 0; /* Use device */ sk_dst_reset(sk); ret = 0; out: release_sock(sk); return ret; } /* * This should be easy, if there is something there * we return it, otherwise we block. */ static int raw_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, int *addr_len) { struct inet_sock *inet = inet_sk(sk); size_t copied = 0; int err = -EOPNOTSUPP; DECLARE_SOCKADDR(struct sockaddr_in *, sin, msg->msg_name); struct sk_buff *skb; if (flags & MSG_OOB) goto out; if (flags & MSG_ERRQUEUE) { err = ip_recv_error(sk, msg, len, addr_len); goto out; } skb = skb_recv_datagram(sk, flags, &err); if (!skb) goto out; copied = skb->len; if (len < copied) { msg->msg_flags |= MSG_TRUNC; copied = len; } err = skb_copy_datagram_msg(skb, 0, msg, copied); if (err) goto done; sock_recv_cmsgs(msg, sk, skb); /* Copy the address. */ if (sin) { sin->sin_family = AF_INET; sin->sin_addr.s_addr = ip_hdr(skb)->saddr; sin->sin_port = 0; memset(&sin->sin_zero, 0, sizeof(sin->sin_zero)); *addr_len = sizeof(*sin); } if (inet_cmsg_flags(inet)) ip_cmsg_recv(msg, skb); if (flags & MSG_TRUNC) copied = skb->len; done: skb_free_datagram(sk, skb); out: if (err) return err; return copied; } static int raw_sk_init(struct sock *sk) { struct raw_sock *rp = raw_sk(sk); if (inet_sk(sk)->inet_num == IPPROTO_ICMP) memset(&rp->filter, 0, sizeof(rp->filter)); return 0; } static int raw_seticmpfilter(struct sock *sk, sockptr_t optval, int optlen) { if (optlen > sizeof(struct icmp_filter)) optlen = sizeof(struct icmp_filter); if (copy_from_sockptr(&raw_sk(sk)->filter, optval, optlen)) return -EFAULT; return 0; } static int raw_geticmpfilter(struct sock *sk, char __user *optval, int __user *optlen) { int len, ret = -EFAULT; if (get_user(len, optlen)) goto out; ret = -EINVAL; if (len < 0) goto out; if (len > sizeof(struct icmp_filter)) len = sizeof(struct icmp_filter); ret = -EFAULT; if (put_user(len, optlen) || copy_to_user(optval, &raw_sk(sk)->filter, len)) goto out; ret = 0; out: return ret; } static int do_raw_setsockopt(struct sock *sk, int optname, sockptr_t optval, unsigned int optlen) { if (optname == ICMP_FILTER) { if (inet_sk(sk)->inet_num != IPPROTO_ICMP) return -EOPNOTSUPP; else return raw_seticmpfilter(sk, optval, optlen); } return -ENOPROTOOPT; } static int raw_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen) { if (level != SOL_RAW) return ip_setsockopt(sk, level, optname, optval, optlen); return do_raw_setsockopt(sk, optname, optval, optlen); } static int do_raw_getsockopt(struct sock *sk, int optname, char __user *optval, int __user *optlen) { if (optname == ICMP_FILTER) { if (inet_sk(sk)->inet_num != IPPROTO_ICMP) return -EOPNOTSUPP; else return raw_geticmpfilter(sk, optval, optlen); } return -ENOPROTOOPT; } static int raw_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen) { if (level != SOL_RAW) return ip_getsockopt(sk, level, optname, optval, optlen); return do_raw_getsockopt(sk, optname, optval, optlen); } static int raw_ioctl(struct sock *sk, int cmd, int *karg) { switch (cmd) { case SIOCOUTQ: { *karg = sk_wmem_alloc_get(sk); return 0; } case SIOCINQ: { struct sk_buff *skb; spin_lock_bh(&sk->sk_receive_queue.lock); skb = skb_peek(&sk->sk_receive_queue); if (skb) *karg = skb->len; else *karg = 0; spin_unlock_bh(&sk->sk_receive_queue.lock); return 0; } default: #ifdef CONFIG_IP_MROUTE return ipmr_ioctl(sk, cmd, karg); #else return -ENOIOCTLCMD; #endif } } #ifdef CONFIG_COMPAT static int compat_raw_ioctl(struct sock *sk, unsigned int cmd, unsigned long arg) { switch (cmd) { case SIOCOUTQ: case SIOCINQ: return -ENOIOCTLCMD; default: #ifdef CONFIG_IP_MROUTE return ipmr_compat_ioctl(sk, cmd, compat_ptr(arg)); #else return -ENOIOCTLCMD; #endif } } #endif int raw_abort(struct sock *sk, int err) { lock_sock(sk); sk->sk_err = err; sk_error_report(sk); __udp_disconnect(sk, 0); release_sock(sk); return 0; } EXPORT_SYMBOL_GPL(raw_abort); struct proto raw_prot = { .name = "RAW", .owner = THIS_MODULE, .close = raw_close, .destroy = raw_destroy, .connect = ip4_datagram_connect, .disconnect = __udp_disconnect, .ioctl = raw_ioctl, .init = raw_sk_init, .setsockopt = raw_setsockopt, .getsockopt = raw_getsockopt, .sendmsg = raw_sendmsg, .recvmsg = raw_recvmsg, .bind = raw_bind, .backlog_rcv = raw_rcv_skb, .release_cb = ip4_datagram_release_cb, .hash = raw_hash_sk, .unhash = raw_unhash_sk, .obj_size = sizeof(struct raw_sock), .useroffset = offsetof(struct raw_sock, filter), .usersize = sizeof_field(struct raw_sock, filter), .h.raw_hash = &raw_v4_hashinfo, #ifdef CONFIG_COMPAT .compat_ioctl = compat_raw_ioctl, #endif .diag_destroy = raw_abort, }; #ifdef CONFIG_PROC_FS static struct sock *raw_get_first(struct seq_file *seq, int bucket) { struct raw_hashinfo *h = pde_data(file_inode(seq->file)); struct raw_iter_state *state = raw_seq_private(seq); struct hlist_head *hlist; struct sock *sk; for (state->bucket = bucket; state->bucket < RAW_HTABLE_SIZE; ++state->bucket) { hlist = &h->ht[state->bucket]; sk_for_each(sk, hlist) { if (sock_net(sk) == seq_file_net(seq)) return sk; } } return NULL; } static struct sock *raw_get_next(struct seq_file *seq, struct sock *sk) { struct raw_iter_state *state = raw_seq_private(seq); do { sk = sk_next(sk); } while (sk && sock_net(sk) != seq_file_net(seq)); if (!sk) return raw_get_first(seq, state->bucket + 1); return sk; } static struct sock *raw_get_idx(struct seq_file *seq, loff_t pos) { struct sock *sk = raw_get_first(seq, 0); if (sk) while (pos && (sk = raw_get_next(seq, sk)) != NULL) --pos; return pos ? NULL : sk; } void *raw_seq_start(struct seq_file *seq, loff_t *pos) __acquires(&h->lock) { struct raw_hashinfo *h = pde_data(file_inode(seq->file)); spin_lock(&h->lock); return *pos ? raw_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; } EXPORT_SYMBOL_GPL(raw_seq_start); void *raw_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct sock *sk; if (v == SEQ_START_TOKEN) sk = raw_get_first(seq, 0); else sk = raw_get_next(seq, v); ++*pos; return sk; } EXPORT_SYMBOL_GPL(raw_seq_next); void raw_seq_stop(struct seq_file *seq, void *v) __releases(&h->lock) { struct raw_hashinfo *h = pde_data(file_inode(seq->file)); spin_unlock(&h->lock); } EXPORT_SYMBOL_GPL(raw_seq_stop); static void raw_sock_seq_show(struct seq_file *seq, struct sock *sp, int i) { struct inet_sock *inet = inet_sk(sp); __be32 dest = inet->inet_daddr, src = inet->inet_rcv_saddr; __u16 destp = 0, srcp = inet->inet_num; seq_printf(seq, "%4d: %08X:%04X %08X:%04X" " %02X %08X:%08X %02X:%08lX %08X %5u %8d %lu %d %pK %u\n", i, src, srcp, dest, destp, sp->sk_state, sk_wmem_alloc_get(sp), sk_rmem_alloc_get(sp), 0, 0L, 0, from_kuid_munged(seq_user_ns(seq), sk_uid(sp)), 0, sock_i_ino(sp), refcount_read(&sp->sk_refcnt), sp, atomic_read(&sp->sk_drops)); } static int raw_seq_show(struct seq_file *seq, void *v) { if (v == SEQ_START_TOKEN) seq_printf(seq, " sl local_address rem_address st tx_queue " "rx_queue tr tm->when retrnsmt uid timeout " "inode ref pointer drops\n"); else raw_sock_seq_show(seq, v, raw_seq_private(seq)->bucket); return 0; } static const struct seq_operations raw_seq_ops = { .start = raw_seq_start, .next = raw_seq_next, .stop = raw_seq_stop, .show = raw_seq_show, }; static __net_init int raw_init_net(struct net *net) { if (!proc_create_net_data("raw", 0444, net->proc_net, &raw_seq_ops, sizeof(struct raw_iter_state), &raw_v4_hashinfo)) return -ENOMEM; return 0; } static __net_exit void raw_exit_net(struct net *net) { remove_proc_entry("raw", net->proc_net); } static __net_initdata struct pernet_operations raw_net_ops = { .init = raw_init_net, .exit = raw_exit_net, }; int __init raw_proc_init(void) { return register_pernet_subsys(&raw_net_ops); } void __init raw_proc_exit(void) { unregister_pernet_subsys(&raw_net_ops); } #endif /* CONFIG_PROC_FS */ static void raw_sysctl_init_net(struct net *net) { #ifdef CONFIG_NET_L3_MASTER_DEV net->ipv4.sysctl_raw_l3mdev_accept = 1; #endif } static int __net_init raw_sysctl_init(struct net *net) { raw_sysctl_init_net(net); return 0; } static struct pernet_operations __net_initdata raw_sysctl_ops = { .init = raw_sysctl_init, }; void __init raw_init(void) { raw_sysctl_init_net(&init_net); if (register_pernet_subsys(&raw_sysctl_ops)) panic("RAW: failed to init sysctl parameters.\n"); } |
| 1 1 1 1 1 1 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 | // SPDX-License-Identifier: GPL-2.0+ /* * pcl711.c * Comedi driver for PC-LabCard PCL-711 and AdSys ACL-8112 and compatibles * Copyright (C) 1998 David A. Schleef <ds@schleef.org> * Janne Jalkanen <jalkanen@cs.hut.fi> * Eric Bunn <ebu@cs.hut.fi> * * COMEDI - Linux Control and Measurement Device Interface * Copyright (C) 1998 David A. Schleef <ds@schleef.org> */ /* * Driver: pcl711 * Description: Advantech PCL-711 and 711b, ADLink ACL-8112 * Devices: [Advantech] PCL-711 (pcl711), PCL-711B (pcl711b), * [ADLink] ACL-8112HG (acl8112hg), ACL-8112DG (acl8112dg) * Author: David A. Schleef <ds@schleef.org> * Janne Jalkanen <jalkanen@cs.hut.fi> * Eric Bunn <ebu@cs.hut.fi> * Updated: * Status: mostly complete * * Configuration Options: * [0] - I/O port base * [1] - IRQ, optional */ #include <linux/module.h> #include <linux/delay.h> #include <linux/interrupt.h> #include <linux/comedi/comedidev.h> #include <linux/comedi/comedi_8254.h> /* * I/O port register map */ #define PCL711_TIMER_BASE 0x00 #define PCL711_AI_LSB_REG 0x04 #define PCL711_AI_MSB_REG 0x05 #define PCL711_AI_MSB_DRDY BIT(4) #define PCL711_AO_LSB_REG(x) (0x04 + ((x) * 2)) #define PCL711_AO_MSB_REG(x) (0x05 + ((x) * 2)) #define PCL711_DI_LSB_REG 0x06 #define PCL711_DI_MSB_REG 0x07 #define PCL711_INT_STAT_REG 0x08 #define PCL711_INT_STAT_CLR (0 << 0) /* any value will work */ #define PCL711_AI_GAIN_REG 0x09 #define PCL711_AI_GAIN(x) (((x) & 0xf) << 0) #define PCL711_MUX_REG 0x0a #define PCL711_MUX_CHAN(x) (((x) & 0xf) << 0) #define PCL711_MUX_CS0 BIT(4) #define PCL711_MUX_CS1 BIT(5) #define PCL711_MUX_DIFF (PCL711_MUX_CS0 | PCL711_MUX_CS1) #define PCL711_MODE_REG 0x0b #define PCL711_MODE(x) (((x) & 0x7) << 0) #define PCL711_MODE_DEFAULT PCL711_MODE(0) #define PCL711_MODE_SOFTTRIG PCL711_MODE(1) #define PCL711_MODE_EXT PCL711_MODE(2) #define PCL711_MODE_EXT_IRQ PCL711_MODE(3) #define PCL711_MODE_PACER PCL711_MODE(4) #define PCL711_MODE_PACER_IRQ PCL711_MODE(6) #define PCL711_MODE_IRQ(x) (((x) & 0x7) << 4) #define PCL711_SOFTTRIG_REG 0x0c #define PCL711_SOFTTRIG (0 << 0) /* any value will work */ #define PCL711_DO_LSB_REG 0x0d #define PCL711_DO_MSB_REG 0x0e static const struct comedi_lrange range_pcl711b_ai = { 5, { BIP_RANGE(5), BIP_RANGE(2.5), BIP_RANGE(1.25), BIP_RANGE(0.625), BIP_RANGE(0.3125) } }; static const struct comedi_lrange range_acl8112hg_ai = { 12, { BIP_RANGE(5), BIP_RANGE(0.5), BIP_RANGE(0.05), BIP_RANGE(0.005), UNI_RANGE(10), UNI_RANGE(1), UNI_RANGE(0.1), UNI_RANGE(0.01), BIP_RANGE(10), BIP_RANGE(1), BIP_RANGE(0.1), BIP_RANGE(0.01) } }; static const struct comedi_lrange range_acl8112dg_ai = { 9, { BIP_RANGE(5), BIP_RANGE(2.5), BIP_RANGE(1.25), BIP_RANGE(0.625), UNI_RANGE(10), UNI_RANGE(5), UNI_RANGE(2.5), UNI_RANGE(1.25), BIP_RANGE(10) } }; struct pcl711_board { const char *name; int n_aichan; int n_aochan; int maxirq; const struct comedi_lrange *ai_range_type; }; static const struct pcl711_board boardtypes[] = { { .name = "pcl711", .n_aichan = 8, .n_aochan = 1, .ai_range_type = &range_bipolar5, }, { .name = "pcl711b", .n_aichan = 8, .n_aochan = 1, .maxirq = 7, .ai_range_type = &range_pcl711b_ai, }, { .name = "acl8112hg", .n_aichan = 16, .n_aochan = 2, .maxirq = 15, .ai_range_type = &range_acl8112hg_ai, }, { .name = "acl8112dg", .n_aichan = 16, .n_aochan = 2, .maxirq = 15, .ai_range_type = &range_acl8112dg_ai, }, }; static void pcl711_ai_set_mode(struct comedi_device *dev, unsigned int mode) { /* * The pcl711b board uses bits in the mode register to select the * interrupt. The other boards supported by this driver all use * jumpers on the board. * * Enables the interrupt when needed on the pcl711b board. These * bits do nothing on the other boards. */ if (mode == PCL711_MODE_EXT_IRQ || mode == PCL711_MODE_PACER_IRQ) mode |= PCL711_MODE_IRQ(dev->irq); outb(mode, dev->iobase + PCL711_MODE_REG); } static unsigned int pcl711_ai_get_sample(struct comedi_device *dev, struct comedi_subdevice *s) { unsigned int val; val = inb(dev->iobase + PCL711_AI_MSB_REG) << 8; val |= inb(dev->iobase + PCL711_AI_LSB_REG); return val & s->maxdata; } static int pcl711_ai_cancel(struct comedi_device *dev, struct comedi_subdevice *s) { outb(PCL711_INT_STAT_CLR, dev->iobase + PCL711_INT_STAT_REG); pcl711_ai_set_mode(dev, PCL711_MODE_SOFTTRIG); return 0; } static irqreturn_t pcl711_interrupt(int irq, void *d) { struct comedi_device *dev = d; struct comedi_subdevice *s = dev->read_subdev; struct comedi_cmd *cmd = &s->async->cmd; unsigned short data; if (!dev->attached) { dev_err(dev->class_dev, "spurious interrupt\n"); return IRQ_HANDLED; } data = pcl711_ai_get_sample(dev, s); outb(PCL711_INT_STAT_CLR, dev->iobase + PCL711_INT_STAT_REG); comedi_buf_write_samples(s, &data, 1); if (cmd->stop_src == TRIG_COUNT && s->async->scans_done >= cmd->stop_arg) s->async->events |= COMEDI_CB_EOA; comedi_handle_events(dev, s); return IRQ_HANDLED; } static void pcl711_set_changain(struct comedi_device *dev, struct comedi_subdevice *s, unsigned int chanspec) { unsigned int chan = CR_CHAN(chanspec); unsigned int range = CR_RANGE(chanspec); unsigned int aref = CR_AREF(chanspec); unsigned int mux = 0; outb(PCL711_AI_GAIN(range), dev->iobase + PCL711_AI_GAIN_REG); if (s->n_chan > 8) { /* Select the correct MPC508A chip */ if (aref == AREF_DIFF) { chan &= 0x7; mux |= PCL711_MUX_DIFF; } else { if (chan < 8) mux |= PCL711_MUX_CS0; else mux |= PCL711_MUX_CS1; } } outb(mux | PCL711_MUX_CHAN(chan), dev->iobase + PCL711_MUX_REG); } static int pcl711_ai_eoc(struct comedi_device *dev, struct comedi_subdevice *s, struct comedi_insn *insn, unsigned long context) { unsigned int status; status = inb(dev->iobase + PCL711_AI_MSB_REG); if ((status & PCL711_AI_MSB_DRDY) == 0) return 0; return -EBUSY; } static int pcl711_ai_insn_read(struct comedi_device *dev, struct comedi_subdevice *s, struct comedi_insn *insn, unsigned int *data) { int ret; int i; pcl711_set_changain(dev, s, insn->chanspec); pcl711_ai_set_mode(dev, PCL711_MODE_SOFTTRIG); for (i = 0; i < insn->n; i++) { outb(PCL711_SOFTTRIG, dev->iobase + PCL711_SOFTTRIG_REG); ret = comedi_timeout(dev, s, insn, pcl711_ai_eoc, 0); if (ret) return ret; data[i] = pcl711_ai_get_sample(dev, s); } return insn->n; } static int pcl711_ai_cmdtest(struct comedi_device *dev, struct comedi_subdevice *s, struct comedi_cmd *cmd) { int err = 0; /* Step 1 : check if triggers are trivially valid */ err |= comedi_check_trigger_src(&cmd->start_src, TRIG_NOW); err |= comedi_check_trigger_src(&cmd->scan_begin_src, TRIG_TIMER | TRIG_EXT); err |= comedi_check_trigger_src(&cmd->convert_src, TRIG_NOW); err |= comedi_check_trigger_src(&cmd->scan_end_src, TRIG_COUNT); err |= comedi_check_trigger_src(&cmd->stop_src, TRIG_COUNT | TRIG_NONE); if (err) return 1; /* Step 2a : make sure trigger sources are unique */ err |= comedi_check_trigger_is_unique(cmd->scan_begin_src); err |= comedi_check_trigger_is_unique(cmd->stop_src); /* Step 2b : and mutually compatible */ if (err) return 2; /* Step 3: check if arguments are trivially valid */ err |= comedi_check_trigger_arg_is(&cmd->start_arg, 0); if (cmd->scan_begin_src == TRIG_EXT) { err |= comedi_check_trigger_arg_is(&cmd->scan_begin_arg, 0); } else { #define MAX_SPEED 1000 err |= comedi_check_trigger_arg_min(&cmd->scan_begin_arg, MAX_SPEED); } err |= comedi_check_trigger_arg_is(&cmd->convert_arg, 0); err |= comedi_check_trigger_arg_is(&cmd->scan_end_arg, cmd->chanlist_len); if (cmd->stop_src == TRIG_COUNT) err |= comedi_check_trigger_arg_min(&cmd->stop_arg, 1); else /* TRIG_NONE */ err |= comedi_check_trigger_arg_is(&cmd->stop_arg, 0); if (err) return 3; /* step 4 */ if (cmd->scan_begin_src == TRIG_TIMER) { unsigned int arg = cmd->scan_begin_arg; comedi_8254_cascade_ns_to_timer(dev->pacer, &arg, cmd->flags); err |= comedi_check_trigger_arg_is(&cmd->scan_begin_arg, arg); } if (err) return 4; return 0; } static int pcl711_ai_cmd(struct comedi_device *dev, struct comedi_subdevice *s) { struct comedi_cmd *cmd = &s->async->cmd; pcl711_set_changain(dev, s, cmd->chanlist[0]); if (cmd->scan_begin_src == TRIG_TIMER) { comedi_8254_update_divisors(dev->pacer); comedi_8254_pacer_enable(dev->pacer, 1, 2, true); outb(PCL711_INT_STAT_CLR, dev->iobase + PCL711_INT_STAT_REG); pcl711_ai_set_mode(dev, PCL711_MODE_PACER_IRQ); } else { pcl711_ai_set_mode(dev, PCL711_MODE_EXT_IRQ); } return 0; } static void pcl711_ao_write(struct comedi_device *dev, unsigned int chan, unsigned int val) { outb(val & 0xff, dev->iobase + PCL711_AO_LSB_REG(chan)); outb((val >> 8) & 0xff, dev->iobase + PCL711_AO_MSB_REG(chan)); } static int pcl711_ao_insn_write(struct comedi_device *dev, struct comedi_subdevice *s, struct comedi_insn *insn, unsigned int *data) { unsigned int chan = CR_CHAN(insn->chanspec); unsigned int val = s->readback[chan]; int i; for (i = 0; i < insn->n; i++) { val = data[i]; pcl711_ao_write(dev, chan, val); } s->readback[chan] = val; return insn->n; } static int pcl711_di_insn_bits(struct comedi_device *dev, struct comedi_subdevice *s, struct comedi_insn *insn, unsigned int *data) { unsigned int val; val = inb(dev->iobase + PCL711_DI_LSB_REG); val |= (inb(dev->iobase + PCL711_DI_MSB_REG) << 8); data[1] = val; return insn->n; } static int pcl711_do_insn_bits(struct comedi_device *dev, struct comedi_subdevice *s, struct comedi_insn *insn, unsigned int *data) { unsigned int mask; mask = comedi_dio_update_state(s, data); if (mask) { if (mask & 0x00ff) outb(s->state & 0xff, dev->iobase + PCL711_DO_LSB_REG); if (mask & 0xff00) outb((s->state >> 8), dev->iobase + PCL711_DO_MSB_REG); } data[1] = s->state; return insn->n; } static int pcl711_attach(struct comedi_device *dev, struct comedi_devconfig *it) { const struct pcl711_board *board = dev->board_ptr; struct comedi_subdevice *s; int ret; ret = comedi_request_region(dev, it->options[0], 0x10); if (ret) return ret; if (it->options[1] && it->options[1] <= board->maxirq) { ret = request_irq(it->options[1], pcl711_interrupt, 0, dev->board_name, dev); if (ret == 0) dev->irq = it->options[1]; } dev->pacer = comedi_8254_io_alloc(dev->iobase + PCL711_TIMER_BASE, I8254_OSC_BASE_2MHZ, I8254_IO8, 0); if (IS_ERR(dev->pacer)) return PTR_ERR(dev->pacer); ret = comedi_alloc_subdevices(dev, 4); if (ret) return ret; /* Analog Input subdevice */ s = &dev->subdevices[0]; s->type = COMEDI_SUBD_AI; s->subdev_flags = SDF_READABLE | SDF_GROUND; if (board->n_aichan > 8) s->subdev_flags |= SDF_DIFF; s->n_chan = board->n_aichan; s->maxdata = 0xfff; s->range_table = board->ai_range_type; s->insn_read = pcl711_ai_insn_read; if (dev->irq) { dev->read_subdev = s; s->subdev_flags |= SDF_CMD_READ; s->len_chanlist = 1; s->do_cmdtest = pcl711_ai_cmdtest; s->do_cmd = pcl711_ai_cmd; s->cancel = pcl711_ai_cancel; } /* Analog Output subdevice */ s = &dev->subdevices[1]; s->type = COMEDI_SUBD_AO; s->subdev_flags = SDF_WRITABLE; s->n_chan = board->n_aochan; s->maxdata = 0xfff; s->range_table = &range_bipolar5; s->insn_write = pcl711_ao_insn_write; ret = comedi_alloc_subdev_readback(s); if (ret) return ret; /* Digital Input subdevice */ s = &dev->subdevices[2]; s->type = COMEDI_SUBD_DI; s->subdev_flags = SDF_READABLE; s->n_chan = 16; s->maxdata = 1; s->range_table = &range_digital; s->insn_bits = pcl711_di_insn_bits; /* Digital Output subdevice */ s = &dev->subdevices[3]; s->type = COMEDI_SUBD_DO; s->subdev_flags = SDF_WRITABLE; s->n_chan = 16; s->maxdata = 1; s->range_table = &range_digital; s->insn_bits = pcl711_do_insn_bits; /* clear DAC */ pcl711_ao_write(dev, 0, 0x0); pcl711_ao_write(dev, 1, 0x0); return 0; } static struct comedi_driver pcl711_driver = { .driver_name = "pcl711", .module = THIS_MODULE, .attach = pcl711_attach, .detach = comedi_legacy_detach, .board_name = &boardtypes[0].name, .num_names = ARRAY_SIZE(boardtypes), .offset = sizeof(struct pcl711_board), }; module_comedi_driver(pcl711_driver); MODULE_AUTHOR("Comedi https://www.comedi.org"); MODULE_DESCRIPTION("Comedi driver for PCL-711 compatible boards"); MODULE_LICENSE("GPL"); |
| 4 4 2 4 4 4 4 29 29 29 29 1 28 2 23 24 24 24 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 | // SPDX-License-Identifier: GPL-2.0-only /* * Copyright 2003-2005 Devicescape Software, Inc. * Copyright (c) 2006 Jiri Benc <jbenc@suse.cz> * Copyright 2007 Johannes Berg <johannes@sipsolutions.net> * Copyright (C) 2015 Intel Deutschland GmbH * Copyright (C) 2021-2023 Intel Corporation */ #include <linux/kobject.h> #include <linux/slab.h> #include "ieee80211_i.h" #include "key.h" #include "debugfs.h" #include "debugfs_key.h" #define KEY_READ(name, prop, format_string) \ static ssize_t key_##name##_read(struct file *file, \ char __user *userbuf, \ size_t count, loff_t *ppos) \ { \ struct ieee80211_key *key = file->private_data; \ return mac80211_format_buffer(userbuf, count, ppos, \ format_string, key->prop); \ } #define KEY_READ_X(name) KEY_READ(name, name, "0x%x\n") #define KEY_OPS(name) \ static const struct debugfs_short_fops key_ ##name## _ops = { \ .read = key_##name##_read, \ .llseek = generic_file_llseek, \ } #define KEY_OPS_W(name) \ static const struct debugfs_short_fops key_ ##name## _ops = { \ .read = key_##name##_read, \ .write = key_##name##_write, \ .llseek = generic_file_llseek, \ } #define KEY_FILE(name, format) \ KEY_READ_##format(name) \ KEY_OPS(name) #define KEY_CONF_READ(name, format_string) \ KEY_READ(conf_##name, conf.name, format_string) #define KEY_CONF_READ_D(name) KEY_CONF_READ(name, "%d\n") #define KEY_CONF_OPS(name) \ static const struct debugfs_short_fops key_ ##name## _ops = { \ .read = key_conf_##name##_read, \ .llseek = generic_file_llseek, \ } #define KEY_CONF_FILE(name, format) \ KEY_CONF_READ_##format(name) \ KEY_CONF_OPS(name) KEY_CONF_FILE(keylen, D); KEY_CONF_FILE(keyidx, D); KEY_CONF_FILE(hw_key_idx, D); KEY_FILE(flags, X); KEY_READ(ifindex, sdata->name, "%s\n"); KEY_OPS(ifindex); static ssize_t key_algorithm_read(struct file *file, char __user *userbuf, size_t count, loff_t *ppos) { char buf[15]; struct ieee80211_key *key = file->private_data; u32 c = key->conf.cipher; sprintf(buf, "%.2x-%.2x-%.2x:%d\n", c >> 24, (c >> 16) & 0xff, (c >> 8) & 0xff, c & 0xff); return simple_read_from_buffer(userbuf, count, ppos, buf, strlen(buf)); } KEY_OPS(algorithm); static ssize_t key_tx_spec_write(struct file *file, const char __user *userbuf, size_t count, loff_t *ppos) { struct ieee80211_key *key = file->private_data; u64 pn; int ret; switch (key->conf.cipher) { case WLAN_CIPHER_SUITE_WEP40: case WLAN_CIPHER_SUITE_WEP104: return -EINVAL; case WLAN_CIPHER_SUITE_TKIP: /* not supported yet */ return -EOPNOTSUPP; case WLAN_CIPHER_SUITE_CCMP: case WLAN_CIPHER_SUITE_CCMP_256: case WLAN_CIPHER_SUITE_AES_CMAC: case WLAN_CIPHER_SUITE_BIP_CMAC_256: case WLAN_CIPHER_SUITE_BIP_GMAC_128: case WLAN_CIPHER_SUITE_BIP_GMAC_256: case WLAN_CIPHER_SUITE_GCMP: case WLAN_CIPHER_SUITE_GCMP_256: ret = kstrtou64_from_user(userbuf, count, 16, &pn); if (ret) return ret; /* PN is a 48-bit counter */ if (pn >= (1ULL << 48)) return -ERANGE; atomic64_set(&key->conf.tx_pn, pn); return count; default: return 0; } } static ssize_t key_tx_spec_read(struct file *file, char __user *userbuf, size_t count, loff_t *ppos) { u64 pn; char buf[20]; int len; struct ieee80211_key *key = file->private_data; switch (key->conf.cipher) { case WLAN_CIPHER_SUITE_WEP40: case WLAN_CIPHER_SUITE_WEP104: len = scnprintf(buf, sizeof(buf), "\n"); break; case WLAN_CIPHER_SUITE_TKIP: pn = atomic64_read(&key->conf.tx_pn); len = scnprintf(buf, sizeof(buf), "%08x %04x\n", TKIP_PN_TO_IV32(pn), TKIP_PN_TO_IV16(pn)); break; case WLAN_CIPHER_SUITE_CCMP: case WLAN_CIPHER_SUITE_CCMP_256: case WLAN_CIPHER_SUITE_AES_CMAC: case WLAN_CIPHER_SUITE_BIP_CMAC_256: case WLAN_CIPHER_SUITE_BIP_GMAC_128: case WLAN_CIPHER_SUITE_BIP_GMAC_256: case WLAN_CIPHER_SUITE_GCMP: case WLAN_CIPHER_SUITE_GCMP_256: pn = atomic64_read(&key->conf.tx_pn); len = scnprintf(buf, sizeof(buf), "%02x%02x%02x%02x%02x%02x\n", (u8)(pn >> 40), (u8)(pn >> 32), (u8)(pn >> 24), (u8)(pn >> 16), (u8)(pn >> 8), (u8)pn); break; default: return 0; } return simple_read_from_buffer(userbuf, count, ppos, buf, len); } KEY_OPS_W(tx_spec); static ssize_t key_rx_spec_read(struct file *file, char __user *userbuf, size_t count, loff_t *ppos) { struct ieee80211_key *key = file->private_data; char buf[14*IEEE80211_NUM_TIDS+1], *p = buf; int i, len; const u8 *rpn; switch (key->conf.cipher) { case WLAN_CIPHER_SUITE_WEP40: case WLAN_CIPHER_SUITE_WEP104: len = scnprintf(buf, sizeof(buf), "\n"); break; case WLAN_CIPHER_SUITE_TKIP: for (i = 0; i < IEEE80211_NUM_TIDS; i++) p += scnprintf(p, sizeof(buf)+buf-p, "%08x %04x\n", key->u.tkip.rx[i].iv32, key->u.tkip.rx[i].iv16); len = p - buf; break; case WLAN_CIPHER_SUITE_CCMP: case WLAN_CIPHER_SUITE_CCMP_256: for (i = 0; i < IEEE80211_NUM_TIDS + 1; i++) { rpn = key->u.ccmp.rx_pn[i]; p += scnprintf(p, sizeof(buf)+buf-p, "%02x%02x%02x%02x%02x%02x\n", rpn[0], rpn[1], rpn[2], rpn[3], rpn[4], rpn[5]); } len = p - buf; break; case WLAN_CIPHER_SUITE_AES_CMAC: case WLAN_CIPHER_SUITE_BIP_CMAC_256: rpn = key->u.aes_cmac.rx_pn; p += scnprintf(p, sizeof(buf)+buf-p, "%02x%02x%02x%02x%02x%02x\n", rpn[0], rpn[1], rpn[2], rpn[3], rpn[4], rpn[5]); len = p - buf; break; case WLAN_CIPHER_SUITE_BIP_GMAC_128: case WLAN_CIPHER_SUITE_BIP_GMAC_256: rpn = key->u.aes_gmac.rx_pn; p += scnprintf(p, sizeof(buf)+buf-p, "%02x%02x%02x%02x%02x%02x\n", rpn[0], rpn[1], rpn[2], rpn[3], rpn[4], rpn[5]); len = p - buf; break; case WLAN_CIPHER_SUITE_GCMP: case WLAN_CIPHER_SUITE_GCMP_256: for (i = 0; i < IEEE80211_NUM_TIDS + 1; i++) { rpn = key->u.gcmp.rx_pn[i]; p += scnprintf(p, sizeof(buf)+buf-p, "%02x%02x%02x%02x%02x%02x\n", rpn[0], rpn[1], rpn[2], rpn[3], rpn[4], rpn[5]); } len = p - buf; break; default: return 0; } return simple_read_from_buffer(userbuf, count, ppos, buf, len); } KEY_OPS(rx_spec); static ssize_t key_replays_read(struct file *file, char __user *userbuf, size_t count, loff_t *ppos) { struct ieee80211_key *key = file->private_data; char buf[20]; int len; switch (key->conf.cipher) { case WLAN_CIPHER_SUITE_CCMP: case WLAN_CIPHER_SUITE_CCMP_256: len = scnprintf(buf, sizeof(buf), "%u\n", key->u.ccmp.replays); break; case WLAN_CIPHER_SUITE_AES_CMAC: case WLAN_CIPHER_SUITE_BIP_CMAC_256: len = scnprintf(buf, sizeof(buf), "%u\n", key->u.aes_cmac.replays); break; case WLAN_CIPHER_SUITE_BIP_GMAC_128: case WLAN_CIPHER_SUITE_BIP_GMAC_256: len = scnprintf(buf, sizeof(buf), "%u\n", key->u.aes_gmac.replays); break; case WLAN_CIPHER_SUITE_GCMP: case WLAN_CIPHER_SUITE_GCMP_256: len = scnprintf(buf, sizeof(buf), "%u\n", key->u.gcmp.replays); break; default: return 0; } return simple_read_from_buffer(userbuf, count, ppos, buf, len); } KEY_OPS(replays); static ssize_t key_icverrors_read(struct file *file, char __user *userbuf, size_t count, loff_t *ppos) { struct ieee80211_key *key = file->private_data; char buf[20]; int len; switch (key->conf.cipher) { case WLAN_CIPHER_SUITE_AES_CMAC: case WLAN_CIPHER_SUITE_BIP_CMAC_256: len = scnprintf(buf, sizeof(buf), "%u\n", key->u.aes_cmac.icverrors); break; case WLAN_CIPHER_SUITE_BIP_GMAC_128: case WLAN_CIPHER_SUITE_BIP_GMAC_256: len = scnprintf(buf, sizeof(buf), "%u\n", key->u.aes_gmac.icverrors); break; default: return 0; } return simple_read_from_buffer(userbuf, count, ppos, buf, len); } KEY_OPS(icverrors); static ssize_t key_mic_failures_read(struct file *file, char __user *userbuf, size_t count, loff_t *ppos) { struct ieee80211_key *key = file->private_data; char buf[20]; int len; if (key->conf.cipher != WLAN_CIPHER_SUITE_TKIP) return -EINVAL; len = scnprintf(buf, sizeof(buf), "%u\n", key->u.tkip.mic_failures); return simple_read_from_buffer(userbuf, count, ppos, buf, len); } KEY_OPS(mic_failures); static ssize_t key_key_read(struct file *file, char __user *userbuf, size_t count, loff_t *ppos) { struct ieee80211_key *key = file->private_data; int i, bufsize = 2 * key->conf.keylen + 2; char *buf = kmalloc(bufsize, GFP_KERNEL); char *p = buf; ssize_t res; if (!buf) return -ENOMEM; for (i = 0; i < key->conf.keylen; i++) p += scnprintf(p, bufsize + buf - p, "%02x", key->conf.key[i]); p += scnprintf(p, bufsize+buf-p, "\n"); res = simple_read_from_buffer(userbuf, count, ppos, buf, p - buf); kfree(buf); return res; } KEY_OPS(key); #define DEBUGFS_ADD(name) \ debugfs_create_file(#name, 0400, key->debugfs.dir, \ key, &key_##name##_ops) #define DEBUGFS_ADD_W(name) \ debugfs_create_file(#name, 0600, key->debugfs.dir, \ key, &key_##name##_ops); void ieee80211_debugfs_key_add(struct ieee80211_key *key) { static int keycount; char buf[100]; struct sta_info *sta; if (!key->local->debugfs.keys) return; sprintf(buf, "%d", keycount); key->debugfs.cnt = keycount; keycount++; key->debugfs.dir = debugfs_create_dir(buf, key->local->debugfs.keys); sta = key->sta; if (sta) { sprintf(buf, "../../netdev:%s/stations/%pM", sta->sdata->name, sta->sta.addr); key->debugfs.stalink = debugfs_create_symlink("station", key->debugfs.dir, buf); } DEBUGFS_ADD(keylen); DEBUGFS_ADD(flags); DEBUGFS_ADD(keyidx); DEBUGFS_ADD(hw_key_idx); DEBUGFS_ADD(algorithm); DEBUGFS_ADD_W(tx_spec); DEBUGFS_ADD(rx_spec); DEBUGFS_ADD(replays); DEBUGFS_ADD(icverrors); DEBUGFS_ADD(mic_failures); DEBUGFS_ADD(key); DEBUGFS_ADD(ifindex); }; void ieee80211_debugfs_key_remove(struct ieee80211_key *key) { if (!key) return; debugfs_remove_recursive(key->debugfs.dir); key->debugfs.dir = NULL; } void ieee80211_debugfs_key_update_default(struct ieee80211_sub_if_data *sdata) { char buf[50]; struct ieee80211_key *key; if (!sdata->vif.debugfs_dir) return; lockdep_assert_wiphy(sdata->local->hw.wiphy); debugfs_remove(sdata->debugfs.default_unicast_key); sdata->debugfs.default_unicast_key = NULL; if (sdata->default_unicast_key) { key = wiphy_dereference(sdata->local->hw.wiphy, sdata->default_unicast_key); sprintf(buf, "../keys/%d", key->debugfs.cnt); sdata->debugfs.default_unicast_key = debugfs_create_symlink("default_unicast_key", sdata->vif.debugfs_dir, buf); } debugfs_remove(sdata->debugfs.default_multicast_key); sdata->debugfs.default_multicast_key = NULL; if (sdata->deflink.default_multicast_key) { key = wiphy_dereference(sdata->local->hw.wiphy, sdata->deflink.default_multicast_key); sprintf(buf, "../keys/%d", key->debugfs.cnt); sdata->debugfs.default_multicast_key = debugfs_create_symlink("default_multicast_key", sdata->vif.debugfs_dir, buf); } } void ieee80211_debugfs_key_remove_mgmt_default(struct ieee80211_sub_if_data *sdata) { if (!sdata) return; debugfs_remove(sdata->debugfs.default_mgmt_key); sdata->debugfs.default_mgmt_key = NULL; } void ieee80211_debugfs_key_remove_beacon_default(struct ieee80211_sub_if_data *sdata) { if (!sdata) return; debugfs_remove(sdata->debugfs.default_beacon_key); sdata->debugfs.default_beacon_key = NULL; } |
| 9 17 17 16 17 17 16 9 17 8 5 3 16 16 16 15 17 17 17 4 4 1 1 4 1 4 4 3 4 3 3 2 3 3 3 4 4 4 4 4 17 17 17 17 17 15 14 14 14 14 15 14 6 4 4 4 4 4 3 3 3 3 3 3 1 4 3 1 3 3 2 2 10 10 10 8 2 8 8 8 8 7 3 10 8 8 1 1 1 8 8 8 8 7 3 3 8 8 19 18 3 6 6 6 6 2 6 5 1 5 3 3 3 3 2 1 4 4 4 1 3 2 2 2 4 19 5 5 5 5 3 3 3 3 3 3 3 3 3 3 3 3 14 5 13 13 13 13 14 6 5 6 6 14 1 14 14 13 14 14 1 14 4 4 4 4 4 4 3 3 3 3 3 3 3 1 1 1 1 1 1 1 6 6 6 6 3 5 6 6 6 3 6 6 6 6 6 5 6 6 6 6 6 6 4 4 3 3 3 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 | // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2017 - 2018 Covalent IO, Inc. http://covalent.io */ #include <linux/skmsg.h> #include <linux/skbuff.h> #include <linux/scatterlist.h> #include <net/sock.h> #include <net/tcp.h> #include <net/tls.h> #include <trace/events/sock.h> static bool sk_msg_try_coalesce_ok(struct sk_msg *msg, int elem_first_coalesce) { if (msg->sg.end > msg->sg.start && elem_first_coalesce < msg->sg.end) return true; if (msg->sg.end < msg->sg.start && (elem_first_coalesce > msg->sg.start || elem_first_coalesce < msg->sg.end)) return true; return false; } int sk_msg_alloc(struct sock *sk, struct sk_msg *msg, int len, int elem_first_coalesce) { struct page_frag *pfrag = sk_page_frag(sk); u32 osize = msg->sg.size; int ret = 0; len -= msg->sg.size; while (len > 0) { struct scatterlist *sge; u32 orig_offset; int use, i; if (!sk_page_frag_refill(sk, pfrag)) { ret = -ENOMEM; goto msg_trim; } orig_offset = pfrag->offset; use = min_t(int, len, pfrag->size - orig_offset); if (!sk_wmem_schedule(sk, use)) { ret = -ENOMEM; goto msg_trim; } i = msg->sg.end; sk_msg_iter_var_prev(i); sge = &msg->sg.data[i]; if (sk_msg_try_coalesce_ok(msg, elem_first_coalesce) && sg_page(sge) == pfrag->page && sge->offset + sge->length == orig_offset) { sge->length += use; } else { if (sk_msg_full(msg)) { ret = -ENOSPC; break; } sge = &msg->sg.data[msg->sg.end]; sg_unmark_end(sge); sg_set_page(sge, pfrag->page, use, orig_offset); get_page(pfrag->page); sk_msg_iter_next(msg, end); } sk_mem_charge(sk, use); msg->sg.size += use; pfrag->offset += use; len -= use; } return ret; msg_trim: sk_msg_trim(sk, msg, osize); return ret; } EXPORT_SYMBOL_GPL(sk_msg_alloc); int sk_msg_clone(struct sock *sk, struct sk_msg *dst, struct sk_msg *src, u32 off, u32 len) { int i = src->sg.start; struct scatterlist *sge = sk_msg_elem(src, i); struct scatterlist *sgd = NULL; u32 sge_len, sge_off; while (off) { if (sge->length > off) break; off -= sge->length; sk_msg_iter_var_next(i); if (i == src->sg.end && off) return -ENOSPC; sge = sk_msg_elem(src, i); } while (len) { sge_len = sge->length - off; if (sge_len > len) sge_len = len; if (dst->sg.end) sgd = sk_msg_elem(dst, dst->sg.end - 1); if (sgd && (sg_page(sge) == sg_page(sgd)) && (sg_virt(sge) + off == sg_virt(sgd) + sgd->length)) { sgd->length += sge_len; dst->sg.size += sge_len; } else if (!sk_msg_full(dst)) { sge_off = sge->offset + off; sk_msg_page_add(dst, sg_page(sge), sge_len, sge_off); } else { return -ENOSPC; } off = 0; len -= sge_len; sk_mem_charge(sk, sge_len); sk_msg_iter_var_next(i); if (i == src->sg.end && len) return -ENOSPC; sge = sk_msg_elem(src, i); } return 0; } EXPORT_SYMBOL_GPL(sk_msg_clone); void sk_msg_return_zero(struct sock *sk, struct sk_msg *msg, int bytes) { int i = msg->sg.start; do { struct scatterlist *sge = sk_msg_elem(msg, i); if (bytes < sge->length) { sge->length -= bytes; sge->offset += bytes; sk_mem_uncharge(sk, bytes); break; } sk_mem_uncharge(sk, sge->length); bytes -= sge->length; sge->length = 0; sge->offset = 0; sk_msg_iter_var_next(i); } while (bytes && i != msg->sg.end); msg->sg.start = i; } EXPORT_SYMBOL_GPL(sk_msg_return_zero); void sk_msg_return(struct sock *sk, struct sk_msg *msg, int bytes) { int i = msg->sg.start; do { struct scatterlist *sge = &msg->sg.data[i]; int uncharge = (bytes < sge->length) ? bytes : sge->length; sk_mem_uncharge(sk, uncharge); bytes -= uncharge; sk_msg_iter_var_next(i); } while (i != msg->sg.end); } EXPORT_SYMBOL_GPL(sk_msg_return); static int sk_msg_free_elem(struct sock *sk, struct sk_msg *msg, u32 i, bool charge) { struct scatterlist *sge = sk_msg_elem(msg, i); u32 len = sge->length; /* When the skb owns the memory we free it from consume_skb path. */ if (!msg->skb) { if (charge) sk_mem_uncharge(sk, len); put_page(sg_page(sge)); } memset(sge, 0, sizeof(*sge)); return len; } static int __sk_msg_free(struct sock *sk, struct sk_msg *msg, u32 i, bool charge) { struct scatterlist *sge = sk_msg_elem(msg, i); int freed = 0; while (msg->sg.size) { msg->sg.size -= sge->length; freed += sk_msg_free_elem(sk, msg, i, charge); sk_msg_iter_var_next(i); sk_msg_check_to_free(msg, i, msg->sg.size); sge = sk_msg_elem(msg, i); } consume_skb(msg->skb); sk_msg_init(msg); return freed; } int sk_msg_free_nocharge(struct sock *sk, struct sk_msg *msg) { return __sk_msg_free(sk, msg, msg->sg.start, false); } EXPORT_SYMBOL_GPL(sk_msg_free_nocharge); int sk_msg_free(struct sock *sk, struct sk_msg *msg) { return __sk_msg_free(sk, msg, msg->sg.start, true); } EXPORT_SYMBOL_GPL(sk_msg_free); static void __sk_msg_free_partial(struct sock *sk, struct sk_msg *msg, u32 bytes, bool charge) { struct scatterlist *sge; u32 i = msg->sg.start; while (bytes) { sge = sk_msg_elem(msg, i); if (!sge->length) break; if (bytes < sge->length) { if (charge) sk_mem_uncharge(sk, bytes); sge->length -= bytes; sge->offset += bytes; msg->sg.size -= bytes; break; } msg->sg.size -= sge->length; bytes -= sge->length; sk_msg_free_elem(sk, msg, i, charge); sk_msg_iter_var_next(i); sk_msg_check_to_free(msg, i, bytes); } msg->sg.start = i; } void sk_msg_free_partial(struct sock *sk, struct sk_msg *msg, u32 bytes) { __sk_msg_free_partial(sk, msg, bytes, true); } EXPORT_SYMBOL_GPL(sk_msg_free_partial); void sk_msg_free_partial_nocharge(struct sock *sk, struct sk_msg *msg, u32 bytes) { __sk_msg_free_partial(sk, msg, bytes, false); } void sk_msg_trim(struct sock *sk, struct sk_msg *msg, int len) { int trim = msg->sg.size - len; u32 i = msg->sg.end; if (trim <= 0) { WARN_ON(trim < 0); return; } sk_msg_iter_var_prev(i); msg->sg.size = len; while (msg->sg.data[i].length && trim >= msg->sg.data[i].length) { trim -= msg->sg.data[i].length; sk_msg_free_elem(sk, msg, i, true); sk_msg_iter_var_prev(i); if (!trim) goto out; } msg->sg.data[i].length -= trim; sk_mem_uncharge(sk, trim); /* Adjust copybreak if it falls into the trimmed part of last buf */ if (msg->sg.curr == i && msg->sg.copybreak > msg->sg.data[i].length) msg->sg.copybreak = msg->sg.data[i].length; out: sk_msg_iter_var_next(i); msg->sg.end = i; /* If we trim data a full sg elem before curr pointer update * copybreak and current so that any future copy operations * start at new copy location. * However trimmed data that has not yet been used in a copy op * does not require an update. */ if (!msg->sg.size) { msg->sg.curr = msg->sg.start; msg->sg.copybreak = 0; } else if (sk_msg_iter_dist(msg->sg.start, msg->sg.curr) >= sk_msg_iter_dist(msg->sg.start, msg->sg.end)) { sk_msg_iter_var_prev(i); msg->sg.curr = i; msg->sg.copybreak = msg->sg.data[i].length; } } EXPORT_SYMBOL_GPL(sk_msg_trim); int sk_msg_zerocopy_from_iter(struct sock *sk, struct iov_iter *from, struct sk_msg *msg, u32 bytes) { int i, maxpages, ret = 0, num_elems = sk_msg_elem_used(msg); const int to_max_pages = MAX_MSG_FRAGS; struct page *pages[MAX_MSG_FRAGS]; ssize_t orig, copied, use, offset; orig = msg->sg.size; while (bytes > 0) { i = 0; maxpages = to_max_pages - num_elems; if (maxpages == 0) { ret = -EFAULT; goto out; } copied = iov_iter_get_pages2(from, pages, bytes, maxpages, &offset); if (copied <= 0) { ret = -EFAULT; goto out; } bytes -= copied; msg->sg.size += copied; while (copied) { use = min_t(int, copied, PAGE_SIZE - offset); sg_set_page(&msg->sg.data[msg->sg.end], pages[i], use, offset); sg_unmark_end(&msg->sg.data[msg->sg.end]); sk_mem_charge(sk, use); offset = 0; copied -= use; sk_msg_iter_next(msg, end); num_elems++; i++; } /* When zerocopy is mixed with sk_msg_*copy* operations we * may have a copybreak set in this case clear and prefer * zerocopy remainder when possible. */ msg->sg.copybreak = 0; msg->sg.curr = msg->sg.end; } out: /* Revert iov_iter updates, msg will need to use 'trim' later if it * also needs to be cleared. */ if (ret) iov_iter_revert(from, msg->sg.size - orig); return ret; } EXPORT_SYMBOL_GPL(sk_msg_zerocopy_from_iter); int sk_msg_memcopy_from_iter(struct sock *sk, struct iov_iter *from, struct sk_msg *msg, u32 bytes) { int ret = -ENOSPC, i = msg->sg.curr; u32 copy, buf_size, copied = 0; struct scatterlist *sge; void *to; do { sge = sk_msg_elem(msg, i); /* This is possible if a trim operation shrunk the buffer */ if (msg->sg.copybreak >= sge->length) { msg->sg.copybreak = 0; sk_msg_iter_var_next(i); if (i == msg->sg.end) break; sge = sk_msg_elem(msg, i); } buf_size = sge->length - msg->sg.copybreak; copy = (buf_size > bytes) ? bytes : buf_size; to = sg_virt(sge) + msg->sg.copybreak; msg->sg.copybreak += copy; if (sk->sk_route_caps & NETIF_F_NOCACHE_COPY) ret = copy_from_iter_nocache(to, copy, from); else ret = copy_from_iter(to, copy, from); if (ret != copy) { ret = -EFAULT; goto out; } bytes -= copy; copied += copy; if (!bytes) break; msg->sg.copybreak = 0; sk_msg_iter_var_next(i); } while (i != msg->sg.end); out: msg->sg.curr = i; return (ret < 0) ? ret : copied; } EXPORT_SYMBOL_GPL(sk_msg_memcopy_from_iter); /* Receive sk_msg from psock->ingress_msg to @msg. */ int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg, int len, int flags) { struct iov_iter *iter = &msg->msg_iter; int peek = flags & MSG_PEEK; struct sk_msg *msg_rx; int i, copied = 0; msg_rx = sk_psock_peek_msg(psock); while (copied != len) { struct scatterlist *sge; if (unlikely(!msg_rx)) break; i = msg_rx->sg.start; do { struct page *page; int copy; sge = sk_msg_elem(msg_rx, i); copy = sge->length; page = sg_page(sge); if (copied + copy > len) copy = len - copied; if (copy) copy = copy_page_to_iter(page, sge->offset, copy, iter); if (!copy) { copied = copied ? copied : -EFAULT; goto out; } copied += copy; if (likely(!peek)) { sge->offset += copy; sge->length -= copy; if (!msg_rx->skb) { sk_mem_uncharge(sk, copy); atomic_sub(copy, &sk->sk_rmem_alloc); } msg_rx->sg.size -= copy; if (!sge->length) { sk_msg_iter_var_next(i); if (!msg_rx->skb) put_page(page); } } else { /* Lets not optimize peek case if copy_page_to_iter * didn't copy the entire length lets just break. */ if (copy != sge->length) goto out; sk_msg_iter_var_next(i); } if (copied == len) break; } while ((i != msg_rx->sg.end) && !sg_is_last(sge)); if (unlikely(peek)) { msg_rx = sk_psock_next_msg(psock, msg_rx); if (!msg_rx) break; continue; } msg_rx->sg.start = i; if (!sge->length && (i == msg_rx->sg.end || sg_is_last(sge))) { msg_rx = sk_psock_dequeue_msg(psock); kfree_sk_msg(msg_rx); } msg_rx = sk_psock_peek_msg(psock); } out: return copied; } EXPORT_SYMBOL_GPL(sk_msg_recvmsg); bool sk_msg_is_readable(struct sock *sk) { struct sk_psock *psock; bool empty = true; rcu_read_lock(); psock = sk_psock(sk); if (likely(psock)) empty = list_empty(&psock->ingress_msg); rcu_read_unlock(); return !empty; } EXPORT_SYMBOL_GPL(sk_msg_is_readable); static struct sk_msg *alloc_sk_msg(gfp_t gfp) { struct sk_msg *msg; msg = kzalloc(sizeof(*msg), gfp | __GFP_NOWARN); if (unlikely(!msg)) return NULL; sg_init_marker(msg->sg.data, NR_MSG_FRAG_IDS); return msg; } static struct sk_msg *sk_psock_create_ingress_msg(struct sock *sk, struct sk_buff *skb) { if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf) return NULL; if (!sk_rmem_schedule(sk, skb, skb->truesize)) return NULL; return alloc_sk_msg(GFP_KERNEL); } static int sk_psock_skb_ingress_enqueue(struct sk_buff *skb, u32 off, u32 len, struct sk_psock *psock, struct sock *sk, struct sk_msg *msg, bool take_ref) { int num_sge, copied; /* skb_to_sgvec will fail when the total number of fragments in * frag_list and frags exceeds MAX_MSG_FRAGS. For example, the * caller may aggregate multiple skbs. */ num_sge = skb_to_sgvec(skb, msg->sg.data, off, len); if (num_sge < 0) { /* skb linearize may fail with ENOMEM, but lets simply try again * later if this happens. Under memory pressure we don't want to * drop the skb. We need to linearize the skb so that the mapping * in skb_to_sgvec can not error. * Note that skb_linearize requires the skb not to be shared. */ if (skb_linearize(skb)) return -EAGAIN; num_sge = skb_to_sgvec(skb, msg->sg.data, off, len); if (unlikely(num_sge < 0)) return num_sge; } #if IS_ENABLED(CONFIG_BPF_STREAM_PARSER) psock->ingress_bytes += len; #endif copied = len; msg->sg.start = 0; msg->sg.size = copied; msg->sg.end = num_sge; msg->skb = take_ref ? skb_get(skb) : skb; sk_psock_queue_msg(psock, msg); sk_psock_data_ready(sk, psock); return copied; } static int sk_psock_skb_ingress_self(struct sk_psock *psock, struct sk_buff *skb, u32 off, u32 len, bool take_ref); static int sk_psock_skb_ingress(struct sk_psock *psock, struct sk_buff *skb, u32 off, u32 len) { struct sock *sk = psock->sk; struct sk_msg *msg; int err; /* If we are receiving on the same sock skb->sk is already assigned, * skip memory accounting and owner transition seeing it already set * correctly. */ if (unlikely(skb->sk == sk)) return sk_psock_skb_ingress_self(psock, skb, off, len, true); msg = sk_psock_create_ingress_msg(sk, skb); if (!msg) return -EAGAIN; /* This will transition ownership of the data from the socket where * the BPF program was run initiating the redirect to the socket * we will eventually receive this data on. The data will be released * from skb_consume found in __tcp_bpf_recvmsg() after its been copied * into user buffers. */ skb_set_owner_r(skb, sk); err = sk_psock_skb_ingress_enqueue(skb, off, len, psock, sk, msg, true); if (err < 0) kfree(msg); return err; } /* Puts an skb on the ingress queue of the socket already assigned to the * skb. In this case we do not need to check memory limits or skb_set_owner_r * because the skb is already accounted for here. */ static int sk_psock_skb_ingress_self(struct sk_psock *psock, struct sk_buff *skb, u32 off, u32 len, bool take_ref) { struct sk_msg *msg = alloc_sk_msg(GFP_ATOMIC); struct sock *sk = psock->sk; int err; if (unlikely(!msg)) return -EAGAIN; skb_set_owner_r(skb, sk); err = sk_psock_skb_ingress_enqueue(skb, off, len, psock, sk, msg, take_ref); if (err < 0) kfree(msg); return err; } static int sk_psock_handle_skb(struct sk_psock *psock, struct sk_buff *skb, u32 off, u32 len, bool ingress) { if (!ingress) { if (!sock_writeable(psock->sk)) return -EAGAIN; return skb_send_sock(psock->sk, skb, off, len); } return sk_psock_skb_ingress(psock, skb, off, len); } static void sk_psock_skb_state(struct sk_psock *psock, struct sk_psock_work_state *state, int len, int off) { spin_lock_bh(&psock->ingress_lock); if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) { state->len = len; state->off = off; } spin_unlock_bh(&psock->ingress_lock); } static void sk_psock_backlog(struct work_struct *work) { struct delayed_work *dwork = to_delayed_work(work); struct sk_psock *psock = container_of(dwork, struct sk_psock, work); struct sk_psock_work_state *state = &psock->work_state; struct sk_buff *skb = NULL; u32 len = 0, off = 0; bool ingress; int ret; /* If sk is quickly removed from the map and then added back, the old * psock should not be scheduled, because there are now two psocks * pointing to the same sk. */ if (!sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) return; /* Increment the psock refcnt to synchronize with close(fd) path in * sock_map_close(), ensuring we wait for backlog thread completion * before sk_socket freed. If refcnt increment fails, it indicates * sock_map_close() completed with sk_socket potentially already freed. */ if (!sk_psock_get(psock->sk)) return; mutex_lock(&psock->work_mutex); while ((skb = skb_peek(&psock->ingress_skb))) { len = skb->len; off = 0; if (skb_bpf_strparser(skb)) { struct strp_msg *stm = strp_msg(skb); off = stm->offset; len = stm->full_len; } /* Resume processing from previous partial state */ if (unlikely(state->len)) { len = state->len; off = state->off; } ingress = skb_bpf_ingress(skb); skb_bpf_redirect_clear(skb); do { ret = -EIO; if (!sock_flag(psock->sk, SOCK_DEAD)) ret = sk_psock_handle_skb(psock, skb, off, len, ingress); if (ret <= 0) { if (ret == -EAGAIN) { sk_psock_skb_state(psock, state, len, off); /* Restore redir info we cleared before */ skb_bpf_set_redir(skb, psock->sk, ingress); /* Delay slightly to prioritize any * other work that might be here. */ if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) schedule_delayed_work(&psock->work, 1); goto end; } /* Hard errors break pipe and stop xmit. */ sk_psock_report_error(psock, ret ? -ret : EPIPE); sk_psock_clear_state(psock, SK_PSOCK_TX_ENABLED); goto end; } off += ret; len -= ret; } while (len); /* The entire skb sent, clear state */ sk_psock_skb_state(psock, state, 0, 0); skb = skb_dequeue(&psock->ingress_skb); kfree_skb(skb); } end: mutex_unlock(&psock->work_mutex); sk_psock_put(psock->sk, psock); } struct sk_psock *sk_psock_init(struct sock *sk, int node) { struct sk_psock *psock; struct proto *prot; write_lock_bh(&sk->sk_callback_lock); if (sk_is_inet(sk) && inet_csk_has_ulp(sk)) { psock = ERR_PTR(-EINVAL); goto out; } if (sk->sk_user_data) { psock = ERR_PTR(-EBUSY); goto out; } psock = kzalloc_node(sizeof(*psock), GFP_ATOMIC | __GFP_NOWARN, node); if (!psock) { psock = ERR_PTR(-ENOMEM); goto out; } prot = READ_ONCE(sk->sk_prot); psock->sk = sk; psock->eval = __SK_NONE; psock->sk_proto = prot; psock->saved_unhash = prot->unhash; psock->saved_destroy = prot->destroy; psock->saved_close = prot->close; psock->saved_write_space = sk->sk_write_space; INIT_LIST_HEAD(&psock->link); spin_lock_init(&psock->link_lock); INIT_DELAYED_WORK(&psock->work, sk_psock_backlog); mutex_init(&psock->work_mutex); INIT_LIST_HEAD(&psock->ingress_msg); spin_lock_init(&psock->ingress_lock); skb_queue_head_init(&psock->ingress_skb); sk_psock_set_state(psock, SK_PSOCK_TX_ENABLED); refcount_set(&psock->refcnt, 1); __rcu_assign_sk_user_data_with_flags(sk, psock, SK_USER_DATA_NOCOPY | SK_USER_DATA_PSOCK); sock_hold(sk); out: write_unlock_bh(&sk->sk_callback_lock); return psock; } EXPORT_SYMBOL_GPL(sk_psock_init); struct sk_psock_link *sk_psock_link_pop(struct sk_psock *psock) { struct sk_psock_link *link; spin_lock_bh(&psock->link_lock); link = list_first_entry_or_null(&psock->link, struct sk_psock_link, list); if (link) list_del(&link->list); spin_unlock_bh(&psock->link_lock); return link; } static void __sk_psock_purge_ingress_msg(struct sk_psock *psock) { struct sk_msg *msg, *tmp; list_for_each_entry_safe(msg, tmp, &psock->ingress_msg, list) { list_del(&msg->list); if (!msg->skb) atomic_sub(msg->sg.size, &psock->sk->sk_rmem_alloc); sk_msg_free(psock->sk, msg); kfree(msg); } } static void __sk_psock_zap_ingress(struct sk_psock *psock) { struct sk_buff *skb; while ((skb = skb_dequeue(&psock->ingress_skb)) != NULL) { skb_bpf_redirect_clear(skb); sock_drop(psock->sk, skb); } __sk_psock_purge_ingress_msg(psock); } static void sk_psock_link_destroy(struct sk_psock *psock) { struct sk_psock_link *link, *tmp; list_for_each_entry_safe(link, tmp, &psock->link, list) { list_del(&link->list); sk_psock_free_link(link); } } void sk_psock_stop(struct sk_psock *psock) { spin_lock_bh(&psock->ingress_lock); sk_psock_clear_state(psock, SK_PSOCK_TX_ENABLED); sk_psock_cork_free(psock); spin_unlock_bh(&psock->ingress_lock); } static void sk_psock_done_strp(struct sk_psock *psock); static void sk_psock_destroy(struct work_struct *work) { struct sk_psock *psock = container_of(to_rcu_work(work), struct sk_psock, rwork); /* No sk_callback_lock since already detached. */ sk_psock_done_strp(psock); cancel_delayed_work_sync(&psock->work); __sk_psock_zap_ingress(psock); mutex_destroy(&psock->work_mutex); psock_progs_drop(&psock->progs); sk_psock_link_destroy(psock); sk_psock_cork_free(psock); if (psock->sk_redir) sock_put(psock->sk_redir); if (psock->sk_pair) sock_put(psock->sk_pair); sock_put(psock->sk); kfree(psock); } void sk_psock_drop(struct sock *sk, struct sk_psock *psock) { write_lock_bh(&sk->sk_callback_lock); sk_psock_restore_proto(sk, psock); rcu_assign_sk_user_data(sk, NULL); if (psock->progs.stream_parser) sk_psock_stop_strp(sk, psock); else if (psock->progs.stream_verdict || psock->progs.skb_verdict) sk_psock_stop_verdict(sk, psock); write_unlock_bh(&sk->sk_callback_lock); sk_psock_stop(psock); INIT_RCU_WORK(&psock->rwork, sk_psock_destroy); queue_rcu_work(system_wq, &psock->rwork); } EXPORT_SYMBOL_GPL(sk_psock_drop); static int sk_psock_map_verd(int verdict, bool redir) { switch (verdict) { case SK_PASS: return redir ? __SK_REDIRECT : __SK_PASS; case SK_DROP: default: break; } return __SK_DROP; } int sk_psock_msg_verdict(struct sock *sk, struct sk_psock *psock, struct sk_msg *msg) { struct bpf_prog *prog; int ret; rcu_read_lock(); prog = READ_ONCE(psock->progs.msg_parser); if (unlikely(!prog)) { ret = __SK_PASS; goto out; } sk_msg_compute_data_pointers(msg); msg->sk = sk; ret = bpf_prog_run_pin_on_cpu(prog, msg); ret = sk_psock_map_verd(ret, msg->sk_redir); psock->apply_bytes = msg->apply_bytes; if (ret == __SK_REDIRECT) { if (psock->sk_redir) { sock_put(psock->sk_redir); psock->sk_redir = NULL; } if (!msg->sk_redir) { ret = __SK_DROP; goto out; } psock->redir_ingress = sk_msg_to_ingress(msg); psock->sk_redir = msg->sk_redir; sock_hold(psock->sk_redir); } out: rcu_read_unlock(); return ret; } EXPORT_SYMBOL_GPL(sk_psock_msg_verdict); static int sk_psock_skb_redirect(struct sk_psock *from, struct sk_buff *skb) { struct sk_psock *psock_other; struct sock *sk_other; sk_other = skb_bpf_redirect_fetch(skb); /* This error is a buggy BPF program, it returned a redirect * return code, but then didn't set a redirect interface. */ if (unlikely(!sk_other)) { skb_bpf_redirect_clear(skb); sock_drop(from->sk, skb); return -EIO; } psock_other = sk_psock(sk_other); /* This error indicates the socket is being torn down or had another * error that caused the pipe to break. We can't send a packet on * a socket that is in this state so we drop the skb. */ if (!psock_other || sock_flag(sk_other, SOCK_DEAD)) { skb_bpf_redirect_clear(skb); sock_drop(from->sk, skb); return -EIO; } spin_lock_bh(&psock_other->ingress_lock); if (!sk_psock_test_state(psock_other, SK_PSOCK_TX_ENABLED)) { spin_unlock_bh(&psock_other->ingress_lock); skb_bpf_redirect_clear(skb); sock_drop(from->sk, skb); return -EIO; } skb_queue_tail(&psock_other->ingress_skb, skb); schedule_delayed_work(&psock_other->work, 0); spin_unlock_bh(&psock_other->ingress_lock); return 0; } static void sk_psock_tls_verdict_apply(struct sk_buff *skb, struct sk_psock *from, int verdict) { switch (verdict) { case __SK_REDIRECT: sk_psock_skb_redirect(from, skb); break; case __SK_PASS: case __SK_DROP: default: break; } } int sk_psock_tls_strp_read(struct sk_psock *psock, struct sk_buff *skb) { struct bpf_prog *prog; int ret = __SK_PASS; rcu_read_lock(); prog = READ_ONCE(psock->progs.stream_verdict); if (likely(prog)) { skb->sk = psock->sk; skb_dst_drop(skb); skb_bpf_redirect_clear(skb); ret = bpf_prog_run_pin_on_cpu(prog, skb); ret = sk_psock_map_verd(ret, skb_bpf_redirect_fetch(skb)); skb->sk = NULL; } sk_psock_tls_verdict_apply(skb, psock, ret); rcu_read_unlock(); return ret; } EXPORT_SYMBOL_GPL(sk_psock_tls_strp_read); static int sk_psock_verdict_apply(struct sk_psock *psock, struct sk_buff *skb, int verdict) { struct sock *sk_other; int err = 0; u32 len, off; switch (verdict) { case __SK_PASS: err = -EIO; sk_other = psock->sk; if (sock_flag(sk_other, SOCK_DEAD) || !sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) goto out_free; skb_bpf_set_ingress(skb); /* If the queue is empty then we can submit directly * into the msg queue. If its not empty we have to * queue work otherwise we may get OOO data. Otherwise, * if sk_psock_skb_ingress errors will be handled by * retrying later from workqueue. */ if (skb_queue_empty(&psock->ingress_skb)) { len = skb->len; off = 0; if (skb_bpf_strparser(skb)) { struct strp_msg *stm = strp_msg(skb); off = stm->offset; len = stm->full_len; } err = sk_psock_skb_ingress_self(psock, skb, off, len, false); } if (err < 0) { spin_lock_bh(&psock->ingress_lock); if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) { skb_queue_tail(&psock->ingress_skb, skb); schedule_delayed_work(&psock->work, 0); err = 0; } spin_unlock_bh(&psock->ingress_lock); if (err < 0) goto out_free; } break; case __SK_REDIRECT: tcp_eat_skb(psock->sk, skb); err = sk_psock_skb_redirect(psock, skb); break; case __SK_DROP: default: out_free: skb_bpf_redirect_clear(skb); tcp_eat_skb(psock->sk, skb); sock_drop(psock->sk, skb); } return err; } static void sk_psock_write_space(struct sock *sk) { struct sk_psock *psock; void (*write_space)(struct sock *sk) = NULL; rcu_read_lock(); psock = sk_psock(sk); if (likely(psock)) { if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) schedule_delayed_work(&psock->work, 0); write_space = psock->saved_write_space; } rcu_read_unlock(); if (write_space) write_space(sk); } #if IS_ENABLED(CONFIG_BPF_STREAM_PARSER) static void sk_psock_strp_read(struct strparser *strp, struct sk_buff *skb) { struct sk_psock *psock; struct bpf_prog *prog; int ret = __SK_DROP; struct sock *sk; rcu_read_lock(); sk = strp->sk; psock = sk_psock(sk); if (unlikely(!psock)) { sock_drop(sk, skb); goto out; } prog = READ_ONCE(psock->progs.stream_verdict); if (likely(prog)) { skb->sk = sk; skb_dst_drop(skb); skb_bpf_redirect_clear(skb); ret = bpf_prog_run_pin_on_cpu(prog, skb); skb_bpf_set_strparser(skb); ret = sk_psock_map_verd(ret, skb_bpf_redirect_fetch(skb)); skb->sk = NULL; } sk_psock_verdict_apply(psock, skb, ret); out: rcu_read_unlock(); } static int sk_psock_strp_read_done(struct strparser *strp, int err) { return err; } static int sk_psock_strp_parse(struct strparser *strp, struct sk_buff *skb) { struct sk_psock *psock = container_of(strp, struct sk_psock, strp); struct bpf_prog *prog; int ret = skb->len; rcu_read_lock(); prog = READ_ONCE(psock->progs.stream_parser); if (likely(prog)) { skb->sk = psock->sk; ret = bpf_prog_run_pin_on_cpu(prog, skb); skb->sk = NULL; } rcu_read_unlock(); return ret; } /* Called with socket lock held. */ static void sk_psock_strp_data_ready(struct sock *sk) { struct sk_psock *psock; trace_sk_data_ready(sk); rcu_read_lock(); psock = sk_psock(sk); if (likely(psock)) { if (tls_sw_has_ctx_rx(sk)) { psock->saved_data_ready(sk); } else { read_lock_bh(&sk->sk_callback_lock); strp_data_ready(&psock->strp); read_unlock_bh(&sk->sk_callback_lock); } } rcu_read_unlock(); } int sk_psock_init_strp(struct sock *sk, struct sk_psock *psock) { int ret; static const struct strp_callbacks cb = { .rcv_msg = sk_psock_strp_read, .read_sock_done = sk_psock_strp_read_done, .parse_msg = sk_psock_strp_parse, }; ret = strp_init(&psock->strp, sk, &cb); if (!ret) sk_psock_set_state(psock, SK_PSOCK_RX_STRP_ENABLED); if (sk_is_tcp(sk)) { psock->strp.cb.read_sock = tcp_bpf_strp_read_sock; psock->copied_seq = tcp_sk(sk)->copied_seq; } return ret; } void sk_psock_start_strp(struct sock *sk, struct sk_psock *psock) { if (psock->saved_data_ready) return; psock->saved_data_ready = sk->sk_data_ready; sk->sk_data_ready = sk_psock_strp_data_ready; sk->sk_write_space = sk_psock_write_space; } void sk_psock_stop_strp(struct sock *sk, struct sk_psock *psock) { psock_set_prog(&psock->progs.stream_parser, NULL); if (!psock->saved_data_ready) return; sk->sk_data_ready = psock->saved_data_ready; psock->saved_data_ready = NULL; strp_stop(&psock->strp); } static void sk_psock_done_strp(struct sk_psock *psock) { /* Parser has been stopped */ if (sk_psock_test_state(psock, SK_PSOCK_RX_STRP_ENABLED)) strp_done(&psock->strp); } #else static void sk_psock_done_strp(struct sk_psock *psock) { } #endif /* CONFIG_BPF_STREAM_PARSER */ static int sk_psock_verdict_recv(struct sock *sk, struct sk_buff *skb) { struct sk_psock *psock; struct bpf_prog *prog; int ret = __SK_DROP; int len = skb->len; rcu_read_lock(); psock = sk_psock(sk); if (unlikely(!psock)) { len = 0; tcp_eat_skb(sk, skb); sock_drop(sk, skb); goto out; } prog = READ_ONCE(psock->progs.stream_verdict); if (!prog) prog = READ_ONCE(psock->progs.skb_verdict); if (likely(prog)) { skb_dst_drop(skb); skb_bpf_redirect_clear(skb); ret = bpf_prog_run_pin_on_cpu(prog, skb); ret = sk_psock_map_verd(ret, skb_bpf_redirect_fetch(skb)); } ret = sk_psock_verdict_apply(psock, skb, ret); if (ret < 0) len = ret; out: rcu_read_unlock(); return len; } static void sk_psock_verdict_data_ready(struct sock *sk) { struct socket *sock = sk->sk_socket; const struct proto_ops *ops; int copied; trace_sk_data_ready(sk); if (unlikely(!sock)) return; ops = READ_ONCE(sock->ops); if (!ops || !ops->read_skb) return; copied = ops->read_skb(sk, sk_psock_verdict_recv); if (copied >= 0) { struct sk_psock *psock; rcu_read_lock(); psock = sk_psock(sk); if (psock) sk_psock_data_ready(sk, psock); rcu_read_unlock(); } } void sk_psock_start_verdict(struct sock *sk, struct sk_psock *psock) { if (psock->saved_data_ready) return; psock->saved_data_ready = sk->sk_data_ready; sk->sk_data_ready = sk_psock_verdict_data_ready; sk->sk_write_space = sk_psock_write_space; } void sk_psock_stop_verdict(struct sock *sk, struct sk_psock *psock) { psock_set_prog(&psock->progs.stream_verdict, NULL); psock_set_prog(&psock->progs.skb_verdict, NULL); if (!psock->saved_data_ready) return; sk->sk_data_ready = psock->saved_data_ready; psock->saved_data_ready = NULL; } |
| 633 107 60 60 129 428 129 129 130 129 129 129 428 420 130 428 130 130 130 130 128 420 428 428 21 17 17 21 48 48 9 47 4 4 4 5 1 1 1 1 1 5 5 5 24 25 25 24 826 827 825 833 15 16 19 18 19 1211 1207 1206 39 36 39 161 161 6 6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 | // SPDX-License-Identifier: GPL-2.0 #include <linux/mm.h> #include <linux/gfp.h> #include <linux/hugetlb.h> #include <asm/pgalloc.h> #include <asm/tlb.h> #include <asm/fixmap.h> #include <asm/mtrr.h> #ifdef CONFIG_DYNAMIC_PHYSICAL_MASK phys_addr_t physical_mask __ro_after_init = (1ULL << __PHYSICAL_MASK_SHIFT) - 1; EXPORT_SYMBOL(physical_mask); SYM_PIC_ALIAS(physical_mask); #endif pgtable_t pte_alloc_one(struct mm_struct *mm) { return __pte_alloc_one(mm, GFP_PGTABLE_USER); } void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte) { paravirt_release_pte(page_to_pfn(pte)); tlb_remove_ptdesc(tlb, page_ptdesc(pte)); } #if CONFIG_PGTABLE_LEVELS > 2 void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd) { paravirt_release_pmd(__pa(pmd) >> PAGE_SHIFT); /* * NOTE! For PAE, any changes to the top page-directory-pointer-table * entries need a full cr3 reload to flush. */ #ifdef CONFIG_X86_PAE tlb->need_flush_all = 1; #endif tlb_remove_ptdesc(tlb, virt_to_ptdesc(pmd)); } #if CONFIG_PGTABLE_LEVELS > 3 void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud) { paravirt_release_pud(__pa(pud) >> PAGE_SHIFT); tlb_remove_ptdesc(tlb, virt_to_ptdesc(pud)); } #if CONFIG_PGTABLE_LEVELS > 4 void ___p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d) { paravirt_release_p4d(__pa(p4d) >> PAGE_SHIFT); tlb_remove_ptdesc(tlb, virt_to_ptdesc(p4d)); } #endif /* CONFIG_PGTABLE_LEVELS > 4 */ #endif /* CONFIG_PGTABLE_LEVELS > 3 */ #endif /* CONFIG_PGTABLE_LEVELS > 2 */ static inline void pgd_list_add(pgd_t *pgd) { struct ptdesc *ptdesc = virt_to_ptdesc(pgd); list_add(&ptdesc->pt_list, &pgd_list); } static inline void pgd_list_del(pgd_t *pgd) { struct ptdesc *ptdesc = virt_to_ptdesc(pgd); list_del(&ptdesc->pt_list); } static void pgd_set_mm(pgd_t *pgd, struct mm_struct *mm) { virt_to_ptdesc(pgd)->pt_mm = mm; } struct mm_struct *pgd_page_get_mm(struct page *page) { return page_ptdesc(page)->pt_mm; } static void pgd_ctor(struct mm_struct *mm, pgd_t *pgd) { /* PAE preallocates all its PMDs. No cloning needed. */ if (!IS_ENABLED(CONFIG_X86_PAE)) clone_pgd_range(pgd + KERNEL_PGD_BOUNDARY, swapper_pg_dir + KERNEL_PGD_BOUNDARY, KERNEL_PGD_PTRS); /* List used to sync kernel mapping updates */ pgd_set_mm(pgd, mm); pgd_list_add(pgd); } static void pgd_dtor(pgd_t *pgd) { spin_lock(&pgd_lock); pgd_list_del(pgd); spin_unlock(&pgd_lock); } /* * List of all pgd's needed for non-PAE so it can invalidate entries * in both cached and uncached pgd's; not needed for PAE since the * kernel pmd is shared. If PAE were not to share the pmd a similar * tactic would be needed. This is essentially codepath-based locking * against pageattr.c; it is the unique case in which a valid change * of kernel pagetables can't be lazily synchronized by vmalloc faults. * vmalloc faults work because attached pagetables are never freed. * -- nyc */ #ifdef CONFIG_X86_PAE /* * In PAE mode, we need to do a cr3 reload (=tlb flush) when * updating the top-level pagetable entries to guarantee the * processor notices the update. Since this is expensive, and * all 4 top-level entries are used almost immediately in a * new process's life, we just pre-populate them here. */ #define PREALLOCATED_PMDS PTRS_PER_PGD /* * "USER_PMDS" are the PMDs for the user copy of the page tables when * PTI is enabled. They do not exist when PTI is disabled. Note that * this is distinct from the user _portion_ of the kernel page tables * which always exists. * * We allocate separate PMDs for the kernel part of the user page-table * when PTI is enabled. We need them to map the per-process LDT into the * user-space page-table. */ #define PREALLOCATED_USER_PMDS (boot_cpu_has(X86_FEATURE_PTI) ? \ KERNEL_PGD_PTRS : 0) #define MAX_PREALLOCATED_USER_PMDS KERNEL_PGD_PTRS void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd) { paravirt_alloc_pmd(mm, __pa(pmd) >> PAGE_SHIFT); /* Note: almost everything apart from _PAGE_PRESENT is reserved at the pmd (PDPT) level. */ set_pud(pudp, __pud(__pa(pmd) | _PAGE_PRESENT)); /* * According to Intel App note "TLBs, Paging-Structure Caches, * and Their Invalidation", April 2007, document 317080-001, * section 8.1: in PAE mode we explicitly have to flush the * TLB via cr3 if the top-level pgd is changed... */ flush_tlb_mm(mm); } #else /* !CONFIG_X86_PAE */ /* No need to prepopulate any pagetable entries in non-PAE modes. */ #define PREALLOCATED_PMDS 0 #define PREALLOCATED_USER_PMDS 0 #define MAX_PREALLOCATED_USER_PMDS 0 #endif /* CONFIG_X86_PAE */ static void free_pmds(struct mm_struct *mm, pmd_t *pmds[], int count) { int i; struct ptdesc *ptdesc; for (i = 0; i < count; i++) if (pmds[i]) { ptdesc = virt_to_ptdesc(pmds[i]); pagetable_dtor(ptdesc); pagetable_free(ptdesc); mm_dec_nr_pmds(mm); } } static int preallocate_pmds(struct mm_struct *mm, pmd_t *pmds[], int count) { int i; bool failed = false; gfp_t gfp = GFP_PGTABLE_USER; if (mm == &init_mm) gfp &= ~__GFP_ACCOUNT; gfp &= ~__GFP_HIGHMEM; for (i = 0; i < count; i++) { pmd_t *pmd = NULL; struct ptdesc *ptdesc = pagetable_alloc(gfp, 0); if (!ptdesc) failed = true; if (ptdesc && !pagetable_pmd_ctor(mm, ptdesc)) { pagetable_free(ptdesc); ptdesc = NULL; failed = true; } if (ptdesc) { mm_inc_nr_pmds(mm); pmd = ptdesc_address(ptdesc); } pmds[i] = pmd; } if (failed) { free_pmds(mm, pmds, count); return -ENOMEM; } return 0; } /* * Mop up any pmd pages which may still be attached to the pgd. * Normally they will be freed by munmap/exit_mmap, but any pmd we * preallocate which never got a corresponding vma will need to be * freed manually. */ static void mop_up_one_pmd(struct mm_struct *mm, pgd_t *pgdp) { pgd_t pgd = *pgdp; if (pgd_val(pgd) != 0) { pmd_t *pmd = (pmd_t *)pgd_page_vaddr(pgd); pgd_clear(pgdp); paravirt_release_pmd(pgd_val(pgd) >> PAGE_SHIFT); pmd_free(mm, pmd); mm_dec_nr_pmds(mm); } } static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp) { int i; for (i = 0; i < PREALLOCATED_PMDS; i++) mop_up_one_pmd(mm, &pgdp[i]); #ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION if (!boot_cpu_has(X86_FEATURE_PTI)) return; pgdp = kernel_to_user_pgdp(pgdp); for (i = 0; i < PREALLOCATED_USER_PMDS; i++) mop_up_one_pmd(mm, &pgdp[i + KERNEL_PGD_BOUNDARY]); #endif } static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[]) { p4d_t *p4d; pud_t *pud; int i; p4d = p4d_offset(pgd, 0); pud = pud_offset(p4d, 0); for (i = 0; i < PREALLOCATED_PMDS; i++, pud++) { pmd_t *pmd = pmds[i]; if (i >= KERNEL_PGD_BOUNDARY) memcpy(pmd, (pmd_t *)pgd_page_vaddr(swapper_pg_dir[i]), sizeof(pmd_t) * PTRS_PER_PMD); pud_populate(mm, pud, pmd); } } #ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION static void pgd_prepopulate_user_pmd(struct mm_struct *mm, pgd_t *k_pgd, pmd_t *pmds[]) { pgd_t *s_pgd = kernel_to_user_pgdp(swapper_pg_dir); pgd_t *u_pgd = kernel_to_user_pgdp(k_pgd); p4d_t *u_p4d; pud_t *u_pud; int i; u_p4d = p4d_offset(u_pgd, 0); u_pud = pud_offset(u_p4d, 0); s_pgd += KERNEL_PGD_BOUNDARY; u_pud += KERNEL_PGD_BOUNDARY; for (i = 0; i < PREALLOCATED_USER_PMDS; i++, u_pud++, s_pgd++) { pmd_t *pmd = pmds[i]; memcpy(pmd, (pmd_t *)pgd_page_vaddr(*s_pgd), sizeof(pmd_t) * PTRS_PER_PMD); pud_populate(mm, u_pud, pmd); } } #else static void pgd_prepopulate_user_pmd(struct mm_struct *mm, pgd_t *k_pgd, pmd_t *pmds[]) { } #endif static inline pgd_t *_pgd_alloc(struct mm_struct *mm) { /* * PTI and Xen need a whole page for the PAE PGD * even though the hardware only needs 32 bytes. * * For simplicity, allocate a page for all users. */ return __pgd_alloc(mm, pgd_allocation_order()); } static inline void _pgd_free(struct mm_struct *mm, pgd_t *pgd) { __pgd_free(mm, pgd); } pgd_t *pgd_alloc(struct mm_struct *mm) { pgd_t *pgd; pmd_t *u_pmds[MAX_PREALLOCATED_USER_PMDS]; pmd_t *pmds[PREALLOCATED_PMDS]; pgd = _pgd_alloc(mm); if (pgd == NULL) goto out; mm->pgd = pgd; if (sizeof(pmds) != 0 && preallocate_pmds(mm, pmds, PREALLOCATED_PMDS) != 0) goto out_free_pgd; if (sizeof(u_pmds) != 0 && preallocate_pmds(mm, u_pmds, PREALLOCATED_USER_PMDS) != 0) goto out_free_pmds; if (paravirt_pgd_alloc(mm) != 0) goto out_free_user_pmds; /* * Make sure that pre-populating the pmds is atomic with * respect to anything walking the pgd_list, so that they * never see a partially populated pgd. */ spin_lock(&pgd_lock); pgd_ctor(mm, pgd); if (sizeof(pmds) != 0) pgd_prepopulate_pmd(mm, pgd, pmds); if (sizeof(u_pmds) != 0) pgd_prepopulate_user_pmd(mm, pgd, u_pmds); spin_unlock(&pgd_lock); return pgd; out_free_user_pmds: if (sizeof(u_pmds) != 0) free_pmds(mm, u_pmds, PREALLOCATED_USER_PMDS); out_free_pmds: if (sizeof(pmds) != 0) free_pmds(mm, pmds, PREALLOCATED_PMDS); out_free_pgd: _pgd_free(mm, pgd); out: return NULL; } void pgd_free(struct mm_struct *mm, pgd_t *pgd) { pgd_mop_up_pmds(mm, pgd); pgd_dtor(pgd); paravirt_pgd_free(mm, pgd); _pgd_free(mm, pgd); } /* * Used to set accessed or dirty bits in the page table entries * on other architectures. On x86, the accessed and dirty bits * are tracked by hardware. However, do_wp_page calls this function * to also make the pte writeable at the same time the dirty bit is * set. In that case we do actually need to write the PTE. */ int ptep_set_access_flags(struct vm_area_struct *vma, unsigned long address, pte_t *ptep, pte_t entry, int dirty) { int changed = !pte_same(*ptep, entry); if (changed && dirty) set_pte(ptep, entry); return changed; } #ifdef CONFIG_TRANSPARENT_HUGEPAGE int pmdp_set_access_flags(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp, pmd_t entry, int dirty) { int changed = !pmd_same(*pmdp, entry); VM_BUG_ON(address & ~HPAGE_PMD_MASK); if (changed && dirty) { set_pmd(pmdp, entry); /* * We had a write-protection fault here and changed the pmd * to to more permissive. No need to flush the TLB for that, * #PF is architecturally guaranteed to do that and in the * worst-case we'll generate a spurious fault. */ } return changed; } int pudp_set_access_flags(struct vm_area_struct *vma, unsigned long address, pud_t *pudp, pud_t entry, int dirty) { int changed = !pud_same(*pudp, entry); VM_BUG_ON(address & ~HPAGE_PUD_MASK); if (changed && dirty) { set_pud(pudp, entry); /* * We had a write-protection fault here and changed the pud * to to more permissive. No need to flush the TLB for that, * #PF is architecturally guaranteed to do that and in the * worst-case we'll generate a spurious fault. */ } return changed; } #endif int ptep_test_and_clear_young(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { int ret = 0; if (pte_young(*ptep)) ret = test_and_clear_bit(_PAGE_BIT_ACCESSED, (unsigned long *) &ptep->pte); return ret; } #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) int pmdp_test_and_clear_young(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmdp) { int ret = 0; if (pmd_young(*pmdp)) ret = test_and_clear_bit(_PAGE_BIT_ACCESSED, (unsigned long *)pmdp); return ret; } #endif #ifdef CONFIG_TRANSPARENT_HUGEPAGE int pudp_test_and_clear_young(struct vm_area_struct *vma, unsigned long addr, pud_t *pudp) { int ret = 0; if (pud_young(*pudp)) ret = test_and_clear_bit(_PAGE_BIT_ACCESSED, (unsigned long *)pudp); return ret; } #endif int ptep_clear_flush_young(struct vm_area_struct *vma, unsigned long address, pte_t *ptep) { /* * On x86 CPUs, clearing the accessed bit without a TLB flush * doesn't cause data corruption. [ It could cause incorrect * page aging and the (mistaken) reclaim of hot pages, but the * chance of that should be relatively low. ] * * So as a performance optimization don't flush the TLB when * clearing the accessed bit, it will eventually be flushed by * a context switch or a VM operation anyway. [ In the rare * event of it not getting flushed for a long time the delay * shouldn't really matter because there's no real memory * pressure for swapout to react to. ] */ return ptep_test_and_clear_young(vma, address, ptep); } #ifdef CONFIG_TRANSPARENT_HUGEPAGE int pmdp_clear_flush_young(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp) { int young; VM_BUG_ON(address & ~HPAGE_PMD_MASK); young = pmdp_test_and_clear_young(vma, address, pmdp); if (young) flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); return young; } pmd_t pmdp_invalidate_ad(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp) { VM_WARN_ON_ONCE(!pmd_present(*pmdp)); /* * No flush is necessary. Once an invalid PTE is established, the PTE's * access and dirty bits cannot be updated. */ return pmdp_establish(vma, address, pmdp, pmd_mkinvalid(*pmdp)); } #endif #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \ defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) pud_t pudp_invalidate(struct vm_area_struct *vma, unsigned long address, pud_t *pudp) { VM_WARN_ON_ONCE(!pud_present(*pudp)); pud_t old = pudp_establish(vma, address, pudp, pud_mkinvalid(*pudp)); flush_pud_tlb_range(vma, address, address + HPAGE_PUD_SIZE); return old; } #endif /** * reserve_top_address - Reserve a hole in the top of the kernel address space * @reserve: Size of hole to reserve * * Can be used to relocate the fixmap area and poke a hole in the top * of the kernel address space to make room for a hypervisor. */ void __init reserve_top_address(unsigned long reserve) { #ifdef CONFIG_X86_32 BUG_ON(fixmaps_set > 0); __FIXADDR_TOP = round_down(-reserve, 1 << PMD_SHIFT) - PAGE_SIZE; printk(KERN_INFO "Reserving virtual address space above 0x%08lx (rounded to 0x%08lx)\n", -reserve, __FIXADDR_TOP + PAGE_SIZE); #endif } int fixmaps_set; void __native_set_fixmap(enum fixed_addresses idx, pte_t pte) { unsigned long address = __fix_to_virt(idx); #ifdef CONFIG_X86_64 /* * Ensure that the static initial page tables are covering the * fixmap completely. */ BUILD_BUG_ON(__end_of_permanent_fixed_addresses > (FIXMAP_PMD_NUM * PTRS_PER_PTE)); #endif if (idx >= __end_of_fixed_addresses) { BUG(); return; } set_pte_vaddr(address, pte); fixmaps_set++; } void native_set_fixmap(unsigned /* enum fixed_addresses */ idx, phys_addr_t phys, pgprot_t flags) { /* Sanitize 'prot' against any unsupported bits: */ pgprot_val(flags) &= __default_kernel_pte_mask; __native_set_fixmap(idx, pfn_pte(phys >> PAGE_SHIFT, flags)); } #ifdef CONFIG_HAVE_ARCH_HUGE_VMAP #if CONFIG_PGTABLE_LEVELS > 4 /** * p4d_set_huge - Set up kernel P4D mapping * @p4d: Pointer to the P4D entry * @addr: Virtual address associated with the P4D entry * @prot: Protection bits to use * * No 512GB pages yet -- always return 0 */ int p4d_set_huge(p4d_t *p4d, phys_addr_t addr, pgprot_t prot) { return 0; } /** * p4d_clear_huge - Clear kernel P4D mapping when it is set * @p4d: Pointer to the P4D entry to clear * * No 512GB pages yet -- do nothing */ void p4d_clear_huge(p4d_t *p4d) { } #endif /** * pud_set_huge - Set up kernel PUD mapping * @pud: Pointer to the PUD entry * @addr: Virtual address associated with the PUD entry * @prot: Protection bits to use * * MTRRs can override PAT memory types with 4KiB granularity. Therefore, this * function sets up a huge page only if the complete range has the same MTRR * caching mode. * * Callers should try to decrease page size (1GB -> 2MB -> 4K) if the bigger * page mapping attempt fails. * * Returns 1 on success and 0 on failure. */ int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot) { u8 uniform; mtrr_type_lookup(addr, addr + PUD_SIZE, &uniform); if (!uniform) return 0; /* Bail out if we are we on a populated non-leaf entry: */ if (pud_present(*pud) && !pud_leaf(*pud)) return 0; set_pte((pte_t *)pud, pfn_pte( (u64)addr >> PAGE_SHIFT, __pgprot(protval_4k_2_large(pgprot_val(prot)) | _PAGE_PSE))); return 1; } /** * pmd_set_huge - Set up kernel PMD mapping * @pmd: Pointer to the PMD entry * @addr: Virtual address associated with the PMD entry * @prot: Protection bits to use * * See text over pud_set_huge() above. * * Returns 1 on success and 0 on failure. */ int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot) { u8 uniform; mtrr_type_lookup(addr, addr + PMD_SIZE, &uniform); if (!uniform) { pr_warn_once("%s: Cannot satisfy [mem %#010llx-%#010llx] with a huge-page mapping due to MTRR override.\n", __func__, addr, addr + PMD_SIZE); return 0; } /* Bail out if we are we on a populated non-leaf entry: */ if (pmd_present(*pmd) && !pmd_leaf(*pmd)) return 0; set_pte((pte_t *)pmd, pfn_pte( (u64)addr >> PAGE_SHIFT, __pgprot(protval_4k_2_large(pgprot_val(prot)) | _PAGE_PSE))); return 1; } /** * pud_clear_huge - Clear kernel PUD mapping when it is set * @pud: Pointer to the PUD entry to clear. * * Returns 1 on success and 0 on failure (no PUD map is found). */ int pud_clear_huge(pud_t *pud) { if (pud_leaf(*pud)) { pud_clear(pud); return 1; } return 0; } /** * pmd_clear_huge - Clear kernel PMD mapping when it is set * @pmd: Pointer to the PMD entry to clear. * * Returns 1 on success and 0 on failure (no PMD map is found). */ int pmd_clear_huge(pmd_t *pmd) { if (pmd_leaf(*pmd)) { pmd_clear(pmd); return 1; } return 0; } #ifdef CONFIG_X86_64 /** * pud_free_pmd_page - Clear PUD entry and free PMD page * @pud: Pointer to a PUD * @addr: Virtual address associated with PUD * * Context: The PUD range has been unmapped and TLB purged. * Return: 1 if clearing the entry succeeded. 0 otherwise. * * NOTE: Callers must allow a single page allocation. */ int pud_free_pmd_page(pud_t *pud, unsigned long addr) { pmd_t *pmd, *pmd_sv; pte_t *pte; int i; pmd = pud_pgtable(*pud); pmd_sv = (pmd_t *)__get_free_page(GFP_KERNEL); if (!pmd_sv) return 0; for (i = 0; i < PTRS_PER_PMD; i++) { pmd_sv[i] = pmd[i]; if (!pmd_none(pmd[i])) pmd_clear(&pmd[i]); } pud_clear(pud); /* INVLPG to clear all paging-structure caches */ flush_tlb_kernel_range(addr, addr + PAGE_SIZE-1); for (i = 0; i < PTRS_PER_PMD; i++) { if (!pmd_none(pmd_sv[i])) { pte = (pte_t *)pmd_page_vaddr(pmd_sv[i]); pte_free_kernel(&init_mm, pte); } } free_page((unsigned long)pmd_sv); pmd_free(&init_mm, pmd); return 1; } /** * pmd_free_pte_page - Clear PMD entry and free PTE page. * @pmd: Pointer to the PMD * @addr: Virtual address associated with PMD * * Context: The PMD range has been unmapped and TLB purged. * Return: 1 if clearing the entry succeeded. 0 otherwise. */ int pmd_free_pte_page(pmd_t *pmd, unsigned long addr) { pte_t *pte; pte = (pte_t *)pmd_page_vaddr(*pmd); pmd_clear(pmd); /* INVLPG to clear all paging-structure caches */ flush_tlb_kernel_range(addr, addr + PAGE_SIZE-1); pte_free_kernel(&init_mm, pte); return 1; } #else /* !CONFIG_X86_64 */ /* * Disable free page handling on x86-PAE. This assures that ioremap() * does not update sync'd PMD entries. See vmalloc_sync_one(). */ int pmd_free_pte_page(pmd_t *pmd, unsigned long addr) { return pmd_none(*pmd); } #endif /* CONFIG_X86_64 */ #endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */ pte_t pte_mkwrite(pte_t pte, struct vm_area_struct *vma) { if (vma->vm_flags & VM_SHADOW_STACK) return pte_mkwrite_shstk(pte); pte = pte_mkwrite_novma(pte); return pte_clear_saveddirty(pte); } pmd_t pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma) { if (vma->vm_flags & VM_SHADOW_STACK) return pmd_mkwrite_shstk(pmd); pmd = pmd_mkwrite_novma(pmd); return pmd_clear_saveddirty(pmd); } void arch_check_zapped_pte(struct vm_area_struct *vma, pte_t pte) { /* * Hardware before shadow stack can (rarely) set Dirty=1 * on a Write=0 PTE. So the below condition * only indicates a software bug when shadow stack is * supported by the HW. This checking is covered in * pte_shstk(). */ VM_WARN_ON_ONCE(!(vma->vm_flags & VM_SHADOW_STACK) && pte_shstk(pte)); } void arch_check_zapped_pmd(struct vm_area_struct *vma, pmd_t pmd) { /* See note in arch_check_zapped_pte() */ VM_WARN_ON_ONCE(!(vma->vm_flags & VM_SHADOW_STACK) && pmd_shstk(pmd)); } void arch_check_zapped_pud(struct vm_area_struct *vma, pud_t pud) { /* See note in arch_check_zapped_pte() */ VM_WARN_ON_ONCE(!(vma->vm_flags & VM_SHADOW_STACK) && pud_shstk(pud)); } |
| 504 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_FSGSBASE_H #define _ASM_FSGSBASE_H #ifndef __ASSEMBLER__ #ifdef CONFIG_X86_64 #include <asm/msr.h> /* * Read/write a task's FSBASE or GSBASE. This returns the value that * the FS/GS base would have (if the task were to be resumed). These * work on the current task or on a non-running (typically stopped * ptrace child) task. */ extern unsigned long x86_fsbase_read_task(struct task_struct *task); extern unsigned long x86_gsbase_read_task(struct task_struct *task); extern void x86_fsbase_write_task(struct task_struct *task, unsigned long fsbase); extern void x86_gsbase_write_task(struct task_struct *task, unsigned long gsbase); /* Must be protected by X86_FEATURE_FSGSBASE check. */ static __always_inline unsigned long rdfsbase(void) { unsigned long fsbase; asm volatile("rdfsbase %0" : "=r" (fsbase) :: "memory"); return fsbase; } static __always_inline unsigned long rdgsbase(void) { unsigned long gsbase; asm volatile("rdgsbase %0" : "=r" (gsbase) :: "memory"); return gsbase; } static __always_inline void wrfsbase(unsigned long fsbase) { asm volatile("wrfsbase %0" :: "r" (fsbase) : "memory"); } static __always_inline void wrgsbase(unsigned long gsbase) { asm volatile("wrgsbase %0" :: "r" (gsbase) : "memory"); } #include <asm/cpufeature.h> /* Helper functions for reading/writing FS/GS base */ static inline unsigned long x86_fsbase_read_cpu(void) { unsigned long fsbase; if (boot_cpu_has(X86_FEATURE_FSGSBASE)) fsbase = rdfsbase(); else rdmsrq(MSR_FS_BASE, fsbase); return fsbase; } static inline void x86_fsbase_write_cpu(unsigned long fsbase) { if (boot_cpu_has(X86_FEATURE_FSGSBASE)) wrfsbase(fsbase); else wrmsrq(MSR_FS_BASE, fsbase); } extern unsigned long x86_gsbase_read_cpu_inactive(void); extern void x86_gsbase_write_cpu_inactive(unsigned long gsbase); extern unsigned long x86_fsgsbase_read_task(struct task_struct *task, unsigned short selector); #endif /* CONFIG_X86_64 */ #endif /* __ASSEMBLER__ */ #endif /* _ASM_FSGSBASE_H */ |
| 29 52 157 158 196 84 44 44 43 93 54 54 57 57 44 44 44 44 44 43 44 44 44 44 44 117 117 117 117 93 54 117 94 80 20 20 60 34 117 44 44 44 19 42 117 49 117 44 19 44 43 44 19 43 44 44 44 44 44 43 19 43 44 43 44 44 44 44 44 19 8 8 19 19 19 19 19 19 19 19 19 44 7 7 7 7 7 7 6 7 7 7 7 29 29 29 29 29 29 109 110 110 110 110 110 155 155 155 145 145 144 145 154 155 154 155 155 44 44 44 155 145 7 7 7 145 29 155 56 11 11 1 57 15 15 148 149 150 146 144 144 148 172 172 173 170 170 170 172 171 115 116 7 7 7 7 7 7 7 7 7 7 6 7 7 57 111 77 57 34 34 34 34 34 57 57 57 76 77 77 1 43 43 37 34 43 37 7 42 3 123 123 53 76 47 38 38 76 37 37 37 37 37 37 37 6 6 6 6 6 6 6 5 31 25 73 66 69 69 69 52 69 10 68 123 556 487 499 502 7 496 574 581 568 26 573 575 579 574 573 569 579 573 153 556 238 224 267 327 180 182 24 24 24 220 521 41 525 526 521 527 526 523 525 42 502 503 10 10 501 494 498 495 496 30 76 78 78 76 56 76 75 76 19 1 124 126 90 22 90 85 84 84 72 84 83 84 84 84 82 84 76 84 79 84 79 84 79 1 79 67 79 22 22 22 22 22 22 22 21 20 21 21 21 21 21 22 17 9 17 9 9 9 9 9 9 9 9 9 9 9 9 9 8 9 8 8 8 9 9 8 9 9 8 9 9 9 91 92 91 91 90 91 91 92 91 92 92 91 92 91 92 92 30 91 92 91 10 10 92 92 3 3 3 3 3 3 3 3 3 3 3 3 3 2 8 9 4 9 9 9 9 5 4 2 3 3 7 6 5 6 4 6 7 4 9 9 4 9 2 9 9 9 4 9 9 36 36 36 35 2 1 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 2 2 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 7 7 7 6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 | // SPDX-License-Identifier: GPL-2.0-or-later /* * * Robert Olsson <robert.olsson@its.uu.se> Uppsala Universitet * & Swedish University of Agricultural Sciences. * * Jens Laas <jens.laas@data.slu.se> Swedish University of * Agricultural Sciences. * * Hans Liss <hans.liss@its.uu.se> Uppsala Universitet * * This work is based on the LPC-trie which is originally described in: * * An experimental study of compression methods for dynamic tries * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002. * https://www.csc.kth.se/~snilsson/software/dyntrie2/ * * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson * IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999 * * Code from fib_hash has been reused which includes the following header: * * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * IPv4 FIB: lookup engine and maintenance routines. * * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> * * Substantial contributions to this work comes from: * * David S. Miller, <davem@davemloft.net> * Stephen Hemminger <shemminger@osdl.org> * Paul E. McKenney <paulmck@us.ibm.com> * Patrick McHardy <kaber@trash.net> */ #include <linux/cache.h> #include <linux/uaccess.h> #include <linux/bitops.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/mm.h> #include <linux/string.h> #include <linux/socket.h> #include <linux/sockios.h> #include <linux/errno.h> #include <linux/in.h> #include <linux/inet.h> #include <linux/inetdevice.h> #include <linux/netdevice.h> #include <linux/if_arp.h> #include <linux/proc_fs.h> #include <linux/rcupdate.h> #include <linux/rcupdate_wait.h> #include <linux/skbuff.h> #include <linux/netlink.h> #include <linux/init.h> #include <linux/list.h> #include <linux/slab.h> #include <linux/export.h> #include <linux/vmalloc.h> #include <linux/notifier.h> #include <net/net_namespace.h> #include <net/inet_dscp.h> #include <net/ip.h> #include <net/protocol.h> #include <net/route.h> #include <net/tcp.h> #include <net/sock.h> #include <net/ip_fib.h> #include <net/fib_notifier.h> #include <trace/events/fib.h> #include "fib_lookup.h" static int call_fib_entry_notifier(struct notifier_block *nb, enum fib_event_type event_type, u32 dst, int dst_len, struct fib_alias *fa, struct netlink_ext_ack *extack) { struct fib_entry_notifier_info info = { .info.extack = extack, .dst = dst, .dst_len = dst_len, .fi = fa->fa_info, .dscp = fa->fa_dscp, .type = fa->fa_type, .tb_id = fa->tb_id, }; return call_fib4_notifier(nb, event_type, &info.info); } static int call_fib_entry_notifiers(struct net *net, enum fib_event_type event_type, u32 dst, int dst_len, struct fib_alias *fa, struct netlink_ext_ack *extack) { struct fib_entry_notifier_info info = { .info.extack = extack, .dst = dst, .dst_len = dst_len, .fi = fa->fa_info, .dscp = fa->fa_dscp, .type = fa->fa_type, .tb_id = fa->tb_id, }; return call_fib4_notifiers(net, event_type, &info.info); } #define MAX_STAT_DEPTH 32 #define KEYLENGTH (8*sizeof(t_key)) #define KEY_MAX ((t_key)~0) typedef unsigned int t_key; #define IS_TRIE(n) ((n)->pos >= KEYLENGTH) #define IS_TNODE(n) ((n)->bits) #define IS_LEAF(n) (!(n)->bits) struct key_vector { t_key key; unsigned char pos; /* 2log(KEYLENGTH) bits needed */ unsigned char bits; /* 2log(KEYLENGTH) bits needed */ unsigned char slen; union { /* This list pointer if valid if (pos | bits) == 0 (LEAF) */ struct hlist_head leaf; /* This array is valid if (pos | bits) > 0 (TNODE) */ DECLARE_FLEX_ARRAY(struct key_vector __rcu *, tnode); }; }; struct tnode { struct rcu_head rcu; t_key empty_children; /* KEYLENGTH bits needed */ t_key full_children; /* KEYLENGTH bits needed */ struct key_vector __rcu *parent; struct key_vector kv[1]; #define tn_bits kv[0].bits }; #define TNODE_SIZE(n) offsetof(struct tnode, kv[0].tnode[n]) #define LEAF_SIZE TNODE_SIZE(1) #ifdef CONFIG_IP_FIB_TRIE_STATS struct trie_use_stats { unsigned int gets; unsigned int backtrack; unsigned int semantic_match_passed; unsigned int semantic_match_miss; unsigned int null_node_hit; unsigned int resize_node_skipped; }; #endif struct trie_stat { unsigned int totdepth; unsigned int maxdepth; unsigned int tnodes; unsigned int leaves; unsigned int nullpointers; unsigned int prefixes; unsigned int nodesizes[MAX_STAT_DEPTH]; }; struct trie { struct key_vector kv[1]; #ifdef CONFIG_IP_FIB_TRIE_STATS struct trie_use_stats __percpu *stats; #endif }; static struct key_vector *resize(struct trie *t, struct key_vector *tn); static unsigned int tnode_free_size; /* * synchronize_rcu after call_rcu for outstanding dirty memory; it should be * especially useful before resizing the root node with PREEMPT_NONE configs; * the value was obtained experimentally, aiming to avoid visible slowdown. */ unsigned int sysctl_fib_sync_mem = 512 * 1024; unsigned int sysctl_fib_sync_mem_min = 64 * 1024; unsigned int sysctl_fib_sync_mem_max = 64 * 1024 * 1024; static struct kmem_cache *fn_alias_kmem __ro_after_init; static struct kmem_cache *trie_leaf_kmem __ro_after_init; static inline struct tnode *tn_info(struct key_vector *kv) { return container_of(kv, struct tnode, kv[0]); } /* caller must hold RTNL */ #define node_parent(tn) rtnl_dereference(tn_info(tn)->parent) #define get_child(tn, i) rtnl_dereference((tn)->tnode[i]) /* caller must hold RCU read lock or RTNL */ #define node_parent_rcu(tn) rcu_dereference_rtnl(tn_info(tn)->parent) #define get_child_rcu(tn, i) rcu_dereference_rtnl((tn)->tnode[i]) /* wrapper for rcu_assign_pointer */ static inline void node_set_parent(struct key_vector *n, struct key_vector *tp) { if (n) rcu_assign_pointer(tn_info(n)->parent, tp); } #define NODE_INIT_PARENT(n, p) RCU_INIT_POINTER(tn_info(n)->parent, p) /* This provides us with the number of children in this node, in the case of a * leaf this will return 0 meaning none of the children are accessible. */ static inline unsigned long child_length(const struct key_vector *tn) { return (1ul << tn->bits) & ~(1ul); } #define get_cindex(key, kv) (((key) ^ (kv)->key) >> (kv)->pos) static inline unsigned long get_index(t_key key, struct key_vector *kv) { unsigned long index = key ^ kv->key; if ((BITS_PER_LONG <= KEYLENGTH) && (KEYLENGTH == kv->pos)) return 0; return index >> kv->pos; } /* To understand this stuff, an understanding of keys and all their bits is * necessary. Every node in the trie has a key associated with it, but not * all of the bits in that key are significant. * * Consider a node 'n' and its parent 'tp'. * * If n is a leaf, every bit in its key is significant. Its presence is * necessitated by path compression, since during a tree traversal (when * searching for a leaf - unless we are doing an insertion) we will completely * ignore all skipped bits we encounter. Thus we need to verify, at the end of * a potentially successful search, that we have indeed been walking the * correct key path. * * Note that we can never "miss" the correct key in the tree if present by * following the wrong path. Path compression ensures that segments of the key * that are the same for all keys with a given prefix are skipped, but the * skipped part *is* identical for each node in the subtrie below the skipped * bit! trie_insert() in this implementation takes care of that. * * if n is an internal node - a 'tnode' here, the various parts of its key * have many different meanings. * * Example: * _________________________________________________________________ * | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C | * ----------------------------------------------------------------- * 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 * * _________________________________________________________________ * | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u | * ----------------------------------------------------------------- * 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0 * * tp->pos = 22 * tp->bits = 3 * n->pos = 13 * n->bits = 4 * * First, let's just ignore the bits that come before the parent tp, that is * the bits from (tp->pos + tp->bits) to 31. They are *known* but at this * point we do not use them for anything. * * The bits from (tp->pos) to (tp->pos + tp->bits - 1) - "N", above - are the * index into the parent's child array. That is, they will be used to find * 'n' among tp's children. * * The bits from (n->pos + n->bits) to (tp->pos - 1) - "S" - are skipped bits * for the node n. * * All the bits we have seen so far are significant to the node n. The rest * of the bits are really not needed or indeed known in n->key. * * The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into * n's child array, and will of course be different for each child. * * The rest of the bits, from 0 to (n->pos -1) - "u" - are completely unknown * at this point. */ static const int halve_threshold = 25; static const int inflate_threshold = 50; static const int halve_threshold_root = 15; static const int inflate_threshold_root = 30; static inline void alias_free_mem_rcu(struct fib_alias *fa) { kfree_rcu(fa, rcu); } #define TNODE_VMALLOC_MAX \ ilog2((SIZE_MAX - TNODE_SIZE(0)) / sizeof(struct key_vector *)) static void __node_free_rcu(struct rcu_head *head) { struct tnode *n = container_of(head, struct tnode, rcu); if (!n->tn_bits) kmem_cache_free(trie_leaf_kmem, n); else kvfree(n); } #define node_free(n) call_rcu(&tn_info(n)->rcu, __node_free_rcu) static struct tnode *tnode_alloc(int bits) { size_t size; /* verify bits is within bounds */ if (bits > TNODE_VMALLOC_MAX) return NULL; /* determine size and verify it is non-zero and didn't overflow */ size = TNODE_SIZE(1ul << bits); if (size <= PAGE_SIZE) return kzalloc(size, GFP_KERNEL); else return vzalloc(size); } static inline void empty_child_inc(struct key_vector *n) { tn_info(n)->empty_children++; if (!tn_info(n)->empty_children) tn_info(n)->full_children++; } static inline void empty_child_dec(struct key_vector *n) { if (!tn_info(n)->empty_children) tn_info(n)->full_children--; tn_info(n)->empty_children--; } static struct key_vector *leaf_new(t_key key, struct fib_alias *fa) { struct key_vector *l; struct tnode *kv; kv = kmem_cache_alloc(trie_leaf_kmem, GFP_KERNEL); if (!kv) return NULL; /* initialize key vector */ l = kv->kv; l->key = key; l->pos = 0; l->bits = 0; l->slen = fa->fa_slen; /* link leaf to fib alias */ INIT_HLIST_HEAD(&l->leaf); hlist_add_head(&fa->fa_list, &l->leaf); return l; } static struct key_vector *tnode_new(t_key key, int pos, int bits) { unsigned int shift = pos + bits; struct key_vector *tn; struct tnode *tnode; /* verify bits and pos their msb bits clear and values are valid */ BUG_ON(!bits || (shift > KEYLENGTH)); tnode = tnode_alloc(bits); if (!tnode) return NULL; pr_debug("AT %p s=%zu %zu\n", tnode, TNODE_SIZE(0), sizeof(struct key_vector *) << bits); if (bits == KEYLENGTH) tnode->full_children = 1; else tnode->empty_children = 1ul << bits; tn = tnode->kv; tn->key = (shift < KEYLENGTH) ? (key >> shift) << shift : 0; tn->pos = pos; tn->bits = bits; tn->slen = pos; return tn; } /* Check whether a tnode 'n' is "full", i.e. it is an internal node * and no bits are skipped. See discussion in dyntree paper p. 6 */ static inline int tnode_full(struct key_vector *tn, struct key_vector *n) { return n && ((n->pos + n->bits) == tn->pos) && IS_TNODE(n); } /* Add a child at position i overwriting the old value. * Update the value of full_children and empty_children. */ static void put_child(struct key_vector *tn, unsigned long i, struct key_vector *n) { struct key_vector *chi = get_child(tn, i); int isfull, wasfull; BUG_ON(i >= child_length(tn)); /* update emptyChildren, overflow into fullChildren */ if (!n && chi) empty_child_inc(tn); if (n && !chi) empty_child_dec(tn); /* update fullChildren */ wasfull = tnode_full(tn, chi); isfull = tnode_full(tn, n); if (wasfull && !isfull) tn_info(tn)->full_children--; else if (!wasfull && isfull) tn_info(tn)->full_children++; if (n && (tn->slen < n->slen)) tn->slen = n->slen; rcu_assign_pointer(tn->tnode[i], n); } static void update_children(struct key_vector *tn) { unsigned long i; /* update all of the child parent pointers */ for (i = child_length(tn); i;) { struct key_vector *inode = get_child(tn, --i); if (!inode) continue; /* Either update the children of a tnode that * already belongs to us or update the child * to point to ourselves. */ if (node_parent(inode) == tn) update_children(inode); else node_set_parent(inode, tn); } } static inline void put_child_root(struct key_vector *tp, t_key key, struct key_vector *n) { if (IS_TRIE(tp)) rcu_assign_pointer(tp->tnode[0], n); else put_child(tp, get_index(key, tp), n); } static inline void tnode_free_init(struct key_vector *tn) { tn_info(tn)->rcu.next = NULL; } static inline void tnode_free_append(struct key_vector *tn, struct key_vector *n) { tn_info(n)->rcu.next = tn_info(tn)->rcu.next; tn_info(tn)->rcu.next = &tn_info(n)->rcu; } static void tnode_free(struct key_vector *tn) { struct callback_head *head = &tn_info(tn)->rcu; while (head) { head = head->next; tnode_free_size += TNODE_SIZE(1ul << tn->bits); node_free(tn); tn = container_of(head, struct tnode, rcu)->kv; } if (tnode_free_size >= READ_ONCE(sysctl_fib_sync_mem)) { tnode_free_size = 0; synchronize_net(); } } static struct key_vector *replace(struct trie *t, struct key_vector *oldtnode, struct key_vector *tn) { struct key_vector *tp = node_parent(oldtnode); unsigned long i; /* setup the parent pointer out of and back into this node */ NODE_INIT_PARENT(tn, tp); put_child_root(tp, tn->key, tn); /* update all of the child parent pointers */ update_children(tn); /* all pointers should be clean so we are done */ tnode_free(oldtnode); /* resize children now that oldtnode is freed */ for (i = child_length(tn); i;) { struct key_vector *inode = get_child(tn, --i); /* resize child node */ if (tnode_full(tn, inode)) tn = resize(t, inode); } return tp; } static struct key_vector *inflate(struct trie *t, struct key_vector *oldtnode) { struct key_vector *tn; unsigned long i; t_key m; pr_debug("In inflate\n"); tn = tnode_new(oldtnode->key, oldtnode->pos - 1, oldtnode->bits + 1); if (!tn) goto notnode; /* prepare oldtnode to be freed */ tnode_free_init(oldtnode); /* Assemble all of the pointers in our cluster, in this case that * represents all of the pointers out of our allocated nodes that * point to existing tnodes and the links between our allocated * nodes. */ for (i = child_length(oldtnode), m = 1u << tn->pos; i;) { struct key_vector *inode = get_child(oldtnode, --i); struct key_vector *node0, *node1; unsigned long j, k; /* An empty child */ if (!inode) continue; /* A leaf or an internal node with skipped bits */ if (!tnode_full(oldtnode, inode)) { put_child(tn, get_index(inode->key, tn), inode); continue; } /* drop the node in the old tnode free list */ tnode_free_append(oldtnode, inode); /* An internal node with two children */ if (inode->bits == 1) { put_child(tn, 2 * i + 1, get_child(inode, 1)); put_child(tn, 2 * i, get_child(inode, 0)); continue; } /* We will replace this node 'inode' with two new * ones, 'node0' and 'node1', each with half of the * original children. The two new nodes will have * a position one bit further down the key and this * means that the "significant" part of their keys * (see the discussion near the top of this file) * will differ by one bit, which will be "0" in * node0's key and "1" in node1's key. Since we are * moving the key position by one step, the bit that * we are moving away from - the bit at position * (tn->pos) - is the one that will differ between * node0 and node1. So... we synthesize that bit in the * two new keys. */ node1 = tnode_new(inode->key | m, inode->pos, inode->bits - 1); if (!node1) goto nomem; node0 = tnode_new(inode->key, inode->pos, inode->bits - 1); tnode_free_append(tn, node1); if (!node0) goto nomem; tnode_free_append(tn, node0); /* populate child pointers in new nodes */ for (k = child_length(inode), j = k / 2; j;) { put_child(node1, --j, get_child(inode, --k)); put_child(node0, j, get_child(inode, j)); put_child(node1, --j, get_child(inode, --k)); put_child(node0, j, get_child(inode, j)); } /* link new nodes to parent */ NODE_INIT_PARENT(node1, tn); NODE_INIT_PARENT(node0, tn); /* link parent to nodes */ put_child(tn, 2 * i + 1, node1); put_child(tn, 2 * i, node0); } /* setup the parent pointers into and out of this node */ return replace(t, oldtnode, tn); nomem: /* all pointers should be clean so we are done */ tnode_free(tn); notnode: return NULL; } static struct key_vector *halve(struct trie *t, struct key_vector *oldtnode) { struct key_vector *tn; unsigned long i; pr_debug("In halve\n"); tn = tnode_new(oldtnode->key, oldtnode->pos + 1, oldtnode->bits - 1); if (!tn) goto notnode; /* prepare oldtnode to be freed */ tnode_free_init(oldtnode); /* Assemble all of the pointers in our cluster, in this case that * represents all of the pointers out of our allocated nodes that * point to existing tnodes and the links between our allocated * nodes. */ for (i = child_length(oldtnode); i;) { struct key_vector *node1 = get_child(oldtnode, --i); struct key_vector *node0 = get_child(oldtnode, --i); struct key_vector *inode; /* At least one of the children is empty */ if (!node1 || !node0) { put_child(tn, i / 2, node1 ? : node0); continue; } /* Two nonempty children */ inode = tnode_new(node0->key, oldtnode->pos, 1); if (!inode) goto nomem; tnode_free_append(tn, inode); /* initialize pointers out of node */ put_child(inode, 1, node1); put_child(inode, 0, node0); NODE_INIT_PARENT(inode, tn); /* link parent to node */ put_child(tn, i / 2, inode); } /* setup the parent pointers into and out of this node */ return replace(t, oldtnode, tn); nomem: /* all pointers should be clean so we are done */ tnode_free(tn); notnode: return NULL; } static struct key_vector *collapse(struct trie *t, struct key_vector *oldtnode) { struct key_vector *n, *tp; unsigned long i; /* scan the tnode looking for that one child that might still exist */ for (n = NULL, i = child_length(oldtnode); !n && i;) n = get_child(oldtnode, --i); /* compress one level */ tp = node_parent(oldtnode); put_child_root(tp, oldtnode->key, n); node_set_parent(n, tp); /* drop dead node */ node_free(oldtnode); return tp; } static unsigned char update_suffix(struct key_vector *tn) { unsigned char slen = tn->pos; unsigned long stride, i; unsigned char slen_max; /* only vector 0 can have a suffix length greater than or equal to * tn->pos + tn->bits, the second highest node will have a suffix * length at most of tn->pos + tn->bits - 1 */ slen_max = min_t(unsigned char, tn->pos + tn->bits - 1, tn->slen); /* search though the list of children looking for nodes that might * have a suffix greater than the one we currently have. This is * why we start with a stride of 2 since a stride of 1 would * represent the nodes with suffix length equal to tn->pos */ for (i = 0, stride = 0x2ul ; i < child_length(tn); i += stride) { struct key_vector *n = get_child(tn, i); if (!n || (n->slen <= slen)) continue; /* update stride and slen based on new value */ stride <<= (n->slen - slen); slen = n->slen; i &= ~(stride - 1); /* stop searching if we have hit the maximum possible value */ if (slen >= slen_max) break; } tn->slen = slen; return slen; } /* From "Implementing a dynamic compressed trie" by Stefan Nilsson of * the Helsinki University of Technology and Matti Tikkanen of Nokia * Telecommunications, page 6: * "A node is doubled if the ratio of non-empty children to all * children in the *doubled* node is at least 'high'." * * 'high' in this instance is the variable 'inflate_threshold'. It * is expressed as a percentage, so we multiply it with * child_length() and instead of multiplying by 2 (since the * child array will be doubled by inflate()) and multiplying * the left-hand side by 100 (to handle the percentage thing) we * multiply the left-hand side by 50. * * The left-hand side may look a bit weird: child_length(tn) * - tn->empty_children is of course the number of non-null children * in the current node. tn->full_children is the number of "full" * children, that is non-null tnodes with a skip value of 0. * All of those will be doubled in the resulting inflated tnode, so * we just count them one extra time here. * * A clearer way to write this would be: * * to_be_doubled = tn->full_children; * not_to_be_doubled = child_length(tn) - tn->empty_children - * tn->full_children; * * new_child_length = child_length(tn) * 2; * * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) / * new_child_length; * if (new_fill_factor >= inflate_threshold) * * ...and so on, tho it would mess up the while () loop. * * anyway, * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >= * inflate_threshold * * avoid a division: * 100 * (not_to_be_doubled + 2*to_be_doubled) >= * inflate_threshold * new_child_length * * expand not_to_be_doubled and to_be_doubled, and shorten: * 100 * (child_length(tn) - tn->empty_children + * tn->full_children) >= inflate_threshold * new_child_length * * expand new_child_length: * 100 * (child_length(tn) - tn->empty_children + * tn->full_children) >= * inflate_threshold * child_length(tn) * 2 * * shorten again: * 50 * (tn->full_children + child_length(tn) - * tn->empty_children) >= inflate_threshold * * child_length(tn) * */ static inline bool should_inflate(struct key_vector *tp, struct key_vector *tn) { unsigned long used = child_length(tn); unsigned long threshold = used; /* Keep root node larger */ threshold *= IS_TRIE(tp) ? inflate_threshold_root : inflate_threshold; used -= tn_info(tn)->empty_children; used += tn_info(tn)->full_children; /* if bits == KEYLENGTH then pos = 0, and will fail below */ return (used > 1) && tn->pos && ((50 * used) >= threshold); } static inline bool should_halve(struct key_vector *tp, struct key_vector *tn) { unsigned long used = child_length(tn); unsigned long threshold = used; /* Keep root node larger */ threshold *= IS_TRIE(tp) ? halve_threshold_root : halve_threshold; used -= tn_info(tn)->empty_children; /* if bits == KEYLENGTH then used = 100% on wrap, and will fail below */ return (used > 1) && (tn->bits > 1) && ((100 * used) < threshold); } static inline bool should_collapse(struct key_vector *tn) { unsigned long used = child_length(tn); used -= tn_info(tn)->empty_children; /* account for bits == KEYLENGTH case */ if ((tn->bits == KEYLENGTH) && tn_info(tn)->full_children) used -= KEY_MAX; /* One child or none, time to drop us from the trie */ return used < 2; } #define MAX_WORK 10 static struct key_vector *resize(struct trie *t, struct key_vector *tn) { #ifdef CONFIG_IP_FIB_TRIE_STATS struct trie_use_stats __percpu *stats = t->stats; #endif struct key_vector *tp = node_parent(tn); unsigned long cindex = get_index(tn->key, tp); int max_work = MAX_WORK; pr_debug("In tnode_resize %p inflate_threshold=%d threshold=%d\n", tn, inflate_threshold, halve_threshold); /* track the tnode via the pointer from the parent instead of * doing it ourselves. This way we can let RCU fully do its * thing without us interfering */ BUG_ON(tn != get_child(tp, cindex)); /* Double as long as the resulting node has a number of * nonempty nodes that are above the threshold. */ while (should_inflate(tp, tn) && max_work) { tp = inflate(t, tn); if (!tp) { #ifdef CONFIG_IP_FIB_TRIE_STATS this_cpu_inc(stats->resize_node_skipped); #endif break; } max_work--; tn = get_child(tp, cindex); } /* update parent in case inflate failed */ tp = node_parent(tn); /* Return if at least one inflate is run */ if (max_work != MAX_WORK) return tp; /* Halve as long as the number of empty children in this * node is above threshold. */ while (should_halve(tp, tn) && max_work) { tp = halve(t, tn); if (!tp) { #ifdef CONFIG_IP_FIB_TRIE_STATS this_cpu_inc(stats->resize_node_skipped); #endif break; } max_work--; tn = get_child(tp, cindex); } /* Only one child remains */ if (should_collapse(tn)) return collapse(t, tn); /* update parent in case halve failed */ return node_parent(tn); } static void node_pull_suffix(struct key_vector *tn, unsigned char slen) { unsigned char node_slen = tn->slen; while ((node_slen > tn->pos) && (node_slen > slen)) { slen = update_suffix(tn); if (node_slen == slen) break; tn = node_parent(tn); node_slen = tn->slen; } } static void node_push_suffix(struct key_vector *tn, unsigned char slen) { while (tn->slen < slen) { tn->slen = slen; tn = node_parent(tn); } } /* rcu_read_lock needs to be hold by caller from readside */ static struct key_vector *fib_find_node(struct trie *t, struct key_vector **tp, u32 key) { struct key_vector *pn, *n = t->kv; unsigned long index = 0; do { pn = n; n = get_child_rcu(n, index); if (!n) break; index = get_cindex(key, n); /* This bit of code is a bit tricky but it combines multiple * checks into a single check. The prefix consists of the * prefix plus zeros for the bits in the cindex. The index * is the difference between the key and this value. From * this we can actually derive several pieces of data. * if (index >= (1ul << bits)) * we have a mismatch in skip bits and failed * else * we know the value is cindex * * This check is safe even if bits == KEYLENGTH due to the * fact that we can only allocate a node with 32 bits if a * long is greater than 32 bits. */ if (index >= (1ul << n->bits)) { n = NULL; break; } /* keep searching until we find a perfect match leaf or NULL */ } while (IS_TNODE(n)); *tp = pn; return n; } /* Return the first fib alias matching DSCP with * priority less than or equal to PRIO. * If 'find_first' is set, return the first matching * fib alias, regardless of DSCP and priority. */ static struct fib_alias *fib_find_alias(struct hlist_head *fah, u8 slen, dscp_t dscp, u32 prio, u32 tb_id, bool find_first) { struct fib_alias *fa; if (!fah) return NULL; hlist_for_each_entry(fa, fah, fa_list) { /* Avoid Sparse warning when using dscp_t in inequalities */ u8 __fa_dscp = inet_dscp_to_dsfield(fa->fa_dscp); u8 __dscp = inet_dscp_to_dsfield(dscp); if (fa->fa_slen < slen) continue; if (fa->fa_slen != slen) break; if (fa->tb_id > tb_id) continue; if (fa->tb_id != tb_id) break; if (find_first) return fa; if (__fa_dscp > __dscp) continue; if (fa->fa_info->fib_priority >= prio || __fa_dscp < __dscp) return fa; } return NULL; } static struct fib_alias * fib_find_matching_alias(struct net *net, const struct fib_rt_info *fri) { u8 slen = KEYLENGTH - fri->dst_len; struct key_vector *l, *tp; struct fib_table *tb; struct fib_alias *fa; struct trie *t; tb = fib_get_table(net, fri->tb_id); if (!tb) return NULL; t = (struct trie *)tb->tb_data; l = fib_find_node(t, &tp, be32_to_cpu(fri->dst)); if (!l) return NULL; hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) { if (fa->fa_slen == slen && fa->tb_id == fri->tb_id && fa->fa_dscp == fri->dscp && fa->fa_info == fri->fi && fa->fa_type == fri->type) return fa; } return NULL; } void fib_alias_hw_flags_set(struct net *net, const struct fib_rt_info *fri) { u8 fib_notify_on_flag_change; struct fib_alias *fa_match; struct sk_buff *skb; int err; rcu_read_lock(); fa_match = fib_find_matching_alias(net, fri); if (!fa_match) goto out; /* These are paired with the WRITE_ONCE() happening in this function. * The reason is that we are only protected by RCU at this point. */ if (READ_ONCE(fa_match->offload) == fri->offload && READ_ONCE(fa_match->trap) == fri->trap && READ_ONCE(fa_match->offload_failed) == fri->offload_failed) goto out; WRITE_ONCE(fa_match->offload, fri->offload); WRITE_ONCE(fa_match->trap, fri->trap); fib_notify_on_flag_change = READ_ONCE(net->ipv4.sysctl_fib_notify_on_flag_change); /* 2 means send notifications only if offload_failed was changed. */ if (fib_notify_on_flag_change == 2 && READ_ONCE(fa_match->offload_failed) == fri->offload_failed) goto out; WRITE_ONCE(fa_match->offload_failed, fri->offload_failed); if (!fib_notify_on_flag_change) goto out; skb = nlmsg_new(fib_nlmsg_size(fa_match->fa_info), GFP_ATOMIC); if (!skb) { err = -ENOBUFS; goto errout; } err = fib_dump_info(skb, 0, 0, RTM_NEWROUTE, fri, 0); if (err < 0) { /* -EMSGSIZE implies BUG in fib_nlmsg_size() */ WARN_ON(err == -EMSGSIZE); kfree_skb(skb); goto errout; } rtnl_notify(skb, net, 0, RTNLGRP_IPV4_ROUTE, NULL, GFP_ATOMIC); goto out; errout: rtnl_set_sk_err(net, RTNLGRP_IPV4_ROUTE, err); out: rcu_read_unlock(); } EXPORT_SYMBOL_GPL(fib_alias_hw_flags_set); static void trie_rebalance(struct trie *t, struct key_vector *tn) { while (!IS_TRIE(tn)) tn = resize(t, tn); } static int fib_insert_node(struct trie *t, struct key_vector *tp, struct fib_alias *new, t_key key) { struct key_vector *n, *l; l = leaf_new(key, new); if (!l) goto noleaf; /* retrieve child from parent node */ n = get_child(tp, get_index(key, tp)); /* Case 2: n is a LEAF or a TNODE and the key doesn't match. * * Add a new tnode here * first tnode need some special handling * leaves us in position for handling as case 3 */ if (n) { struct key_vector *tn; tn = tnode_new(key, __fls(key ^ n->key), 1); if (!tn) goto notnode; /* initialize routes out of node */ NODE_INIT_PARENT(tn, tp); put_child(tn, get_index(key, tn) ^ 1, n); /* start adding routes into the node */ put_child_root(tp, key, tn); node_set_parent(n, tn); /* parent now has a NULL spot where the leaf can go */ tp = tn; } /* Case 3: n is NULL, and will just insert a new leaf */ node_push_suffix(tp, new->fa_slen); NODE_INIT_PARENT(l, tp); put_child_root(tp, key, l); trie_rebalance(t, tp); return 0; notnode: node_free(l); noleaf: return -ENOMEM; } static int fib_insert_alias(struct trie *t, struct key_vector *tp, struct key_vector *l, struct fib_alias *new, struct fib_alias *fa, t_key key) { if (!l) return fib_insert_node(t, tp, new, key); if (fa) { hlist_add_before_rcu(&new->fa_list, &fa->fa_list); } else { struct fib_alias *last; hlist_for_each_entry(last, &l->leaf, fa_list) { if (new->fa_slen < last->fa_slen) break; if ((new->fa_slen == last->fa_slen) && (new->tb_id > last->tb_id)) break; fa = last; } if (fa) hlist_add_behind_rcu(&new->fa_list, &fa->fa_list); else hlist_add_head_rcu(&new->fa_list, &l->leaf); } /* if we added to the tail node then we need to update slen */ if (l->slen < new->fa_slen) { l->slen = new->fa_slen; node_push_suffix(tp, new->fa_slen); } return 0; } static void fib_remove_alias(struct trie *t, struct key_vector *tp, struct key_vector *l, struct fib_alias *old); /* Caller must hold RTNL. */ int fib_table_insert(struct net *net, struct fib_table *tb, struct fib_config *cfg, struct netlink_ext_ack *extack) { struct trie *t = (struct trie *)tb->tb_data; struct fib_alias *fa, *new_fa; struct key_vector *l, *tp; u16 nlflags = NLM_F_EXCL; struct fib_info *fi; u8 plen = cfg->fc_dst_len; u8 slen = KEYLENGTH - plen; dscp_t dscp; u32 key; int err; key = ntohl(cfg->fc_dst); pr_debug("Insert table=%u %08x/%d\n", tb->tb_id, key, plen); fi = fib_create_info(cfg, extack); if (IS_ERR(fi)) { err = PTR_ERR(fi); goto err; } dscp = cfg->fc_dscp; l = fib_find_node(t, &tp, key); fa = l ? fib_find_alias(&l->leaf, slen, dscp, fi->fib_priority, tb->tb_id, false) : NULL; /* Now fa, if non-NULL, points to the first fib alias * with the same keys [prefix,dscp,priority], if such key already * exists or to the node before which we will insert new one. * * If fa is NULL, we will need to allocate a new one and * insert to the tail of the section matching the suffix length * of the new alias. */ if (fa && fa->fa_dscp == dscp && fa->fa_info->fib_priority == fi->fib_priority) { struct fib_alias *fa_first, *fa_match; err = -EEXIST; if (cfg->fc_nlflags & NLM_F_EXCL) goto out; nlflags &= ~NLM_F_EXCL; /* We have 2 goals: * 1. Find exact match for type, scope, fib_info to avoid * duplicate routes * 2. Find next 'fa' (or head), NLM_F_APPEND inserts before it */ fa_match = NULL; fa_first = fa; hlist_for_each_entry_from(fa, fa_list) { if ((fa->fa_slen != slen) || (fa->tb_id != tb->tb_id) || (fa->fa_dscp != dscp)) break; if (fa->fa_info->fib_priority != fi->fib_priority) break; if (fa->fa_type == cfg->fc_type && fa->fa_info == fi) { fa_match = fa; break; } } if (cfg->fc_nlflags & NLM_F_REPLACE) { struct fib_info *fi_drop; u8 state; nlflags |= NLM_F_REPLACE; fa = fa_first; if (fa_match) { if (fa == fa_match) err = 0; goto out; } err = -ENOBUFS; new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL); if (!new_fa) goto out; fi_drop = fa->fa_info; new_fa->fa_dscp = fa->fa_dscp; new_fa->fa_info = fi; new_fa->fa_type = cfg->fc_type; state = fa->fa_state; new_fa->fa_state = state & ~FA_S_ACCESSED; new_fa->fa_slen = fa->fa_slen; new_fa->tb_id = tb->tb_id; new_fa->fa_default = -1; new_fa->offload = 0; new_fa->trap = 0; new_fa->offload_failed = 0; hlist_replace_rcu(&fa->fa_list, &new_fa->fa_list); if (fib_find_alias(&l->leaf, fa->fa_slen, 0, 0, tb->tb_id, true) == new_fa) { enum fib_event_type fib_event; fib_event = FIB_EVENT_ENTRY_REPLACE; err = call_fib_entry_notifiers(net, fib_event, key, plen, new_fa, extack); if (err) { hlist_replace_rcu(&new_fa->fa_list, &fa->fa_list); goto out_free_new_fa; } } rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id, &cfg->fc_nlinfo, nlflags); alias_free_mem_rcu(fa); fib_release_info(fi_drop); if (state & FA_S_ACCESSED) rt_cache_flush(cfg->fc_nlinfo.nl_net); goto succeeded; } /* Error if we find a perfect match which * uses the same scope, type, and nexthop * information. */ if (fa_match) goto out; if (cfg->fc_nlflags & NLM_F_APPEND) nlflags |= NLM_F_APPEND; else fa = fa_first; } err = -ENOENT; if (!(cfg->fc_nlflags & NLM_F_CREATE)) goto out; nlflags |= NLM_F_CREATE; err = -ENOBUFS; new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL); if (!new_fa) goto out; new_fa->fa_info = fi; new_fa->fa_dscp = dscp; new_fa->fa_type = cfg->fc_type; new_fa->fa_state = 0; new_fa->fa_slen = slen; new_fa->tb_id = tb->tb_id; new_fa->fa_default = -1; new_fa->offload = 0; new_fa->trap = 0; new_fa->offload_failed = 0; /* Insert new entry to the list. */ err = fib_insert_alias(t, tp, l, new_fa, fa, key); if (err) goto out_free_new_fa; /* The alias was already inserted, so the node must exist. */ l = l ? l : fib_find_node(t, &tp, key); if (WARN_ON_ONCE(!l)) { err = -ENOENT; goto out_free_new_fa; } if (fib_find_alias(&l->leaf, new_fa->fa_slen, 0, 0, tb->tb_id, true) == new_fa) { enum fib_event_type fib_event; fib_event = FIB_EVENT_ENTRY_REPLACE; err = call_fib_entry_notifiers(net, fib_event, key, plen, new_fa, extack); if (err) goto out_remove_new_fa; } if (!plen) tb->tb_num_default++; rt_cache_flush(cfg->fc_nlinfo.nl_net); rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, new_fa->tb_id, &cfg->fc_nlinfo, nlflags); succeeded: return 0; out_remove_new_fa: fib_remove_alias(t, tp, l, new_fa); out_free_new_fa: kmem_cache_free(fn_alias_kmem, new_fa); out: fib_release_info(fi); err: return err; } static inline t_key prefix_mismatch(t_key key, struct key_vector *n) { t_key prefix = n->key; return (key ^ prefix) & (prefix | -prefix); } bool fib_lookup_good_nhc(const struct fib_nh_common *nhc, int fib_flags, const struct flowi4 *flp) { if (nhc->nhc_flags & RTNH_F_DEAD) return false; if (ip_ignore_linkdown(nhc->nhc_dev) && nhc->nhc_flags & RTNH_F_LINKDOWN && !(fib_flags & FIB_LOOKUP_IGNORE_LINKSTATE)) return false; if (flp->flowi4_oif && flp->flowi4_oif != nhc->nhc_oif) return false; return true; } /* should be called with rcu_read_lock */ int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp, struct fib_result *res, int fib_flags) { struct trie *t = (struct trie *) tb->tb_data; #ifdef CONFIG_IP_FIB_TRIE_STATS struct trie_use_stats __percpu *stats = t->stats; #endif const t_key key = ntohl(flp->daddr); struct key_vector *n, *pn; struct fib_alias *fa; unsigned long index; t_key cindex; pn = t->kv; cindex = 0; n = get_child_rcu(pn, cindex); if (!n) { trace_fib_table_lookup(tb->tb_id, flp, NULL, -EAGAIN); return -EAGAIN; } #ifdef CONFIG_IP_FIB_TRIE_STATS this_cpu_inc(stats->gets); #endif /* Step 1: Travel to the longest prefix match in the trie */ for (;;) { index = get_cindex(key, n); /* This bit of code is a bit tricky but it combines multiple * checks into a single check. The prefix consists of the * prefix plus zeros for the "bits" in the prefix. The index * is the difference between the key and this value. From * this we can actually derive several pieces of data. * if (index >= (1ul << bits)) * we have a mismatch in skip bits and failed * else * we know the value is cindex * * This check is safe even if bits == KEYLENGTH due to the * fact that we can only allocate a node with 32 bits if a * long is greater than 32 bits. */ if (index >= (1ul << n->bits)) break; /* we have found a leaf. Prefixes have already been compared */ if (IS_LEAF(n)) goto found; /* only record pn and cindex if we are going to be chopping * bits later. Otherwise we are just wasting cycles. */ if (n->slen > n->pos) { pn = n; cindex = index; } n = get_child_rcu(n, index); if (unlikely(!n)) goto backtrace; } /* Step 2: Sort out leaves and begin backtracing for longest prefix */ for (;;) { /* record the pointer where our next node pointer is stored */ struct key_vector __rcu **cptr = n->tnode; /* This test verifies that none of the bits that differ * between the key and the prefix exist in the region of * the lsb and higher in the prefix. */ if (unlikely(prefix_mismatch(key, n)) || (n->slen == n->pos)) goto backtrace; /* exit out and process leaf */ if (unlikely(IS_LEAF(n))) break; /* Don't bother recording parent info. Since we are in * prefix match mode we will have to come back to wherever * we started this traversal anyway */ while ((n = rcu_dereference(*cptr)) == NULL) { backtrace: #ifdef CONFIG_IP_FIB_TRIE_STATS if (!n) this_cpu_inc(stats->null_node_hit); #endif /* If we are at cindex 0 there are no more bits for * us to strip at this level so we must ascend back * up one level to see if there are any more bits to * be stripped there. */ while (!cindex) { t_key pkey = pn->key; /* If we don't have a parent then there is * nothing for us to do as we do not have any * further nodes to parse. */ if (IS_TRIE(pn)) { trace_fib_table_lookup(tb->tb_id, flp, NULL, -EAGAIN); return -EAGAIN; } #ifdef CONFIG_IP_FIB_TRIE_STATS this_cpu_inc(stats->backtrack); #endif /* Get Child's index */ pn = node_parent_rcu(pn); cindex = get_index(pkey, pn); } /* strip the least significant bit from the cindex */ cindex &= cindex - 1; /* grab pointer for next child node */ cptr = &pn->tnode[cindex]; } } found: /* this line carries forward the xor from earlier in the function */ index = key ^ n->key; /* Step 3: Process the leaf, if that fails fall back to backtracing */ hlist_for_each_entry_rcu(fa, &n->leaf, fa_list) { struct fib_info *fi = fa->fa_info; struct fib_nh_common *nhc; int nhsel, err; if ((BITS_PER_LONG > KEYLENGTH) || (fa->fa_slen < KEYLENGTH)) { if (index >= (1ul << fa->fa_slen)) continue; } if (fa->fa_dscp && !fib_dscp_masked_match(fa->fa_dscp, flp)) continue; /* Paired with WRITE_ONCE() in fib_release_info() */ if (READ_ONCE(fi->fib_dead)) continue; if (fa->fa_info->fib_scope < flp->flowi4_scope) continue; fib_alias_accessed(fa); err = fib_props[fa->fa_type].error; if (unlikely(err < 0)) { out_reject: #ifdef CONFIG_IP_FIB_TRIE_STATS this_cpu_inc(stats->semantic_match_passed); #endif trace_fib_table_lookup(tb->tb_id, flp, NULL, err); return err; } if (fi->fib_flags & RTNH_F_DEAD) continue; if (unlikely(fi->nh)) { if (nexthop_is_blackhole(fi->nh)) { err = fib_props[RTN_BLACKHOLE].error; goto out_reject; } nhc = nexthop_get_nhc_lookup(fi->nh, fib_flags, flp, &nhsel); if (nhc) goto set_result; goto miss; } for (nhsel = 0; nhsel < fib_info_num_path(fi); nhsel++) { nhc = fib_info_nhc(fi, nhsel); if (!fib_lookup_good_nhc(nhc, fib_flags, flp)) continue; set_result: if (!(fib_flags & FIB_LOOKUP_NOREF)) refcount_inc(&fi->fib_clntref); res->prefix = htonl(n->key); res->prefixlen = KEYLENGTH - fa->fa_slen; res->nh_sel = nhsel; res->nhc = nhc; res->type = fa->fa_type; res->scope = fi->fib_scope; res->dscp = fa->fa_dscp; res->fi = fi; res->table = tb; res->fa_head = &n->leaf; #ifdef CONFIG_IP_FIB_TRIE_STATS this_cpu_inc(stats->semantic_match_passed); #endif trace_fib_table_lookup(tb->tb_id, flp, nhc, err); return err; } } miss: #ifdef CONFIG_IP_FIB_TRIE_STATS this_cpu_inc(stats->semantic_match_miss); #endif goto backtrace; } EXPORT_SYMBOL_GPL(fib_table_lookup); static void fib_remove_alias(struct trie *t, struct key_vector *tp, struct key_vector *l, struct fib_alias *old) { /* record the location of the previous list_info entry */ struct hlist_node **pprev = old->fa_list.pprev; struct fib_alias *fa = hlist_entry(pprev, typeof(*fa), fa_list.next); /* remove the fib_alias from the list */ hlist_del_rcu(&old->fa_list); /* if we emptied the list this leaf will be freed and we can sort * out parent suffix lengths as a part of trie_rebalance */ if (hlist_empty(&l->leaf)) { if (tp->slen == l->slen) node_pull_suffix(tp, tp->pos); put_child_root(tp, l->key, NULL); node_free(l); trie_rebalance(t, tp); return; } /* only access fa if it is pointing at the last valid hlist_node */ if (*pprev) return; /* update the trie with the latest suffix length */ l->slen = fa->fa_slen; node_pull_suffix(tp, fa->fa_slen); } static void fib_notify_alias_delete(struct net *net, u32 key, struct hlist_head *fah, struct fib_alias *fa_to_delete, struct netlink_ext_ack *extack) { struct fib_alias *fa_next, *fa_to_notify; u32 tb_id = fa_to_delete->tb_id; u8 slen = fa_to_delete->fa_slen; enum fib_event_type fib_event; /* Do not notify if we do not care about the route. */ if (fib_find_alias(fah, slen, 0, 0, tb_id, true) != fa_to_delete) return; /* Determine if the route should be replaced by the next route in the * list. */ fa_next = hlist_entry_safe(fa_to_delete->fa_list.next, struct fib_alias, fa_list); if (fa_next && fa_next->fa_slen == slen && fa_next->tb_id == tb_id) { fib_event = FIB_EVENT_ENTRY_REPLACE; fa_to_notify = fa_next; } else { fib_event = FIB_EVENT_ENTRY_DEL; fa_to_notify = fa_to_delete; } call_fib_entry_notifiers(net, fib_event, key, KEYLENGTH - slen, fa_to_notify, extack); } /* Caller must hold RTNL. */ int fib_table_delete(struct net *net, struct fib_table *tb, struct fib_config *cfg, struct netlink_ext_ack *extack) { struct trie *t = (struct trie *) tb->tb_data; struct fib_alias *fa, *fa_to_delete; struct key_vector *l, *tp; u8 plen = cfg->fc_dst_len; u8 slen = KEYLENGTH - plen; dscp_t dscp; u32 key; key = ntohl(cfg->fc_dst); l = fib_find_node(t, &tp, key); if (!l) return -ESRCH; dscp = cfg->fc_dscp; fa = fib_find_alias(&l->leaf, slen, dscp, 0, tb->tb_id, false); if (!fa) return -ESRCH; pr_debug("Deleting %08x/%d dsfield=0x%02x t=%p\n", key, plen, inet_dscp_to_dsfield(dscp), t); fa_to_delete = NULL; hlist_for_each_entry_from(fa, fa_list) { struct fib_info *fi = fa->fa_info; if ((fa->fa_slen != slen) || (fa->tb_id != tb->tb_id) || (fa->fa_dscp != dscp)) break; if ((!cfg->fc_type || fa->fa_type == cfg->fc_type) && (cfg->fc_scope == RT_SCOPE_NOWHERE || fa->fa_info->fib_scope == cfg->fc_scope) && (!cfg->fc_prefsrc || fi->fib_prefsrc == cfg->fc_prefsrc) && (!cfg->fc_protocol || fi->fib_protocol == cfg->fc_protocol) && fib_nh_match(net, cfg, fi, extack) == 0 && fib_metrics_match(cfg, fi)) { fa_to_delete = fa; break; } } if (!fa_to_delete) return -ESRCH; fib_notify_alias_delete(net, key, &l->leaf, fa_to_delete, extack); rtmsg_fib(RTM_DELROUTE, htonl(key), fa_to_delete, plen, tb->tb_id, &cfg->fc_nlinfo, 0); if (!plen) tb->tb_num_default--; fib_remove_alias(t, tp, l, fa_to_delete); if (fa_to_delete->fa_state & FA_S_ACCESSED) rt_cache_flush(cfg->fc_nlinfo.nl_net); fib_release_info(fa_to_delete->fa_info); alias_free_mem_rcu(fa_to_delete); return 0; } /* Scan for the next leaf starting at the provided key value */ static struct key_vector *leaf_walk_rcu(struct key_vector **tn, t_key key) { struct key_vector *pn, *n = *tn; unsigned long cindex; /* this loop is meant to try and find the key in the trie */ do { /* record parent and next child index */ pn = n; cindex = (key > pn->key) ? get_index(key, pn) : 0; if (cindex >> pn->bits) break; /* descend into the next child */ n = get_child_rcu(pn, cindex++); if (!n) break; /* guarantee forward progress on the keys */ if (IS_LEAF(n) && (n->key >= key)) goto found; } while (IS_TNODE(n)); /* this loop will search for the next leaf with a greater key */ while (!IS_TRIE(pn)) { /* if we exhausted the parent node we will need to climb */ if (cindex >= (1ul << pn->bits)) { t_key pkey = pn->key; pn = node_parent_rcu(pn); cindex = get_index(pkey, pn) + 1; continue; } /* grab the next available node */ n = get_child_rcu(pn, cindex++); if (!n) continue; /* no need to compare keys since we bumped the index */ if (IS_LEAF(n)) goto found; /* Rescan start scanning in new node */ pn = n; cindex = 0; } *tn = pn; return NULL; /* Root of trie */ found: /* if we are at the limit for keys just return NULL for the tnode */ *tn = pn; return n; } static void fib_trie_free(struct fib_table *tb) { struct trie *t = (struct trie *)tb->tb_data; struct key_vector *pn = t->kv; unsigned long cindex = 1; struct hlist_node *tmp; struct fib_alias *fa; /* walk trie in reverse order and free everything */ for (;;) { struct key_vector *n; if (!(cindex--)) { t_key pkey = pn->key; if (IS_TRIE(pn)) break; n = pn; pn = node_parent(pn); /* drop emptied tnode */ put_child_root(pn, n->key, NULL); node_free(n); cindex = get_index(pkey, pn); continue; } /* grab the next available node */ n = get_child(pn, cindex); if (!n) continue; if (IS_TNODE(n)) { /* record pn and cindex for leaf walking */ pn = n; cindex = 1ul << n->bits; continue; } hlist_for_each_entry_safe(fa, tmp, &n->leaf, fa_list) { hlist_del_rcu(&fa->fa_list); alias_free_mem_rcu(fa); } put_child_root(pn, n->key, NULL); node_free(n); } #ifdef CONFIG_IP_FIB_TRIE_STATS free_percpu(t->stats); #endif kfree(tb); } struct fib_table *fib_trie_unmerge(struct fib_table *oldtb) { struct trie *ot = (struct trie *)oldtb->tb_data; struct key_vector *l, *tp = ot->kv; struct fib_table *local_tb; struct fib_alias *fa; struct trie *lt; t_key key = 0; if (oldtb->tb_data == oldtb->__data) return oldtb; local_tb = fib_trie_table(RT_TABLE_LOCAL, NULL); if (!local_tb) return NULL; lt = (struct trie *)local_tb->tb_data; while ((l = leaf_walk_rcu(&tp, key)) != NULL) { struct key_vector *local_l = NULL, *local_tp; hlist_for_each_entry(fa, &l->leaf, fa_list) { struct fib_alias *new_fa; if (local_tb->tb_id != fa->tb_id) continue; /* clone fa for new local table */ new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL); if (!new_fa) goto out; memcpy(new_fa, fa, sizeof(*fa)); /* insert clone into table */ if (!local_l) local_l = fib_find_node(lt, &local_tp, l->key); if (fib_insert_alias(lt, local_tp, local_l, new_fa, NULL, l->key)) { kmem_cache_free(fn_alias_kmem, new_fa); goto out; } } /* stop loop if key wrapped back to 0 */ key = l->key + 1; if (key < l->key) break; } return local_tb; out: fib_trie_free(local_tb); return NULL; } /* Caller must hold RTNL */ void fib_table_flush_external(struct fib_table *tb) { struct trie *t = (struct trie *)tb->tb_data; struct key_vector *pn = t->kv; unsigned long cindex = 1; struct hlist_node *tmp; struct fib_alias *fa; /* walk trie in reverse order */ for (;;) { unsigned char slen = 0; struct key_vector *n; if (!(cindex--)) { t_key pkey = pn->key; /* cannot resize the trie vector */ if (IS_TRIE(pn)) break; /* update the suffix to address pulled leaves */ if (pn->slen > pn->pos) update_suffix(pn); /* resize completed node */ pn = resize(t, pn); cindex = get_index(pkey, pn); continue; } /* grab the next available node */ n = get_child(pn, cindex); if (!n) continue; if (IS_TNODE(n)) { /* record pn and cindex for leaf walking */ pn = n; cindex = 1ul << n->bits; continue; } hlist_for_each_entry_safe(fa, tmp, &n->leaf, fa_list) { /* if alias was cloned to local then we just * need to remove the local copy from main */ if (tb->tb_id != fa->tb_id) { hlist_del_rcu(&fa->fa_list); alias_free_mem_rcu(fa); continue; } /* record local slen */ slen = fa->fa_slen; } /* update leaf slen */ n->slen = slen; if (hlist_empty(&n->leaf)) { put_child_root(pn, n->key, NULL); node_free(n); } } } /* Caller must hold RTNL. */ int fib_table_flush(struct net *net, struct fib_table *tb, bool flush_all) { struct trie *t = (struct trie *)tb->tb_data; struct nl_info info = { .nl_net = net }; struct key_vector *pn = t->kv; unsigned long cindex = 1; struct hlist_node *tmp; struct fib_alias *fa; int found = 0; /* walk trie in reverse order */ for (;;) { unsigned char slen = 0; struct key_vector *n; if (!(cindex--)) { t_key pkey = pn->key; /* cannot resize the trie vector */ if (IS_TRIE(pn)) break; /* update the suffix to address pulled leaves */ if (pn->slen > pn->pos) update_suffix(pn); /* resize completed node */ pn = resize(t, pn); cindex = get_index(pkey, pn); continue; } /* grab the next available node */ n = get_child(pn, cindex); if (!n) continue; if (IS_TNODE(n)) { /* record pn and cindex for leaf walking */ pn = n; cindex = 1ul << n->bits; continue; } hlist_for_each_entry_safe(fa, tmp, &n->leaf, fa_list) { struct fib_info *fi = fa->fa_info; if (!fi || tb->tb_id != fa->tb_id || (!(fi->fib_flags & RTNH_F_DEAD) && !fib_props[fa->fa_type].error)) { slen = fa->fa_slen; continue; } /* Do not flush error routes if network namespace is * not being dismantled */ if (!flush_all && fib_props[fa->fa_type].error) { slen = fa->fa_slen; continue; } fib_notify_alias_delete(net, n->key, &n->leaf, fa, NULL); if (fi->pfsrc_removed) rtmsg_fib(RTM_DELROUTE, htonl(n->key), fa, KEYLENGTH - fa->fa_slen, tb->tb_id, &info, 0); hlist_del_rcu(&fa->fa_list); fib_release_info(fa->fa_info); alias_free_mem_rcu(fa); found++; } /* update leaf slen */ n->slen = slen; if (hlist_empty(&n->leaf)) { put_child_root(pn, n->key, NULL); node_free(n); } } pr_debug("trie_flush found=%d\n", found); return found; } /* derived from fib_trie_free */ static void __fib_info_notify_update(struct net *net, struct fib_table *tb, struct nl_info *info) { struct trie *t = (struct trie *)tb->tb_data; struct key_vector *pn = t->kv; unsigned long cindex = 1; struct fib_alias *fa; for (;;) { struct key_vector *n; if (!(cindex--)) { t_key pkey = pn->key; if (IS_TRIE(pn)) break; pn = node_parent(pn); cindex = get_index(pkey, pn); continue; } /* grab the next available node */ n = get_child(pn, cindex); if (!n) continue; if (IS_TNODE(n)) { /* record pn and cindex for leaf walking */ pn = n; cindex = 1ul << n->bits; continue; } hlist_for_each_entry(fa, &n->leaf, fa_list) { struct fib_info *fi = fa->fa_info; if (!fi || !fi->nh_updated || fa->tb_id != tb->tb_id) continue; rtmsg_fib(RTM_NEWROUTE, htonl(n->key), fa, KEYLENGTH - fa->fa_slen, tb->tb_id, info, NLM_F_REPLACE); } } } void fib_info_notify_update(struct net *net, struct nl_info *info) { unsigned int h; for (h = 0; h < FIB_TABLE_HASHSZ; h++) { struct hlist_head *head = &net->ipv4.fib_table_hash[h]; struct fib_table *tb; hlist_for_each_entry_rcu(tb, head, tb_hlist, lockdep_rtnl_is_held()) __fib_info_notify_update(net, tb, info); } } static int fib_leaf_notify(struct key_vector *l, struct fib_table *tb, struct notifier_block *nb, struct netlink_ext_ack *extack) { struct fib_alias *fa; int last_slen = -1; int err; hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) { struct fib_info *fi = fa->fa_info; if (!fi) continue; /* local and main table can share the same trie, * so don't notify twice for the same entry. */ if (tb->tb_id != fa->tb_id) continue; if (fa->fa_slen == last_slen) continue; last_slen = fa->fa_slen; err = call_fib_entry_notifier(nb, FIB_EVENT_ENTRY_REPLACE, l->key, KEYLENGTH - fa->fa_slen, fa, extack); if (err) return err; } return 0; } static int fib_table_notify(struct fib_table *tb, struct notifier_block *nb, struct netlink_ext_ack *extack) { struct trie *t = (struct trie *)tb->tb_data; struct key_vector *l, *tp = t->kv; t_key key = 0; int err; while ((l = leaf_walk_rcu(&tp, key)) != NULL) { err = fib_leaf_notify(l, tb, nb, extack); if (err) return err; key = l->key + 1; /* stop in case of wrap around */ if (key < l->key) break; } return 0; } int fib_notify(struct net *net, struct notifier_block *nb, struct netlink_ext_ack *extack) { unsigned int h; int err; for (h = 0; h < FIB_TABLE_HASHSZ; h++) { struct hlist_head *head = &net->ipv4.fib_table_hash[h]; struct fib_table *tb; hlist_for_each_entry_rcu(tb, head, tb_hlist) { err = fib_table_notify(tb, nb, extack); if (err) return err; } } return 0; } static void __trie_free_rcu(struct rcu_head *head) { struct fib_table *tb = container_of(head, struct fib_table, rcu); #ifdef CONFIG_IP_FIB_TRIE_STATS struct trie *t = (struct trie *)tb->tb_data; if (tb->tb_data == tb->__data) free_percpu(t->stats); #endif /* CONFIG_IP_FIB_TRIE_STATS */ kfree(tb); } void fib_free_table(struct fib_table *tb) { call_rcu(&tb->rcu, __trie_free_rcu); } static int fn_trie_dump_leaf(struct key_vector *l, struct fib_table *tb, struct sk_buff *skb, struct netlink_callback *cb, struct fib_dump_filter *filter) { unsigned int flags = NLM_F_MULTI; __be32 xkey = htonl(l->key); int i, s_i, i_fa, s_fa, err; struct fib_alias *fa; if (filter->filter_set || !filter->dump_exceptions || !filter->dump_routes) flags |= NLM_F_DUMP_FILTERED; s_i = cb->args[4]; s_fa = cb->args[5]; i = 0; /* rcu_read_lock is hold by caller */ hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) { struct fib_info *fi = fa->fa_info; if (i < s_i) goto next; i_fa = 0; if (tb->tb_id != fa->tb_id) goto next; if (filter->filter_set) { if (filter->rt_type && fa->fa_type != filter->rt_type) goto next; if ((filter->protocol && fi->fib_protocol != filter->protocol)) goto next; if (filter->dev && !fib_info_nh_uses_dev(fi, filter->dev)) goto next; } if (filter->dump_routes) { if (!s_fa) { struct fib_rt_info fri; fri.fi = fi; fri.tb_id = tb->tb_id; fri.dst = xkey; fri.dst_len = KEYLENGTH - fa->fa_slen; fri.dscp = fa->fa_dscp; fri.type = fa->fa_type; fri.offload = READ_ONCE(fa->offload); fri.trap = READ_ONCE(fa->trap); fri.offload_failed = READ_ONCE(fa->offload_failed); err = fib_dump_info(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, RTM_NEWROUTE, &fri, flags); if (err < 0) goto stop; } i_fa++; } if (filter->dump_exceptions) { err = fib_dump_info_fnhe(skb, cb, tb->tb_id, fi, &i_fa, s_fa, flags); if (err < 0) goto stop; } next: i++; } cb->args[4] = i; return skb->len; stop: cb->args[4] = i; cb->args[5] = i_fa; return err; } /* rcu_read_lock needs to be hold by caller from readside */ int fib_table_dump(struct fib_table *tb, struct sk_buff *skb, struct netlink_callback *cb, struct fib_dump_filter *filter) { struct trie *t = (struct trie *)tb->tb_data; struct key_vector *l, *tp = t->kv; /* Dump starting at last key. * Note: 0.0.0.0/0 (ie default) is first key. */ int count = cb->args[2]; t_key key = cb->args[3]; /* First time here, count and key are both always 0. Count > 0 * and key == 0 means the dump has wrapped around and we are done. */ if (count && !key) return 0; while ((l = leaf_walk_rcu(&tp, key)) != NULL) { int err; err = fn_trie_dump_leaf(l, tb, skb, cb, filter); if (err < 0) { cb->args[3] = key; cb->args[2] = count; return err; } ++count; key = l->key + 1; memset(&cb->args[4], 0, sizeof(cb->args) - 4*sizeof(cb->args[0])); /* stop loop if key wrapped back to 0 */ if (key < l->key) break; } cb->args[3] = key; cb->args[2] = count; return 0; } void __init fib_trie_init(void) { fn_alias_kmem = kmem_cache_create("ip_fib_alias", sizeof(struct fib_alias), 0, SLAB_PANIC | SLAB_ACCOUNT, NULL); trie_leaf_kmem = kmem_cache_create("ip_fib_trie", LEAF_SIZE, 0, SLAB_PANIC | SLAB_ACCOUNT, NULL); } struct fib_table *fib_trie_table(u32 id, struct fib_table *alias) { struct fib_table *tb; struct trie *t; size_t sz = sizeof(*tb); if (!alias) sz += sizeof(struct trie); tb = kzalloc(sz, GFP_KERNEL); if (!tb) return NULL; tb->tb_id = id; tb->tb_num_default = 0; tb->tb_data = (alias ? alias->__data : tb->__data); if (alias) return tb; t = (struct trie *) tb->tb_data; t->kv[0].pos = KEYLENGTH; t->kv[0].slen = KEYLENGTH; #ifdef CONFIG_IP_FIB_TRIE_STATS t->stats = alloc_percpu(struct trie_use_stats); if (!t->stats) { kfree(tb); tb = NULL; } #endif return tb; } #ifdef CONFIG_PROC_FS /* Depth first Trie walk iterator */ struct fib_trie_iter { struct seq_net_private p; struct fib_table *tb; struct key_vector *tnode; unsigned int index; unsigned int depth; }; static struct key_vector *fib_trie_get_next(struct fib_trie_iter *iter) { unsigned long cindex = iter->index; struct key_vector *pn = iter->tnode; t_key pkey; pr_debug("get_next iter={node=%p index=%d depth=%d}\n", iter->tnode, iter->index, iter->depth); while (!IS_TRIE(pn)) { while (cindex < child_length(pn)) { struct key_vector *n = get_child_rcu(pn, cindex++); if (!n) continue; if (IS_LEAF(n)) { iter->tnode = pn; iter->index = cindex; } else { /* push down one level */ iter->tnode = n; iter->index = 0; ++iter->depth; } return n; } /* Current node exhausted, pop back up */ pkey = pn->key; pn = node_parent_rcu(pn); cindex = get_index(pkey, pn) + 1; --iter->depth; } /* record root node so further searches know we are done */ iter->tnode = pn; iter->index = 0; return NULL; } static struct key_vector *fib_trie_get_first(struct fib_trie_iter *iter, struct trie *t) { struct key_vector *n, *pn; if (!t) return NULL; pn = t->kv; n = rcu_dereference(pn->tnode[0]); if (!n) return NULL; if (IS_TNODE(n)) { iter->tnode = n; iter->index = 0; iter->depth = 1; } else { iter->tnode = pn; iter->index = 0; iter->depth = 0; } return n; } static void trie_collect_stats(struct trie *t, struct trie_stat *s) { struct key_vector *n; struct fib_trie_iter iter; memset(s, 0, sizeof(*s)); rcu_read_lock(); for (n = fib_trie_get_first(&iter, t); n; n = fib_trie_get_next(&iter)) { if (IS_LEAF(n)) { struct fib_alias *fa; s->leaves++; s->totdepth += iter.depth; if (iter.depth > s->maxdepth) s->maxdepth = iter.depth; hlist_for_each_entry_rcu(fa, &n->leaf, fa_list) ++s->prefixes; } else { s->tnodes++; if (n->bits < MAX_STAT_DEPTH) s->nodesizes[n->bits]++; s->nullpointers += tn_info(n)->empty_children; } } rcu_read_unlock(); } /* * This outputs /proc/net/fib_triestats */ static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat) { unsigned int i, max, pointers, bytes, avdepth; if (stat->leaves) avdepth = stat->totdepth*100 / stat->leaves; else avdepth = 0; seq_printf(seq, "\tAver depth: %u.%02d\n", avdepth / 100, avdepth % 100); seq_printf(seq, "\tMax depth: %u\n", stat->maxdepth); seq_printf(seq, "\tLeaves: %u\n", stat->leaves); bytes = LEAF_SIZE * stat->leaves; seq_printf(seq, "\tPrefixes: %u\n", stat->prefixes); bytes += sizeof(struct fib_alias) * stat->prefixes; seq_printf(seq, "\tInternal nodes: %u\n\t", stat->tnodes); bytes += TNODE_SIZE(0) * stat->tnodes; max = MAX_STAT_DEPTH; while (max > 0 && stat->nodesizes[max-1] == 0) max--; pointers = 0; for (i = 1; i < max; i++) if (stat->nodesizes[i] != 0) { seq_printf(seq, " %u: %u", i, stat->nodesizes[i]); pointers += (1<<i) * stat->nodesizes[i]; } seq_putc(seq, '\n'); seq_printf(seq, "\tPointers: %u\n", pointers); bytes += sizeof(struct key_vector *) * pointers; seq_printf(seq, "Null ptrs: %u\n", stat->nullpointers); seq_printf(seq, "Total size: %u kB\n", (bytes + 1023) / 1024); } #ifdef CONFIG_IP_FIB_TRIE_STATS static void trie_show_usage(struct seq_file *seq, const struct trie_use_stats __percpu *stats) { struct trie_use_stats s = { 0 }; int cpu; /* loop through all of the CPUs and gather up the stats */ for_each_possible_cpu(cpu) { const struct trie_use_stats *pcpu = per_cpu_ptr(stats, cpu); s.gets += pcpu->gets; s.backtrack += pcpu->backtrack; s.semantic_match_passed += pcpu->semantic_match_passed; s.semantic_match_miss += pcpu->semantic_match_miss; s.null_node_hit += pcpu->null_node_hit; s.resize_node_skipped += pcpu->resize_node_skipped; } seq_printf(seq, "\nCounters:\n---------\n"); seq_printf(seq, "gets = %u\n", s.gets); seq_printf(seq, "backtracks = %u\n", s.backtrack); seq_printf(seq, "semantic match passed = %u\n", s.semantic_match_passed); seq_printf(seq, "semantic match miss = %u\n", s.semantic_match_miss); seq_printf(seq, "null node hit= %u\n", s.null_node_hit); seq_printf(seq, "skipped node resize = %u\n\n", s.resize_node_skipped); } #endif /* CONFIG_IP_FIB_TRIE_STATS */ static void fib_table_print(struct seq_file *seq, struct fib_table *tb) { if (tb->tb_id == RT_TABLE_LOCAL) seq_puts(seq, "Local:\n"); else if (tb->tb_id == RT_TABLE_MAIN) seq_puts(seq, "Main:\n"); else seq_printf(seq, "Id %d:\n", tb->tb_id); } static int fib_triestat_seq_show(struct seq_file *seq, void *v) { struct net *net = seq->private; unsigned int h; seq_printf(seq, "Basic info: size of leaf:" " %zd bytes, size of tnode: %zd bytes.\n", LEAF_SIZE, TNODE_SIZE(0)); rcu_read_lock(); for (h = 0; h < FIB_TABLE_HASHSZ; h++) { struct hlist_head *head = &net->ipv4.fib_table_hash[h]; struct fib_table *tb; hlist_for_each_entry_rcu(tb, head, tb_hlist) { struct trie *t = (struct trie *) tb->tb_data; struct trie_stat stat; if (!t) continue; fib_table_print(seq, tb); trie_collect_stats(t, &stat); trie_show_stats(seq, &stat); #ifdef CONFIG_IP_FIB_TRIE_STATS trie_show_usage(seq, t->stats); #endif } cond_resched_rcu(); } rcu_read_unlock(); return 0; } static struct key_vector *fib_trie_get_idx(struct seq_file *seq, loff_t pos) { struct fib_trie_iter *iter = seq->private; struct net *net = seq_file_net(seq); loff_t idx = 0; unsigned int h; for (h = 0; h < FIB_TABLE_HASHSZ; h++) { struct hlist_head *head = &net->ipv4.fib_table_hash[h]; struct fib_table *tb; hlist_for_each_entry_rcu(tb, head, tb_hlist) { struct key_vector *n; for (n = fib_trie_get_first(iter, (struct trie *) tb->tb_data); n; n = fib_trie_get_next(iter)) if (pos == idx++) { iter->tb = tb; return n; } } } return NULL; } static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos) __acquires(RCU) { rcu_read_lock(); return fib_trie_get_idx(seq, *pos); } static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct fib_trie_iter *iter = seq->private; struct net *net = seq_file_net(seq); struct fib_table *tb = iter->tb; struct hlist_node *tb_node; unsigned int h; struct key_vector *n; ++*pos; /* next node in same table */ n = fib_trie_get_next(iter); if (n) return n; /* walk rest of this hash chain */ h = tb->tb_id & (FIB_TABLE_HASHSZ - 1); while ((tb_node = rcu_dereference(hlist_next_rcu(&tb->tb_hlist)))) { tb = hlist_entry(tb_node, struct fib_table, tb_hlist); n = fib_trie_get_first(iter, (struct trie *) tb->tb_data); if (n) goto found; } /* new hash chain */ while (++h < FIB_TABLE_HASHSZ) { struct hlist_head *head = &net->ipv4.fib_table_hash[h]; hlist_for_each_entry_rcu(tb, head, tb_hlist) { n = fib_trie_get_first(iter, (struct trie *) tb->tb_data); if (n) goto found; } } return NULL; found: iter->tb = tb; return n; } static void fib_trie_seq_stop(struct seq_file *seq, void *v) __releases(RCU) { rcu_read_unlock(); } static void seq_indent(struct seq_file *seq, int n) { while (n-- > 0) seq_puts(seq, " "); } static inline const char *rtn_scope(char *buf, size_t len, enum rt_scope_t s) { switch (s) { case RT_SCOPE_UNIVERSE: return "universe"; case RT_SCOPE_SITE: return "site"; case RT_SCOPE_LINK: return "link"; case RT_SCOPE_HOST: return "host"; case RT_SCOPE_NOWHERE: return "nowhere"; default: snprintf(buf, len, "scope=%d", s); return buf; } } static const char *const rtn_type_names[__RTN_MAX] = { [RTN_UNSPEC] = "UNSPEC", [RTN_UNICAST] = "UNICAST", [RTN_LOCAL] = "LOCAL", [RTN_BROADCAST] = "BROADCAST", [RTN_ANYCAST] = "ANYCAST", [RTN_MULTICAST] = "MULTICAST", [RTN_BLACKHOLE] = "BLACKHOLE", [RTN_UNREACHABLE] = "UNREACHABLE", [RTN_PROHIBIT] = "PROHIBIT", [RTN_THROW] = "THROW", [RTN_NAT] = "NAT", [RTN_XRESOLVE] = "XRESOLVE", }; static inline const char *rtn_type(char *buf, size_t len, unsigned int t) { if (t < __RTN_MAX && rtn_type_names[t]) return rtn_type_names[t]; snprintf(buf, len, "type %u", t); return buf; } /* Pretty print the trie */ static int fib_trie_seq_show(struct seq_file *seq, void *v) { const struct fib_trie_iter *iter = seq->private; struct key_vector *n = v; if (IS_TRIE(node_parent_rcu(n))) fib_table_print(seq, iter->tb); if (IS_TNODE(n)) { __be32 prf = htonl(n->key); seq_indent(seq, iter->depth-1); seq_printf(seq, " +-- %pI4/%zu %u %u %u\n", &prf, KEYLENGTH - n->pos - n->bits, n->bits, tn_info(n)->full_children, tn_info(n)->empty_children); } else { __be32 val = htonl(n->key); struct fib_alias *fa; seq_indent(seq, iter->depth); seq_printf(seq, " |-- %pI4\n", &val); hlist_for_each_entry_rcu(fa, &n->leaf, fa_list) { char buf1[32], buf2[32]; seq_indent(seq, iter->depth + 1); seq_printf(seq, " /%zu %s %s", KEYLENGTH - fa->fa_slen, rtn_scope(buf1, sizeof(buf1), fa->fa_info->fib_scope), rtn_type(buf2, sizeof(buf2), fa->fa_type)); if (fa->fa_dscp) seq_printf(seq, " tos=%d", inet_dscp_to_dsfield(fa->fa_dscp)); seq_putc(seq, '\n'); } } return 0; } static const struct seq_operations fib_trie_seq_ops = { .start = fib_trie_seq_start, .next = fib_trie_seq_next, .stop = fib_trie_seq_stop, .show = fib_trie_seq_show, }; struct fib_route_iter { struct seq_net_private p; struct fib_table *main_tb; struct key_vector *tnode; loff_t pos; t_key key; }; static struct key_vector *fib_route_get_idx(struct fib_route_iter *iter, loff_t pos) { struct key_vector *l, **tp = &iter->tnode; t_key key; /* use cached location of previously found key */ if (iter->pos > 0 && pos >= iter->pos) { key = iter->key; } else { iter->pos = 1; key = 0; } pos -= iter->pos; while ((l = leaf_walk_rcu(tp, key)) && (pos-- > 0)) { key = l->key + 1; iter->pos++; l = NULL; /* handle unlikely case of a key wrap */ if (!key) break; } if (l) iter->key = l->key; /* remember it */ else iter->pos = 0; /* forget it */ return l; } static void *fib_route_seq_start(struct seq_file *seq, loff_t *pos) __acquires(RCU) { struct fib_route_iter *iter = seq->private; struct fib_table *tb; struct trie *t; rcu_read_lock(); tb = fib_get_table(seq_file_net(seq), RT_TABLE_MAIN); if (!tb) return NULL; iter->main_tb = tb; t = (struct trie *)tb->tb_data; iter->tnode = t->kv; if (*pos != 0) return fib_route_get_idx(iter, *pos); iter->pos = 0; iter->key = KEY_MAX; return SEQ_START_TOKEN; } static void *fib_route_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct fib_route_iter *iter = seq->private; struct key_vector *l = NULL; t_key key = iter->key + 1; ++*pos; /* only allow key of 0 for start of sequence */ if ((v == SEQ_START_TOKEN) || key) l = leaf_walk_rcu(&iter->tnode, key); if (l) { iter->key = l->key; iter->pos++; } else { iter->pos = 0; } return l; } static void fib_route_seq_stop(struct seq_file *seq, void *v) __releases(RCU) { rcu_read_unlock(); } static unsigned int fib_flag_trans(int type, __be32 mask, struct fib_info *fi) { unsigned int flags = 0; if (type == RTN_UNREACHABLE || type == RTN_PROHIBIT) flags = RTF_REJECT; if (fi) { const struct fib_nh_common *nhc = fib_info_nhc(fi, 0); if (nhc->nhc_gw.ipv4) flags |= RTF_GATEWAY; } if (mask == htonl(0xFFFFFFFF)) flags |= RTF_HOST; flags |= RTF_UP; return flags; } /* * This outputs /proc/net/route. * The format of the file is not supposed to be changed * and needs to be same as fib_hash output to avoid breaking * legacy utilities */ static int fib_route_seq_show(struct seq_file *seq, void *v) { struct fib_route_iter *iter = seq->private; struct fib_table *tb = iter->main_tb; struct fib_alias *fa; struct key_vector *l = v; __be32 prefix; if (v == SEQ_START_TOKEN) { seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway " "\tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU" "\tWindow\tIRTT"); return 0; } prefix = htonl(l->key); hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) { struct fib_info *fi = fa->fa_info; __be32 mask = inet_make_mask(KEYLENGTH - fa->fa_slen); unsigned int flags = fib_flag_trans(fa->fa_type, mask, fi); if ((fa->fa_type == RTN_BROADCAST) || (fa->fa_type == RTN_MULTICAST)) continue; if (fa->tb_id != tb->tb_id) continue; seq_setwidth(seq, 127); if (fi) { struct fib_nh_common *nhc = fib_info_nhc(fi, 0); __be32 gw = 0; if (nhc->nhc_gw_family == AF_INET) gw = nhc->nhc_gw.ipv4; seq_printf(seq, "%s\t%08X\t%08X\t%04X\t%d\t%u\t" "%u\t%08X\t%d\t%u\t%u", nhc->nhc_dev ? nhc->nhc_dev->name : "*", prefix, gw, flags, 0, 0, fi->fib_priority, mask, (fi->fib_advmss ? fi->fib_advmss + 40 : 0), fi->fib_window, fi->fib_rtt >> 3); } else { seq_printf(seq, "*\t%08X\t%08X\t%04X\t%d\t%u\t" "%u\t%08X\t%d\t%u\t%u", prefix, 0, flags, 0, 0, 0, mask, 0, 0, 0); } seq_pad(seq, '\n'); } return 0; } static const struct seq_operations fib_route_seq_ops = { .start = fib_route_seq_start, .next = fib_route_seq_next, .stop = fib_route_seq_stop, .show = fib_route_seq_show, }; int __net_init fib_proc_init(struct net *net) { if (!proc_create_net("fib_trie", 0444, net->proc_net, &fib_trie_seq_ops, sizeof(struct fib_trie_iter))) goto out1; if (!proc_create_net_single("fib_triestat", 0444, net->proc_net, fib_triestat_seq_show, NULL)) goto out2; if (!proc_create_net("route", 0444, net->proc_net, &fib_route_seq_ops, sizeof(struct fib_route_iter))) goto out3; return 0; out3: remove_proc_entry("fib_triestat", net->proc_net); out2: remove_proc_entry("fib_trie", net->proc_net); out1: return -ENOMEM; } void __net_exit fib_proc_exit(struct net *net) { remove_proc_entry("fib_trie", net->proc_net); remove_proc_entry("fib_triestat", net->proc_net); remove_proc_entry("route", net->proc_net); } #endif /* CONFIG_PROC_FS */ |
| 64 78 122 2 138 139 139 143 95 15 87 68 14 57 1 2 4 4 3 4 7 84 88 80 80 79 110 1 128 130 90 2 162 78 103 57 12 58 80 83 64 86 86 85 68 120 121 116 107 45 153 202 1 6 1 7 123 48 123 138 1 171 2 2 14 1 1 5 5 194 17 60 54 50 115 72 64 35 136 94 109 176 125 9 91 102 15 125 17 17 25 4 47 79 75 99 11 11 183 102 6 15 6 106 206 8 22 80 81 61 15 2 15 2 126 1 1 3 3 10 70 3 1 1 3 3 1 1 100 1 1 1 7 1 2 54 2 1 96 107 122 147 82 272 259 277 88 2 94 94 94 94 1 99 44 20 106 104 20 17 20 20 95 12 92 47 3 4 89 2 2 111 17 5 204 12 17 85 85 2 2 1 43 31 31 27 7 3 28 21 3 2 9 27 158 158 138 128 138 124 18 126 116 92 93 4 4 1 6 1 1 1 13 63 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 | /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Definitions for the TCP module. * * Version: @(#)tcp.h 1.0.5 05/23/93 * * Authors: Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> */ #ifndef _TCP_H #define _TCP_H #define FASTRETRANS_DEBUG 1 #include <linux/list.h> #include <linux/tcp.h> #include <linux/bug.h> #include <linux/slab.h> #include <linux/cache.h> #include <linux/percpu.h> #include <linux/skbuff.h> #include <linux/kref.h> #include <linux/ktime.h> #include <linux/indirect_call_wrapper.h> #include <linux/bits.h> #include <net/inet_connection_sock.h> #include <net/inet_timewait_sock.h> #include <net/inet_hashtables.h> #include <net/checksum.h> #include <net/request_sock.h> #include <net/sock_reuseport.h> #include <net/sock.h> #include <net/snmp.h> #include <net/ip.h> #include <net/tcp_states.h> #include <net/tcp_ao.h> #include <net/inet_ecn.h> #include <net/dst.h> #include <net/mptcp.h> #include <net/xfrm.h> #include <linux/seq_file.h> #include <linux/memcontrol.h> #include <linux/bpf-cgroup.h> #include <linux/siphash.h> extern struct inet_hashinfo tcp_hashinfo; DECLARE_PER_CPU(unsigned int, tcp_orphan_count); int tcp_orphan_count_sum(void); DECLARE_PER_CPU(u32, tcp_tw_isn); void tcp_time_wait(struct sock *sk, int state, int timeo); #define MAX_TCP_HEADER L1_CACHE_ALIGN(128 + MAX_HEADER) #define MAX_TCP_OPTION_SPACE 40 #define TCP_MIN_SND_MSS 48 #define TCP_MIN_GSO_SIZE (TCP_MIN_SND_MSS - MAX_TCP_OPTION_SPACE) /* * Never offer a window over 32767 without using window scaling. Some * poor stacks do signed 16bit maths! */ #define MAX_TCP_WINDOW 32767U /* Minimal accepted MSS. It is (60+60+8) - (20+20). */ #define TCP_MIN_MSS 88U /* The initial MTU to use for probing */ #define TCP_BASE_MSS 1024 /* probing interval, default to 10 minutes as per RFC4821 */ #define TCP_PROBE_INTERVAL 600 /* Specify interval when tcp mtu probing will stop */ #define TCP_PROBE_THRESHOLD 8 /* After receiving this amount of duplicate ACKs fast retransmit starts. */ #define TCP_FASTRETRANS_THRESH 3 /* Maximal number of ACKs sent quickly to accelerate slow-start. */ #define TCP_MAX_QUICKACKS 16U /* Maximal number of window scale according to RFC1323 */ #define TCP_MAX_WSCALE 14U /* urg_data states */ #define TCP_URG_VALID 0x0100 #define TCP_URG_NOTYET 0x0200 #define TCP_URG_READ 0x0400 #define TCP_RETR1 3 /* * This is how many retries it does before it * tries to figure out if the gateway is * down. Minimal RFC value is 3; it corresponds * to ~3sec-8min depending on RTO. */ #define TCP_RETR2 15 /* * This should take at least * 90 minutes to time out. * RFC1122 says that the limit is 100 sec. * 15 is ~13-30min depending on RTO. */ #define TCP_SYN_RETRIES 6 /* This is how many retries are done * when active opening a connection. * RFC1122 says the minimum retry MUST * be at least 180secs. Nevertheless * this value is corresponding to * 63secs of retransmission with the * current initial RTO. */ #define TCP_SYNACK_RETRIES 5 /* This is how may retries are done * when passive opening a connection. * This is corresponding to 31secs of * retransmission with the current * initial RTO. */ #define TCP_TIMEWAIT_LEN (60*HZ) /* how long to wait to destroy TIME-WAIT * state, about 60 seconds */ #define TCP_FIN_TIMEOUT TCP_TIMEWAIT_LEN /* BSD style FIN_WAIT2 deadlock breaker. * It used to be 3min, new value is 60sec, * to combine FIN-WAIT-2 timeout with * TIME-WAIT timer. */ #define TCP_FIN_TIMEOUT_MAX (120 * HZ) /* max TCP_LINGER2 value (two minutes) */ #define TCP_DELACK_MAX ((unsigned)(HZ/5)) /* maximal time to delay before sending an ACK */ static_assert((1 << ATO_BITS) > TCP_DELACK_MAX); #if HZ >= 100 #define TCP_DELACK_MIN ((unsigned)(HZ/25)) /* minimal time to delay before sending an ACK */ #define TCP_ATO_MIN ((unsigned)(HZ/25)) #else #define TCP_DELACK_MIN 4U #define TCP_ATO_MIN 4U #endif #define TCP_RTO_MAX_SEC 120 #define TCP_RTO_MAX ((unsigned)(TCP_RTO_MAX_SEC * HZ)) #define TCP_RTO_MIN ((unsigned)(HZ / 5)) #define TCP_TIMEOUT_MIN (2U) /* Min timeout for TCP timers in jiffies */ #define TCP_TIMEOUT_MIN_US (2*USEC_PER_MSEC) /* Min TCP timeout in microsecs */ #define TCP_TIMEOUT_INIT ((unsigned)(1*HZ)) /* RFC6298 2.1 initial RTO value */ #define TCP_TIMEOUT_FALLBACK ((unsigned)(3*HZ)) /* RFC 1122 initial RTO value, now * used as a fallback RTO for the * initial data transmission if no * valid RTT sample has been acquired, * most likely due to retrans in 3WHS. */ #define TCP_RESOURCE_PROBE_INTERVAL ((unsigned)(HZ/2U)) /* Maximal interval between probes * for local resources. */ #define TCP_KEEPALIVE_TIME (120*60*HZ) /* two hours */ #define TCP_KEEPALIVE_PROBES 9 /* Max of 9 keepalive probes */ #define TCP_KEEPALIVE_INTVL (75*HZ) #define MAX_TCP_KEEPIDLE 32767 #define MAX_TCP_KEEPINTVL 32767 #define MAX_TCP_KEEPCNT 127 #define MAX_TCP_SYNCNT 127 /* Ensure that TCP PAWS checks are relaxed after ~2147 seconds * to avoid overflows. This assumes a clock smaller than 1 Mhz. * Default clock is 1 Khz, tcp_usec_ts uses 1 Mhz. */ #define TCP_PAWS_WRAP (INT_MAX / USEC_PER_SEC) #define TCP_PAWS_MSL 60 /* Per-host timestamps are invalidated * after this time. It should be equal * (or greater than) TCP_TIMEWAIT_LEN * to provide reliability equal to one * provided by timewait state. */ #define TCP_PAWS_WINDOW 1 /* Replay window for per-host * timestamps. It must be less than * minimal timewait lifetime. */ /* * TCP option */ #define TCPOPT_NOP 1 /* Padding */ #define TCPOPT_EOL 0 /* End of options */ #define TCPOPT_MSS 2 /* Segment size negotiating */ #define TCPOPT_WINDOW 3 /* Window scaling */ #define TCPOPT_SACK_PERM 4 /* SACK Permitted */ #define TCPOPT_SACK 5 /* SACK Block */ #define TCPOPT_TIMESTAMP 8 /* Better RTT estimations/PAWS */ #define TCPOPT_MD5SIG 19 /* MD5 Signature (RFC2385) */ #define TCPOPT_AO 29 /* Authentication Option (RFC5925) */ #define TCPOPT_MPTCP 30 /* Multipath TCP (RFC6824) */ #define TCPOPT_FASTOPEN 34 /* Fast open (RFC7413) */ #define TCPOPT_EXP 254 /* Experimental */ /* Magic number to be after the option value for sharing TCP * experimental options. See draft-ietf-tcpm-experimental-options-00.txt */ #define TCPOPT_FASTOPEN_MAGIC 0xF989 #define TCPOPT_SMC_MAGIC 0xE2D4C3D9 /* * TCP option lengths */ #define TCPOLEN_MSS 4 #define TCPOLEN_WINDOW 3 #define TCPOLEN_SACK_PERM 2 #define TCPOLEN_TIMESTAMP 10 #define TCPOLEN_MD5SIG 18 #define TCPOLEN_FASTOPEN_BASE 2 #define TCPOLEN_EXP_FASTOPEN_BASE 4 #define TCPOLEN_EXP_SMC_BASE 6 /* But this is what stacks really send out. */ #define TCPOLEN_TSTAMP_ALIGNED 12 #define TCPOLEN_WSCALE_ALIGNED 4 #define TCPOLEN_SACKPERM_ALIGNED 4 #define TCPOLEN_SACK_BASE 2 #define TCPOLEN_SACK_BASE_ALIGNED 4 #define TCPOLEN_SACK_PERBLOCK 8 #define TCPOLEN_MD5SIG_ALIGNED 20 #define TCPOLEN_MSS_ALIGNED 4 #define TCPOLEN_EXP_SMC_BASE_ALIGNED 8 /* Flags in tp->nonagle */ #define TCP_NAGLE_OFF 1 /* Nagle's algo is disabled */ #define TCP_NAGLE_CORK 2 /* Socket is corked */ #define TCP_NAGLE_PUSH 4 /* Cork is overridden for already queued data */ /* TCP thin-stream limits */ #define TCP_THIN_LINEAR_RETRIES 6 /* After 6 linear retries, do exp. backoff */ /* TCP initial congestion window as per rfc6928 */ #define TCP_INIT_CWND 10 /* Bit Flags for sysctl_tcp_fastopen */ #define TFO_CLIENT_ENABLE 1 #define TFO_SERVER_ENABLE 2 #define TFO_CLIENT_NO_COOKIE 4 /* Data in SYN w/o cookie option */ /* Accept SYN data w/o any cookie option */ #define TFO_SERVER_COOKIE_NOT_REQD 0x200 /* Force enable TFO on all listeners, i.e., not requiring the * TCP_FASTOPEN socket option. */ #define TFO_SERVER_WO_SOCKOPT1 0x400 /* sysctl variables for tcp */ extern int sysctl_tcp_max_orphans; extern long sysctl_tcp_mem[3]; #define TCP_RACK_LOSS_DETECTION 0x1 /* Use RACK to detect losses */ #define TCP_RACK_STATIC_REO_WND 0x2 /* Use static RACK reo wnd */ #define TCP_RACK_NO_DUPTHRESH 0x4 /* Do not use DUPACK threshold in RACK */ DECLARE_PER_CPU(int, tcp_memory_per_cpu_fw_alloc); extern struct percpu_counter tcp_sockets_allocated; extern unsigned long tcp_memory_pressure; /* optimized version of sk_under_memory_pressure() for TCP sockets */ static inline bool tcp_under_memory_pressure(const struct sock *sk) { if (mem_cgroup_sockets_enabled && sk->sk_memcg && mem_cgroup_under_socket_pressure(sk->sk_memcg)) return true; return READ_ONCE(tcp_memory_pressure); } /* * The next routines deal with comparing 32 bit unsigned ints * and worry about wraparound (automatic with unsigned arithmetic). */ static inline bool before(__u32 seq1, __u32 seq2) { return (__s32)(seq1-seq2) < 0; } #define after(seq2, seq1) before(seq1, seq2) /* is s2<=s1<=s3 ? */ static inline bool between(__u32 seq1, __u32 seq2, __u32 seq3) { return seq3 - seq2 >= seq1 - seq2; } static inline void tcp_wmem_free_skb(struct sock *sk, struct sk_buff *skb) { sk_wmem_queued_add(sk, -skb->truesize); if (!skb_zcopy_pure(skb)) sk_mem_uncharge(sk, skb->truesize); else sk_mem_uncharge(sk, SKB_TRUESIZE(skb_end_offset(skb))); __kfree_skb(skb); } void sk_forced_mem_schedule(struct sock *sk, int size); bool tcp_check_oom(const struct sock *sk, int shift); extern struct proto tcp_prot; #define TCP_INC_STATS(net, field) SNMP_INC_STATS((net)->mib.tcp_statistics, field) #define __TCP_INC_STATS(net, field) __SNMP_INC_STATS((net)->mib.tcp_statistics, field) #define TCP_DEC_STATS(net, field) SNMP_DEC_STATS((net)->mib.tcp_statistics, field) #define TCP_ADD_STATS(net, field, val) SNMP_ADD_STATS((net)->mib.tcp_statistics, field, val) void tcp_tsq_work_init(void); int tcp_v4_err(struct sk_buff *skb, u32); void tcp_shutdown(struct sock *sk, int how); int tcp_v4_early_demux(struct sk_buff *skb); int tcp_v4_rcv(struct sk_buff *skb); void tcp_remove_empty_skb(struct sock *sk); int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size); int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size); int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, int *copied, size_t size, struct ubuf_info *uarg); void tcp_splice_eof(struct socket *sock); int tcp_send_mss(struct sock *sk, int *size_goal, int flags); int tcp_wmem_schedule(struct sock *sk, int copy); void tcp_push(struct sock *sk, int flags, int mss_now, int nonagle, int size_goal); void tcp_release_cb(struct sock *sk); void tcp_wfree(struct sk_buff *skb); void tcp_write_timer_handler(struct sock *sk); void tcp_delack_timer_handler(struct sock *sk); int tcp_ioctl(struct sock *sk, int cmd, int *karg); enum skb_drop_reason tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb); void tcp_rcv_established(struct sock *sk, struct sk_buff *skb); void tcp_rcv_space_adjust(struct sock *sk); int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp); void tcp_twsk_destructor(struct sock *sk); void tcp_twsk_purge(struct list_head *net_exit_list); ssize_t tcp_splice_read(struct socket *sk, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags); struct sk_buff *tcp_stream_alloc_skb(struct sock *sk, gfp_t gfp, bool force_schedule); static inline void tcp_dec_quickack_mode(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); if (icsk->icsk_ack.quick) { /* How many ACKs S/ACKing new data have we sent? */ const unsigned int pkts = inet_csk_ack_scheduled(sk) ? 1 : 0; if (pkts >= icsk->icsk_ack.quick) { icsk->icsk_ack.quick = 0; /* Leaving quickack mode we deflate ATO. */ icsk->icsk_ack.ato = TCP_ATO_MIN; } else icsk->icsk_ack.quick -= pkts; } } #define TCP_ECN_MODE_RFC3168 BIT(0) #define TCP_ECN_QUEUE_CWR BIT(1) #define TCP_ECN_DEMAND_CWR BIT(2) #define TCP_ECN_SEEN BIT(3) #define TCP_ECN_MODE_ACCECN BIT(4) #define TCP_ECN_DISABLED 0 #define TCP_ECN_MODE_PENDING (TCP_ECN_MODE_RFC3168 | TCP_ECN_MODE_ACCECN) #define TCP_ECN_MODE_ANY (TCP_ECN_MODE_RFC3168 | TCP_ECN_MODE_ACCECN) static inline bool tcp_ecn_mode_any(const struct tcp_sock *tp) { return tp->ecn_flags & TCP_ECN_MODE_ANY; } static inline bool tcp_ecn_mode_rfc3168(const struct tcp_sock *tp) { return (tp->ecn_flags & TCP_ECN_MODE_ANY) == TCP_ECN_MODE_RFC3168; } static inline bool tcp_ecn_mode_accecn(const struct tcp_sock *tp) { return (tp->ecn_flags & TCP_ECN_MODE_ANY) == TCP_ECN_MODE_ACCECN; } static inline bool tcp_ecn_disabled(const struct tcp_sock *tp) { return !tcp_ecn_mode_any(tp); } static inline bool tcp_ecn_mode_pending(const struct tcp_sock *tp) { return (tp->ecn_flags & TCP_ECN_MODE_PENDING) == TCP_ECN_MODE_PENDING; } static inline void tcp_ecn_mode_set(struct tcp_sock *tp, u8 mode) { tp->ecn_flags &= ~TCP_ECN_MODE_ANY; tp->ecn_flags |= mode; } enum tcp_tw_status { TCP_TW_SUCCESS = 0, TCP_TW_RST = 1, TCP_TW_ACK = 2, TCP_TW_SYN = 3, TCP_TW_ACK_OOW = 4 }; enum tcp_tw_status tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, const struct tcphdr *th, u32 *tw_isn, enum skb_drop_reason *drop_reason); struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, struct request_sock *req, bool fastopen, bool *lost_race, enum skb_drop_reason *drop_reason); enum skb_drop_reason tcp_child_process(struct sock *parent, struct sock *child, struct sk_buff *skb); void tcp_enter_loss(struct sock *sk); void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked, int newly_lost, int flag); void tcp_clear_retrans(struct tcp_sock *tp); void tcp_update_metrics(struct sock *sk); void tcp_init_metrics(struct sock *sk); void tcp_metrics_init(void); bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst); void __tcp_close(struct sock *sk, long timeout); void tcp_close(struct sock *sk, long timeout); void tcp_init_sock(struct sock *sk); void tcp_init_transfer(struct sock *sk, int bpf_op, struct sk_buff *skb); __poll_t tcp_poll(struct file *file, struct socket *sock, struct poll_table_struct *wait); int do_tcp_getsockopt(struct sock *sk, int level, int optname, sockptr_t optval, sockptr_t optlen); int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen); bool tcp_bpf_bypass_getsockopt(int level, int optname); int do_tcp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen); int tcp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen); void tcp_reset_keepalive_timer(struct sock *sk, unsigned long timeout); void tcp_set_keepalive(struct sock *sk, int val); void tcp_syn_ack_timeout(const struct request_sock *req); int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, int *addr_len); int tcp_set_rcvlowat(struct sock *sk, int val); int tcp_set_window_clamp(struct sock *sk, int val); void tcp_update_recv_tstamps(struct sk_buff *skb, struct scm_timestamping_internal *tss); void tcp_recv_timestamp(struct msghdr *msg, const struct sock *sk, struct scm_timestamping_internal *tss); void tcp_data_ready(struct sock *sk); #ifdef CONFIG_MMU int tcp_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma); #endif void tcp_parse_options(const struct net *net, const struct sk_buff *skb, struct tcp_options_received *opt_rx, int estab, struct tcp_fastopen_cookie *foc); /* * BPF SKB-less helpers */ u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph, struct tcphdr *th, u32 *cookie); u16 tcp_v6_get_syncookie(struct sock *sk, struct ipv6hdr *iph, struct tcphdr *th, u32 *cookie); u16 tcp_parse_mss_option(const struct tcphdr *th, u16 user_mss); u16 tcp_get_syncookie_mss(struct request_sock_ops *rsk_ops, const struct tcp_request_sock_ops *af_ops, struct sock *sk, struct tcphdr *th); /* * TCP v4 functions exported for the inet6 API */ void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb); void tcp_v4_mtu_reduced(struct sock *sk); void tcp_req_err(struct sock *sk, u32 seq, bool abort); void tcp_ld_RTO_revert(struct sock *sk, u32 seq); int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb); struct sock *tcp_create_openreq_child(const struct sock *sk, struct request_sock *req, struct sk_buff *skb); void tcp_ca_openreq_child(struct sock *sk, const struct dst_entry *dst); struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, struct request_sock *req, struct dst_entry *dst, struct request_sock *req_unhash, bool *own_req); int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb); int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len); int tcp_connect(struct sock *sk); enum tcp_synack_type { TCP_SYNACK_NORMAL, TCP_SYNACK_FASTOPEN, TCP_SYNACK_COOKIE, }; struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, struct request_sock *req, struct tcp_fastopen_cookie *foc, enum tcp_synack_type synack_type, struct sk_buff *syn_skb); int tcp_disconnect(struct sock *sk, int flags); void tcp_finish_connect(struct sock *sk, struct sk_buff *skb); int tcp_send_rcvq(struct sock *sk, struct msghdr *msg, size_t size); void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb); /* From syncookies.c */ struct sock *tcp_get_cookie_sock(struct sock *sk, struct sk_buff *skb, struct request_sock *req, struct dst_entry *dst); int __cookie_v4_check(const struct iphdr *iph, const struct tcphdr *th); struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb); struct request_sock *cookie_tcp_reqsk_alloc(const struct request_sock_ops *ops, struct sock *sk, struct sk_buff *skb, struct tcp_options_received *tcp_opt, int mss, u32 tsoff); #if IS_ENABLED(CONFIG_BPF) struct bpf_tcp_req_attrs { u32 rcv_tsval; u32 rcv_tsecr; u16 mss; u8 rcv_wscale; u8 snd_wscale; u8 ecn_ok; u8 wscale_ok; u8 sack_ok; u8 tstamp_ok; u8 usec_ts_ok; u8 reserved[3]; }; #endif #ifdef CONFIG_SYN_COOKIES /* Syncookies use a monotonic timer which increments every 60 seconds. * This counter is used both as a hash input and partially encoded into * the cookie value. A cookie is only validated further if the delta * between the current counter value and the encoded one is less than this, * i.e. a sent cookie is valid only at most for 2*60 seconds (or less if * the counter advances immediately after a cookie is generated). */ #define MAX_SYNCOOKIE_AGE 2 #define TCP_SYNCOOKIE_PERIOD (60 * HZ) #define TCP_SYNCOOKIE_VALID (MAX_SYNCOOKIE_AGE * TCP_SYNCOOKIE_PERIOD) /* syncookies: remember time of last synqueue overflow * But do not dirty this field too often (once per second is enough) * It is racy as we do not hold a lock, but race is very minor. */ static inline void tcp_synq_overflow(const struct sock *sk) { unsigned int last_overflow; unsigned int now = jiffies; if (sk->sk_reuseport) { struct sock_reuseport *reuse; reuse = rcu_dereference(sk->sk_reuseport_cb); if (likely(reuse)) { last_overflow = READ_ONCE(reuse->synq_overflow_ts); if (!time_between32(now, last_overflow, last_overflow + HZ)) WRITE_ONCE(reuse->synq_overflow_ts, now); return; } } last_overflow = READ_ONCE(tcp_sk(sk)->rx_opt.ts_recent_stamp); if (!time_between32(now, last_overflow, last_overflow + HZ)) WRITE_ONCE(tcp_sk_rw(sk)->rx_opt.ts_recent_stamp, now); } /* syncookies: no recent synqueue overflow on this listening socket? */ static inline bool tcp_synq_no_recent_overflow(const struct sock *sk) { unsigned int last_overflow; unsigned int now = jiffies; if (sk->sk_reuseport) { struct sock_reuseport *reuse; reuse = rcu_dereference(sk->sk_reuseport_cb); if (likely(reuse)) { last_overflow = READ_ONCE(reuse->synq_overflow_ts); return !time_between32(now, last_overflow - HZ, last_overflow + TCP_SYNCOOKIE_VALID); } } last_overflow = READ_ONCE(tcp_sk(sk)->rx_opt.ts_recent_stamp); /* If last_overflow <= jiffies <= last_overflow + TCP_SYNCOOKIE_VALID, * then we're under synflood. However, we have to use * 'last_overflow - HZ' as lower bound. That's because a concurrent * tcp_synq_overflow() could update .ts_recent_stamp after we read * jiffies but before we store .ts_recent_stamp into last_overflow, * which could lead to rejecting a valid syncookie. */ return !time_between32(now, last_overflow - HZ, last_overflow + TCP_SYNCOOKIE_VALID); } static inline u32 tcp_cookie_time(void) { u64 val = get_jiffies_64(); do_div(val, TCP_SYNCOOKIE_PERIOD); return val; } /* Convert one nsec 64bit timestamp to ts (ms or usec resolution) */ static inline u64 tcp_ns_to_ts(bool usec_ts, u64 val) { if (usec_ts) return div_u64(val, NSEC_PER_USEC); return div_u64(val, NSEC_PER_MSEC); } u32 __cookie_v4_init_sequence(const struct iphdr *iph, const struct tcphdr *th, u16 *mssp); __u32 cookie_v4_init_sequence(const struct sk_buff *skb, __u16 *mss); u64 cookie_init_timestamp(struct request_sock *req, u64 now); bool cookie_timestamp_decode(const struct net *net, struct tcp_options_received *opt); static inline bool cookie_ecn_ok(const struct net *net, const struct dst_entry *dst) { return READ_ONCE(net->ipv4.sysctl_tcp_ecn) || dst_feature(dst, RTAX_FEATURE_ECN); } #if IS_ENABLED(CONFIG_BPF) static inline bool cookie_bpf_ok(struct sk_buff *skb) { return skb->sk; } struct request_sock *cookie_bpf_check(struct sock *sk, struct sk_buff *skb); #else static inline bool cookie_bpf_ok(struct sk_buff *skb) { return false; } static inline struct request_sock *cookie_bpf_check(struct net *net, struct sock *sk, struct sk_buff *skb) { return NULL; } #endif /* From net/ipv6/syncookies.c */ int __cookie_v6_check(const struct ipv6hdr *iph, const struct tcphdr *th); struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb); u32 __cookie_v6_init_sequence(const struct ipv6hdr *iph, const struct tcphdr *th, u16 *mssp); __u32 cookie_v6_init_sequence(const struct sk_buff *skb, __u16 *mss); #endif /* tcp_output.c */ void tcp_skb_entail(struct sock *sk, struct sk_buff *skb); void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb); void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss, int nonagle); int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs); int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs); void tcp_retransmit_timer(struct sock *sk); void tcp_xmit_retransmit_queue(struct sock *); void tcp_simple_retransmit(struct sock *); void tcp_enter_recovery(struct sock *sk, bool ece_ack); int tcp_trim_head(struct sock *, struct sk_buff *, u32); enum tcp_queue { TCP_FRAG_IN_WRITE_QUEUE, TCP_FRAG_IN_RTX_QUEUE, }; int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue, struct sk_buff *skb, u32 len, unsigned int mss_now, gfp_t gfp); void tcp_send_probe0(struct sock *); int tcp_write_wakeup(struct sock *, int mib); void tcp_send_fin(struct sock *sk); void tcp_send_active_reset(struct sock *sk, gfp_t priority, enum sk_rst_reason reason); int tcp_send_synack(struct sock *); void tcp_push_one(struct sock *, unsigned int mss_now); void __tcp_send_ack(struct sock *sk, u32 rcv_nxt, u16 flags); void tcp_send_ack(struct sock *sk); void tcp_send_delayed_ack(struct sock *sk); void tcp_send_loss_probe(struct sock *sk); bool tcp_schedule_loss_probe(struct sock *sk, bool advancing_rto); void tcp_skb_collapse_tstamp(struct sk_buff *skb, const struct sk_buff *next_skb); /* tcp_input.c */ void tcp_rearm_rto(struct sock *sk); void tcp_synack_rtt_meas(struct sock *sk, struct request_sock *req); void tcp_done_with_error(struct sock *sk, int err); void tcp_reset(struct sock *sk, struct sk_buff *skb); void tcp_fin(struct sock *sk); void tcp_check_space(struct sock *sk); void tcp_sack_compress_send_ack(struct sock *sk); static inline void tcp_cleanup_skb(struct sk_buff *skb) { skb_dst_drop(skb); secpath_reset(skb); } static inline void tcp_add_receive_queue(struct sock *sk, struct sk_buff *skb) { DEBUG_NET_WARN_ON_ONCE(skb_dst(skb)); DEBUG_NET_WARN_ON_ONCE(secpath_exists(skb)); __skb_queue_tail(&sk->sk_receive_queue, skb); } /* tcp_timer.c */ void tcp_init_xmit_timers(struct sock *); static inline void tcp_clear_xmit_timers(struct sock *sk) { if (hrtimer_try_to_cancel(&tcp_sk(sk)->pacing_timer) == 1) __sock_put(sk); if (hrtimer_try_to_cancel(&tcp_sk(sk)->compressed_ack_timer) == 1) __sock_put(sk); inet_csk_clear_xmit_timers(sk); } unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu); unsigned int tcp_current_mss(struct sock *sk); u32 tcp_clamp_probe0_to_user_timeout(const struct sock *sk, u32 when); /* Bound MSS / TSO packet size with the half of the window */ static inline int tcp_bound_to_half_wnd(struct tcp_sock *tp, int pktsize) { int cutoff; /* When peer uses tiny windows, there is no use in packetizing * to sub-MSS pieces for the sake of SWS or making sure there * are enough packets in the pipe for fast recovery. * * On the other hand, for extremely large MSS devices, handling * smaller than MSS windows in this way does make sense. */ if (tp->max_window > TCP_MSS_DEFAULT) cutoff = (tp->max_window >> 1); else cutoff = tp->max_window; if (cutoff && pktsize > cutoff) return max_t(int, cutoff, 68U - tp->tcp_header_len); else return pktsize; } /* tcp.c */ void tcp_get_info(struct sock *, struct tcp_info *); /* Read 'sendfile()'-style from a TCP socket */ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc, sk_read_actor_t recv_actor); int tcp_read_sock_noack(struct sock *sk, read_descriptor_t *desc, sk_read_actor_t recv_actor, bool noack, u32 *copied_seq); int tcp_read_skb(struct sock *sk, skb_read_actor_t recv_actor); struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off); void tcp_read_done(struct sock *sk, size_t len); void tcp_initialize_rcv_mss(struct sock *sk); int tcp_mtu_to_mss(struct sock *sk, int pmtu); int tcp_mss_to_mtu(struct sock *sk, int mss); void tcp_mtup_init(struct sock *sk); static inline unsigned int tcp_rto_max(const struct sock *sk) { return READ_ONCE(inet_csk(sk)->icsk_rto_max); } static inline void tcp_bound_rto(struct sock *sk) { inet_csk(sk)->icsk_rto = min(inet_csk(sk)->icsk_rto, tcp_rto_max(sk)); } static inline u32 __tcp_set_rto(const struct tcp_sock *tp) { return usecs_to_jiffies((tp->srtt_us >> 3) + tp->rttvar_us); } static inline void __tcp_fast_path_on(struct tcp_sock *tp, u32 snd_wnd) { /* mptcp hooks are only on the slow path */ if (sk_is_mptcp((struct sock *)tp)) return; tp->pred_flags = htonl((tp->tcp_header_len << 26) | ntohl(TCP_FLAG_ACK) | snd_wnd); } static inline void tcp_fast_path_on(struct tcp_sock *tp) { __tcp_fast_path_on(tp, tp->snd_wnd >> tp->rx_opt.snd_wscale); } static inline void tcp_fast_path_check(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); if (RB_EMPTY_ROOT(&tp->out_of_order_queue) && tp->rcv_wnd && atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf && !tp->urg_data) tcp_fast_path_on(tp); } u32 tcp_delack_max(const struct sock *sk); /* Compute the actual rto_min value */ static inline u32 tcp_rto_min(const struct sock *sk) { const struct dst_entry *dst = __sk_dst_get(sk); u32 rto_min = READ_ONCE(inet_csk(sk)->icsk_rto_min); if (dst && dst_metric_locked(dst, RTAX_RTO_MIN)) rto_min = dst_metric_rtt(dst, RTAX_RTO_MIN); return rto_min; } static inline u32 tcp_rto_min_us(const struct sock *sk) { return jiffies_to_usecs(tcp_rto_min(sk)); } static inline bool tcp_ca_dst_locked(const struct dst_entry *dst) { return dst_metric_locked(dst, RTAX_CC_ALGO); } /* Minimum RTT in usec. ~0 means not available. */ static inline u32 tcp_min_rtt(const struct tcp_sock *tp) { return minmax_get(&tp->rtt_min); } /* Compute the actual receive window we are currently advertising. * Rcv_nxt can be after the window if our peer push more data * than the offered window. */ static inline u32 tcp_receive_window(const struct tcp_sock *tp) { s32 win = tp->rcv_wup + tp->rcv_wnd - tp->rcv_nxt; if (win < 0) win = 0; return (u32) win; } /* Choose a new window, without checks for shrinking, and without * scaling applied to the result. The caller does these things * if necessary. This is a "raw" window selection. */ u32 __tcp_select_window(struct sock *sk); void tcp_send_window_probe(struct sock *sk); /* TCP uses 32bit jiffies to save some space. * Note that this is different from tcp_time_stamp, which * historically has been the same until linux-4.13. */ #define tcp_jiffies32 ((u32)jiffies) /* * Deliver a 32bit value for TCP timestamp option (RFC 7323) * It is no longer tied to jiffies, but to 1 ms clock. * Note: double check if you want to use tcp_jiffies32 instead of this. */ #define TCP_TS_HZ 1000 static inline u64 tcp_clock_ns(void) { return ktime_get_ns(); } static inline u64 tcp_clock_us(void) { return div_u64(tcp_clock_ns(), NSEC_PER_USEC); } static inline u64 tcp_clock_ms(void) { return div_u64(tcp_clock_ns(), NSEC_PER_MSEC); } /* TCP Timestamp included in TS option (RFC 1323) can either use ms * or usec resolution. Each socket carries a flag to select one or other * resolution, as the route attribute could change anytime. * Each flow must stick to initial resolution. */ static inline u32 tcp_clock_ts(bool usec_ts) { return usec_ts ? tcp_clock_us() : tcp_clock_ms(); } static inline u32 tcp_time_stamp_ms(const struct tcp_sock *tp) { return div_u64(tp->tcp_mstamp, USEC_PER_MSEC); } static inline u32 tcp_time_stamp_ts(const struct tcp_sock *tp) { if (tp->tcp_usec_ts) return tp->tcp_mstamp; return tcp_time_stamp_ms(tp); } void tcp_mstamp_refresh(struct tcp_sock *tp); static inline u32 tcp_stamp_us_delta(u64 t1, u64 t0) { return max_t(s64, t1 - t0, 0); } /* provide the departure time in us unit */ static inline u64 tcp_skb_timestamp_us(const struct sk_buff *skb) { return div_u64(skb->skb_mstamp_ns, NSEC_PER_USEC); } /* Provide skb TSval in usec or ms unit */ static inline u32 tcp_skb_timestamp_ts(bool usec_ts, const struct sk_buff *skb) { if (usec_ts) return tcp_skb_timestamp_us(skb); return div_u64(skb->skb_mstamp_ns, NSEC_PER_MSEC); } static inline u32 tcp_tw_tsval(const struct tcp_timewait_sock *tcptw) { return tcp_clock_ts(tcptw->tw_sk.tw_usec_ts) + tcptw->tw_ts_offset; } static inline u32 tcp_rsk_tsval(const struct tcp_request_sock *treq) { return tcp_clock_ts(treq->req_usec_ts) + treq->ts_off; } #define tcp_flag_byte(th) (((u_int8_t *)th)[13]) #define TCPHDR_FIN BIT(0) #define TCPHDR_SYN BIT(1) #define TCPHDR_RST BIT(2) #define TCPHDR_PSH BIT(3) #define TCPHDR_ACK BIT(4) #define TCPHDR_URG BIT(5) #define TCPHDR_ECE BIT(6) #define TCPHDR_CWR BIT(7) #define TCPHDR_AE BIT(8) #define TCPHDR_FLAGS_MASK (TCPHDR_FIN | TCPHDR_SYN | TCPHDR_RST | \ TCPHDR_PSH | TCPHDR_ACK | TCPHDR_URG | \ TCPHDR_ECE | TCPHDR_CWR | TCPHDR_AE) #define tcp_flags_ntohs(th) (ntohs(*(__be16 *)&tcp_flag_word(th)) & \ TCPHDR_FLAGS_MASK) #define TCPHDR_ACE (TCPHDR_ECE | TCPHDR_CWR | TCPHDR_AE) #define TCPHDR_SYN_ECN (TCPHDR_SYN | TCPHDR_ECE | TCPHDR_CWR) /* State flags for sacked in struct tcp_skb_cb */ enum tcp_skb_cb_sacked_flags { TCPCB_SACKED_ACKED = (1 << 0), /* SKB ACK'd by a SACK block */ TCPCB_SACKED_RETRANS = (1 << 1), /* SKB retransmitted */ TCPCB_LOST = (1 << 2), /* SKB is lost */ TCPCB_TAGBITS = (TCPCB_SACKED_ACKED | TCPCB_SACKED_RETRANS | TCPCB_LOST), /* All tag bits */ TCPCB_REPAIRED = (1 << 4), /* SKB repaired (no skb_mstamp_ns) */ TCPCB_EVER_RETRANS = (1 << 7), /* Ever retransmitted frame */ TCPCB_RETRANS = (TCPCB_SACKED_RETRANS | TCPCB_EVER_RETRANS | TCPCB_REPAIRED), }; /* This is what the send packet queuing engine uses to pass * TCP per-packet control information to the transmission code. * We also store the host-order sequence numbers in here too. * This is 44 bytes if IPV6 is enabled. * If this grows please adjust skbuff.h:skbuff->cb[xxx] size appropriately. */ struct tcp_skb_cb { __u32 seq; /* Starting sequence number */ __u32 end_seq; /* SEQ + FIN + SYN + datalen */ union { /* Note : * tcp_gso_segs/size are used in write queue only, * cf tcp_skb_pcount()/tcp_skb_mss() */ struct { u16 tcp_gso_segs; u16 tcp_gso_size; }; }; __u16 tcp_flags; /* TCP header flags (tcp[12-13])*/ __u8 sacked; /* State flags for SACK. */ __u8 ip_dsfield; /* IPv4 tos or IPv6 dsfield */ #define TSTAMP_ACK_SK 0x1 #define TSTAMP_ACK_BPF 0x2 __u8 txstamp_ack:2, /* Record TX timestamp for ack? */ eor:1, /* Is skb MSG_EOR marked? */ has_rxtstamp:1, /* SKB has a RX timestamp */ unused:4; __u32 ack_seq; /* Sequence number ACK'd */ union { struct { #define TCPCB_DELIVERED_CE_MASK ((1U<<20) - 1) /* There is space for up to 24 bytes */ __u32 is_app_limited:1, /* cwnd not fully used? */ delivered_ce:20, unused:11; /* pkts S/ACKed so far upon tx of skb, incl retrans: */ __u32 delivered; /* start of send pipeline phase */ u64 first_tx_mstamp; /* when we reached the "delivered" count */ u64 delivered_mstamp; } tx; /* only used for outgoing skbs */ union { struct inet_skb_parm h4; #if IS_ENABLED(CONFIG_IPV6) struct inet6_skb_parm h6; #endif } header; /* For incoming skbs */ }; }; #define TCP_SKB_CB(__skb) ((struct tcp_skb_cb *)&((__skb)->cb[0])) extern const struct inet_connection_sock_af_ops ipv4_specific; #if IS_ENABLED(CONFIG_IPV6) /* This is the variant of inet6_iif() that must be used by TCP, * as TCP moves IP6CB into a different location in skb->cb[] */ static inline int tcp_v6_iif(const struct sk_buff *skb) { return TCP_SKB_CB(skb)->header.h6.iif; } static inline int tcp_v6_iif_l3_slave(const struct sk_buff *skb) { bool l3_slave = ipv6_l3mdev_skb(TCP_SKB_CB(skb)->header.h6.flags); return l3_slave ? skb->skb_iif : TCP_SKB_CB(skb)->header.h6.iif; } /* TCP_SKB_CB reference means this can not be used from early demux */ static inline int tcp_v6_sdif(const struct sk_buff *skb) { #if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV) if (skb && ipv6_l3mdev_skb(TCP_SKB_CB(skb)->header.h6.flags)) return TCP_SKB_CB(skb)->header.h6.iif; #endif return 0; } extern const struct inet_connection_sock_af_ops ipv6_specific; INDIRECT_CALLABLE_DECLARE(void tcp_v6_send_check(struct sock *sk, struct sk_buff *skb)); INDIRECT_CALLABLE_DECLARE(int tcp_v6_rcv(struct sk_buff *skb)); void tcp_v6_early_demux(struct sk_buff *skb); #endif /* TCP_SKB_CB reference means this can not be used from early demux */ static inline int tcp_v4_sdif(struct sk_buff *skb) { #if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV) if (skb && ipv4_l3mdev_skb(TCP_SKB_CB(skb)->header.h4.flags)) return TCP_SKB_CB(skb)->header.h4.iif; #endif return 0; } /* Due to TSO, an SKB can be composed of multiple actual * packets. To keep these tracked properly, we use this. */ static inline int tcp_skb_pcount(const struct sk_buff *skb) { return TCP_SKB_CB(skb)->tcp_gso_segs; } static inline void tcp_skb_pcount_set(struct sk_buff *skb, int segs) { TCP_SKB_CB(skb)->tcp_gso_segs = segs; } static inline void tcp_skb_pcount_add(struct sk_buff *skb, int segs) { TCP_SKB_CB(skb)->tcp_gso_segs += segs; } /* This is valid iff skb is in write queue and tcp_skb_pcount() > 1. */ static inline int tcp_skb_mss(const struct sk_buff *skb) { return TCP_SKB_CB(skb)->tcp_gso_size; } static inline bool tcp_skb_can_collapse_to(const struct sk_buff *skb) { return likely(!TCP_SKB_CB(skb)->eor); } static inline bool tcp_skb_can_collapse(const struct sk_buff *to, const struct sk_buff *from) { /* skb_cmp_decrypted() not needed, use tcp_write_collapse_fence() */ return likely(tcp_skb_can_collapse_to(to) && mptcp_skb_can_collapse(to, from) && skb_pure_zcopy_same(to, from) && skb_frags_readable(to) == skb_frags_readable(from)); } static inline bool tcp_skb_can_collapse_rx(const struct sk_buff *to, const struct sk_buff *from) { return likely(mptcp_skb_can_collapse(to, from) && !skb_cmp_decrypted(to, from)); } /* Events passed to congestion control interface */ enum tcp_ca_event { CA_EVENT_TX_START, /* first transmit when no packets in flight */ CA_EVENT_CWND_RESTART, /* congestion window restart */ CA_EVENT_COMPLETE_CWR, /* end of congestion recovery */ CA_EVENT_LOSS, /* loss timeout */ CA_EVENT_ECN_NO_CE, /* ECT set, but not CE marked */ CA_EVENT_ECN_IS_CE, /* received CE marked IP packet */ }; /* Information about inbound ACK, passed to cong_ops->in_ack_event() */ enum tcp_ca_ack_event_flags { CA_ACK_SLOWPATH = (1 << 0), /* In slow path processing */ CA_ACK_WIN_UPDATE = (1 << 1), /* ACK updated window */ CA_ACK_ECE = (1 << 2), /* ECE bit is set on ack */ }; /* * Interface for adding new TCP congestion control handlers */ #define TCP_CA_NAME_MAX 16 #define TCP_CA_MAX 128 #define TCP_CA_BUF_MAX (TCP_CA_NAME_MAX*TCP_CA_MAX) #define TCP_CA_UNSPEC 0 /* Algorithm can be set on socket without CAP_NET_ADMIN privileges */ #define TCP_CONG_NON_RESTRICTED BIT(0) /* Requires ECN/ECT set on all packets */ #define TCP_CONG_NEEDS_ECN BIT(1) #define TCP_CONG_MASK (TCP_CONG_NON_RESTRICTED | TCP_CONG_NEEDS_ECN) union tcp_cc_info; struct ack_sample { u32 pkts_acked; s32 rtt_us; u32 in_flight; }; /* A rate sample measures the number of (original/retransmitted) data * packets delivered "delivered" over an interval of time "interval_us". * The tcp_rate.c code fills in the rate sample, and congestion * control modules that define a cong_control function to run at the end * of ACK processing can optionally chose to consult this sample when * setting cwnd and pacing rate. * A sample is invalid if "delivered" or "interval_us" is negative. */ struct rate_sample { u64 prior_mstamp; /* starting timestamp for interval */ u32 prior_delivered; /* tp->delivered at "prior_mstamp" */ u32 prior_delivered_ce;/* tp->delivered_ce at "prior_mstamp" */ s32 delivered; /* number of packets delivered over interval */ s32 delivered_ce; /* number of packets delivered w/ CE marks*/ long interval_us; /* time for tp->delivered to incr "delivered" */ u32 snd_interval_us; /* snd interval for delivered packets */ u32 rcv_interval_us; /* rcv interval for delivered packets */ long rtt_us; /* RTT of last (S)ACKed packet (or -1) */ int losses; /* number of packets marked lost upon ACK */ u32 acked_sacked; /* number of packets newly (S)ACKed upon ACK */ u32 prior_in_flight; /* in flight before this ACK */ u32 last_end_seq; /* end_seq of most recently ACKed packet */ bool is_app_limited; /* is sample from packet with bubble in pipe? */ bool is_retrans; /* is sample from retransmission? */ bool is_ack_delayed; /* is this (likely) a delayed ACK? */ }; struct tcp_congestion_ops { /* fast path fields are put first to fill one cache line */ /* return slow start threshold (required) */ u32 (*ssthresh)(struct sock *sk); /* do new cwnd calculation (required) */ void (*cong_avoid)(struct sock *sk, u32 ack, u32 acked); /* call before changing ca_state (optional) */ void (*set_state)(struct sock *sk, u8 new_state); /* call when cwnd event occurs (optional) */ void (*cwnd_event)(struct sock *sk, enum tcp_ca_event ev); /* call when ack arrives (optional) */ void (*in_ack_event)(struct sock *sk, u32 flags); /* hook for packet ack accounting (optional) */ void (*pkts_acked)(struct sock *sk, const struct ack_sample *sample); /* override sysctl_tcp_min_tso_segs */ u32 (*min_tso_segs)(struct sock *sk); /* call when packets are delivered to update cwnd and pacing rate, * after all the ca_state processing. (optional) */ void (*cong_control)(struct sock *sk, u32 ack, int flag, const struct rate_sample *rs); /* new value of cwnd after loss (required) */ u32 (*undo_cwnd)(struct sock *sk); /* returns the multiplier used in tcp_sndbuf_expand (optional) */ u32 (*sndbuf_expand)(struct sock *sk); /* control/slow paths put last */ /* get info for inet_diag (optional) */ size_t (*get_info)(struct sock *sk, u32 ext, int *attr, union tcp_cc_info *info); char name[TCP_CA_NAME_MAX]; struct module *owner; struct list_head list; u32 key; u32 flags; /* initialize private data (optional) */ void (*init)(struct sock *sk); /* cleanup private data (optional) */ void (*release)(struct sock *sk); } ____cacheline_aligned_in_smp; int tcp_register_congestion_control(struct tcp_congestion_ops *type); void tcp_unregister_congestion_control(struct tcp_congestion_ops *type); int tcp_update_congestion_control(struct tcp_congestion_ops *type, struct tcp_congestion_ops *old_type); int tcp_validate_congestion_control(struct tcp_congestion_ops *ca); void tcp_assign_congestion_control(struct sock *sk); void tcp_init_congestion_control(struct sock *sk); void tcp_cleanup_congestion_control(struct sock *sk); int tcp_set_default_congestion_control(struct net *net, const char *name); void tcp_get_default_congestion_control(struct net *net, char *name); void tcp_get_available_congestion_control(char *buf, size_t len); void tcp_get_allowed_congestion_control(char *buf, size_t len); int tcp_set_allowed_congestion_control(char *allowed); int tcp_set_congestion_control(struct sock *sk, const char *name, bool load, bool cap_net_admin); u32 tcp_slow_start(struct tcp_sock *tp, u32 acked); void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w, u32 acked); u32 tcp_reno_ssthresh(struct sock *sk); u32 tcp_reno_undo_cwnd(struct sock *sk); void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked); extern struct tcp_congestion_ops tcp_reno; struct tcp_congestion_ops *tcp_ca_find(const char *name); struct tcp_congestion_ops *tcp_ca_find_key(u32 key); u32 tcp_ca_get_key_by_name(const char *name, bool *ecn_ca); #ifdef CONFIG_INET char *tcp_ca_get_name_by_key(u32 key, char *buffer); #else static inline char *tcp_ca_get_name_by_key(u32 key, char *buffer) { return NULL; } #endif static inline bool tcp_ca_needs_ecn(const struct sock *sk) { const struct inet_connection_sock *icsk = inet_csk(sk); return icsk->icsk_ca_ops->flags & TCP_CONG_NEEDS_ECN; } static inline void tcp_ca_event(struct sock *sk, const enum tcp_ca_event event) { const struct inet_connection_sock *icsk = inet_csk(sk); if (icsk->icsk_ca_ops->cwnd_event) icsk->icsk_ca_ops->cwnd_event(sk, event); } /* From tcp_cong.c */ void tcp_set_ca_state(struct sock *sk, const u8 ca_state); /* From tcp_rate.c */ void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb); void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb, struct rate_sample *rs); void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost, bool is_sack_reneg, struct rate_sample *rs); void tcp_rate_check_app_limited(struct sock *sk); static inline bool tcp_skb_sent_after(u64 t1, u64 t2, u32 seq1, u32 seq2) { return t1 > t2 || (t1 == t2 && after(seq1, seq2)); } /* These functions determine how the current flow behaves in respect of SACK * handling. SACK is negotiated with the peer, and therefore it can vary * between different flows. * * tcp_is_sack - SACK enabled * tcp_is_reno - No SACK */ static inline int tcp_is_sack(const struct tcp_sock *tp) { return likely(tp->rx_opt.sack_ok); } static inline bool tcp_is_reno(const struct tcp_sock *tp) { return !tcp_is_sack(tp); } static inline unsigned int tcp_left_out(const struct tcp_sock *tp) { return tp->sacked_out + tp->lost_out; } /* This determines how many packets are "in the network" to the best * of our knowledge. In many cases it is conservative, but where * detailed information is available from the receiver (via SACK * blocks etc.) we can make more aggressive calculations. * * Use this for decisions involving congestion control, use just * tp->packets_out to determine if the send queue is empty or not. * * Read this equation as: * * "Packets sent once on transmission queue" MINUS * "Packets left network, but not honestly ACKed yet" PLUS * "Packets fast retransmitted" */ static inline unsigned int tcp_packets_in_flight(const struct tcp_sock *tp) { return tp->packets_out - tcp_left_out(tp) + tp->retrans_out; } #define TCP_INFINITE_SSTHRESH 0x7fffffff static inline u32 tcp_snd_cwnd(const struct tcp_sock *tp) { return tp->snd_cwnd; } static inline void tcp_snd_cwnd_set(struct tcp_sock *tp, u32 val) { WARN_ON_ONCE((int)val <= 0); tp->snd_cwnd = val; } static inline bool tcp_in_slow_start(const struct tcp_sock *tp) { return tcp_snd_cwnd(tp) < tp->snd_ssthresh; } static inline bool tcp_in_initial_slowstart(const struct tcp_sock *tp) { return tp->snd_ssthresh >= TCP_INFINITE_SSTHRESH; } static inline bool tcp_in_cwnd_reduction(const struct sock *sk) { return (TCPF_CA_CWR | TCPF_CA_Recovery) & (1 << inet_csk(sk)->icsk_ca_state); } /* If cwnd > ssthresh, we may raise ssthresh to be half-way to cwnd. * The exception is cwnd reduction phase, when cwnd is decreasing towards * ssthresh. */ static inline __u32 tcp_current_ssthresh(const struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); if (tcp_in_cwnd_reduction(sk)) return tp->snd_ssthresh; else return max(tp->snd_ssthresh, ((tcp_snd_cwnd(tp) >> 1) + (tcp_snd_cwnd(tp) >> 2))); } /* Use define here intentionally to get WARN_ON location shown at the caller */ #define tcp_verify_left_out(tp) WARN_ON(tcp_left_out(tp) > tp->packets_out) void tcp_enter_cwr(struct sock *sk); __u32 tcp_init_cwnd(const struct tcp_sock *tp, const struct dst_entry *dst); /* The maximum number of MSS of available cwnd for which TSO defers * sending if not using sysctl_tcp_tso_win_divisor. */ static inline __u32 tcp_max_tso_deferred_mss(const struct tcp_sock *tp) { return 3; } /* Returns end sequence number of the receiver's advertised window */ static inline u32 tcp_wnd_end(const struct tcp_sock *tp) { return tp->snd_una + tp->snd_wnd; } /* We follow the spirit of RFC2861 to validate cwnd but implement a more * flexible approach. The RFC suggests cwnd should not be raised unless * it was fully used previously. And that's exactly what we do in * congestion avoidance mode. But in slow start we allow cwnd to grow * as long as the application has used half the cwnd. * Example : * cwnd is 10 (IW10), but application sends 9 frames. * We allow cwnd to reach 18 when all frames are ACKed. * This check is safe because it's as aggressive as slow start which already * risks 100% overshoot. The advantage is that we discourage application to * either send more filler packets or data to artificially blow up the cwnd * usage, and allow application-limited process to probe bw more aggressively. */ static inline bool tcp_is_cwnd_limited(const struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); if (tp->is_cwnd_limited) return true; /* If in slow start, ensure cwnd grows to twice what was ACKed. */ if (tcp_in_slow_start(tp)) return tcp_snd_cwnd(tp) < 2 * tp->max_packets_out; return false; } /* BBR congestion control needs pacing. * Same remark for SO_MAX_PACING_RATE. * sch_fq packet scheduler is efficiently handling pacing, * but is not always installed/used. * Return true if TCP stack should pace packets itself. */ static inline bool tcp_needs_internal_pacing(const struct sock *sk) { return smp_load_acquire(&sk->sk_pacing_status) == SK_PACING_NEEDED; } /* Estimates in how many jiffies next packet for this flow can be sent. * Scheduling a retransmit timer too early would be silly. */ static inline unsigned long tcp_pacing_delay(const struct sock *sk) { s64 delay = tcp_sk(sk)->tcp_wstamp_ns - tcp_sk(sk)->tcp_clock_cache; return delay > 0 ? nsecs_to_jiffies(delay) : 0; } static inline void tcp_reset_xmit_timer(struct sock *sk, const int what, unsigned long when, bool pace_delay) { if (pace_delay) when += tcp_pacing_delay(sk); inet_csk_reset_xmit_timer(sk, what, when, tcp_rto_max(sk)); } /* Something is really bad, we could not queue an additional packet, * because qdisc is full or receiver sent a 0 window, or we are paced. * We do not want to add fuel to the fire, or abort too early, * so make sure the timer we arm now is at least 200ms in the future, * regardless of current icsk_rto value (as it could be ~2ms) */ static inline unsigned long tcp_probe0_base(const struct sock *sk) { return max_t(unsigned long, inet_csk(sk)->icsk_rto, TCP_RTO_MIN); } /* Variant of inet_csk_rto_backoff() used for zero window probes */ static inline unsigned long tcp_probe0_when(const struct sock *sk, unsigned long max_when) { u8 backoff = min_t(u8, ilog2(TCP_RTO_MAX / TCP_RTO_MIN) + 1, inet_csk(sk)->icsk_backoff); u64 when = (u64)tcp_probe0_base(sk) << backoff; return (unsigned long)min_t(u64, when, max_when); } static inline void tcp_check_probe_timer(struct sock *sk) { if (!tcp_sk(sk)->packets_out && !inet_csk(sk)->icsk_pending) tcp_reset_xmit_timer(sk, ICSK_TIME_PROBE0, tcp_probe0_base(sk), true); } static inline void tcp_init_wl(struct tcp_sock *tp, u32 seq) { tp->snd_wl1 = seq; } static inline void tcp_update_wl(struct tcp_sock *tp, u32 seq) { tp->snd_wl1 = seq; } /* * Calculate(/check) TCP checksum */ static inline __sum16 tcp_v4_check(int len, __be32 saddr, __be32 daddr, __wsum base) { return csum_tcpudp_magic(saddr, daddr, len, IPPROTO_TCP, base); } static inline bool tcp_checksum_complete(struct sk_buff *skb) { return !skb_csum_unnecessary(skb) && __skb_checksum_complete(skb); } bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb, enum skb_drop_reason *reason); int tcp_filter(struct sock *sk, struct sk_buff *skb, enum skb_drop_reason *reason); void tcp_set_state(struct sock *sk, int state); void tcp_done(struct sock *sk); int tcp_abort(struct sock *sk, int err); static inline void tcp_sack_reset(struct tcp_options_received *rx_opt) { rx_opt->dsack = 0; rx_opt->num_sacks = 0; } void tcp_cwnd_restart(struct sock *sk, s32 delta); static inline void tcp_slow_start_after_idle_check(struct sock *sk) { const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops; struct tcp_sock *tp = tcp_sk(sk); s32 delta; if (!READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_slow_start_after_idle) || tp->packets_out || ca_ops->cong_control) return; delta = tcp_jiffies32 - tp->lsndtime; if (delta > inet_csk(sk)->icsk_rto) tcp_cwnd_restart(sk, delta); } /* Determine a window scaling and initial window to offer. */ void tcp_select_initial_window(const struct sock *sk, int __space, __u32 mss, __u32 *rcv_wnd, __u32 *window_clamp, int wscale_ok, __u8 *rcv_wscale, __u32 init_rcv_wnd); static inline int __tcp_win_from_space(u8 scaling_ratio, int space) { s64 scaled_space = (s64)space * scaling_ratio; return scaled_space >> TCP_RMEM_TO_WIN_SCALE; } static inline int tcp_win_from_space(const struct sock *sk, int space) { return __tcp_win_from_space(tcp_sk(sk)->scaling_ratio, space); } /* inverse of __tcp_win_from_space() */ static inline int __tcp_space_from_win(u8 scaling_ratio, int win) { u64 val = (u64)win << TCP_RMEM_TO_WIN_SCALE; do_div(val, scaling_ratio); return val; } static inline int tcp_space_from_win(const struct sock *sk, int win) { return __tcp_space_from_win(tcp_sk(sk)->scaling_ratio, win); } /* Assume a 50% default for skb->len/skb->truesize ratio. * This may be adjusted later in tcp_measure_rcv_mss(). */ #define TCP_DEFAULT_SCALING_RATIO (1 << (TCP_RMEM_TO_WIN_SCALE - 1)) static inline void tcp_scaling_ratio_init(struct sock *sk) { tcp_sk(sk)->scaling_ratio = TCP_DEFAULT_SCALING_RATIO; } /* Note: caller must be prepared to deal with negative returns */ static inline int tcp_space(const struct sock *sk) { return tcp_win_from_space(sk, READ_ONCE(sk->sk_rcvbuf) - READ_ONCE(sk->sk_backlog.len) - atomic_read(&sk->sk_rmem_alloc)); } static inline int tcp_full_space(const struct sock *sk) { return tcp_win_from_space(sk, READ_ONCE(sk->sk_rcvbuf)); } static inline void __tcp_adjust_rcv_ssthresh(struct sock *sk, u32 new_ssthresh) { int unused_mem = sk_unused_reserved_mem(sk); struct tcp_sock *tp = tcp_sk(sk); tp->rcv_ssthresh = min(tp->rcv_ssthresh, new_ssthresh); if (unused_mem) tp->rcv_ssthresh = max_t(u32, tp->rcv_ssthresh, tcp_win_from_space(sk, unused_mem)); } static inline void tcp_adjust_rcv_ssthresh(struct sock *sk) { __tcp_adjust_rcv_ssthresh(sk, 4U * tcp_sk(sk)->advmss); } void tcp_cleanup_rbuf(struct sock *sk, int copied); void __tcp_cleanup_rbuf(struct sock *sk, int copied); /* We provision sk_rcvbuf around 200% of sk_rcvlowat. * If 87.5 % (7/8) of the space has been consumed, we want to override * SO_RCVLOWAT constraint, since we are receiving skbs with too small * len/truesize ratio. */ static inline bool tcp_rmem_pressure(const struct sock *sk) { int rcvbuf, threshold; if (tcp_under_memory_pressure(sk)) return true; rcvbuf = READ_ONCE(sk->sk_rcvbuf); threshold = rcvbuf - (rcvbuf >> 3); return atomic_read(&sk->sk_rmem_alloc) > threshold; } static inline bool tcp_epollin_ready(const struct sock *sk, int target) { const struct tcp_sock *tp = tcp_sk(sk); int avail = READ_ONCE(tp->rcv_nxt) - READ_ONCE(tp->copied_seq); if (avail <= 0) return false; return (avail >= target) || tcp_rmem_pressure(sk) || (tcp_receive_window(tp) <= inet_csk(sk)->icsk_ack.rcv_mss); } extern void tcp_openreq_init_rwin(struct request_sock *req, const struct sock *sk_listener, const struct dst_entry *dst); void tcp_enter_memory_pressure(struct sock *sk); void tcp_leave_memory_pressure(struct sock *sk); static inline int keepalive_intvl_when(const struct tcp_sock *tp) { struct net *net = sock_net((struct sock *)tp); int val; /* Paired with WRITE_ONCE() in tcp_sock_set_keepintvl() * and do_tcp_setsockopt(). */ val = READ_ONCE(tp->keepalive_intvl); return val ? : READ_ONCE(net->ipv4.sysctl_tcp_keepalive_intvl); } static inline int keepalive_time_when(const struct tcp_sock *tp) { struct net *net = sock_net((struct sock *)tp); int val; /* Paired with WRITE_ONCE() in tcp_sock_set_keepidle_locked() */ val = READ_ONCE(tp->keepalive_time); return val ? : READ_ONCE(net->ipv4.sysctl_tcp_keepalive_time); } static inline int keepalive_probes(const struct tcp_sock *tp) { struct net *net = sock_net((struct sock *)tp); int val; /* Paired with WRITE_ONCE() in tcp_sock_set_keepcnt() * and do_tcp_setsockopt(). */ val = READ_ONCE(tp->keepalive_probes); return val ? : READ_ONCE(net->ipv4.sysctl_tcp_keepalive_probes); } static inline u32 keepalive_time_elapsed(const struct tcp_sock *tp) { const struct inet_connection_sock *icsk = &tp->inet_conn; return min_t(u32, tcp_jiffies32 - icsk->icsk_ack.lrcvtime, tcp_jiffies32 - tp->rcv_tstamp); } static inline int tcp_fin_time(const struct sock *sk) { int fin_timeout = tcp_sk(sk)->linger2 ? : READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_fin_timeout); const int rto = inet_csk(sk)->icsk_rto; if (fin_timeout < (rto << 2) - (rto >> 1)) fin_timeout = (rto << 2) - (rto >> 1); return fin_timeout; } static inline bool tcp_paws_check(const struct tcp_options_received *rx_opt, int paws_win) { if ((s32)(rx_opt->ts_recent - rx_opt->rcv_tsval) <= paws_win) return true; if (unlikely(!time_before32(ktime_get_seconds(), rx_opt->ts_recent_stamp + TCP_PAWS_WRAP))) return true; /* * Some OSes send SYN and SYNACK messages with tsval=0 tsecr=0, * then following tcp messages have valid values. Ignore 0 value, * or else 'negative' tsval might forbid us to accept their packets. */ if (!rx_opt->ts_recent) return true; return false; } static inline bool tcp_paws_reject(const struct tcp_options_received *rx_opt, int rst) { if (tcp_paws_check(rx_opt, 0)) return false; /* RST segments are not recommended to carry timestamp, and, if they do, it is recommended to ignore PAWS because "their cleanup function should take precedence over timestamps." Certainly, it is mistake. It is necessary to understand the reasons of this constraint to relax it: if peer reboots, clock may go out-of-sync and half-open connections will not be reset. Actually, the problem would be not existing if all the implementations followed draft about maintaining clock via reboots. Linux-2.2 DOES NOT! However, we can relax time bounds for RST segments to MSL. */ if (rst && !time_before32(ktime_get_seconds(), rx_opt->ts_recent_stamp + TCP_PAWS_MSL)) return false; return true; } bool tcp_oow_rate_limited(struct net *net, const struct sk_buff *skb, int mib_idx, u32 *last_oow_ack_time); static inline void tcp_mib_init(struct net *net) { /* See RFC 2012 */ TCP_ADD_STATS(net, TCP_MIB_RTOALGORITHM, 1); TCP_ADD_STATS(net, TCP_MIB_RTOMIN, TCP_RTO_MIN*1000/HZ); TCP_ADD_STATS(net, TCP_MIB_RTOMAX, TCP_RTO_MAX*1000/HZ); TCP_ADD_STATS(net, TCP_MIB_MAXCONN, -1); } /* from STCP */ static inline void tcp_clear_all_retrans_hints(struct tcp_sock *tp) { tp->retransmit_skb_hint = NULL; } #define tcp_md5_addr tcp_ao_addr /* - key database */ struct tcp_md5sig_key { struct hlist_node node; u8 keylen; u8 family; /* AF_INET or AF_INET6 */ u8 prefixlen; u8 flags; union tcp_md5_addr addr; int l3index; /* set if key added with L3 scope */ u8 key[TCP_MD5SIG_MAXKEYLEN]; struct rcu_head rcu; }; /* - sock block */ struct tcp_md5sig_info { struct hlist_head head; struct rcu_head rcu; }; /* - pseudo header */ struct tcp4_pseudohdr { __be32 saddr; __be32 daddr; __u8 pad; __u8 protocol; __be16 len; }; struct tcp6_pseudohdr { struct in6_addr saddr; struct in6_addr daddr; __be32 len; __be32 protocol; /* including padding */ }; union tcp_md5sum_block { struct tcp4_pseudohdr ip4; #if IS_ENABLED(CONFIG_IPV6) struct tcp6_pseudohdr ip6; #endif }; /* * struct tcp_sigpool - per-CPU pool of ahash_requests * @scratch: per-CPU temporary area, that can be used between * tcp_sigpool_start() and tcp_sigpool_end() to perform * crypto request * @req: pre-allocated ahash request */ struct tcp_sigpool { void *scratch; struct ahash_request *req; }; int tcp_sigpool_alloc_ahash(const char *alg, size_t scratch_size); void tcp_sigpool_get(unsigned int id); void tcp_sigpool_release(unsigned int id); int tcp_sigpool_hash_skb_data(struct tcp_sigpool *hp, const struct sk_buff *skb, unsigned int header_len); /** * tcp_sigpool_start - disable bh and start using tcp_sigpool_ahash * @id: tcp_sigpool that was previously allocated by tcp_sigpool_alloc_ahash() * @c: returned tcp_sigpool for usage (uninitialized on failure) * * Returns: 0 on success, error otherwise. */ int tcp_sigpool_start(unsigned int id, struct tcp_sigpool *c); /** * tcp_sigpool_end - enable bh and stop using tcp_sigpool * @c: tcp_sigpool context that was returned by tcp_sigpool_start() */ void tcp_sigpool_end(struct tcp_sigpool *c); size_t tcp_sigpool_algo(unsigned int id, char *buf, size_t buf_len); /* - functions */ int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key, const struct sock *sk, const struct sk_buff *skb); int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, int family, u8 prefixlen, int l3index, u8 flags, const u8 *newkey, u8 newkeylen); int tcp_md5_key_copy(struct sock *sk, const union tcp_md5_addr *addr, int family, u8 prefixlen, int l3index, struct tcp_md5sig_key *key); int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family, u8 prefixlen, int l3index, u8 flags); void tcp_clear_md5_list(struct sock *sk); struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk, const struct sock *addr_sk); #ifdef CONFIG_TCP_MD5SIG struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index, const union tcp_md5_addr *addr, int family, bool any_l3index); static inline struct tcp_md5sig_key * tcp_md5_do_lookup(const struct sock *sk, int l3index, const union tcp_md5_addr *addr, int family) { if (!static_branch_unlikely(&tcp_md5_needed.key)) return NULL; return __tcp_md5_do_lookup(sk, l3index, addr, family, false); } static inline struct tcp_md5sig_key * tcp_md5_do_lookup_any_l3index(const struct sock *sk, const union tcp_md5_addr *addr, int family) { if (!static_branch_unlikely(&tcp_md5_needed.key)) return NULL; return __tcp_md5_do_lookup(sk, 0, addr, family, true); } #define tcp_twsk_md5_key(twsk) ((twsk)->tw_md5_key) #else static inline struct tcp_md5sig_key * tcp_md5_do_lookup(const struct sock *sk, int l3index, const union tcp_md5_addr *addr, int family) { return NULL; } static inline struct tcp_md5sig_key * tcp_md5_do_lookup_any_l3index(const struct sock *sk, const union tcp_md5_addr *addr, int family) { return NULL; } #define tcp_twsk_md5_key(twsk) NULL #endif int tcp_md5_alloc_sigpool(void); void tcp_md5_release_sigpool(void); void tcp_md5_add_sigpool(void); extern int tcp_md5_sigpool_id; int tcp_md5_hash_key(struct tcp_sigpool *hp, const struct tcp_md5sig_key *key); /* From tcp_fastopen.c */ void tcp_fastopen_cache_get(struct sock *sk, u16 *mss, struct tcp_fastopen_cookie *cookie); void tcp_fastopen_cache_set(struct sock *sk, u16 mss, struct tcp_fastopen_cookie *cookie, bool syn_lost, u16 try_exp); struct tcp_fastopen_request { /* Fast Open cookie. Size 0 means a cookie request */ struct tcp_fastopen_cookie cookie; struct msghdr *data; /* data in MSG_FASTOPEN */ size_t size; int copied; /* queued in tcp_connect() */ struct ubuf_info *uarg; }; void tcp_free_fastopen_req(struct tcp_sock *tp); void tcp_fastopen_destroy_cipher(struct sock *sk); void tcp_fastopen_ctx_destroy(struct net *net); int tcp_fastopen_reset_cipher(struct net *net, struct sock *sk, void *primary_key, void *backup_key); int tcp_fastopen_get_cipher(struct net *net, struct inet_connection_sock *icsk, u64 *key); void tcp_fastopen_add_skb(struct sock *sk, struct sk_buff *skb); struct sock *tcp_try_fastopen(struct sock *sk, struct sk_buff *skb, struct request_sock *req, struct tcp_fastopen_cookie *foc, const struct dst_entry *dst); void tcp_fastopen_init_key_once(struct net *net); bool tcp_fastopen_cookie_check(struct sock *sk, u16 *mss, struct tcp_fastopen_cookie *cookie); bool tcp_fastopen_defer_connect(struct sock *sk, int *err); #define TCP_FASTOPEN_KEY_LENGTH sizeof(siphash_key_t) #define TCP_FASTOPEN_KEY_MAX 2 #define TCP_FASTOPEN_KEY_BUF_LENGTH \ (TCP_FASTOPEN_KEY_LENGTH * TCP_FASTOPEN_KEY_MAX) /* Fastopen key context */ struct tcp_fastopen_context { siphash_key_t key[TCP_FASTOPEN_KEY_MAX]; int num; struct rcu_head rcu; }; void tcp_fastopen_active_disable(struct sock *sk); bool tcp_fastopen_active_should_disable(struct sock *sk); void tcp_fastopen_active_disable_ofo_check(struct sock *sk); void tcp_fastopen_active_detect_blackhole(struct sock *sk, bool expired); /* Caller needs to wrap with rcu_read_(un)lock() */ static inline struct tcp_fastopen_context *tcp_fastopen_get_ctx(const struct sock *sk) { struct tcp_fastopen_context *ctx; ctx = rcu_dereference(inet_csk(sk)->icsk_accept_queue.fastopenq.ctx); if (!ctx) ctx = rcu_dereference(sock_net(sk)->ipv4.tcp_fastopen_ctx); return ctx; } static inline bool tcp_fastopen_cookie_match(const struct tcp_fastopen_cookie *foc, const struct tcp_fastopen_cookie *orig) { if (orig->len == TCP_FASTOPEN_COOKIE_SIZE && orig->len == foc->len && !memcmp(orig->val, foc->val, foc->len)) return true; return false; } static inline int tcp_fastopen_context_len(const struct tcp_fastopen_context *ctx) { return ctx->num; } /* Latencies incurred by various limits for a sender. They are * chronograph-like stats that are mutually exclusive. */ enum tcp_chrono { TCP_CHRONO_UNSPEC, TCP_CHRONO_BUSY, /* Actively sending data (non-empty write queue) */ TCP_CHRONO_RWND_LIMITED, /* Stalled by insufficient receive window */ TCP_CHRONO_SNDBUF_LIMITED, /* Stalled by insufficient send buffer */ __TCP_CHRONO_MAX, }; void tcp_chrono_start(struct sock *sk, const enum tcp_chrono type); void tcp_chrono_stop(struct sock *sk, const enum tcp_chrono type); /* This helper is needed, because skb->tcp_tsorted_anchor uses * the same memory storage than skb->destructor/_skb_refdst */ static inline void tcp_skb_tsorted_anchor_cleanup(struct sk_buff *skb) { skb->destructor = NULL; skb->_skb_refdst = 0UL; } #define tcp_skb_tsorted_save(skb) { \ unsigned long _save = skb->_skb_refdst; \ skb->_skb_refdst = 0UL; #define tcp_skb_tsorted_restore(skb) \ skb->_skb_refdst = _save; \ } void tcp_write_queue_purge(struct sock *sk); static inline struct sk_buff *tcp_rtx_queue_head(const struct sock *sk) { return skb_rb_first(&sk->tcp_rtx_queue); } static inline struct sk_buff *tcp_rtx_queue_tail(const struct sock *sk) { return skb_rb_last(&sk->tcp_rtx_queue); } static inline struct sk_buff *tcp_write_queue_tail(const struct sock *sk) { return skb_peek_tail(&sk->sk_write_queue); } #define tcp_for_write_queue_from_safe(skb, tmp, sk) \ skb_queue_walk_from_safe(&(sk)->sk_write_queue, skb, tmp) static inline struct sk_buff *tcp_send_head(const struct sock *sk) { return skb_peek(&sk->sk_write_queue); } static inline bool tcp_skb_is_last(const struct sock *sk, const struct sk_buff *skb) { return skb_queue_is_last(&sk->sk_write_queue, skb); } /** * tcp_write_queue_empty - test if any payload (or FIN) is available in write queue * @sk: socket * * Since the write queue can have a temporary empty skb in it, * we must not use "return skb_queue_empty(&sk->sk_write_queue)" */ static inline bool tcp_write_queue_empty(const struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); return tp->write_seq == tp->snd_nxt; } static inline bool tcp_rtx_queue_empty(const struct sock *sk) { return RB_EMPTY_ROOT(&sk->tcp_rtx_queue); } static inline bool tcp_rtx_and_write_queues_empty(const struct sock *sk) { return tcp_rtx_queue_empty(sk) && tcp_write_queue_empty(sk); } static inline void tcp_add_write_queue_tail(struct sock *sk, struct sk_buff *skb) { __skb_queue_tail(&sk->sk_write_queue, skb); /* Queue it, remembering where we must start sending. */ if (sk->sk_write_queue.next == skb) tcp_chrono_start(sk, TCP_CHRONO_BUSY); } /* Insert new before skb on the write queue of sk. */ static inline void tcp_insert_write_queue_before(struct sk_buff *new, struct sk_buff *skb, struct sock *sk) { __skb_queue_before(&sk->sk_write_queue, skb, new); } static inline void tcp_unlink_write_queue(struct sk_buff *skb, struct sock *sk) { tcp_skb_tsorted_anchor_cleanup(skb); __skb_unlink(skb, &sk->sk_write_queue); } void tcp_rbtree_insert(struct rb_root *root, struct sk_buff *skb); static inline void tcp_rtx_queue_unlink(struct sk_buff *skb, struct sock *sk) { tcp_skb_tsorted_anchor_cleanup(skb); rb_erase(&skb->rbnode, &sk->tcp_rtx_queue); } static inline void tcp_rtx_queue_unlink_and_free(struct sk_buff *skb, struct sock *sk) { list_del(&skb->tcp_tsorted_anchor); tcp_rtx_queue_unlink(skb, sk); tcp_wmem_free_skb(sk, skb); } static inline void tcp_write_collapse_fence(struct sock *sk) { struct sk_buff *skb = tcp_write_queue_tail(sk); if (skb) TCP_SKB_CB(skb)->eor = 1; } static inline void tcp_push_pending_frames(struct sock *sk) { if (tcp_send_head(sk)) { struct tcp_sock *tp = tcp_sk(sk); __tcp_push_pending_frames(sk, tcp_current_mss(sk), tp->nonagle); } } /* Start sequence of the skb just after the highest skb with SACKed * bit, valid only if sacked_out > 0 or when the caller has ensured * validity by itself. */ static inline u32 tcp_highest_sack_seq(struct tcp_sock *tp) { if (!tp->sacked_out) return tp->snd_una; if (tp->highest_sack == NULL) return tp->snd_nxt; return TCP_SKB_CB(tp->highest_sack)->seq; } static inline void tcp_advance_highest_sack(struct sock *sk, struct sk_buff *skb) { tcp_sk(sk)->highest_sack = skb_rb_next(skb); } static inline struct sk_buff *tcp_highest_sack(struct sock *sk) { return tcp_sk(sk)->highest_sack; } static inline void tcp_highest_sack_reset(struct sock *sk) { tcp_sk(sk)->highest_sack = tcp_rtx_queue_head(sk); } /* Called when old skb is about to be deleted and replaced by new skb */ static inline void tcp_highest_sack_replace(struct sock *sk, struct sk_buff *old, struct sk_buff *new) { if (old == tcp_highest_sack(sk)) tcp_sk(sk)->highest_sack = new; } /* This helper checks if socket has IP_TRANSPARENT set */ static inline bool inet_sk_transparent(const struct sock *sk) { switch (sk->sk_state) { case TCP_TIME_WAIT: return inet_twsk(sk)->tw_transparent; case TCP_NEW_SYN_RECV: return inet_rsk(inet_reqsk(sk))->no_srccheck; } return inet_test_bit(TRANSPARENT, sk); } /* Determines whether this is a thin stream (which may suffer from * increased latency). Used to trigger latency-reducing mechanisms. */ static inline bool tcp_stream_is_thin(struct tcp_sock *tp) { return tp->packets_out < 4 && !tcp_in_initial_slowstart(tp); } /* /proc */ enum tcp_seq_states { TCP_SEQ_STATE_LISTENING, TCP_SEQ_STATE_ESTABLISHED, }; void *tcp_seq_start(struct seq_file *seq, loff_t *pos); void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos); void tcp_seq_stop(struct seq_file *seq, void *v); struct tcp_seq_afinfo { sa_family_t family; }; struct tcp_iter_state { struct seq_net_private p; enum tcp_seq_states state; struct sock *syn_wait_sk; int bucket, offset, sbucket, num; loff_t last_pos; }; extern struct request_sock_ops tcp_request_sock_ops; extern struct request_sock_ops tcp6_request_sock_ops; void tcp_v4_destroy_sock(struct sock *sk); struct sk_buff *tcp_gso_segment(struct sk_buff *skb, netdev_features_t features); struct tcphdr *tcp_gro_pull_header(struct sk_buff *skb); struct sk_buff *tcp_gro_lookup(struct list_head *head, struct tcphdr *th); struct sk_buff *tcp_gro_receive(struct list_head *head, struct sk_buff *skb, struct tcphdr *th); INDIRECT_CALLABLE_DECLARE(int tcp4_gro_complete(struct sk_buff *skb, int thoff)); INDIRECT_CALLABLE_DECLARE(struct sk_buff *tcp4_gro_receive(struct list_head *head, struct sk_buff *skb)); INDIRECT_CALLABLE_DECLARE(int tcp6_gro_complete(struct sk_buff *skb, int thoff)); INDIRECT_CALLABLE_DECLARE(struct sk_buff *tcp6_gro_receive(struct list_head *head, struct sk_buff *skb)); #ifdef CONFIG_INET void tcp_gro_complete(struct sk_buff *skb); #else static inline void tcp_gro_complete(struct sk_buff *skb) { } #endif void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr); static inline u32 tcp_notsent_lowat(const struct tcp_sock *tp) { struct net *net = sock_net((struct sock *)tp); u32 val; val = READ_ONCE(tp->notsent_lowat); return val ?: READ_ONCE(net->ipv4.sysctl_tcp_notsent_lowat); } bool tcp_stream_memory_free(const struct sock *sk, int wake); #ifdef CONFIG_PROC_FS int tcp4_proc_init(void); void tcp4_proc_exit(void); #endif int tcp_rtx_synack(const struct sock *sk, struct request_sock *req); int tcp_conn_request(struct request_sock_ops *rsk_ops, const struct tcp_request_sock_ops *af_ops, struct sock *sk, struct sk_buff *skb); /* TCP af-specific functions */ struct tcp_sock_af_ops { #ifdef CONFIG_TCP_MD5SIG struct tcp_md5sig_key *(*md5_lookup) (const struct sock *sk, const struct sock *addr_sk); int (*calc_md5_hash)(char *location, const struct tcp_md5sig_key *md5, const struct sock *sk, const struct sk_buff *skb); int (*md5_parse)(struct sock *sk, int optname, sockptr_t optval, int optlen); #endif #ifdef CONFIG_TCP_AO int (*ao_parse)(struct sock *sk, int optname, sockptr_t optval, int optlen); struct tcp_ao_key *(*ao_lookup)(const struct sock *sk, struct sock *addr_sk, int sndid, int rcvid); int (*ao_calc_key_sk)(struct tcp_ao_key *mkt, u8 *key, const struct sock *sk, __be32 sisn, __be32 disn, bool send); int (*calc_ao_hash)(char *location, struct tcp_ao_key *ao, const struct sock *sk, const struct sk_buff *skb, const u8 *tkey, int hash_offset, u32 sne); #endif }; struct tcp_request_sock_ops { u16 mss_clamp; #ifdef CONFIG_TCP_MD5SIG struct tcp_md5sig_key *(*req_md5_lookup)(const struct sock *sk, const struct sock *addr_sk); int (*calc_md5_hash) (char *location, const struct tcp_md5sig_key *md5, const struct sock *sk, const struct sk_buff *skb); #endif #ifdef CONFIG_TCP_AO struct tcp_ao_key *(*ao_lookup)(const struct sock *sk, struct request_sock *req, int sndid, int rcvid); int (*ao_calc_key)(struct tcp_ao_key *mkt, u8 *key, struct request_sock *sk); int (*ao_synack_hash)(char *ao_hash, struct tcp_ao_key *mkt, struct request_sock *req, const struct sk_buff *skb, int hash_offset, u32 sne); #endif #ifdef CONFIG_SYN_COOKIES __u32 (*cookie_init_seq)(const struct sk_buff *skb, __u16 *mss); #endif struct dst_entry *(*route_req)(const struct sock *sk, struct sk_buff *skb, struct flowi *fl, struct request_sock *req, u32 tw_isn); u32 (*init_seq)(const struct sk_buff *skb); u32 (*init_ts_off)(const struct net *net, const struct sk_buff *skb); int (*send_synack)(const struct sock *sk, struct dst_entry *dst, struct flowi *fl, struct request_sock *req, struct tcp_fastopen_cookie *foc, enum tcp_synack_type synack_type, struct sk_buff *syn_skb); }; extern const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops; #if IS_ENABLED(CONFIG_IPV6) extern const struct tcp_request_sock_ops tcp_request_sock_ipv6_ops; #endif #ifdef CONFIG_SYN_COOKIES static inline __u32 cookie_init_sequence(const struct tcp_request_sock_ops *ops, const struct sock *sk, struct sk_buff *skb, __u16 *mss) { tcp_synq_overflow(sk); __NET_INC_STATS(sock_net(sk), LINUX_MIB_SYNCOOKIESSENT); return ops->cookie_init_seq(skb, mss); } #else static inline __u32 cookie_init_sequence(const struct tcp_request_sock_ops *ops, const struct sock *sk, struct sk_buff *skb, __u16 *mss) { return 0; } #endif struct tcp_key { union { struct { struct tcp_ao_key *ao_key; char *traffic_key; u32 sne; u8 rcv_next; }; struct tcp_md5sig_key *md5_key; }; enum { TCP_KEY_NONE = 0, TCP_KEY_MD5, TCP_KEY_AO, } type; }; static inline void tcp_get_current_key(const struct sock *sk, struct tcp_key *out) { #if defined(CONFIG_TCP_AO) || defined(CONFIG_TCP_MD5SIG) const struct tcp_sock *tp = tcp_sk(sk); #endif #ifdef CONFIG_TCP_AO if (static_branch_unlikely(&tcp_ao_needed.key)) { struct tcp_ao_info *ao; ao = rcu_dereference_protected(tp->ao_info, lockdep_sock_is_held(sk)); if (ao) { out->ao_key = READ_ONCE(ao->current_key); out->type = TCP_KEY_AO; return; } } #endif #ifdef CONFIG_TCP_MD5SIG if (static_branch_unlikely(&tcp_md5_needed.key) && rcu_access_pointer(tp->md5sig_info)) { out->md5_key = tp->af_specific->md5_lookup(sk, sk); if (out->md5_key) { out->type = TCP_KEY_MD5; return; } } #endif out->type = TCP_KEY_NONE; } static inline bool tcp_key_is_md5(const struct tcp_key *key) { if (static_branch_tcp_md5()) return key->type == TCP_KEY_MD5; return false; } static inline bool tcp_key_is_ao(const struct tcp_key *key) { if (static_branch_tcp_ao()) return key->type == TCP_KEY_AO; return false; } int tcpv4_offload_init(void); void tcp_v4_init(void); void tcp_init(void); /* tcp_recovery.c */ void tcp_mark_skb_lost(struct sock *sk, struct sk_buff *skb); void tcp_newreno_mark_lost(struct sock *sk, bool snd_una_advanced); extern s32 tcp_rack_skb_timeout(struct tcp_sock *tp, struct sk_buff *skb, u32 reo_wnd); extern bool tcp_rack_mark_lost(struct sock *sk); extern void tcp_rack_advance(struct tcp_sock *tp, u8 sacked, u32 end_seq, u64 xmit_time); extern void tcp_rack_reo_timeout(struct sock *sk); extern void tcp_rack_update_reo_wnd(struct sock *sk, struct rate_sample *rs); /* tcp_plb.c */ /* * Scaling factor for fractions in PLB. For example, tcp_plb_update_state * expects cong_ratio which represents fraction of traffic that experienced * congestion over a single RTT. In order to avoid floating point operations, * this fraction should be mapped to (1 << TCP_PLB_SCALE) and passed in. */ #define TCP_PLB_SCALE 8 /* State for PLB (Protective Load Balancing) for a single TCP connection. */ struct tcp_plb_state { u8 consec_cong_rounds:5, /* consecutive congested rounds */ unused:3; u32 pause_until; /* jiffies32 when PLB can resume rerouting */ }; static inline void tcp_plb_init(const struct sock *sk, struct tcp_plb_state *plb) { plb->consec_cong_rounds = 0; plb->pause_until = 0; } void tcp_plb_update_state(const struct sock *sk, struct tcp_plb_state *plb, const int cong_ratio); void tcp_plb_check_rehash(struct sock *sk, struct tcp_plb_state *plb); void tcp_plb_update_state_upon_rto(struct sock *sk, struct tcp_plb_state *plb); static inline void tcp_warn_once(const struct sock *sk, bool cond, const char *str) { WARN_ONCE(cond, "%scwn:%u out:%u sacked:%u lost:%u retrans:%u tlp_high_seq:%u sk_state:%u ca_state:%u advmss:%u mss_cache:%u pmtu:%u\n", str, tcp_snd_cwnd(tcp_sk(sk)), tcp_sk(sk)->packets_out, tcp_sk(sk)->sacked_out, tcp_sk(sk)->lost_out, tcp_sk(sk)->retrans_out, tcp_sk(sk)->tlp_high_seq, sk->sk_state, inet_csk(sk)->icsk_ca_state, tcp_sk(sk)->advmss, tcp_sk(sk)->mss_cache, inet_csk(sk)->icsk_pmtu_cookie); } /* At how many usecs into the future should the RTO fire? */ static inline s64 tcp_rto_delta_us(const struct sock *sk) { const struct sk_buff *skb = tcp_rtx_queue_head(sk); u32 rto = inet_csk(sk)->icsk_rto; if (likely(skb)) { u64 rto_time_stamp_us = tcp_skb_timestamp_us(skb) + jiffies_to_usecs(rto); return rto_time_stamp_us - tcp_sk(sk)->tcp_mstamp; } else { tcp_warn_once(sk, 1, "rtx queue empty: "); return jiffies_to_usecs(rto); } } /* * Save and compile IPv4 options, return a pointer to it */ static inline struct ip_options_rcu *tcp_v4_save_options(struct net *net, struct sk_buff *skb) { const struct ip_options *opt = &TCP_SKB_CB(skb)->header.h4.opt; struct ip_options_rcu *dopt = NULL; if (opt->optlen) { int opt_size = sizeof(*dopt) + opt->optlen; dopt = kmalloc(opt_size, GFP_ATOMIC); if (dopt && __ip_options_echo(net, &dopt->opt, skb, opt)) { kfree(dopt); dopt = NULL; } } return dopt; } /* locally generated TCP pure ACKs have skb->truesize == 2 * (check tcp_send_ack() in net/ipv4/tcp_output.c ) * This is much faster than dissecting the packet to find out. * (Think of GRE encapsulations, IPv4, IPv6, ...) */ static inline bool skb_is_tcp_pure_ack(const struct sk_buff *skb) { return skb->truesize == 2; } static inline void skb_set_tcp_pure_ack(struct sk_buff *skb) { skb->truesize = 2; } static inline int tcp_inq(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); int answ; if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) { answ = 0; } else if (sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data || before(tp->urg_seq, tp->copied_seq) || !before(tp->urg_seq, tp->rcv_nxt)) { answ = tp->rcv_nxt - tp->copied_seq; /* Subtract 1, if FIN was received */ if (answ && sock_flag(sk, SOCK_DONE)) answ--; } else { answ = tp->urg_seq - tp->copied_seq; } return answ; } int tcp_peek_len(struct socket *sock); static inline void tcp_segs_in(struct tcp_sock *tp, const struct sk_buff *skb) { u16 segs_in; segs_in = max_t(u16, 1, skb_shinfo(skb)->gso_segs); /* We update these fields while other threads might * read them from tcp_get_info() */ WRITE_ONCE(tp->segs_in, tp->segs_in + segs_in); if (skb->len > tcp_hdrlen(skb)) WRITE_ONCE(tp->data_segs_in, tp->data_segs_in + segs_in); } /* * TCP listen path runs lockless. * We forced "struct sock" to be const qualified to make sure * we don't modify one of its field by mistake. * Here, we increment sk_drops which is an atomic_t, so we can safely * make sock writable again. */ static inline void tcp_listendrop(const struct sock *sk) { atomic_inc(&((struct sock *)sk)->sk_drops); __NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENDROPS); } enum hrtimer_restart tcp_pace_kick(struct hrtimer *timer); /* * Interface for adding Upper Level Protocols over TCP */ #define TCP_ULP_NAME_MAX 16 #define TCP_ULP_MAX 128 #define TCP_ULP_BUF_MAX (TCP_ULP_NAME_MAX*TCP_ULP_MAX) struct tcp_ulp_ops { struct list_head list; /* initialize ulp */ int (*init)(struct sock *sk); /* update ulp */ void (*update)(struct sock *sk, struct proto *p, void (*write_space)(struct sock *sk)); /* cleanup ulp */ void (*release)(struct sock *sk); /* diagnostic */ int (*get_info)(struct sock *sk, struct sk_buff *skb, bool net_admin); size_t (*get_info_size)(const struct sock *sk, bool net_admin); /* clone ulp */ void (*clone)(const struct request_sock *req, struct sock *newsk, const gfp_t priority); char name[TCP_ULP_NAME_MAX]; struct module *owner; }; int tcp_register_ulp(struct tcp_ulp_ops *type); void tcp_unregister_ulp(struct tcp_ulp_ops *type); int tcp_set_ulp(struct sock *sk, const char *name); void tcp_get_available_ulp(char *buf, size_t len); void tcp_cleanup_ulp(struct sock *sk); void tcp_update_ulp(struct sock *sk, struct proto *p, void (*write_space)(struct sock *sk)); #define MODULE_ALIAS_TCP_ULP(name) \ MODULE_INFO(alias, name); \ MODULE_INFO(alias, "tcp-ulp-" name) #ifdef CONFIG_NET_SOCK_MSG struct sk_msg; struct sk_psock; #ifdef CONFIG_BPF_SYSCALL int tcp_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore); void tcp_bpf_clone(const struct sock *sk, struct sock *newsk); #ifdef CONFIG_BPF_STREAM_PARSER struct strparser; int tcp_bpf_strp_read_sock(struct strparser *strp, read_descriptor_t *desc, sk_read_actor_t recv_actor); #endif /* CONFIG_BPF_STREAM_PARSER */ #endif /* CONFIG_BPF_SYSCALL */ #ifdef CONFIG_INET void tcp_eat_skb(struct sock *sk, struct sk_buff *skb); #else static inline void tcp_eat_skb(struct sock *sk, struct sk_buff *skb) { } #endif int tcp_bpf_sendmsg_redir(struct sock *sk, bool ingress, struct sk_msg *msg, u32 bytes, int flags); #endif /* CONFIG_NET_SOCK_MSG */ #if !defined(CONFIG_BPF_SYSCALL) || !defined(CONFIG_NET_SOCK_MSG) static inline void tcp_bpf_clone(const struct sock *sk, struct sock *newsk) { } #endif #ifdef CONFIG_CGROUP_BPF static inline void bpf_skops_init_skb(struct bpf_sock_ops_kern *skops, struct sk_buff *skb, unsigned int end_offset) { skops->skb = skb; skops->skb_data_end = skb->data + end_offset; } #else static inline void bpf_skops_init_skb(struct bpf_sock_ops_kern *skops, struct sk_buff *skb, unsigned int end_offset) { } #endif /* Call BPF_SOCK_OPS program that returns an int. If the return value * is < 0, then the BPF op failed (for example if the loaded BPF * program does not support the chosen operation or there is no BPF * program loaded). */ #ifdef CONFIG_BPF static inline int tcp_call_bpf(struct sock *sk, int op, u32 nargs, u32 *args) { struct bpf_sock_ops_kern sock_ops; int ret; memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp)); if (sk_fullsock(sk)) { sock_ops.is_fullsock = 1; sock_ops.is_locked_tcp_sock = 1; sock_owned_by_me(sk); } sock_ops.sk = sk; sock_ops.op = op; if (nargs > 0) memcpy(sock_ops.args, args, nargs * sizeof(*args)); ret = BPF_CGROUP_RUN_PROG_SOCK_OPS(&sock_ops); if (ret == 0) ret = sock_ops.reply; else ret = -1; return ret; } static inline int tcp_call_bpf_2arg(struct sock *sk, int op, u32 arg1, u32 arg2) { u32 args[2] = {arg1, arg2}; return tcp_call_bpf(sk, op, 2, args); } static inline int tcp_call_bpf_3arg(struct sock *sk, int op, u32 arg1, u32 arg2, u32 arg3) { u32 args[3] = {arg1, arg2, arg3}; return tcp_call_bpf(sk, op, 3, args); } #else static inline int tcp_call_bpf(struct sock *sk, int op, u32 nargs, u32 *args) { return -EPERM; } static inline int tcp_call_bpf_2arg(struct sock *sk, int op, u32 arg1, u32 arg2) { return -EPERM; } static inline int tcp_call_bpf_3arg(struct sock *sk, int op, u32 arg1, u32 arg2, u32 arg3) { return -EPERM; } #endif static inline u32 tcp_timeout_init(struct sock *sk) { int timeout; timeout = tcp_call_bpf(sk, BPF_SOCK_OPS_TIMEOUT_INIT, 0, NULL); if (timeout <= 0) timeout = TCP_TIMEOUT_INIT; return min_t(int, timeout, TCP_RTO_MAX); } static inline u32 tcp_rwnd_init_bpf(struct sock *sk) { int rwnd; rwnd = tcp_call_bpf(sk, BPF_SOCK_OPS_RWND_INIT, 0, NULL); if (rwnd < 0) rwnd = 0; return rwnd; } static inline bool tcp_bpf_ca_needs_ecn(struct sock *sk) { return (tcp_call_bpf(sk, BPF_SOCK_OPS_NEEDS_ECN, 0, NULL) == 1); } static inline void tcp_bpf_rtt(struct sock *sk, long mrtt, u32 srtt) { if (BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk), BPF_SOCK_OPS_RTT_CB_FLAG)) tcp_call_bpf_2arg(sk, BPF_SOCK_OPS_RTT_CB, mrtt, srtt); } #if IS_ENABLED(CONFIG_SMC) extern struct static_key_false tcp_have_smc; #endif #if IS_ENABLED(CONFIG_TLS_DEVICE) void clean_acked_data_enable(struct tcp_sock *tp, void (*cad)(struct sock *sk, u32 ack_seq)); void clean_acked_data_disable(struct tcp_sock *tp); void clean_acked_data_flush(void); #endif DECLARE_STATIC_KEY_FALSE(tcp_tx_delay_enabled); static inline void tcp_add_tx_delay(struct sk_buff *skb, const struct tcp_sock *tp) { if (static_branch_unlikely(&tcp_tx_delay_enabled)) skb->skb_mstamp_ns += (u64)tp->tcp_tx_delay * NSEC_PER_USEC; } /* Compute Earliest Departure Time for some control packets * like ACK or RST for TIME_WAIT or non ESTABLISHED sockets. */ static inline u64 tcp_transmit_time(const struct sock *sk) { if (static_branch_unlikely(&tcp_tx_delay_enabled)) { u32 delay = (sk->sk_state == TCP_TIME_WAIT) ? tcp_twsk(sk)->tw_tx_delay : tcp_sk(sk)->tcp_tx_delay; return tcp_clock_ns() + (u64)delay * NSEC_PER_USEC; } return 0; } static inline int tcp_parse_auth_options(const struct tcphdr *th, const u8 **md5_hash, const struct tcp_ao_hdr **aoh) { const u8 *md5_tmp, *ao_tmp; int ret; ret = tcp_do_parse_auth_options(th, &md5_tmp, &ao_tmp); if (ret) return ret; if (md5_hash) *md5_hash = md5_tmp; if (aoh) { if (!ao_tmp) *aoh = NULL; else *aoh = (struct tcp_ao_hdr *)(ao_tmp - 2); } return 0; } static inline bool tcp_ao_required(struct sock *sk, const void *saddr, int family, int l3index, bool stat_inc) { #ifdef CONFIG_TCP_AO struct tcp_ao_info *ao_info; struct tcp_ao_key *ao_key; if (!static_branch_unlikely(&tcp_ao_needed.key)) return false; ao_info = rcu_dereference_check(tcp_sk(sk)->ao_info, lockdep_sock_is_held(sk)); if (!ao_info) return false; ao_key = tcp_ao_do_lookup(sk, l3index, saddr, family, -1, -1); if (ao_info->ao_required || ao_key) { if (stat_inc) { NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPAOREQUIRED); atomic64_inc(&ao_info->counters.ao_required); } return true; } #endif return false; } enum skb_drop_reason tcp_inbound_hash(struct sock *sk, const struct request_sock *req, const struct sk_buff *skb, const void *saddr, const void *daddr, int family, int dif, int sdif); #endif /* _TCP_H */ |
| 22 23 23 23 23 12 12 12 12 3 12 12 12 5 12 164 165 161 165 395 162 391 19 389 10 165 167 162 166 23 165 167 154 16 18 108 108 108 253 108 108 106 108 108 107 339 395 393 395 401 342 342 64 385 5 396 389 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 | #include <linux/gfp.h> #include <linux/highmem.h> #include <linux/kernel.h> #include <linux/mmdebug.h> #include <linux/mm_types.h> #include <linux/mm_inline.h> #include <linux/pagemap.h> #include <linux/rcupdate.h> #include <linux/smp.h> #include <linux/swap.h> #include <linux/rmap.h> #include <asm/pgalloc.h> #include <asm/tlb.h> #ifndef CONFIG_MMU_GATHER_NO_GATHER static bool tlb_next_batch(struct mmu_gather *tlb) { struct mmu_gather_batch *batch; /* Limit batching if we have delayed rmaps pending */ if (tlb->delayed_rmap && tlb->active != &tlb->local) return false; batch = tlb->active; if (batch->next) { tlb->active = batch->next; return true; } if (tlb->batch_count == MAX_GATHER_BATCH_COUNT) return false; batch = (void *)__get_free_page(GFP_NOWAIT | __GFP_NOWARN); if (!batch) return false; tlb->batch_count++; batch->next = NULL; batch->nr = 0; batch->max = MAX_GATHER_BATCH; tlb->active->next = batch; tlb->active = batch; return true; } #ifdef CONFIG_SMP static void tlb_flush_rmap_batch(struct mmu_gather_batch *batch, struct vm_area_struct *vma) { struct encoded_page **pages = batch->encoded_pages; for (int i = 0; i < batch->nr; i++) { struct encoded_page *enc = pages[i]; if (encoded_page_flags(enc) & ENCODED_PAGE_BIT_DELAY_RMAP) { struct page *page = encoded_page_ptr(enc); unsigned int nr_pages = 1; if (unlikely(encoded_page_flags(enc) & ENCODED_PAGE_BIT_NR_PAGES_NEXT)) nr_pages = encoded_nr_pages(pages[++i]); folio_remove_rmap_ptes(page_folio(page), page, nr_pages, vma); } } } /** * tlb_flush_rmaps - do pending rmap removals after we have flushed the TLB * @tlb: the current mmu_gather * @vma: The memory area from which the pages are being removed. * * Note that because of how tlb_next_batch() above works, we will * never start multiple new batches with pending delayed rmaps, so * we only need to walk through the current active batch and the * original local one. */ void tlb_flush_rmaps(struct mmu_gather *tlb, struct vm_area_struct *vma) { if (!tlb->delayed_rmap) return; tlb_flush_rmap_batch(&tlb->local, vma); if (tlb->active != &tlb->local) tlb_flush_rmap_batch(tlb->active, vma); tlb->delayed_rmap = 0; } #endif /* * We might end up freeing a lot of pages. Reschedule on a regular * basis to avoid soft lockups in configurations without full * preemption enabled. The magic number of 512 folios seems to work. */ #define MAX_NR_FOLIOS_PER_FREE 512 static void __tlb_batch_free_encoded_pages(struct mmu_gather_batch *batch) { struct encoded_page **pages = batch->encoded_pages; unsigned int nr, nr_pages; while (batch->nr) { if (!page_poisoning_enabled_static() && !want_init_on_free()) { nr = min(MAX_NR_FOLIOS_PER_FREE, batch->nr); /* * Make sure we cover page + nr_pages, and don't leave * nr_pages behind when capping the number of entries. */ if (unlikely(encoded_page_flags(pages[nr - 1]) & ENCODED_PAGE_BIT_NR_PAGES_NEXT)) nr++; } else { /* * With page poisoning and init_on_free, the time it * takes to free memory grows proportionally with the * actual memory size. Therefore, limit based on the * actual memory size and not the number of involved * folios. */ for (nr = 0, nr_pages = 0; nr < batch->nr && nr_pages < MAX_NR_FOLIOS_PER_FREE; nr++) { if (unlikely(encoded_page_flags(pages[nr]) & ENCODED_PAGE_BIT_NR_PAGES_NEXT)) nr_pages += encoded_nr_pages(pages[++nr]); else nr_pages++; } } free_pages_and_swap_cache(pages, nr); pages += nr; batch->nr -= nr; cond_resched(); } } static void tlb_batch_pages_flush(struct mmu_gather *tlb) { struct mmu_gather_batch *batch; for (batch = &tlb->local; batch && batch->nr; batch = batch->next) __tlb_batch_free_encoded_pages(batch); tlb->active = &tlb->local; } static void tlb_batch_list_free(struct mmu_gather *tlb) { struct mmu_gather_batch *batch, *next; for (batch = tlb->local.next; batch; batch = next) { next = batch->next; free_pages((unsigned long)batch, 0); } tlb->local.next = NULL; } static bool __tlb_remove_folio_pages_size(struct mmu_gather *tlb, struct page *page, unsigned int nr_pages, bool delay_rmap, int page_size) { int flags = delay_rmap ? ENCODED_PAGE_BIT_DELAY_RMAP : 0; struct mmu_gather_batch *batch; VM_BUG_ON(!tlb->end); #ifdef CONFIG_MMU_GATHER_PAGE_SIZE VM_WARN_ON(tlb->page_size != page_size); VM_WARN_ON_ONCE(nr_pages != 1 && page_size != PAGE_SIZE); VM_WARN_ON_ONCE(page_folio(page) != page_folio(page + nr_pages - 1)); #endif batch = tlb->active; /* * Add the page and check if we are full. If so * force a flush. */ if (likely(nr_pages == 1)) { batch->encoded_pages[batch->nr++] = encode_page(page, flags); } else { flags |= ENCODED_PAGE_BIT_NR_PAGES_NEXT; batch->encoded_pages[batch->nr++] = encode_page(page, flags); batch->encoded_pages[batch->nr++] = encode_nr_pages(nr_pages); } /* * Make sure that we can always add another "page" + "nr_pages", * requiring two entries instead of only a single one. */ if (batch->nr >= batch->max - 1) { if (!tlb_next_batch(tlb)) return true; batch = tlb->active; } VM_BUG_ON_PAGE(batch->nr > batch->max - 1, page); return false; } bool __tlb_remove_folio_pages(struct mmu_gather *tlb, struct page *page, unsigned int nr_pages, bool delay_rmap) { return __tlb_remove_folio_pages_size(tlb, page, nr_pages, delay_rmap, PAGE_SIZE); } bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, bool delay_rmap, int page_size) { return __tlb_remove_folio_pages_size(tlb, page, 1, delay_rmap, page_size); } #endif /* MMU_GATHER_NO_GATHER */ #ifdef CONFIG_MMU_GATHER_TABLE_FREE static void __tlb_remove_table_free(struct mmu_table_batch *batch) { int i; for (i = 0; i < batch->nr; i++) __tlb_remove_table(batch->tables[i]); free_page((unsigned long)batch); } #ifdef CONFIG_MMU_GATHER_RCU_TABLE_FREE /* * Semi RCU freeing of the page directories. * * This is needed by some architectures to implement software pagetable walkers. * * gup_fast() and other software pagetable walkers do a lockless page-table * walk and therefore needs some synchronization with the freeing of the page * directories. The chosen means to accomplish that is by disabling IRQs over * the walk. * * Architectures that use IPIs to flush TLBs will then automagically DTRT, * since we unlink the page, flush TLBs, free the page. Since the disabling of * IRQs delays the completion of the TLB flush we can never observe an already * freed page. * * Not all systems IPI every CPU for this purpose: * * - Some architectures have HW support for cross-CPU synchronisation of TLB * flushes, so there's no IPI at all. * * - Paravirt guests can do this TLB flushing in the hypervisor, or coordinate * with the hypervisor to defer flushing on preempted vCPUs. * * Such systems need to delay the freeing by some other means, this is that * means. * * What we do is batch the freed directory pages (tables) and RCU free them. * We use the sched RCU variant, as that guarantees that IRQ/preempt disabling * holds off grace periods. * * However, in order to batch these pages we need to allocate storage, this * allocation is deep inside the MM code and can thus easily fail on memory * pressure. To guarantee progress we fall back to single table freeing, see * the implementation of tlb_remove_table_one(). * */ static void tlb_remove_table_smp_sync(void *arg) { /* Simply deliver the interrupt */ } void tlb_remove_table_sync_one(void) { /* * This isn't an RCU grace period and hence the page-tables cannot be * assumed to be actually RCU-freed. * * It is however sufficient for software page-table walkers that rely on * IRQ disabling. */ smp_call_function(tlb_remove_table_smp_sync, NULL, 1); } static void tlb_remove_table_rcu(struct rcu_head *head) { __tlb_remove_table_free(container_of(head, struct mmu_table_batch, rcu)); } static void tlb_remove_table_free(struct mmu_table_batch *batch) { call_rcu(&batch->rcu, tlb_remove_table_rcu); } #else /* !CONFIG_MMU_GATHER_RCU_TABLE_FREE */ static void tlb_remove_table_free(struct mmu_table_batch *batch) { __tlb_remove_table_free(batch); } #endif /* CONFIG_MMU_GATHER_RCU_TABLE_FREE */ /* * If we want tlb_remove_table() to imply TLB invalidates. */ static inline void tlb_table_invalidate(struct mmu_gather *tlb) { if (tlb_needs_table_invalidate()) { /* * Invalidate page-table caches used by hardware walkers. Then * we still need to RCU-sched wait while freeing the pages * because software walkers can still be in-flight. */ tlb_flush_mmu_tlbonly(tlb); } } #ifdef CONFIG_PT_RECLAIM static inline void __tlb_remove_table_one_rcu(struct rcu_head *head) { struct ptdesc *ptdesc; ptdesc = container_of(head, struct ptdesc, pt_rcu_head); __tlb_remove_table(ptdesc); } static inline void __tlb_remove_table_one(void *table) { struct ptdesc *ptdesc; ptdesc = table; call_rcu(&ptdesc->pt_rcu_head, __tlb_remove_table_one_rcu); } #else static inline void __tlb_remove_table_one(void *table) { tlb_remove_table_sync_one(); __tlb_remove_table(table); } #endif /* CONFIG_PT_RECLAIM */ static void tlb_remove_table_one(void *table) { __tlb_remove_table_one(table); } static void tlb_table_flush(struct mmu_gather *tlb) { struct mmu_table_batch **batch = &tlb->batch; if (*batch) { tlb_table_invalidate(tlb); tlb_remove_table_free(*batch); *batch = NULL; } } void tlb_remove_table(struct mmu_gather *tlb, void *table) { struct mmu_table_batch **batch = &tlb->batch; if (*batch == NULL) { *batch = (struct mmu_table_batch *)__get_free_page(GFP_NOWAIT | __GFP_NOWARN); if (*batch == NULL) { tlb_table_invalidate(tlb); tlb_remove_table_one(table); return; } (*batch)->nr = 0; } (*batch)->tables[(*batch)->nr++] = table; if ((*batch)->nr == MAX_TABLE_BATCH) tlb_table_flush(tlb); } static inline void tlb_table_init(struct mmu_gather *tlb) { tlb->batch = NULL; } #else /* !CONFIG_MMU_GATHER_TABLE_FREE */ static inline void tlb_table_flush(struct mmu_gather *tlb) { } static inline void tlb_table_init(struct mmu_gather *tlb) { } #endif /* CONFIG_MMU_GATHER_TABLE_FREE */ static void tlb_flush_mmu_free(struct mmu_gather *tlb) { tlb_table_flush(tlb); #ifndef CONFIG_MMU_GATHER_NO_GATHER tlb_batch_pages_flush(tlb); #endif } void tlb_flush_mmu(struct mmu_gather *tlb) { tlb_flush_mmu_tlbonly(tlb); tlb_flush_mmu_free(tlb); } static void __tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, bool fullmm) { tlb->mm = mm; tlb->fullmm = fullmm; #ifndef CONFIG_MMU_GATHER_NO_GATHER tlb->need_flush_all = 0; tlb->local.next = NULL; tlb->local.nr = 0; tlb->local.max = ARRAY_SIZE(tlb->__pages); tlb->active = &tlb->local; tlb->batch_count = 0; #endif tlb->delayed_rmap = 0; tlb_table_init(tlb); #ifdef CONFIG_MMU_GATHER_PAGE_SIZE tlb->page_size = 0; #endif tlb->vma_pfn = 0; __tlb_reset_range(tlb); inc_tlb_flush_pending(tlb->mm); } /** * tlb_gather_mmu - initialize an mmu_gather structure for page-table tear-down * @tlb: the mmu_gather structure to initialize * @mm: the mm_struct of the target address space * * Called to initialize an (on-stack) mmu_gather structure for page-table * tear-down from @mm. */ void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm) { __tlb_gather_mmu(tlb, mm, false); } /** * tlb_gather_mmu_fullmm - initialize an mmu_gather structure for page-table tear-down * @tlb: the mmu_gather structure to initialize * @mm: the mm_struct of the target address space * * In this case, @mm is without users and we're going to destroy the * full address space (exit/execve). * * Called to initialize an (on-stack) mmu_gather structure for page-table * tear-down from @mm. */ void tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct mm_struct *mm) { __tlb_gather_mmu(tlb, mm, true); } /** * tlb_finish_mmu - finish an mmu_gather structure * @tlb: the mmu_gather structure to finish * * Called at the end of the shootdown operation to free up any resources that * were required. */ void tlb_finish_mmu(struct mmu_gather *tlb) { /* * If there are parallel threads are doing PTE changes on same range * under non-exclusive lock (e.g., mmap_lock read-side) but defer TLB * flush by batching, one thread may end up seeing inconsistent PTEs * and result in having stale TLB entries. So flush TLB forcefully * if we detect parallel PTE batching threads. * * However, some syscalls, e.g. munmap(), may free page tables, this * needs force flush everything in the given range. Otherwise this * may result in having stale TLB entries for some architectures, * e.g. aarch64, that could specify flush what level TLB. */ if (mm_tlb_flush_nested(tlb->mm)) { /* * The aarch64 yields better performance with fullmm by * avoiding multiple CPUs spamming TLBI messages at the * same time. * * On x86 non-fullmm doesn't yield significant difference * against fullmm. */ tlb->fullmm = 1; __tlb_reset_range(tlb); tlb->freed_tables = 1; } tlb_flush_mmu(tlb); #ifndef CONFIG_MMU_GATHER_NO_GATHER tlb_batch_list_free(tlb); #endif dec_tlb_flush_pending(tlb->mm); } |
| 3 3 3 3 3 3 3 3 2 1 3 706 701 9 9 704 3 3 3 3 3 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 | // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause /* Authors: Bernard Metzler <bmt@zurich.ibm.com> */ /* Copyright (c) 2008-2019, IBM Corporation */ #include <linux/init.h> #include <linux/errno.h> #include <linux/netdevice.h> #include <linux/inetdevice.h> #include <net/net_namespace.h> #include <linux/rtnetlink.h> #include <linux/if_arp.h> #include <linux/list.h> #include <linux/kernel.h> #include <linux/sched.h> #include <linux/module.h> #include <linux/dma-mapping.h> #include <net/addrconf.h> #include <rdma/ib_verbs.h> #include <rdma/ib_user_verbs.h> #include <rdma/rdma_netlink.h> #include <linux/kthread.h> #include "siw.h" #include "siw_verbs.h" MODULE_AUTHOR("Bernard Metzler"); MODULE_DESCRIPTION("Software iWARP Driver"); MODULE_LICENSE("Dual BSD/GPL"); /* transmit from user buffer, if possible */ const bool zcopy_tx = true; /* Restrict usage of GSO, if hardware peer iwarp is unable to process * large packets. try_gso = true lets siw try to use local GSO, * if peer agrees. Not using GSO severly limits siw maximum tx bandwidth. */ const bool try_gso; /* Attach siw also with loopback devices */ const bool loopback_enabled = true; /* We try to negotiate CRC on, if true */ const bool mpa_crc_required; /* MPA CRC on/off enforced */ const bool mpa_crc_strict; /* Control TCP_NODELAY socket option */ const bool siw_tcp_nagle; /* Select MPA version to be used during connection setup */ u_char mpa_version = MPA_REVISION_2; /* Selects MPA P2P mode (additional handshake during connection * setup, if true. */ const bool peer_to_peer; struct task_struct *siw_tx_thread[NR_CPUS]; static int siw_device_register(struct siw_device *sdev, const char *name) { struct ib_device *base_dev = &sdev->base_dev; static int dev_id = 1; int rv; sdev->vendor_part_id = dev_id++; rv = ib_register_device(base_dev, name, NULL); if (rv) { pr_warn("siw: device registration error %d\n", rv); return rv; } siw_dbg(base_dev, "HWaddr=%pM\n", sdev->raw_gid); return 0; } static void siw_device_cleanup(struct ib_device *base_dev) { struct siw_device *sdev = to_siw_dev(base_dev); xa_destroy(&sdev->qp_xa); xa_destroy(&sdev->mem_xa); } static int siw_dev_qualified(struct net_device *netdev) { /* * Additional hardware support can be added here * (e.g. ARPHRD_FDDI, ARPHRD_ATM, ...) - see * <linux/if_arp.h> for type identifiers. */ if (netdev->type == ARPHRD_ETHER || netdev->type == ARPHRD_IEEE802 || netdev->type == ARPHRD_NONE || (netdev->type == ARPHRD_LOOPBACK && loopback_enabled)) return 1; return 0; } static DEFINE_PER_CPU(atomic_t, siw_use_cnt); static struct { struct cpumask **tx_valid_cpus; int num_nodes; } siw_cpu_info; static void siw_destroy_cpulist(int number) { int i = 0; while (i < number) kfree(siw_cpu_info.tx_valid_cpus[i++]); kfree(siw_cpu_info.tx_valid_cpus); siw_cpu_info.tx_valid_cpus = NULL; } static int siw_init_cpulist(void) { int i, num_nodes = nr_node_ids; memset(siw_tx_thread, 0, sizeof(siw_tx_thread)); siw_cpu_info.num_nodes = num_nodes; siw_cpu_info.tx_valid_cpus = kcalloc(num_nodes, sizeof(struct cpumask *), GFP_KERNEL); if (!siw_cpu_info.tx_valid_cpus) { siw_cpu_info.num_nodes = 0; return -ENOMEM; } for (i = 0; i < siw_cpu_info.num_nodes; i++) { siw_cpu_info.tx_valid_cpus[i] = kzalloc(sizeof(struct cpumask), GFP_KERNEL); if (!siw_cpu_info.tx_valid_cpus[i]) goto out_err; cpumask_clear(siw_cpu_info.tx_valid_cpus[i]); } for_each_possible_cpu(i) cpumask_set_cpu(i, siw_cpu_info.tx_valid_cpus[cpu_to_node(i)]); return 0; out_err: siw_cpu_info.num_nodes = 0; siw_destroy_cpulist(i); return -ENOMEM; } /* * Choose CPU with least number of active QP's from NUMA node of * TX interface. */ int siw_get_tx_cpu(struct siw_device *sdev) { const struct cpumask *tx_cpumask; int i, num_cpus, cpu, min_use, node = sdev->numa_node, tx_cpu = -1; if (node < 0) tx_cpumask = cpu_online_mask; else tx_cpumask = siw_cpu_info.tx_valid_cpus[node]; num_cpus = cpumask_weight(tx_cpumask); if (!num_cpus) { /* no CPU on this NUMA node */ tx_cpumask = cpu_online_mask; num_cpus = cpumask_weight(tx_cpumask); } if (!num_cpus) goto out; cpu = cpumask_first(tx_cpumask); for (i = 0, min_use = SIW_MAX_QP; i < num_cpus; i++, cpu = cpumask_next(cpu, tx_cpumask)) { int usage; /* Skip any cores which have no TX thread */ if (!siw_tx_thread[cpu]) continue; usage = atomic_read(&per_cpu(siw_use_cnt, cpu)); if (usage <= min_use) { tx_cpu = cpu; min_use = usage; } } siw_dbg(&sdev->base_dev, "tx cpu %d, node %d, %d qp's\n", tx_cpu, node, min_use); out: if (tx_cpu >= 0) atomic_inc(&per_cpu(siw_use_cnt, tx_cpu)); else pr_warn("siw: no tx cpu found\n"); return tx_cpu; } void siw_put_tx_cpu(int cpu) { atomic_dec(&per_cpu(siw_use_cnt, cpu)); } static struct ib_qp *siw_get_base_qp(struct ib_device *base_dev, int id) { struct siw_qp *qp = siw_qp_id2obj(to_siw_dev(base_dev), id); if (qp) { /* * siw_qp_id2obj() increments object reference count */ siw_qp_put(qp); return &qp->base_qp; } return NULL; } static const struct ib_device_ops siw_device_ops = { .owner = THIS_MODULE, .uverbs_abi_ver = SIW_ABI_VERSION, .driver_id = RDMA_DRIVER_SIW, .alloc_mr = siw_alloc_mr, .alloc_pd = siw_alloc_pd, .alloc_ucontext = siw_alloc_ucontext, .create_cq = siw_create_cq, .create_qp = siw_create_qp, .create_srq = siw_create_srq, .dealloc_driver = siw_device_cleanup, .dealloc_pd = siw_dealloc_pd, .dealloc_ucontext = siw_dealloc_ucontext, .dereg_mr = siw_dereg_mr, .destroy_cq = siw_destroy_cq, .destroy_qp = siw_destroy_qp, .destroy_srq = siw_destroy_srq, .get_dma_mr = siw_get_dma_mr, .get_port_immutable = siw_get_port_immutable, .iw_accept = siw_accept, .iw_add_ref = siw_qp_get_ref, .iw_connect = siw_connect, .iw_create_listen = siw_create_listen, .iw_destroy_listen = siw_destroy_listen, .iw_get_qp = siw_get_base_qp, .iw_reject = siw_reject, .iw_rem_ref = siw_qp_put_ref, .map_mr_sg = siw_map_mr_sg, .mmap = siw_mmap, .mmap_free = siw_mmap_free, .modify_qp = siw_verbs_modify_qp, .modify_srq = siw_modify_srq, .poll_cq = siw_poll_cq, .post_recv = siw_post_receive, .post_send = siw_post_send, .post_srq_recv = siw_post_srq_recv, .query_device = siw_query_device, .query_gid = siw_query_gid, .query_port = siw_query_port, .query_qp = siw_query_qp, .query_srq = siw_query_srq, .req_notify_cq = siw_req_notify_cq, .reg_user_mr = siw_reg_user_mr, INIT_RDMA_OBJ_SIZE(ib_cq, siw_cq, base_cq), INIT_RDMA_OBJ_SIZE(ib_pd, siw_pd, base_pd), INIT_RDMA_OBJ_SIZE(ib_qp, siw_qp, base_qp), INIT_RDMA_OBJ_SIZE(ib_srq, siw_srq, base_srq), INIT_RDMA_OBJ_SIZE(ib_ucontext, siw_ucontext, base_ucontext), }; static struct siw_device *siw_device_create(struct net_device *netdev) { struct siw_device *sdev = NULL; struct ib_device *base_dev; int rv; sdev = ib_alloc_device(siw_device, base_dev); if (!sdev) return NULL; base_dev = &sdev->base_dev; if (netdev->addr_len) { memcpy(sdev->raw_gid, netdev->dev_addr, min_t(unsigned int, netdev->addr_len, ETH_ALEN)); } else { /* * This device does not have a HW address, but * connection mangagement requires a unique gid. */ eth_random_addr(sdev->raw_gid); } addrconf_addr_eui48((u8 *)&base_dev->node_guid, sdev->raw_gid); base_dev->uverbs_cmd_mask |= BIT_ULL(IB_USER_VERBS_CMD_POST_SEND); base_dev->node_type = RDMA_NODE_RNIC; memcpy(base_dev->node_desc, SIW_NODE_DESC_COMMON, sizeof(SIW_NODE_DESC_COMMON)); /* * Current model (one-to-one device association): * One Softiwarp device per net_device or, equivalently, * per physical port. */ base_dev->phys_port_cnt = 1; base_dev->num_comp_vectors = num_possible_cpus(); xa_init_flags(&sdev->qp_xa, XA_FLAGS_ALLOC1); xa_init_flags(&sdev->mem_xa, XA_FLAGS_ALLOC1); ib_set_device_ops(base_dev, &siw_device_ops); rv = ib_device_set_netdev(base_dev, netdev, 1); if (rv) goto error; memcpy(base_dev->iw_ifname, netdev->name, sizeof(base_dev->iw_ifname)); /* Disable TCP port mapping */ base_dev->iw_driver_flags = IW_F_NO_PORT_MAP; sdev->attrs.max_qp = SIW_MAX_QP; sdev->attrs.max_qp_wr = SIW_MAX_QP_WR; sdev->attrs.max_ord = SIW_MAX_ORD_QP; sdev->attrs.max_ird = SIW_MAX_IRD_QP; sdev->attrs.max_sge = SIW_MAX_SGE; sdev->attrs.max_sge_rd = SIW_MAX_SGE_RD; sdev->attrs.max_cq = SIW_MAX_CQ; sdev->attrs.max_cqe = SIW_MAX_CQE; sdev->attrs.max_mr = SIW_MAX_MR; sdev->attrs.max_pd = SIW_MAX_PD; sdev->attrs.max_mw = SIW_MAX_MW; sdev->attrs.max_srq = SIW_MAX_SRQ; sdev->attrs.max_srq_wr = SIW_MAX_SRQ_WR; sdev->attrs.max_srq_sge = SIW_MAX_SGE; INIT_LIST_HEAD(&sdev->cep_list); INIT_LIST_HEAD(&sdev->qp_list); atomic_set(&sdev->num_ctx, 0); atomic_set(&sdev->num_srq, 0); atomic_set(&sdev->num_qp, 0); atomic_set(&sdev->num_cq, 0); atomic_set(&sdev->num_mr, 0); atomic_set(&sdev->num_pd, 0); sdev->numa_node = dev_to_node(&netdev->dev); spin_lock_init(&sdev->lock); return sdev; error: ib_dealloc_device(base_dev); return NULL; } static int siw_netdev_event(struct notifier_block *nb, unsigned long event, void *arg) { struct net_device *netdev = netdev_notifier_info_to_dev(arg); struct ib_device *base_dev; struct siw_device *sdev; dev_dbg(&netdev->dev, "siw: event %lu\n", event); base_dev = ib_device_get_by_netdev(netdev, RDMA_DRIVER_SIW); if (!base_dev) return NOTIFY_OK; sdev = to_siw_dev(base_dev); switch (event) { case NETDEV_REGISTER: /* * Device registration now handled only by * rdma netlink commands. So it shall be impossible * to end up here with a valid siw device. */ siw_dbg(base_dev, "unexpected NETDEV_REGISTER event\n"); break; case NETDEV_UNREGISTER: ib_unregister_device_queued(&sdev->base_dev); break; case NETDEV_CHANGEADDR: siw_port_event(sdev, 1, IB_EVENT_LID_CHANGE); break; /* * All other events are not handled */ default: break; } ib_device_put(&sdev->base_dev); return NOTIFY_OK; } static struct notifier_block siw_netdev_nb = { .notifier_call = siw_netdev_event, }; static int siw_newlink(const char *basedev_name, struct net_device *netdev) { struct ib_device *base_dev; struct siw_device *sdev = NULL; int rv = -ENOMEM; if (!siw_dev_qualified(netdev)) return -EINVAL; base_dev = ib_device_get_by_netdev(netdev, RDMA_DRIVER_SIW); if (base_dev) { ib_device_put(base_dev); return -EEXIST; } sdev = siw_device_create(netdev); if (sdev) { dev_dbg(&netdev->dev, "siw: new device\n"); ib_mark_name_assigned_by_user(&sdev->base_dev); rv = siw_device_register(sdev, basedev_name); if (rv) ib_dealloc_device(&sdev->base_dev); } return rv; } static struct rdma_link_ops siw_link_ops = { .type = "siw", .newlink = siw_newlink, }; /* * siw_init_module - Initialize Softiwarp module and register with netdev * subsystem. */ static __init int siw_init_module(void) { int rv; if (SENDPAGE_THRESH < SIW_MAX_INLINE) { pr_info("siw: sendpage threshold too small: %u\n", (int)SENDPAGE_THRESH); rv = -EINVAL; goto out_error; } rv = siw_init_cpulist(); if (rv) goto out_error; rv = siw_cm_init(); if (rv) goto out_error; if (!siw_create_tx_threads()) { pr_info("siw: Could not start any TX thread\n"); rv = -ENOMEM; goto out_error; } rv = register_netdevice_notifier(&siw_netdev_nb); if (rv) goto out_error; rdma_link_register(&siw_link_ops); pr_info("SoftiWARP attached\n"); return 0; out_error: siw_stop_tx_threads(); pr_info("SoftIWARP attach failed. Error: %d\n", rv); siw_cm_exit(); siw_destroy_cpulist(siw_cpu_info.num_nodes); return rv; } static void __exit siw_exit_module(void) { siw_stop_tx_threads(); unregister_netdevice_notifier(&siw_netdev_nb); rdma_link_unregister(&siw_link_ops); ib_unregister_driver(RDMA_DRIVER_SIW); siw_cm_exit(); siw_destroy_cpulist(siw_cpu_info.num_nodes); pr_info("SoftiWARP detached\n"); } module_init(siw_init_module); module_exit(siw_exit_module); MODULE_ALIAS_RDMA_LINK("siw"); |
| 2 2 2 2 2 2 2 2 2 1 1 2 1 1 2 2 2 2 2 1 2 2 2 2 2 2 1 1 8 1 1 7 1 1 6 1 1 5 1 1 4 4 4 8 4 1 4 4 4 4 4 4 4 4 4 4 4 4 4 4 1 3 1 2 1 1 1 4 4 4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 | // SPDX-License-Identifier: GPL-2.0-only #include <net/netdev_queues.h> #include "netlink.h" #include "common.h" struct rings_req_info { struct ethnl_req_info base; }; struct rings_reply_data { struct ethnl_reply_data base; struct ethtool_ringparam ringparam; struct kernel_ethtool_ringparam kernel_ringparam; u32 supported_ring_params; }; #define RINGS_REPDATA(__reply_base) \ container_of(__reply_base, struct rings_reply_data, base) const struct nla_policy ethnl_rings_get_policy[] = { [ETHTOOL_A_RINGS_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), }; static int rings_prepare_data(const struct ethnl_req_info *req_base, struct ethnl_reply_data *reply_base, const struct genl_info *info) { struct rings_reply_data *data = RINGS_REPDATA(reply_base); struct net_device *dev = reply_base->dev; int ret; if (!dev->ethtool_ops->get_ringparam) return -EOPNOTSUPP; data->supported_ring_params = dev->ethtool_ops->supported_ring_params; ret = ethnl_ops_begin(dev); if (ret < 0) return ret; data->kernel_ringparam.tcp_data_split = dev->cfg->hds_config; data->kernel_ringparam.hds_thresh = dev->cfg->hds_thresh; dev->ethtool_ops->get_ringparam(dev, &data->ringparam, &data->kernel_ringparam, info->extack); ethnl_ops_complete(dev); return 0; } static int rings_reply_size(const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) { return nla_total_size(sizeof(u32)) + /* _RINGS_RX_MAX */ nla_total_size(sizeof(u32)) + /* _RINGS_RX_MINI_MAX */ nla_total_size(sizeof(u32)) + /* _RINGS_RX_JUMBO_MAX */ nla_total_size(sizeof(u32)) + /* _RINGS_TX_MAX */ nla_total_size(sizeof(u32)) + /* _RINGS_RX */ nla_total_size(sizeof(u32)) + /* _RINGS_RX_MINI */ nla_total_size(sizeof(u32)) + /* _RINGS_RX_JUMBO */ nla_total_size(sizeof(u32)) + /* _RINGS_TX */ nla_total_size(sizeof(u32)) + /* _RINGS_RX_BUF_LEN */ nla_total_size(sizeof(u8)) + /* _RINGS_TCP_DATA_SPLIT */ nla_total_size(sizeof(u32) + /* _RINGS_CQE_SIZE */ nla_total_size(sizeof(u8)) + /* _RINGS_TX_PUSH */ nla_total_size(sizeof(u8))) + /* _RINGS_RX_PUSH */ nla_total_size(sizeof(u32)) + /* _RINGS_TX_PUSH_BUF_LEN */ nla_total_size(sizeof(u32)) + /* _RINGS_TX_PUSH_BUF_LEN_MAX */ nla_total_size(sizeof(u32)) + /* _RINGS_HDS_THRESH */ nla_total_size(sizeof(u32)); /* _RINGS_HDS_THRESH_MAX*/ } static int rings_fill_reply(struct sk_buff *skb, const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) { const struct rings_reply_data *data = RINGS_REPDATA(reply_base); const struct kernel_ethtool_ringparam *kr = &data->kernel_ringparam; const struct ethtool_ringparam *ringparam = &data->ringparam; u32 supported_ring_params = data->supported_ring_params; WARN_ON(kr->tcp_data_split > ETHTOOL_TCP_DATA_SPLIT_ENABLED); if ((ringparam->rx_max_pending && (nla_put_u32(skb, ETHTOOL_A_RINGS_RX_MAX, ringparam->rx_max_pending) || nla_put_u32(skb, ETHTOOL_A_RINGS_RX, ringparam->rx_pending))) || (ringparam->rx_mini_max_pending && (nla_put_u32(skb, ETHTOOL_A_RINGS_RX_MINI_MAX, ringparam->rx_mini_max_pending) || nla_put_u32(skb, ETHTOOL_A_RINGS_RX_MINI, ringparam->rx_mini_pending))) || (ringparam->rx_jumbo_max_pending && (nla_put_u32(skb, ETHTOOL_A_RINGS_RX_JUMBO_MAX, ringparam->rx_jumbo_max_pending) || nla_put_u32(skb, ETHTOOL_A_RINGS_RX_JUMBO, ringparam->rx_jumbo_pending))) || (ringparam->tx_max_pending && (nla_put_u32(skb, ETHTOOL_A_RINGS_TX_MAX, ringparam->tx_max_pending) || nla_put_u32(skb, ETHTOOL_A_RINGS_TX, ringparam->tx_pending))) || (kr->rx_buf_len && (nla_put_u32(skb, ETHTOOL_A_RINGS_RX_BUF_LEN, kr->rx_buf_len))) || (kr->tcp_data_split && (nla_put_u8(skb, ETHTOOL_A_RINGS_TCP_DATA_SPLIT, kr->tcp_data_split))) || (kr->cqe_size && (nla_put_u32(skb, ETHTOOL_A_RINGS_CQE_SIZE, kr->cqe_size))) || nla_put_u8(skb, ETHTOOL_A_RINGS_TX_PUSH, !!kr->tx_push) || nla_put_u8(skb, ETHTOOL_A_RINGS_RX_PUSH, !!kr->rx_push) || ((supported_ring_params & ETHTOOL_RING_USE_TX_PUSH_BUF_LEN) && (nla_put_u32(skb, ETHTOOL_A_RINGS_TX_PUSH_BUF_LEN_MAX, kr->tx_push_buf_max_len) || nla_put_u32(skb, ETHTOOL_A_RINGS_TX_PUSH_BUF_LEN, kr->tx_push_buf_len))) || ((supported_ring_params & ETHTOOL_RING_USE_HDS_THRS) && (nla_put_u32(skb, ETHTOOL_A_RINGS_HDS_THRESH, kr->hds_thresh) || nla_put_u32(skb, ETHTOOL_A_RINGS_HDS_THRESH_MAX, kr->hds_thresh_max)))) return -EMSGSIZE; return 0; } /* RINGS_SET */ const struct nla_policy ethnl_rings_set_policy[] = { [ETHTOOL_A_RINGS_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), [ETHTOOL_A_RINGS_RX] = { .type = NLA_U32 }, [ETHTOOL_A_RINGS_RX_MINI] = { .type = NLA_U32 }, [ETHTOOL_A_RINGS_RX_JUMBO] = { .type = NLA_U32 }, [ETHTOOL_A_RINGS_TX] = { .type = NLA_U32 }, [ETHTOOL_A_RINGS_RX_BUF_LEN] = NLA_POLICY_MIN(NLA_U32, 1), [ETHTOOL_A_RINGS_TCP_DATA_SPLIT] = NLA_POLICY_MAX(NLA_U8, ETHTOOL_TCP_DATA_SPLIT_ENABLED), [ETHTOOL_A_RINGS_CQE_SIZE] = NLA_POLICY_MIN(NLA_U32, 1), [ETHTOOL_A_RINGS_TX_PUSH] = NLA_POLICY_MAX(NLA_U8, 1), [ETHTOOL_A_RINGS_RX_PUSH] = NLA_POLICY_MAX(NLA_U8, 1), [ETHTOOL_A_RINGS_TX_PUSH_BUF_LEN] = { .type = NLA_U32 }, [ETHTOOL_A_RINGS_HDS_THRESH] = { .type = NLA_U32 }, }; static int ethnl_set_rings_validate(struct ethnl_req_info *req_info, struct genl_info *info) { const struct ethtool_ops *ops = req_info->dev->ethtool_ops; struct nlattr **tb = info->attrs; if (tb[ETHTOOL_A_RINGS_RX_BUF_LEN] && !(ops->supported_ring_params & ETHTOOL_RING_USE_RX_BUF_LEN)) { NL_SET_ERR_MSG_ATTR(info->extack, tb[ETHTOOL_A_RINGS_RX_BUF_LEN], "setting rx buf len not supported"); return -EOPNOTSUPP; } if (tb[ETHTOOL_A_RINGS_TCP_DATA_SPLIT] && !(ops->supported_ring_params & ETHTOOL_RING_USE_TCP_DATA_SPLIT)) { NL_SET_ERR_MSG_ATTR(info->extack, tb[ETHTOOL_A_RINGS_TCP_DATA_SPLIT], "setting TCP data split is not supported"); return -EOPNOTSUPP; } if (tb[ETHTOOL_A_RINGS_HDS_THRESH] && !(ops->supported_ring_params & ETHTOOL_RING_USE_HDS_THRS)) { NL_SET_ERR_MSG_ATTR(info->extack, tb[ETHTOOL_A_RINGS_HDS_THRESH], "setting hds-thresh is not supported"); return -EOPNOTSUPP; } if (tb[ETHTOOL_A_RINGS_CQE_SIZE] && !(ops->supported_ring_params & ETHTOOL_RING_USE_CQE_SIZE)) { NL_SET_ERR_MSG_ATTR(info->extack, tb[ETHTOOL_A_RINGS_CQE_SIZE], "setting cqe size not supported"); return -EOPNOTSUPP; } if (tb[ETHTOOL_A_RINGS_TX_PUSH] && !(ops->supported_ring_params & ETHTOOL_RING_USE_TX_PUSH)) { NL_SET_ERR_MSG_ATTR(info->extack, tb[ETHTOOL_A_RINGS_TX_PUSH], "setting tx push not supported"); return -EOPNOTSUPP; } if (tb[ETHTOOL_A_RINGS_RX_PUSH] && !(ops->supported_ring_params & ETHTOOL_RING_USE_RX_PUSH)) { NL_SET_ERR_MSG_ATTR(info->extack, tb[ETHTOOL_A_RINGS_RX_PUSH], "setting rx push not supported"); return -EOPNOTSUPP; } if (tb[ETHTOOL_A_RINGS_TX_PUSH_BUF_LEN] && !(ops->supported_ring_params & ETHTOOL_RING_USE_TX_PUSH_BUF_LEN)) { NL_SET_ERR_MSG_ATTR(info->extack, tb[ETHTOOL_A_RINGS_TX_PUSH_BUF_LEN], "setting tx push buf len is not supported"); return -EOPNOTSUPP; } return ops->get_ringparam && ops->set_ringparam ? 1 : -EOPNOTSUPP; } static int ethnl_set_rings(struct ethnl_req_info *req_info, struct genl_info *info) { struct kernel_ethtool_ringparam kernel_ringparam; struct net_device *dev = req_info->dev; struct ethtool_ringparam ringparam; struct nlattr **tb = info->attrs; const struct nlattr *err_attr; bool mod = false; int ret; ethtool_ringparam_get_cfg(dev, &ringparam, &kernel_ringparam, info->extack); ethnl_update_u32(&ringparam.rx_pending, tb[ETHTOOL_A_RINGS_RX], &mod); ethnl_update_u32(&ringparam.rx_mini_pending, tb[ETHTOOL_A_RINGS_RX_MINI], &mod); ethnl_update_u32(&ringparam.rx_jumbo_pending, tb[ETHTOOL_A_RINGS_RX_JUMBO], &mod); ethnl_update_u32(&ringparam.tx_pending, tb[ETHTOOL_A_RINGS_TX], &mod); ethnl_update_u32(&kernel_ringparam.rx_buf_len, tb[ETHTOOL_A_RINGS_RX_BUF_LEN], &mod); ethnl_update_u8(&kernel_ringparam.tcp_data_split, tb[ETHTOOL_A_RINGS_TCP_DATA_SPLIT], &mod); ethnl_update_u32(&kernel_ringparam.cqe_size, tb[ETHTOOL_A_RINGS_CQE_SIZE], &mod); ethnl_update_u8(&kernel_ringparam.tx_push, tb[ETHTOOL_A_RINGS_TX_PUSH], &mod); ethnl_update_u8(&kernel_ringparam.rx_push, tb[ETHTOOL_A_RINGS_RX_PUSH], &mod); ethnl_update_u32(&kernel_ringparam.tx_push_buf_len, tb[ETHTOOL_A_RINGS_TX_PUSH_BUF_LEN], &mod); ethnl_update_u32(&kernel_ringparam.hds_thresh, tb[ETHTOOL_A_RINGS_HDS_THRESH], &mod); if (!mod) return 0; if (kernel_ringparam.tcp_data_split == ETHTOOL_TCP_DATA_SPLIT_ENABLED && dev_xdp_sb_prog_count(dev)) { NL_SET_ERR_MSG_ATTR(info->extack, tb[ETHTOOL_A_RINGS_TCP_DATA_SPLIT], "tcp-data-split can not be enabled with single buffer XDP"); return -EINVAL; } if (dev_get_min_mp_channel_count(dev)) { if (kernel_ringparam.tcp_data_split != ETHTOOL_TCP_DATA_SPLIT_ENABLED) { NL_SET_ERR_MSG(info->extack, "can't disable tcp-data-split while device has memory provider enabled"); return -EINVAL; } else if (kernel_ringparam.hds_thresh) { NL_SET_ERR_MSG(info->extack, "can't set non-zero hds_thresh while device is memory provider enabled"); return -EINVAL; } } /* ensure new ring parameters are within limits */ if (ringparam.rx_pending > ringparam.rx_max_pending) err_attr = tb[ETHTOOL_A_RINGS_RX]; else if (ringparam.rx_mini_pending > ringparam.rx_mini_max_pending) err_attr = tb[ETHTOOL_A_RINGS_RX_MINI]; else if (ringparam.rx_jumbo_pending > ringparam.rx_jumbo_max_pending) err_attr = tb[ETHTOOL_A_RINGS_RX_JUMBO]; else if (ringparam.tx_pending > ringparam.tx_max_pending) err_attr = tb[ETHTOOL_A_RINGS_TX]; else if (kernel_ringparam.hds_thresh > kernel_ringparam.hds_thresh_max) err_attr = tb[ETHTOOL_A_RINGS_HDS_THRESH]; else err_attr = NULL; if (err_attr) { NL_SET_ERR_MSG_ATTR(info->extack, err_attr, "requested ring size exceeds maximum"); return -EINVAL; } if (kernel_ringparam.tx_push_buf_len > kernel_ringparam.tx_push_buf_max_len) { NL_SET_ERR_MSG_ATTR_FMT(info->extack, tb[ETHTOOL_A_RINGS_TX_PUSH_BUF_LEN], "Requested TX push buffer exceeds the maximum of %u", kernel_ringparam.tx_push_buf_max_len); return -EINVAL; } dev->cfg_pending->hds_config = kernel_ringparam.tcp_data_split; dev->cfg_pending->hds_thresh = kernel_ringparam.hds_thresh; ret = dev->ethtool_ops->set_ringparam(dev, &ringparam, &kernel_ringparam, info->extack); return ret < 0 ? ret : 1; } const struct ethnl_request_ops ethnl_rings_request_ops = { .request_cmd = ETHTOOL_MSG_RINGS_GET, .reply_cmd = ETHTOOL_MSG_RINGS_GET_REPLY, .hdr_attr = ETHTOOL_A_RINGS_HEADER, .req_info_size = sizeof(struct rings_req_info), .reply_data_size = sizeof(struct rings_reply_data), .prepare_data = rings_prepare_data, .reply_size = rings_reply_size, .fill_reply = rings_fill_reply, .set_validate = ethnl_set_rings_validate, .set = ethnl_set_rings, .set_ntf_cmd = ETHTOOL_MSG_RINGS_NTF, }; |
| 16 16 16 40 16 16 16 40 23 128 125 47 45 39 19 31 31 16 16 5 41 32 32 32 73 38 49 48 56 57 57 12 12 35 13 45 67 66 43 52 52 66 84 84 83 83 55 52 32 31 14 27 6 54 51 51 24 25 25 3 3 3 3 24 5 19 24 20 21 1 20 21 66 81 26 23 10 10 2 3 96 94 94 31 86 94 89 85 52 42 66 56 56 38 9 57 44 24 20 6 44 50 24 22 24 24 12 8 36 3 13 3 10 44 3 3 12 12 1 23 23 23 23 23 11 11 4 58 56 39 43 43 42 42 43 43 43 16 16 1 48 48 44 44 48 16 16 16 16 16 25 25 27 39 27 37 27 27 27 16 16 16 16 15 15 15 1 14 13 1 14 14 13 2 2 16 41 41 31 31 20 21 21 21 17 17 16 16 18 18 18 18 2 2 1 1 1 1 1 2 2 2 26 26 26 13 13 13 28 20 16 10 14 12 1 1 1 1 1 1 1 1 1 1 1 1 1 1 11 11 1 11 11 11 11 1 11 11 1 1 1 11 5 5 5 5 30 30 26 26 26 3 3 2 2 2 19 19 15 1 9 9 3 3 3 3 3 3 3 3 3 9 2 2 4 2 4 76 25 25 24 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 | // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2011 Novell Inc. * Copyright (C) 2016 Red Hat, Inc. */ #include <linux/fs.h> #include <linux/mount.h> #include <linux/slab.h> #include <linux/cred.h> #include <linux/xattr.h> #include <linux/exportfs.h> #include <linux/file.h> #include <linux/fileattr.h> #include <linux/uuid.h> #include <linux/namei.h> #include <linux/ratelimit.h> #include <linux/overflow.h> #include "overlayfs.h" /* Get write access to upper mnt - may fail if upper sb was remounted ro */ int ovl_get_write_access(struct dentry *dentry) { struct ovl_fs *ofs = OVL_FS(dentry->d_sb); return mnt_get_write_access(ovl_upper_mnt(ofs)); } /* Get write access to upper sb - may block if upper sb is frozen */ void ovl_start_write(struct dentry *dentry) { struct ovl_fs *ofs = OVL_FS(dentry->d_sb); sb_start_write(ovl_upper_mnt(ofs)->mnt_sb); } int ovl_want_write(struct dentry *dentry) { struct ovl_fs *ofs = OVL_FS(dentry->d_sb); return mnt_want_write(ovl_upper_mnt(ofs)); } void ovl_put_write_access(struct dentry *dentry) { struct ovl_fs *ofs = OVL_FS(dentry->d_sb); mnt_put_write_access(ovl_upper_mnt(ofs)); } void ovl_end_write(struct dentry *dentry) { struct ovl_fs *ofs = OVL_FS(dentry->d_sb); sb_end_write(ovl_upper_mnt(ofs)->mnt_sb); } void ovl_drop_write(struct dentry *dentry) { struct ovl_fs *ofs = OVL_FS(dentry->d_sb); mnt_drop_write(ovl_upper_mnt(ofs)); } struct dentry *ovl_workdir(struct dentry *dentry) { struct ovl_fs *ofs = OVL_FS(dentry->d_sb); return ofs->workdir; } const struct cred *ovl_override_creds(struct super_block *sb) { struct ovl_fs *ofs = OVL_FS(sb); return override_creds(ofs->creator_cred); } void ovl_revert_creds(const struct cred *old_cred) { revert_creds(old_cred); } /* * Check if underlying fs supports file handles and try to determine encoding * type, in order to deduce maximum inode number used by fs. * * Return 0 if file handles are not supported. * Return 1 (FILEID_INO32_GEN) if fs uses the default 32bit inode encoding. * Return -1 if fs uses a non default encoding with unknown inode size. */ int ovl_can_decode_fh(struct super_block *sb) { if (!capable(CAP_DAC_READ_SEARCH)) return 0; if (!exportfs_can_decode_fh(sb->s_export_op)) return 0; return sb->s_export_op->encode_fh ? -1 : FILEID_INO32_GEN; } struct dentry *ovl_indexdir(struct super_block *sb) { struct ovl_fs *ofs = OVL_FS(sb); return ofs->config.index ? ofs->workdir : NULL; } /* Index all files on copy up. For now only enabled for NFS export */ bool ovl_index_all(struct super_block *sb) { struct ovl_fs *ofs = OVL_FS(sb); return ofs->config.nfs_export && ofs->config.index; } /* Verify lower origin on lookup. For now only enabled for NFS export */ bool ovl_verify_lower(struct super_block *sb) { struct ovl_fs *ofs = OVL_FS(sb); return ofs->config.nfs_export && ofs->config.index; } struct ovl_path *ovl_stack_alloc(unsigned int n) { return kcalloc(n, sizeof(struct ovl_path), GFP_KERNEL); } void ovl_stack_cpy(struct ovl_path *dst, struct ovl_path *src, unsigned int n) { unsigned int i; memcpy(dst, src, sizeof(struct ovl_path) * n); for (i = 0; i < n; i++) dget(src[i].dentry); } void ovl_stack_put(struct ovl_path *stack, unsigned int n) { unsigned int i; for (i = 0; stack && i < n; i++) dput(stack[i].dentry); } void ovl_stack_free(struct ovl_path *stack, unsigned int n) { ovl_stack_put(stack, n); kfree(stack); } struct ovl_entry *ovl_alloc_entry(unsigned int numlower) { struct ovl_entry *oe; oe = kzalloc(struct_size(oe, __lowerstack, numlower), GFP_KERNEL); if (oe) oe->__numlower = numlower; return oe; } void ovl_free_entry(struct ovl_entry *oe) { ovl_stack_put(ovl_lowerstack(oe), ovl_numlower(oe)); kfree(oe); } #define OVL_D_REVALIDATE (DCACHE_OP_REVALIDATE | DCACHE_OP_WEAK_REVALIDATE) bool ovl_dentry_remote(struct dentry *dentry) { return dentry->d_flags & OVL_D_REVALIDATE; } void ovl_dentry_update_reval(struct dentry *dentry, struct dentry *realdentry) { if (!ovl_dentry_remote(realdentry)) return; spin_lock(&dentry->d_lock); dentry->d_flags |= realdentry->d_flags & OVL_D_REVALIDATE; spin_unlock(&dentry->d_lock); } void ovl_dentry_init_reval(struct dentry *dentry, struct dentry *upperdentry, struct ovl_entry *oe) { return ovl_dentry_init_flags(dentry, upperdentry, oe, OVL_D_REVALIDATE); } void ovl_dentry_init_flags(struct dentry *dentry, struct dentry *upperdentry, struct ovl_entry *oe, unsigned int mask) { struct ovl_path *lowerstack = ovl_lowerstack(oe); unsigned int i, flags = 0; if (upperdentry) flags |= upperdentry->d_flags; for (i = 0; i < ovl_numlower(oe) && lowerstack[i].dentry; i++) flags |= lowerstack[i].dentry->d_flags; spin_lock(&dentry->d_lock); dentry->d_flags &= ~mask; dentry->d_flags |= flags & mask; spin_unlock(&dentry->d_lock); } bool ovl_dentry_weird(struct dentry *dentry) { if (!d_can_lookup(dentry) && !d_is_file(dentry) && !d_is_symlink(dentry)) return true; if (dentry->d_flags & (DCACHE_NEED_AUTOMOUNT | DCACHE_MANAGE_TRANSIT)) return true; /* * Allow filesystems that are case-folding capable but deny composing * ovl stack from case-folded directories. */ if (sb_has_encoding(dentry->d_sb)) return IS_CASEFOLDED(d_inode(dentry)); return dentry->d_flags & (DCACHE_OP_HASH | DCACHE_OP_COMPARE); } enum ovl_path_type ovl_path_type(struct dentry *dentry) { struct ovl_entry *oe = OVL_E(dentry); enum ovl_path_type type = 0; if (ovl_dentry_upper(dentry)) { type = __OVL_PATH_UPPER; /* * Non-dir dentry can hold lower dentry of its copy up origin. */ if (ovl_numlower(oe)) { if (ovl_test_flag(OVL_CONST_INO, d_inode(dentry))) type |= __OVL_PATH_ORIGIN; if (d_is_dir(dentry) || !ovl_has_upperdata(d_inode(dentry))) type |= __OVL_PATH_MERGE; } } else { if (ovl_numlower(oe) > 1) type |= __OVL_PATH_MERGE; } return type; } void ovl_path_upper(struct dentry *dentry, struct path *path) { struct ovl_fs *ofs = OVL_FS(dentry->d_sb); path->mnt = ovl_upper_mnt(ofs); path->dentry = ovl_dentry_upper(dentry); } void ovl_path_lower(struct dentry *dentry, struct path *path) { struct ovl_entry *oe = OVL_E(dentry); struct ovl_path *lowerpath = ovl_lowerstack(oe); if (ovl_numlower(oe)) { path->mnt = lowerpath->layer->mnt; path->dentry = lowerpath->dentry; } else { *path = (struct path) { }; } } void ovl_path_lowerdata(struct dentry *dentry, struct path *path) { struct ovl_entry *oe = OVL_E(dentry); struct ovl_path *lowerdata = ovl_lowerdata(oe); struct dentry *lowerdata_dentry = ovl_lowerdata_dentry(oe); if (lowerdata_dentry) { path->dentry = lowerdata_dentry; /* * Pairs with smp_wmb() in ovl_dentry_set_lowerdata(). * Make sure that if lowerdata->dentry is visible, then * datapath->layer is visible as well. */ smp_rmb(); path->mnt = READ_ONCE(lowerdata->layer)->mnt; } else { *path = (struct path) { }; } } enum ovl_path_type ovl_path_real(struct dentry *dentry, struct path *path) { enum ovl_path_type type = ovl_path_type(dentry); if (!OVL_TYPE_UPPER(type)) ovl_path_lower(dentry, path); else ovl_path_upper(dentry, path); return type; } enum ovl_path_type ovl_path_realdata(struct dentry *dentry, struct path *path) { enum ovl_path_type type = ovl_path_type(dentry); WARN_ON_ONCE(d_is_dir(dentry)); if (!OVL_TYPE_UPPER(type) || OVL_TYPE_MERGE(type)) ovl_path_lowerdata(dentry, path); else ovl_path_upper(dentry, path); return type; } struct dentry *ovl_dentry_upper(struct dentry *dentry) { struct inode *inode = d_inode(dentry); return inode ? ovl_upperdentry_dereference(OVL_I(inode)) : NULL; } struct dentry *ovl_dentry_lower(struct dentry *dentry) { struct ovl_entry *oe = OVL_E(dentry); return ovl_numlower(oe) ? ovl_lowerstack(oe)->dentry : NULL; } const struct ovl_layer *ovl_layer_lower(struct dentry *dentry) { struct ovl_entry *oe = OVL_E(dentry); return ovl_numlower(oe) ? ovl_lowerstack(oe)->layer : NULL; } /* * ovl_dentry_lower() could return either a data dentry or metacopy dentry * depending on what is stored in lowerstack[0]. At times we need to find * lower dentry which has data (and not metacopy dentry). This helper * returns the lower data dentry. */ struct dentry *ovl_dentry_lowerdata(struct dentry *dentry) { return ovl_lowerdata_dentry(OVL_E(dentry)); } int ovl_dentry_set_lowerdata(struct dentry *dentry, struct ovl_path *datapath) { struct ovl_entry *oe = OVL_E(dentry); struct ovl_path *lowerdata = ovl_lowerdata(oe); struct dentry *datadentry = datapath->dentry; if (WARN_ON_ONCE(ovl_numlower(oe) <= 1)) return -EIO; WRITE_ONCE(lowerdata->layer, datapath->layer); /* * Pairs with smp_rmb() in ovl_path_lowerdata(). * Make sure that if lowerdata->dentry is visible, then * lowerdata->layer is visible as well. */ smp_wmb(); WRITE_ONCE(lowerdata->dentry, dget(datadentry)); ovl_dentry_update_reval(dentry, datadentry); return 0; } struct dentry *ovl_dentry_real(struct dentry *dentry) { return ovl_dentry_upper(dentry) ?: ovl_dentry_lower(dentry); } struct dentry *ovl_i_dentry_upper(struct inode *inode) { return ovl_upperdentry_dereference(OVL_I(inode)); } struct inode *ovl_i_path_real(struct inode *inode, struct path *path) { struct ovl_path *lowerpath = ovl_lowerpath(OVL_I_E(inode)); path->dentry = ovl_i_dentry_upper(inode); if (!path->dentry) { path->dentry = lowerpath->dentry; path->mnt = lowerpath->layer->mnt; } else { path->mnt = ovl_upper_mnt(OVL_FS(inode->i_sb)); } return path->dentry ? d_inode_rcu(path->dentry) : NULL; } struct inode *ovl_inode_upper(struct inode *inode) { struct dentry *upperdentry = ovl_i_dentry_upper(inode); return upperdentry ? d_inode(upperdentry) : NULL; } struct inode *ovl_inode_lower(struct inode *inode) { struct ovl_path *lowerpath = ovl_lowerpath(OVL_I_E(inode)); return lowerpath ? d_inode(lowerpath->dentry) : NULL; } struct inode *ovl_inode_real(struct inode *inode) { return ovl_inode_upper(inode) ?: ovl_inode_lower(inode); } /* Return inode which contains lower data. Do not return metacopy */ struct inode *ovl_inode_lowerdata(struct inode *inode) { struct dentry *lowerdata = ovl_lowerdata_dentry(OVL_I_E(inode)); if (WARN_ON(!S_ISREG(inode->i_mode))) return NULL; return lowerdata ? d_inode(lowerdata) : NULL; } /* Return real inode which contains data. Does not return metacopy inode */ struct inode *ovl_inode_realdata(struct inode *inode) { struct inode *upperinode; upperinode = ovl_inode_upper(inode); if (upperinode && ovl_has_upperdata(inode)) return upperinode; return ovl_inode_lowerdata(inode); } const char *ovl_lowerdata_redirect(struct inode *inode) { return inode && S_ISREG(inode->i_mode) ? OVL_I(inode)->lowerdata_redirect : NULL; } struct ovl_dir_cache *ovl_dir_cache(struct inode *inode) { return inode && S_ISDIR(inode->i_mode) ? OVL_I(inode)->cache : NULL; } void ovl_set_dir_cache(struct inode *inode, struct ovl_dir_cache *cache) { OVL_I(inode)->cache = cache; } void ovl_dentry_set_flag(unsigned long flag, struct dentry *dentry) { set_bit(flag, OVL_E_FLAGS(dentry)); } void ovl_dentry_clear_flag(unsigned long flag, struct dentry *dentry) { clear_bit(flag, OVL_E_FLAGS(dentry)); } bool ovl_dentry_test_flag(unsigned long flag, struct dentry *dentry) { return test_bit(flag, OVL_E_FLAGS(dentry)); } bool ovl_dentry_is_opaque(struct dentry *dentry) { return ovl_dentry_test_flag(OVL_E_OPAQUE, dentry); } bool ovl_dentry_is_whiteout(struct dentry *dentry) { return !dentry->d_inode && ovl_dentry_is_opaque(dentry); } void ovl_dentry_set_opaque(struct dentry *dentry) { ovl_dentry_set_flag(OVL_E_OPAQUE, dentry); } bool ovl_dentry_has_xwhiteouts(struct dentry *dentry) { return ovl_dentry_test_flag(OVL_E_XWHITEOUTS, dentry); } void ovl_dentry_set_xwhiteouts(struct dentry *dentry) { ovl_dentry_set_flag(OVL_E_XWHITEOUTS, dentry); } /* * ovl_layer_set_xwhiteouts() is called before adding the overlay dir * dentry to dcache, while readdir of that same directory happens after * the overlay dir dentry is in dcache, so if some cpu observes that * ovl_dentry_is_xwhiteouts(), it will also observe layer->has_xwhiteouts * for the layers where xwhiteouts marker was found in that merge dir. */ void ovl_layer_set_xwhiteouts(struct ovl_fs *ofs, const struct ovl_layer *layer) { if (layer->has_xwhiteouts) return; /* Write once to read-mostly layer properties */ ofs->layers[layer->idx].has_xwhiteouts = true; } /* * For hard links and decoded file handles, it's possible for ovl_dentry_upper() * to return positive, while there's no actual upper alias for the inode. * Copy up code needs to know about the existence of the upper alias, so it * can't use ovl_dentry_upper(). */ bool ovl_dentry_has_upper_alias(struct dentry *dentry) { return ovl_dentry_test_flag(OVL_E_UPPER_ALIAS, dentry); } void ovl_dentry_set_upper_alias(struct dentry *dentry) { ovl_dentry_set_flag(OVL_E_UPPER_ALIAS, dentry); } static bool ovl_should_check_upperdata(struct inode *inode) { if (!S_ISREG(inode->i_mode)) return false; if (!ovl_inode_lower(inode)) return false; return true; } bool ovl_has_upperdata(struct inode *inode) { if (!ovl_should_check_upperdata(inode)) return true; if (!ovl_test_flag(OVL_UPPERDATA, inode)) return false; /* * Pairs with smp_wmb() in ovl_set_upperdata(). Main user of * ovl_has_upperdata() is ovl_copy_up_meta_inode_data(). Make sure * if setting of OVL_UPPERDATA is visible, then effects of writes * before that are visible too. */ smp_rmb(); return true; } void ovl_set_upperdata(struct inode *inode) { /* * Pairs with smp_rmb() in ovl_has_upperdata(). Make sure * if OVL_UPPERDATA flag is visible, then effects of write operations * before it are visible as well. */ smp_wmb(); ovl_set_flag(OVL_UPPERDATA, inode); } /* Caller should hold ovl_inode->lock */ bool ovl_dentry_needs_data_copy_up_locked(struct dentry *dentry, int flags) { if (!ovl_open_flags_need_copy_up(flags)) return false; return !ovl_test_flag(OVL_UPPERDATA, d_inode(dentry)); } bool ovl_dentry_needs_data_copy_up(struct dentry *dentry, int flags) { if (!ovl_open_flags_need_copy_up(flags)) return false; return !ovl_has_upperdata(d_inode(dentry)); } const char *ovl_dentry_get_redirect(struct dentry *dentry) { return OVL_I(d_inode(dentry))->redirect; } void ovl_dentry_set_redirect(struct dentry *dentry, const char *redirect) { struct ovl_inode *oi = OVL_I(d_inode(dentry)); kfree(oi->redirect); oi->redirect = redirect; } void ovl_inode_update(struct inode *inode, struct dentry *upperdentry) { struct inode *upperinode = d_inode(upperdentry); WARN_ON(OVL_I(inode)->__upperdentry); /* * Make sure upperdentry is consistent before making it visible */ smp_wmb(); OVL_I(inode)->__upperdentry = upperdentry; if (inode_unhashed(inode)) { inode->i_private = upperinode; __insert_inode_hash(inode, (unsigned long) upperinode); } } static void ovl_dir_version_inc(struct dentry *dentry, bool impurity) { struct inode *inode = d_inode(dentry); WARN_ON(!inode_is_locked(inode)); WARN_ON(!d_is_dir(dentry)); /* * Version is used by readdir code to keep cache consistent. * For merge dirs (or dirs with origin) all changes need to be noted. * For non-merge dirs, cache contains only impure entries (i.e. ones * which have been copied up and have origins), so only need to note * changes to impure entries. */ if (!ovl_dir_is_real(inode) || impurity) OVL_I(inode)->version++; } void ovl_dir_modified(struct dentry *dentry, bool impurity) { /* Copy mtime/ctime */ ovl_copyattr(d_inode(dentry)); ovl_dir_version_inc(dentry, impurity); } u64 ovl_inode_version_get(struct inode *inode) { WARN_ON(!inode_is_locked(inode)); return OVL_I(inode)->version; } bool ovl_is_whiteout(struct dentry *dentry) { struct inode *inode = dentry->d_inode; return inode && IS_WHITEOUT(inode); } /* * Use this over ovl_is_whiteout for upper and lower files, as it also * handles overlay.whiteout xattr whiteout files. */ bool ovl_path_is_whiteout(struct ovl_fs *ofs, const struct path *path) { return ovl_is_whiteout(path->dentry) || ovl_path_check_xwhiteout_xattr(ofs, path); } struct file *ovl_path_open(const struct path *path, int flags) { struct inode *inode = d_inode(path->dentry); struct mnt_idmap *real_idmap = mnt_idmap(path->mnt); int err, acc_mode; if (flags & ~(O_ACCMODE | O_LARGEFILE)) BUG(); switch (flags & O_ACCMODE) { case O_RDONLY: acc_mode = MAY_READ; break; case O_WRONLY: acc_mode = MAY_WRITE; break; default: BUG(); } err = inode_permission(real_idmap, inode, acc_mode | MAY_OPEN); if (err) return ERR_PTR(err); /* O_NOATIME is an optimization, don't fail if not permitted */ if (inode_owner_or_capable(real_idmap, inode)) flags |= O_NOATIME; return dentry_open(path, flags, current_cred()); } /* Caller should hold ovl_inode->lock */ static bool ovl_already_copied_up_locked(struct dentry *dentry, int flags) { bool disconnected = dentry->d_flags & DCACHE_DISCONNECTED; if (ovl_dentry_upper(dentry) && (ovl_dentry_has_upper_alias(dentry) || disconnected) && !ovl_dentry_needs_data_copy_up_locked(dentry, flags)) return true; return false; } bool ovl_already_copied_up(struct dentry *dentry, int flags) { bool disconnected = dentry->d_flags & DCACHE_DISCONNECTED; /* * Check if copy-up has happened as well as for upper alias (in * case of hard links) is there. * * Both checks are lockless: * - false negatives: will recheck under oi->lock * - false positives: * + ovl_dentry_upper() uses memory barriers to ensure the * upper dentry is up-to-date * + ovl_dentry_has_upper_alias() relies on locking of * upper parent i_rwsem to prevent reordering copy-up * with rename. */ if (ovl_dentry_upper(dentry) && (ovl_dentry_has_upper_alias(dentry) || disconnected) && !ovl_dentry_needs_data_copy_up(dentry, flags)) return true; return false; } /* * The copy up "transaction" keeps an elevated mnt write count on upper mnt, * but leaves taking freeze protection on upper sb to lower level helpers. */ int ovl_copy_up_start(struct dentry *dentry, int flags) { struct inode *inode = d_inode(dentry); int err; err = ovl_inode_lock_interruptible(inode); if (err) return err; if (ovl_already_copied_up_locked(dentry, flags)) err = 1; /* Already copied up */ else err = ovl_get_write_access(dentry); if (err) goto out_unlock; return 0; out_unlock: ovl_inode_unlock(inode); return err; } void ovl_copy_up_end(struct dentry *dentry) { ovl_put_write_access(dentry); ovl_inode_unlock(d_inode(dentry)); } bool ovl_path_check_origin_xattr(struct ovl_fs *ofs, const struct path *path) { int res; res = ovl_path_getxattr(ofs, path, OVL_XATTR_ORIGIN, NULL, 0); /* Zero size value means "copied up but origin unknown" */ if (res >= 0) return true; return false; } bool ovl_path_check_xwhiteout_xattr(struct ovl_fs *ofs, const struct path *path) { struct dentry *dentry = path->dentry; int res; /* xattr.whiteout must be a zero size regular file */ if (!d_is_reg(dentry) || i_size_read(d_inode(dentry)) != 0) return false; res = ovl_path_getxattr(ofs, path, OVL_XATTR_XWHITEOUT, NULL, 0); return res >= 0; } /* * Load persistent uuid from xattr into s_uuid if found, or store a new * random generated value in s_uuid and in xattr. */ bool ovl_init_uuid_xattr(struct super_block *sb, struct ovl_fs *ofs, const struct path *upperpath) { bool set = false; uuid_t uuid; int res; /* Try to load existing persistent uuid */ res = ovl_path_getxattr(ofs, upperpath, OVL_XATTR_UUID, uuid.b, UUID_SIZE); if (res == UUID_SIZE) goto set_uuid; if (res != -ENODATA) goto fail; /* * With uuid=auto, if uuid xattr is found, it will be used. * If uuid xattrs is not found, generate a persistent uuid only on mount * of new overlays where upper root dir is not yet marked as impure. * An upper dir is marked as impure on copy up or lookup of its subdirs. */ if (ofs->config.uuid == OVL_UUID_AUTO) { res = ovl_path_getxattr(ofs, upperpath, OVL_XATTR_IMPURE, NULL, 0); if (res > 0) { /* Any mount of old overlay - downgrade to uuid=null */ ofs->config.uuid = OVL_UUID_NULL; return true; } else if (res == -ENODATA) { /* First mount of new overlay - upgrade to uuid=on */ ofs->config.uuid = OVL_UUID_ON; } else if (res < 0) { goto fail; } } /* Generate overlay instance uuid */ uuid_gen(&uuid); /* Try to store persistent uuid */ set = true; res = ovl_setxattr(ofs, upperpath->dentry, OVL_XATTR_UUID, uuid.b, UUID_SIZE); if (res) goto fail; set_uuid: super_set_uuid(sb, uuid.b, sizeof(uuid)); return true; fail: ofs->config.uuid = OVL_UUID_NULL; pr_warn("failed to %s uuid (%pd2, err=%i); falling back to uuid=null.\n", set ? "set" : "get", upperpath->dentry, res); return false; } char ovl_get_dir_xattr_val(struct ovl_fs *ofs, const struct path *path, enum ovl_xattr ox) { int res; char val; if (!d_is_dir(path->dentry)) return 0; res = ovl_path_getxattr(ofs, path, ox, &val, 1); return res == 1 ? val : 0; } #define OVL_XATTR_OPAQUE_POSTFIX "opaque" #define OVL_XATTR_REDIRECT_POSTFIX "redirect" #define OVL_XATTR_ORIGIN_POSTFIX "origin" #define OVL_XATTR_IMPURE_POSTFIX "impure" #define OVL_XATTR_NLINK_POSTFIX "nlink" #define OVL_XATTR_UPPER_POSTFIX "upper" #define OVL_XATTR_UUID_POSTFIX "uuid" #define OVL_XATTR_METACOPY_POSTFIX "metacopy" #define OVL_XATTR_PROTATTR_POSTFIX "protattr" #define OVL_XATTR_XWHITEOUT_POSTFIX "whiteout" #define OVL_XATTR_TAB_ENTRY(x) \ [x] = { [false] = OVL_XATTR_TRUSTED_PREFIX x ## _POSTFIX, \ [true] = OVL_XATTR_USER_PREFIX x ## _POSTFIX } const char *const ovl_xattr_table[][2] = { OVL_XATTR_TAB_ENTRY(OVL_XATTR_OPAQUE), OVL_XATTR_TAB_ENTRY(OVL_XATTR_REDIRECT), OVL_XATTR_TAB_ENTRY(OVL_XATTR_ORIGIN), OVL_XATTR_TAB_ENTRY(OVL_XATTR_IMPURE), OVL_XATTR_TAB_ENTRY(OVL_XATTR_NLINK), OVL_XATTR_TAB_ENTRY(OVL_XATTR_UPPER), OVL_XATTR_TAB_ENTRY(OVL_XATTR_UUID), OVL_XATTR_TAB_ENTRY(OVL_XATTR_METACOPY), OVL_XATTR_TAB_ENTRY(OVL_XATTR_PROTATTR), OVL_XATTR_TAB_ENTRY(OVL_XATTR_XWHITEOUT), }; int ovl_check_setxattr(struct ovl_fs *ofs, struct dentry *upperdentry, enum ovl_xattr ox, const void *value, size_t size, int xerr) { int err; if (ofs->noxattr) return xerr; err = ovl_setxattr(ofs, upperdentry, ox, value, size); if (err == -EOPNOTSUPP) { pr_warn("cannot set %s xattr on upper\n", ovl_xattr(ofs, ox)); ofs->noxattr = true; return xerr; } return err; } int ovl_set_impure(struct dentry *dentry, struct dentry *upperdentry) { struct ovl_fs *ofs = OVL_FS(dentry->d_sb); int err; if (ovl_test_flag(OVL_IMPURE, d_inode(dentry))) return 0; /* * Do not fail when upper doesn't support xattrs. * Upper inodes won't have origin nor redirect xattr anyway. */ err = ovl_check_setxattr(ofs, upperdentry, OVL_XATTR_IMPURE, "y", 1, 0); if (!err) ovl_set_flag(OVL_IMPURE, d_inode(dentry)); return err; } #define OVL_PROTATTR_MAX 32 /* Reserved for future flags */ void ovl_check_protattr(struct inode *inode, struct dentry *upper) { struct ovl_fs *ofs = OVL_FS(inode->i_sb); u32 iflags = inode->i_flags & OVL_PROT_I_FLAGS_MASK; char buf[OVL_PROTATTR_MAX+1]; int res, n; res = ovl_getxattr_upper(ofs, upper, OVL_XATTR_PROTATTR, buf, OVL_PROTATTR_MAX); if (res < 0) return; /* * Initialize inode flags from overlay.protattr xattr and upper inode * flags. If upper inode has those fileattr flags set (i.e. from old * kernel), we do not clear them on ovl_get_inode(), but we will clear * them on next fileattr_set(). */ for (n = 0; n < res; n++) { if (buf[n] == 'a') iflags |= S_APPEND; else if (buf[n] == 'i') iflags |= S_IMMUTABLE; else break; } if (!res || n < res) { pr_warn_ratelimited("incompatible overlay.protattr format (%pd2, len=%d)\n", upper, res); } else { inode_set_flags(inode, iflags, OVL_PROT_I_FLAGS_MASK); } } int ovl_set_protattr(struct inode *inode, struct dentry *upper, struct file_kattr *fa) { struct ovl_fs *ofs = OVL_FS(inode->i_sb); char buf[OVL_PROTATTR_MAX]; int len = 0, err = 0; u32 iflags = 0; BUILD_BUG_ON(HWEIGHT32(OVL_PROT_FS_FLAGS_MASK) > OVL_PROTATTR_MAX); if (fa->flags & FS_APPEND_FL) { buf[len++] = 'a'; iflags |= S_APPEND; } if (fa->flags & FS_IMMUTABLE_FL) { buf[len++] = 'i'; iflags |= S_IMMUTABLE; } /* * Do not allow to set protection flags when upper doesn't support * xattrs, because we do not set those fileattr flags on upper inode. * Remove xattr if it exist and all protection flags are cleared. */ if (len) { err = ovl_check_setxattr(ofs, upper, OVL_XATTR_PROTATTR, buf, len, -EPERM); } else if (inode->i_flags & OVL_PROT_I_FLAGS_MASK) { err = ovl_removexattr(ofs, upper, OVL_XATTR_PROTATTR); if (err == -EOPNOTSUPP || err == -ENODATA) err = 0; } if (err) return err; inode_set_flags(inode, iflags, OVL_PROT_I_FLAGS_MASK); /* Mask out the fileattr flags that should not be set in upper inode */ fa->flags &= ~OVL_PROT_FS_FLAGS_MASK; fa->fsx_xflags &= ~OVL_PROT_FSX_FLAGS_MASK; return 0; } /* * Caller must hold a reference to inode to prevent it from being freed while * it is marked inuse. */ bool ovl_inuse_trylock(struct dentry *dentry) { struct inode *inode = d_inode(dentry); bool locked = false; spin_lock(&inode->i_lock); if (!(inode->i_state & I_OVL_INUSE)) { inode->i_state |= I_OVL_INUSE; locked = true; } spin_unlock(&inode->i_lock); return locked; } void ovl_inuse_unlock(struct dentry *dentry) { if (dentry) { struct inode *inode = d_inode(dentry); spin_lock(&inode->i_lock); WARN_ON(!(inode->i_state & I_OVL_INUSE)); inode->i_state &= ~I_OVL_INUSE; spin_unlock(&inode->i_lock); } } bool ovl_is_inuse(struct dentry *dentry) { struct inode *inode = d_inode(dentry); bool inuse; spin_lock(&inode->i_lock); inuse = (inode->i_state & I_OVL_INUSE); spin_unlock(&inode->i_lock); return inuse; } /* * Does this overlay dentry need to be indexed on copy up? */ bool ovl_need_index(struct dentry *dentry) { struct dentry *lower = ovl_dentry_lower(dentry); if (!lower || !ovl_indexdir(dentry->d_sb)) return false; /* Index all files for NFS export and consistency verification */ if (ovl_index_all(dentry->d_sb)) return true; /* Index only lower hardlinks on copy up */ if (!d_is_dir(lower) && d_inode(lower)->i_nlink > 1) return true; return false; } /* Caller must hold OVL_I(inode)->lock */ static void ovl_cleanup_index(struct dentry *dentry) { struct ovl_fs *ofs = OVL_FS(dentry->d_sb); struct dentry *indexdir = ovl_indexdir(dentry->d_sb); struct dentry *lowerdentry = ovl_dentry_lower(dentry); struct dentry *upperdentry = ovl_dentry_upper(dentry); struct dentry *index = NULL; struct inode *inode; struct qstr name = { }; bool got_write = false; int err; err = ovl_get_index_name(ofs, lowerdentry, &name); if (err) goto fail; err = ovl_want_write(dentry); if (err) goto fail; got_write = true; inode = d_inode(upperdentry); if (!S_ISDIR(inode->i_mode) && inode->i_nlink != 1) { pr_warn_ratelimited("cleanup linked index (%pd2, ino=%lu, nlink=%u)\n", upperdentry, inode->i_ino, inode->i_nlink); /* * We either have a bug with persistent union nlink or a lower * hardlink was added while overlay is mounted. Adding a lower * hardlink and then unlinking all overlay hardlinks would drop * overlay nlink to zero before all upper inodes are unlinked. * As a safety measure, when that situation is detected, set * the overlay nlink to the index inode nlink minus one for the * index entry itself. */ set_nlink(d_inode(dentry), inode->i_nlink - 1); ovl_set_nlink_upper(dentry); goto out; } index = ovl_lookup_upper_unlocked(ofs, name.name, indexdir, name.len); err = PTR_ERR(index); if (IS_ERR(index)) { index = NULL; } else if (ovl_index_all(dentry->d_sb)) { /* Whiteout orphan index to block future open by handle */ err = ovl_cleanup_and_whiteout(OVL_FS(dentry->d_sb), indexdir, index); } else { /* Cleanup orphan index entries */ err = ovl_cleanup(ofs, indexdir, index); } if (err) goto fail; out: if (got_write) ovl_drop_write(dentry); kfree(name.name); dput(index); return; fail: pr_err("cleanup index of '%pd2' failed (%i)\n", dentry, err); goto out; } /* * Operations that change overlay inode and upper inode nlink need to be * synchronized with copy up for persistent nlink accounting. */ int ovl_nlink_start(struct dentry *dentry) { struct inode *inode = d_inode(dentry); const struct cred *old_cred; int err; if (WARN_ON(!inode)) return -ENOENT; /* * With inodes index is enabled, we store the union overlay nlink * in an xattr on the index inode. When whiting out an indexed lower, * we need to decrement the overlay persistent nlink, but before the * first copy up, we have no upper index inode to store the xattr. * * As a workaround, before whiteout/rename over an indexed lower, * copy up to create the upper index. Creating the upper index will * initialize the overlay nlink, so it could be dropped if unlink * or rename succeeds. * * TODO: implement metadata only index copy up when called with * ovl_copy_up_flags(dentry, O_PATH). */ if (ovl_need_index(dentry) && !ovl_dentry_has_upper_alias(dentry)) { err = ovl_copy_up(dentry); if (err) return err; } err = ovl_inode_lock_interruptible(inode); if (err) return err; err = ovl_want_write(dentry); if (err) goto out_unlock; if (d_is_dir(dentry) || !ovl_test_flag(OVL_INDEX, inode)) return 0; old_cred = ovl_override_creds(dentry->d_sb); /* * The overlay inode nlink should be incremented/decremented IFF the * upper operation succeeds, along with nlink change of upper inode. * Therefore, before link/unlink/rename, we store the union nlink * value relative to the upper inode nlink in an upper inode xattr. */ err = ovl_set_nlink_upper(dentry); ovl_revert_creds(old_cred); if (err) goto out_drop_write; return 0; out_drop_write: ovl_drop_write(dentry); out_unlock: ovl_inode_unlock(inode); return err; } void ovl_nlink_end(struct dentry *dentry) { struct inode *inode = d_inode(dentry); ovl_drop_write(dentry); if (ovl_test_flag(OVL_INDEX, inode) && inode->i_nlink == 0) { const struct cred *old_cred; old_cred = ovl_override_creds(dentry->d_sb); ovl_cleanup_index(dentry); ovl_revert_creds(old_cred); } ovl_inode_unlock(inode); } int ovl_lock_rename_workdir(struct dentry *workdir, struct dentry *work, struct dentry *upperdir, struct dentry *upper) { struct dentry *trap; /* Workdir should not be subdir of upperdir and vice versa */ trap = lock_rename(workdir, upperdir); if (IS_ERR(trap)) goto err; if (trap) goto err_unlock; if (work && work->d_parent != workdir) goto err_unlock; if (upper && upper->d_parent != upperdir) goto err_unlock; return 0; err_unlock: unlock_rename(workdir, upperdir); err: pr_err("failed to lock workdir+upperdir\n"); return -EIO; } /* * err < 0, 0 if no metacopy xattr, metacopy data size if xattr found. * an empty xattr returns OVL_METACOPY_MIN_SIZE to distinguish from no xattr value. */ int ovl_check_metacopy_xattr(struct ovl_fs *ofs, const struct path *path, struct ovl_metacopy *data) { int res; /* Only regular files can have metacopy xattr */ if (!S_ISREG(d_inode(path->dentry)->i_mode)) return 0; res = ovl_path_getxattr(ofs, path, OVL_XATTR_METACOPY, data, data ? OVL_METACOPY_MAX_SIZE : 0); if (res < 0) { if (res == -ENODATA || res == -EOPNOTSUPP) return 0; /* * getxattr on user.* may fail with EACCES in case there's no * read permission on the inode. Not much we can do, other than * tell the caller that this is not a metacopy inode. */ if (ofs->config.userxattr && res == -EACCES) return 0; goto out; } if (res == 0) { /* Emulate empty data for zero size metacopy xattr */ res = OVL_METACOPY_MIN_SIZE; if (data) { memset(data, 0, res); data->len = res; } } else if (res < OVL_METACOPY_MIN_SIZE) { pr_warn_ratelimited("metacopy file '%pd' has too small xattr\n", path->dentry); return -EIO; } else if (data) { if (data->version != 0) { pr_warn_ratelimited("metacopy file '%pd' has unsupported version\n", path->dentry); return -EIO; } if (res != data->len) { pr_warn_ratelimited("metacopy file '%pd' has invalid xattr size\n", path->dentry); return -EIO; } } return res; out: pr_warn_ratelimited("failed to get metacopy (%i)\n", res); return res; } int ovl_set_metacopy_xattr(struct ovl_fs *ofs, struct dentry *d, struct ovl_metacopy *metacopy) { size_t len = metacopy->len; /* If no flags or digest fall back to empty metacopy file */ if (metacopy->version == 0 && metacopy->flags == 0 && metacopy->digest_algo == 0) len = 0; return ovl_check_setxattr(ofs, d, OVL_XATTR_METACOPY, metacopy, len, -EOPNOTSUPP); } bool ovl_is_metacopy_dentry(struct dentry *dentry) { struct ovl_entry *oe = OVL_E(dentry); if (!d_is_reg(dentry)) return false; if (ovl_dentry_upper(dentry)) { if (!ovl_has_upperdata(d_inode(dentry))) return true; return false; } return (ovl_numlower(oe) > 1); } char *ovl_get_redirect_xattr(struct ovl_fs *ofs, const struct path *path, int padding) { int res; char *s, *next, *buf = NULL; res = ovl_path_getxattr(ofs, path, OVL_XATTR_REDIRECT, NULL, 0); if (res == -ENODATA || res == -EOPNOTSUPP) return NULL; if (res < 0) goto fail; if (res == 0) goto invalid; buf = kzalloc(res + padding + 1, GFP_KERNEL); if (!buf) return ERR_PTR(-ENOMEM); res = ovl_path_getxattr(ofs, path, OVL_XATTR_REDIRECT, buf, res); if (res < 0) goto fail; if (res == 0) goto invalid; if (buf[0] == '/') { for (s = buf; *s++ == '/'; s = next) { next = strchrnul(s, '/'); if (s == next) goto invalid; } } else { if (strchr(buf, '/') != NULL) goto invalid; } return buf; invalid: pr_warn_ratelimited("invalid redirect (%s)\n", buf); res = -EINVAL; goto err_free; fail: pr_warn_ratelimited("failed to get redirect (%i)\n", res); err_free: kfree(buf); return ERR_PTR(res); } /* Call with mounter creds as it may open the file */ int ovl_ensure_verity_loaded(struct path *datapath) { struct inode *inode = d_inode(datapath->dentry); struct file *filp; if (!fsverity_active(inode) && IS_VERITY(inode)) { /* * If this inode was not yet opened, the verity info hasn't been * loaded yet, so we need to do that here to force it into memory. */ filp = kernel_file_open(datapath, O_RDONLY, current_cred()); if (IS_ERR(filp)) return PTR_ERR(filp); fput(filp); } return 0; } int ovl_validate_verity(struct ovl_fs *ofs, struct path *metapath, struct path *datapath) { struct ovl_metacopy metacopy_data; u8 actual_digest[FS_VERITY_MAX_DIGEST_SIZE]; int xattr_digest_size, digest_size; int xattr_size, err; u8 verity_algo; if (!ofs->config.verity_mode || /* Verity only works on regular files */ !S_ISREG(d_inode(metapath->dentry)->i_mode)) return 0; xattr_size = ovl_check_metacopy_xattr(ofs, metapath, &metacopy_data); if (xattr_size < 0) return xattr_size; if (!xattr_size || !metacopy_data.digest_algo) { if (ofs->config.verity_mode == OVL_VERITY_REQUIRE) { pr_warn_ratelimited("metacopy file '%pd' has no digest specified\n", metapath->dentry); return -EIO; } return 0; } xattr_digest_size = ovl_metadata_digest_size(&metacopy_data); err = ovl_ensure_verity_loaded(datapath); if (err < 0) { pr_warn_ratelimited("lower file '%pd' failed to load fs-verity info\n", datapath->dentry); return -EIO; } digest_size = fsverity_get_digest(d_inode(datapath->dentry), actual_digest, &verity_algo, NULL); if (digest_size == 0) { pr_warn_ratelimited("lower file '%pd' has no fs-verity digest\n", datapath->dentry); return -EIO; } if (xattr_digest_size != digest_size || metacopy_data.digest_algo != verity_algo || memcmp(metacopy_data.digest, actual_digest, xattr_digest_size) != 0) { pr_warn_ratelimited("lower file '%pd' has the wrong fs-verity digest\n", datapath->dentry); return -EIO; } return 0; } int ovl_get_verity_digest(struct ovl_fs *ofs, struct path *src, struct ovl_metacopy *metacopy) { int err, digest_size; if (!ofs->config.verity_mode || !S_ISREG(d_inode(src->dentry)->i_mode)) return 0; err = ovl_ensure_verity_loaded(src); if (err < 0) { pr_warn_ratelimited("lower file '%pd' failed to load fs-verity info\n", src->dentry); return -EIO; } digest_size = fsverity_get_digest(d_inode(src->dentry), metacopy->digest, &metacopy->digest_algo, NULL); if (digest_size == 0 || WARN_ON_ONCE(digest_size > FS_VERITY_MAX_DIGEST_SIZE)) { if (ofs->config.verity_mode == OVL_VERITY_REQUIRE) { pr_warn_ratelimited("lower file '%pd' has no fs-verity digest\n", src->dentry); return -EIO; } return 0; } metacopy->len += digest_size; return 0; } /* * ovl_sync_status() - Check fs sync status for volatile mounts * * Returns 1 if this is not a volatile mount and a real sync is required. * * Returns 0 if syncing can be skipped because mount is volatile, and no errors * have occurred on the upperdir since the mount. * * Returns -errno if it is a volatile mount, and the error that occurred since * the last mount. If the error code changes, it'll return the latest error * code. */ int ovl_sync_status(struct ovl_fs *ofs) { struct vfsmount *mnt; if (ovl_should_sync(ofs)) return 1; mnt = ovl_upper_mnt(ofs); if (!mnt) return 0; return errseq_check(&mnt->mnt_sb->s_wb_err, ofs->errseq); } /* * ovl_copyattr() - copy inode attributes from layer to ovl inode * * When overlay copies inode information from an upper or lower layer to the * relevant overlay inode it will apply the idmapping of the upper or lower * layer when doing so ensuring that the ovl inode ownership will correctly * reflect the ownership of the idmapped upper or lower layer. For example, an * idmapped upper or lower layer mapping id 1001 to id 1000 will take care to * map any lower or upper inode owned by id 1001 to id 1000. These mapping * helpers are nops when the relevant layer isn't idmapped. */ void ovl_copyattr(struct inode *inode) { struct path realpath; struct inode *realinode; struct mnt_idmap *real_idmap; vfsuid_t vfsuid; vfsgid_t vfsgid; realinode = ovl_i_path_real(inode, &realpath); real_idmap = mnt_idmap(realpath.mnt); spin_lock(&inode->i_lock); vfsuid = i_uid_into_vfsuid(real_idmap, realinode); vfsgid = i_gid_into_vfsgid(real_idmap, realinode); inode->i_uid = vfsuid_into_kuid(vfsuid); inode->i_gid = vfsgid_into_kgid(vfsgid); inode->i_mode = realinode->i_mode; inode_set_atime_to_ts(inode, inode_get_atime(realinode)); inode_set_mtime_to_ts(inode, inode_get_mtime(realinode)); inode_set_ctime_to_ts(inode, inode_get_ctime(realinode)); i_size_write(inode, i_size_read(realinode)); spin_unlock(&inode->i_lock); } int ovl_parent_lock(struct dentry *parent, struct dentry *child) { inode_lock_nested(parent->d_inode, I_MUTEX_PARENT); if (!child || child->d_parent == parent) return 0; inode_unlock(parent->d_inode); return -EINVAL; } |
| 23 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 | /* SPDX-License-Identifier: GPL-2.0-only */ #ifndef _INPUT_COMPAT_H #define _INPUT_COMPAT_H /* * 32bit compatibility wrappers for the input subsystem. * * Very heavily based on evdev.c - Copyright (c) 1999-2002 Vojtech Pavlik */ #include <linux/compiler.h> #include <linux/compat.h> #include <linux/input.h> #ifdef CONFIG_COMPAT struct input_event_compat { compat_ulong_t sec; compat_ulong_t usec; __u16 type; __u16 code; __s32 value; }; struct ff_periodic_effect_compat { __u16 waveform; __u16 period; __s16 magnitude; __s16 offset; __u16 phase; struct ff_envelope envelope; __u32 custom_len; compat_uptr_t custom_data; }; struct ff_effect_compat { __u16 type; __s16 id; __u16 direction; struct ff_trigger trigger; struct ff_replay replay; union { struct ff_constant_effect constant; struct ff_ramp_effect ramp; struct ff_periodic_effect_compat periodic; struct ff_condition_effect condition[2]; /* One for each axis */ struct ff_rumble_effect rumble; } u; }; static inline size_t input_event_size(void) { return (in_compat_syscall() && !COMPAT_USE_64BIT_TIME) ? sizeof(struct input_event_compat) : sizeof(struct input_event); } #else static inline size_t input_event_size(void) { return sizeof(struct input_event); } #endif /* CONFIG_COMPAT */ int input_event_from_user(const char __user *buffer, struct input_event *event); int input_event_to_user(char __user *buffer, const struct input_event *event); int input_ff_effect_from_user(const char __user *buffer, size_t size, struct ff_effect *effect); #endif /* _INPUT_COMPAT_H */ |
| 19 17 19 7 19 19 19 19 19 19 19 13 5 8 8 5 2 4 2 2 4 2 2 4 3 4 3 4 3 3 3 3 3 3 3 3 3 2 3 3 3 3 3 3 3 2 2 2 1 1 1 2 2 2 2 2 1 2 1 1 1 1 2 1 2 14 14 14 7 5 3 2 2 2 2 2 19 19 8 8 7 58 7 6 7 7 7 7 57 58 58 57 57 1 1 58 58 19 7 39 19 14 19 16 19 14 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 | // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 1991-1998 Linus Torvalds * Re-organised Feb 1998 Russell King * Copyright (C) 2020 Christoph Hellwig */ #include <linux/fs.h> #include <linux/major.h> #include <linux/slab.h> #include <linux/ctype.h> #include <linux/vmalloc.h> #include <linux/raid/detect.h> #include "check.h" static int (*const check_part[])(struct parsed_partitions *) = { /* * Probe partition formats with tables at disk address 0 * that also have an ADFS boot block at 0xdc0. */ #ifdef CONFIG_ACORN_PARTITION_ICS adfspart_check_ICS, #endif #ifdef CONFIG_ACORN_PARTITION_POWERTEC adfspart_check_POWERTEC, #endif #ifdef CONFIG_ACORN_PARTITION_EESOX adfspart_check_EESOX, #endif /* * Now move on to formats that only have partition info at * disk address 0xdc0. Since these may also have stale * PC/BIOS partition tables, they need to come before * the msdos entry. */ #ifdef CONFIG_ACORN_PARTITION_CUMANA adfspart_check_CUMANA, #endif #ifdef CONFIG_ACORN_PARTITION_ADFS adfspart_check_ADFS, #endif #ifdef CONFIG_CMDLINE_PARTITION cmdline_partition, #endif #ifdef CONFIG_OF_PARTITION of_partition, /* cmdline have priority to OF */ #endif #ifdef CONFIG_EFI_PARTITION efi_partition, /* this must come before msdos */ #endif #ifdef CONFIG_SGI_PARTITION sgi_partition, #endif #ifdef CONFIG_LDM_PARTITION ldm_partition, /* this must come before msdos */ #endif #ifdef CONFIG_MSDOS_PARTITION msdos_partition, #endif #ifdef CONFIG_OSF_PARTITION osf_partition, #endif #ifdef CONFIG_SUN_PARTITION sun_partition, #endif #ifdef CONFIG_AMIGA_PARTITION amiga_partition, #endif #ifdef CONFIG_ATARI_PARTITION atari_partition, #endif #ifdef CONFIG_MAC_PARTITION mac_partition, #endif #ifdef CONFIG_ULTRIX_PARTITION ultrix_partition, #endif #ifdef CONFIG_IBM_PARTITION ibm_partition, #endif #ifdef CONFIG_KARMA_PARTITION karma_partition, #endif #ifdef CONFIG_SYSV68_PARTITION sysv68_partition, #endif NULL }; static struct parsed_partitions *allocate_partitions(struct gendisk *hd) { struct parsed_partitions *state; int nr = DISK_MAX_PARTS; state = kzalloc(sizeof(*state), GFP_KERNEL); if (!state) return NULL; state->parts = vzalloc(array_size(nr, sizeof(state->parts[0]))); if (!state->parts) { kfree(state); return NULL; } state->limit = nr; return state; } static void free_partitions(struct parsed_partitions *state) { vfree(state->parts); kfree(state); } static struct parsed_partitions *check_partition(struct gendisk *hd) { struct parsed_partitions *state; int i, res, err; state = allocate_partitions(hd); if (!state) return NULL; state->pp_buf = (char *)__get_free_page(GFP_KERNEL); if (!state->pp_buf) { free_partitions(state); return NULL; } state->pp_buf[0] = '\0'; state->disk = hd; snprintf(state->name, BDEVNAME_SIZE, "%s", hd->disk_name); snprintf(state->pp_buf, PAGE_SIZE, " %s:", state->name); if (isdigit(state->name[strlen(state->name)-1])) sprintf(state->name, "p"); i = res = err = 0; while (!res && check_part[i]) { memset(state->parts, 0, state->limit * sizeof(state->parts[0])); res = check_part[i++](state); if (res < 0) { /* * We have hit an I/O error which we don't report now. * But record it, and let the others do their job. */ err = res; res = 0; } } if (res > 0) { printk(KERN_INFO "%s", state->pp_buf); free_page((unsigned long)state->pp_buf); return state; } if (state->access_beyond_eod) err = -ENOSPC; /* * The partition is unrecognized. So report I/O errors if there were any */ if (err) res = err; if (res) { strlcat(state->pp_buf, " unable to read partition table\n", PAGE_SIZE); printk(KERN_INFO "%s", state->pp_buf); } free_page((unsigned long)state->pp_buf); free_partitions(state); return ERR_PTR(res); } static ssize_t part_partition_show(struct device *dev, struct device_attribute *attr, char *buf) { return sprintf(buf, "%d\n", bdev_partno(dev_to_bdev(dev))); } static ssize_t part_start_show(struct device *dev, struct device_attribute *attr, char *buf) { return sprintf(buf, "%llu\n", dev_to_bdev(dev)->bd_start_sect); } static ssize_t part_ro_show(struct device *dev, struct device_attribute *attr, char *buf) { return sprintf(buf, "%d\n", bdev_read_only(dev_to_bdev(dev))); } static ssize_t part_alignment_offset_show(struct device *dev, struct device_attribute *attr, char *buf) { return sprintf(buf, "%u\n", bdev_alignment_offset(dev_to_bdev(dev))); } static ssize_t part_discard_alignment_show(struct device *dev, struct device_attribute *attr, char *buf) { return sprintf(buf, "%u\n", bdev_discard_alignment(dev_to_bdev(dev))); } static DEVICE_ATTR(partition, 0444, part_partition_show, NULL); static DEVICE_ATTR(start, 0444, part_start_show, NULL); static DEVICE_ATTR(size, 0444, part_size_show, NULL); static DEVICE_ATTR(ro, 0444, part_ro_show, NULL); static DEVICE_ATTR(alignment_offset, 0444, part_alignment_offset_show, NULL); static DEVICE_ATTR(discard_alignment, 0444, part_discard_alignment_show, NULL); static DEVICE_ATTR(stat, 0444, part_stat_show, NULL); static DEVICE_ATTR(inflight, 0444, part_inflight_show, NULL); #ifdef CONFIG_FAIL_MAKE_REQUEST static struct device_attribute dev_attr_fail = __ATTR(make-it-fail, 0644, part_fail_show, part_fail_store); #endif static struct attribute *part_attrs[] = { &dev_attr_partition.attr, &dev_attr_start.attr, &dev_attr_size.attr, &dev_attr_ro.attr, &dev_attr_alignment_offset.attr, &dev_attr_discard_alignment.attr, &dev_attr_stat.attr, &dev_attr_inflight.attr, #ifdef CONFIG_FAIL_MAKE_REQUEST &dev_attr_fail.attr, #endif NULL }; static const struct attribute_group part_attr_group = { .attrs = part_attrs, }; static const struct attribute_group *part_attr_groups[] = { &part_attr_group, #ifdef CONFIG_BLK_DEV_IO_TRACE &blk_trace_attr_group, #endif NULL }; static void part_release(struct device *dev) { put_disk(dev_to_bdev(dev)->bd_disk); bdev_drop(dev_to_bdev(dev)); } static int part_uevent(const struct device *dev, struct kobj_uevent_env *env) { const struct block_device *part = dev_to_bdev(dev); add_uevent_var(env, "PARTN=%u", bdev_partno(part)); if (part->bd_meta_info && part->bd_meta_info->volname[0]) add_uevent_var(env, "PARTNAME=%s", part->bd_meta_info->volname); if (part->bd_meta_info && part->bd_meta_info->uuid[0]) add_uevent_var(env, "PARTUUID=%s", part->bd_meta_info->uuid); return 0; } const struct device_type part_type = { .name = "partition", .groups = part_attr_groups, .release = part_release, .uevent = part_uevent, }; void drop_partition(struct block_device *part) { lockdep_assert_held(&part->bd_disk->open_mutex); xa_erase(&part->bd_disk->part_tbl, bdev_partno(part)); kobject_put(part->bd_holder_dir); device_del(&part->bd_device); put_device(&part->bd_device); } static ssize_t whole_disk_show(struct device *dev, struct device_attribute *attr, char *buf) { return 0; } static const DEVICE_ATTR(whole_disk, 0444, whole_disk_show, NULL); /* * Must be called either with open_mutex held, before a disk can be opened or * after all disk users are gone. */ static struct block_device *add_partition(struct gendisk *disk, int partno, sector_t start, sector_t len, int flags, struct partition_meta_info *info) { dev_t devt = MKDEV(0, 0); struct device *ddev = disk_to_dev(disk); struct device *pdev; struct block_device *bdev; const char *dname; int err; lockdep_assert_held(&disk->open_mutex); if (partno >= DISK_MAX_PARTS) return ERR_PTR(-EINVAL); /* * Partitions are not supported on zoned block devices that are used as * such. */ if (bdev_is_zoned(disk->part0)) { pr_warn("%s: partitions not supported on host managed zoned block device\n", disk->disk_name); return ERR_PTR(-ENXIO); } if (xa_load(&disk->part_tbl, partno)) return ERR_PTR(-EBUSY); /* ensure we always have a reference to the whole disk */ get_device(disk_to_dev(disk)); err = -ENOMEM; bdev = bdev_alloc(disk, partno); if (!bdev) goto out_put_disk; bdev->bd_start_sect = start; bdev_set_nr_sectors(bdev, len); pdev = &bdev->bd_device; dname = dev_name(ddev); if (isdigit(dname[strlen(dname) - 1])) dev_set_name(pdev, "%sp%d", dname, partno); else dev_set_name(pdev, "%s%d", dname, partno); device_initialize(pdev); pdev->class = &block_class; pdev->type = &part_type; pdev->parent = ddev; /* in consecutive minor range? */ if (bdev_partno(bdev) < disk->minors) { devt = MKDEV(disk->major, disk->first_minor + bdev_partno(bdev)); } else { err = blk_alloc_ext_minor(); if (err < 0) goto out_put; devt = MKDEV(BLOCK_EXT_MAJOR, err); } pdev->devt = devt; if (info) { err = -ENOMEM; bdev->bd_meta_info = kmemdup(info, sizeof(*info), GFP_KERNEL); if (!bdev->bd_meta_info) goto out_put; } /* delay uevent until 'holders' subdir is created */ dev_set_uevent_suppress(pdev, 1); err = device_add(pdev); if (err) goto out_put; err = -ENOMEM; bdev->bd_holder_dir = kobject_create_and_add("holders", &pdev->kobj); if (!bdev->bd_holder_dir) goto out_del; dev_set_uevent_suppress(pdev, 0); if (flags & ADDPART_FLAG_WHOLEDISK) { err = device_create_file(pdev, &dev_attr_whole_disk); if (err) goto out_del; } if (flags & ADDPART_FLAG_READONLY) bdev_set_flag(bdev, BD_READ_ONLY); /* everything is up and running, commence */ err = xa_insert(&disk->part_tbl, partno, bdev, GFP_KERNEL); if (err) goto out_del; bdev_add(bdev, devt); /* suppress uevent if the disk suppresses it */ if (!dev_get_uevent_suppress(ddev)) kobject_uevent(&pdev->kobj, KOBJ_ADD); return bdev; out_del: kobject_put(bdev->bd_holder_dir); device_del(pdev); out_put: put_device(pdev); return ERR_PTR(err); out_put_disk: put_disk(disk); return ERR_PTR(err); } static bool partition_overlaps(struct gendisk *disk, sector_t start, sector_t length, int skip_partno) { struct block_device *part; bool overlap = false; unsigned long idx; rcu_read_lock(); xa_for_each_start(&disk->part_tbl, idx, part, 1) { if (bdev_partno(part) != skip_partno && start < part->bd_start_sect + bdev_nr_sectors(part) && start + length > part->bd_start_sect) { overlap = true; break; } } rcu_read_unlock(); return overlap; } int bdev_add_partition(struct gendisk *disk, int partno, sector_t start, sector_t length) { struct block_device *part; int ret; mutex_lock(&disk->open_mutex); if (!disk_live(disk)) { ret = -ENXIO; goto out; } if (disk->flags & GENHD_FL_NO_PART) { ret = -EINVAL; goto out; } if (partition_overlaps(disk, start, length, -1)) { ret = -EBUSY; goto out; } part = add_partition(disk, partno, start, length, ADDPART_FLAG_NONE, NULL); ret = PTR_ERR_OR_ZERO(part); out: mutex_unlock(&disk->open_mutex); return ret; } int bdev_del_partition(struct gendisk *disk, int partno) { struct block_device *part = NULL; int ret = -ENXIO; mutex_lock(&disk->open_mutex); part = xa_load(&disk->part_tbl, partno); if (!part) goto out_unlock; ret = -EBUSY; if (atomic_read(&part->bd_openers)) goto out_unlock; /* * We verified that @part->bd_openers is zero above and so * @part->bd_holder{_ops} can't be set. And since we hold * @disk->open_mutex the device can't be claimed by anyone. * * So no need to call @part->bd_holder_ops->mark_dead() here. * Just delete the partition and invalidate it. */ bdev_unhash(part); invalidate_bdev(part); drop_partition(part); ret = 0; out_unlock: mutex_unlock(&disk->open_mutex); return ret; } int bdev_resize_partition(struct gendisk *disk, int partno, sector_t start, sector_t length) { struct block_device *part = NULL; int ret = -ENXIO; mutex_lock(&disk->open_mutex); part = xa_load(&disk->part_tbl, partno); if (!part) goto out_unlock; ret = -EINVAL; if (start != part->bd_start_sect) goto out_unlock; ret = -EBUSY; if (partition_overlaps(disk, start, length, partno)) goto out_unlock; bdev_set_nr_sectors(part, length); ret = 0; out_unlock: mutex_unlock(&disk->open_mutex); return ret; } static bool disk_unlock_native_capacity(struct gendisk *disk) { if (!disk->fops->unlock_native_capacity || test_and_set_bit(GD_NATIVE_CAPACITY, &disk->state)) { printk(KERN_CONT "truncated\n"); return false; } printk(KERN_CONT "enabling native capacity\n"); disk->fops->unlock_native_capacity(disk); return true; } static bool blk_add_partition(struct gendisk *disk, struct parsed_partitions *state, int p) { sector_t size = state->parts[p].size; sector_t from = state->parts[p].from; struct block_device *part; if (!size) return true; if (from >= get_capacity(disk)) { printk(KERN_WARNING "%s: p%d start %llu is beyond EOD, ", disk->disk_name, p, (unsigned long long) from); if (disk_unlock_native_capacity(disk)) return false; return true; } if (from + size > get_capacity(disk)) { printk(KERN_WARNING "%s: p%d size %llu extends beyond EOD, ", disk->disk_name, p, (unsigned long long) size); if (disk_unlock_native_capacity(disk)) return false; /* * We can not ignore partitions of broken tables created by for * example camera firmware, but we limit them to the end of the * disk to avoid creating invalid block devices. */ size = get_capacity(disk) - from; } part = add_partition(disk, p, from, size, state->parts[p].flags, &state->parts[p].info); if (IS_ERR(part)) { if (PTR_ERR(part) != -ENXIO) { printk(KERN_ERR " %s: p%d could not be added: %pe\n", disk->disk_name, p, part); } return true; } if (IS_BUILTIN(CONFIG_BLK_DEV_MD) && (state->parts[p].flags & ADDPART_FLAG_RAID)) md_autodetect_dev(part->bd_dev); return true; } static int blk_add_partitions(struct gendisk *disk) { struct parsed_partitions *state; int ret = -EAGAIN, p; if (!disk_has_partscan(disk)) return 0; state = check_partition(disk); if (!state) return 0; if (IS_ERR(state)) { /* * I/O error reading the partition table. If we tried to read * beyond EOD, retry after unlocking the native capacity. */ if (PTR_ERR(state) == -ENOSPC) { printk(KERN_WARNING "%s: partition table beyond EOD, ", disk->disk_name); if (disk_unlock_native_capacity(disk)) return -EAGAIN; } return -EIO; } /* * Partitions are not supported on host managed zoned block devices. */ if (bdev_is_zoned(disk->part0)) { pr_warn("%s: ignoring partition table on host managed zoned block device\n", disk->disk_name); ret = 0; goto out_free_state; } /* * If we read beyond EOD, try unlocking native capacity even if the * partition table was successfully read as we could be missing some * partitions. */ if (state->access_beyond_eod) { printk(KERN_WARNING "%s: partition table partially beyond EOD, ", disk->disk_name); if (disk_unlock_native_capacity(disk)) goto out_free_state; } /* tell userspace that the media / partition table may have changed */ kobject_uevent(&disk_to_dev(disk)->kobj, KOBJ_CHANGE); for (p = 1; p < state->limit; p++) if (!blk_add_partition(disk, state, p)) goto out_free_state; ret = 0; out_free_state: free_partitions(state); return ret; } int bdev_disk_changed(struct gendisk *disk, bool invalidate) { struct block_device *part; unsigned long idx; int ret = 0; lockdep_assert_held(&disk->open_mutex); if (!disk_live(disk)) return -ENXIO; rescan: if (disk->open_partitions) return -EBUSY; sync_blockdev(disk->part0); invalidate_bdev(disk->part0); xa_for_each_start(&disk->part_tbl, idx, part, 1) { /* * Remove the block device from the inode hash, so that * it cannot be looked up any more even when openers * still hold references. */ bdev_unhash(part); /* * If @disk->open_partitions isn't elevated but there's * still an active holder of that block device things * are broken. */ WARN_ON_ONCE(atomic_read(&part->bd_openers)); invalidate_bdev(part); drop_partition(part); } clear_bit(GD_NEED_PART_SCAN, &disk->state); /* * Historically we only set the capacity to zero for devices that * support partitions (independ of actually having partitions created). * Doing that is rather inconsistent, but changing it broke legacy * udisks polling for legacy ide-cdrom devices. Use the crude check * below to get the sane behavior for most device while not breaking * userspace for this particular setup. */ if (invalidate) { if (!(disk->flags & GENHD_FL_NO_PART) || !(disk->flags & GENHD_FL_REMOVABLE)) set_capacity(disk, 0); } if (get_capacity(disk)) { ret = blk_add_partitions(disk); if (ret == -EAGAIN) goto rescan; } else if (invalidate) { /* * Tell userspace that the media / partition table may have * changed. */ kobject_uevent(&disk_to_dev(disk)->kobj, KOBJ_CHANGE); } return ret; } /* * Only exported for loop and dasd for historic reasons. Don't use in new * code! */ EXPORT_SYMBOL_GPL(bdev_disk_changed); void *read_part_sector(struct parsed_partitions *state, sector_t n, Sector *p) { struct address_space *mapping = state->disk->part0->bd_mapping; struct folio *folio; if (n >= get_capacity(state->disk)) { state->access_beyond_eod = true; goto out; } folio = read_mapping_folio(mapping, n >> PAGE_SECTORS_SHIFT, NULL); if (IS_ERR(folio)) goto out; p->v = folio; return folio_address(folio) + offset_in_folio(folio, n * SECTOR_SIZE); out: p->v = NULL; return NULL; } |
| 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 | // SPDX-License-Identifier: GPL-2.0-only /* * linux/fs/hfsplus/super.c * * Copyright (C) 2001 * Brad Boyer (flar@allandria.com) * (C) 2003 Ardis Technologies <roman@ardistech.com> * */ #include <linux/module.h> #include <linux/init.h> #include <linux/pagemap.h> #include <linux/blkdev.h> #include <linux/backing-dev.h> #include <linux/fs.h> #include <linux/fs_context.h> #include <linux/slab.h> #include <linux/vfs.h> #include <linux/nls.h> static struct inode *hfsplus_alloc_inode(struct super_block *sb); static void hfsplus_free_inode(struct inode *inode); #include "hfsplus_fs.h" #include "xattr.h" static int hfsplus_system_read_inode(struct inode *inode) { struct hfsplus_vh *vhdr = HFSPLUS_SB(inode->i_sb)->s_vhdr; switch (inode->i_ino) { case HFSPLUS_EXT_CNID: hfsplus_inode_read_fork(inode, &vhdr->ext_file); inode->i_mapping->a_ops = &hfsplus_btree_aops; break; case HFSPLUS_CAT_CNID: hfsplus_inode_read_fork(inode, &vhdr->cat_file); inode->i_mapping->a_ops = &hfsplus_btree_aops; break; case HFSPLUS_ALLOC_CNID: hfsplus_inode_read_fork(inode, &vhdr->alloc_file); inode->i_mapping->a_ops = &hfsplus_aops; break; case HFSPLUS_START_CNID: hfsplus_inode_read_fork(inode, &vhdr->start_file); break; case HFSPLUS_ATTR_CNID: hfsplus_inode_read_fork(inode, &vhdr->attr_file); inode->i_mapping->a_ops = &hfsplus_btree_aops; break; default: return -EIO; } return 0; } struct inode *hfsplus_iget(struct super_block *sb, unsigned long ino) { struct hfs_find_data fd; struct inode *inode; int err; inode = iget_locked(sb, ino); if (!inode) return ERR_PTR(-ENOMEM); if (!(inode->i_state & I_NEW)) return inode; INIT_LIST_HEAD(&HFSPLUS_I(inode)->open_dir_list); spin_lock_init(&HFSPLUS_I(inode)->open_dir_lock); mutex_init(&HFSPLUS_I(inode)->extents_lock); HFSPLUS_I(inode)->flags = 0; HFSPLUS_I(inode)->extent_state = 0; HFSPLUS_I(inode)->rsrc_inode = NULL; atomic_set(&HFSPLUS_I(inode)->opencnt, 0); if (inode->i_ino >= HFSPLUS_FIRSTUSER_CNID || inode->i_ino == HFSPLUS_ROOT_CNID) { err = hfs_find_init(HFSPLUS_SB(inode->i_sb)->cat_tree, &fd); if (!err) { err = hfsplus_find_cat(inode->i_sb, inode->i_ino, &fd); if (!err) err = hfsplus_cat_read_inode(inode, &fd); hfs_find_exit(&fd); } } else { err = hfsplus_system_read_inode(inode); } if (err) { iget_failed(inode); return ERR_PTR(err); } unlock_new_inode(inode); return inode; } static int hfsplus_system_write_inode(struct inode *inode) { struct hfsplus_sb_info *sbi = HFSPLUS_SB(inode->i_sb); struct hfsplus_vh *vhdr = sbi->s_vhdr; struct hfsplus_fork_raw *fork; struct hfs_btree *tree = NULL; switch (inode->i_ino) { case HFSPLUS_EXT_CNID: fork = &vhdr->ext_file; tree = sbi->ext_tree; break; case HFSPLUS_CAT_CNID: fork = &vhdr->cat_file; tree = sbi->cat_tree; break; case HFSPLUS_ALLOC_CNID: fork = &vhdr->alloc_file; break; case HFSPLUS_START_CNID: fork = &vhdr->start_file; break; case HFSPLUS_ATTR_CNID: fork = &vhdr->attr_file; tree = sbi->attr_tree; break; default: return -EIO; } if (fork->total_size != cpu_to_be64(inode->i_size)) { set_bit(HFSPLUS_SB_WRITEBACKUP, &sbi->flags); hfsplus_mark_mdb_dirty(inode->i_sb); } hfsplus_inode_write_fork(inode, fork); if (tree) { int err = hfs_btree_write(tree); if (err) { pr_err("b-tree write err: %d, ino %lu\n", err, inode->i_ino); return err; } } return 0; } static int hfsplus_write_inode(struct inode *inode, struct writeback_control *wbc) { int err; hfs_dbg(INODE, "hfsplus_write_inode: %lu\n", inode->i_ino); err = hfsplus_ext_write_extent(inode); if (err) return err; if (inode->i_ino >= HFSPLUS_FIRSTUSER_CNID || inode->i_ino == HFSPLUS_ROOT_CNID) return hfsplus_cat_write_inode(inode); else return hfsplus_system_write_inode(inode); } static void hfsplus_evict_inode(struct inode *inode) { hfs_dbg(INODE, "hfsplus_evict_inode: %lu\n", inode->i_ino); truncate_inode_pages_final(&inode->i_data); clear_inode(inode); if (HFSPLUS_IS_RSRC(inode)) { HFSPLUS_I(HFSPLUS_I(inode)->rsrc_inode)->rsrc_inode = NULL; iput(HFSPLUS_I(inode)->rsrc_inode); } } static int hfsplus_sync_fs(struct super_block *sb, int wait) { struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb); struct hfsplus_vh *vhdr = sbi->s_vhdr; int write_backup = 0; int error, error2; if (!wait) return 0; hfs_dbg(SUPER, "hfsplus_sync_fs\n"); /* * Explicitly write out the special metadata inodes. * * While these special inodes are marked as hashed and written * out peridocically by the flusher threads we redirty them * during writeout of normal inodes, and thus the life lock * prevents us from getting the latest state to disk. */ error = filemap_write_and_wait(sbi->cat_tree->inode->i_mapping); error2 = filemap_write_and_wait(sbi->ext_tree->inode->i_mapping); if (!error) error = error2; if (sbi->attr_tree) { error2 = filemap_write_and_wait(sbi->attr_tree->inode->i_mapping); if (!error) error = error2; } error2 = filemap_write_and_wait(sbi->alloc_file->i_mapping); if (!error) error = error2; mutex_lock(&sbi->vh_mutex); mutex_lock(&sbi->alloc_mutex); vhdr->free_blocks = cpu_to_be32(sbi->free_blocks); vhdr->next_cnid = cpu_to_be32(sbi->next_cnid); vhdr->folder_count = cpu_to_be32(sbi->folder_count); vhdr->file_count = cpu_to_be32(sbi->file_count); if (test_and_clear_bit(HFSPLUS_SB_WRITEBACKUP, &sbi->flags)) { memcpy(sbi->s_backup_vhdr, sbi->s_vhdr, sizeof(*sbi->s_vhdr)); write_backup = 1; } error2 = hfsplus_submit_bio(sb, sbi->part_start + HFSPLUS_VOLHEAD_SECTOR, sbi->s_vhdr_buf, NULL, REQ_OP_WRITE); if (!error) error = error2; if (!write_backup) goto out; error2 = hfsplus_submit_bio(sb, sbi->part_start + sbi->sect_count - 2, sbi->s_backup_vhdr_buf, NULL, REQ_OP_WRITE); if (!error) error2 = error; out: mutex_unlock(&sbi->alloc_mutex); mutex_unlock(&sbi->vh_mutex); if (!test_bit(HFSPLUS_SB_NOBARRIER, &sbi->flags)) blkdev_issue_flush(sb->s_bdev); return error; } static void delayed_sync_fs(struct work_struct *work) { int err; struct hfsplus_sb_info *sbi; sbi = container_of(work, struct hfsplus_sb_info, sync_work.work); spin_lock(&sbi->work_lock); sbi->work_queued = 0; spin_unlock(&sbi->work_lock); err = hfsplus_sync_fs(sbi->alloc_file->i_sb, 1); if (err) pr_err("delayed sync fs err %d\n", err); } void hfsplus_mark_mdb_dirty(struct super_block *sb) { struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb); unsigned long delay; if (sb_rdonly(sb)) return; spin_lock(&sbi->work_lock); if (!sbi->work_queued) { delay = msecs_to_jiffies(dirty_writeback_interval * 10); queue_delayed_work(system_long_wq, &sbi->sync_work, delay); sbi->work_queued = 1; } spin_unlock(&sbi->work_lock); } static void delayed_free(struct rcu_head *p) { struct hfsplus_sb_info *sbi = container_of(p, struct hfsplus_sb_info, rcu); unload_nls(sbi->nls); kfree(sbi); } static void hfsplus_put_super(struct super_block *sb) { struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb); hfs_dbg(SUPER, "hfsplus_put_super\n"); cancel_delayed_work_sync(&sbi->sync_work); if (!sb_rdonly(sb) && sbi->s_vhdr) { struct hfsplus_vh *vhdr = sbi->s_vhdr; vhdr->modify_date = hfsp_now2mt(); vhdr->attributes |= cpu_to_be32(HFSPLUS_VOL_UNMNT); vhdr->attributes &= cpu_to_be32(~HFSPLUS_VOL_INCNSTNT); hfsplus_sync_fs(sb, 1); } iput(sbi->alloc_file); iput(sbi->hidden_dir); hfs_btree_close(sbi->attr_tree); hfs_btree_close(sbi->cat_tree); hfs_btree_close(sbi->ext_tree); kfree(sbi->s_vhdr_buf); kfree(sbi->s_backup_vhdr_buf); call_rcu(&sbi->rcu, delayed_free); } static int hfsplus_statfs(struct dentry *dentry, struct kstatfs *buf) { struct super_block *sb = dentry->d_sb; struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb); u64 id = huge_encode_dev(sb->s_bdev->bd_dev); buf->f_type = HFSPLUS_SUPER_MAGIC; buf->f_bsize = sb->s_blocksize; buf->f_blocks = sbi->total_blocks << sbi->fs_shift; buf->f_bfree = sbi->free_blocks << sbi->fs_shift; buf->f_bavail = buf->f_bfree; buf->f_files = 0xFFFFFFFF; buf->f_ffree = 0xFFFFFFFF - sbi->next_cnid; buf->f_fsid = u64_to_fsid(id); buf->f_namelen = HFSPLUS_MAX_STRLEN; return 0; } static int hfsplus_reconfigure(struct fs_context *fc) { struct super_block *sb = fc->root->d_sb; sync_filesystem(sb); if ((bool)(fc->sb_flags & SB_RDONLY) == sb_rdonly(sb)) return 0; if (!(fc->sb_flags & SB_RDONLY)) { struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb); struct hfsplus_vh *vhdr = sbi->s_vhdr; if (!(vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_UNMNT))) { pr_warn("filesystem was not cleanly unmounted, running fsck.hfsplus is recommended. leaving read-only.\n"); sb->s_flags |= SB_RDONLY; fc->sb_flags |= SB_RDONLY; } else if (test_bit(HFSPLUS_SB_FORCE, &sbi->flags)) { /* nothing */ } else if (vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_SOFTLOCK)) { pr_warn("filesystem is marked locked, leaving read-only.\n"); sb->s_flags |= SB_RDONLY; fc->sb_flags |= SB_RDONLY; } else if (vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_JOURNALED)) { pr_warn("filesystem is marked journaled, leaving read-only.\n"); sb->s_flags |= SB_RDONLY; fc->sb_flags |= SB_RDONLY; } } return 0; } static const struct super_operations hfsplus_sops = { .alloc_inode = hfsplus_alloc_inode, .free_inode = hfsplus_free_inode, .write_inode = hfsplus_write_inode, .evict_inode = hfsplus_evict_inode, .put_super = hfsplus_put_super, .sync_fs = hfsplus_sync_fs, .statfs = hfsplus_statfs, .show_options = hfsplus_show_options, }; static int hfsplus_fill_super(struct super_block *sb, struct fs_context *fc) { struct hfsplus_vh *vhdr; struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb); hfsplus_cat_entry entry; struct hfs_find_data fd; struct inode *root, *inode; struct qstr str; struct nls_table *nls; u64 last_fs_block, last_fs_page; int silent = fc->sb_flags & SB_SILENT; int err; mutex_init(&sbi->alloc_mutex); mutex_init(&sbi->vh_mutex); spin_lock_init(&sbi->work_lock); INIT_DELAYED_WORK(&sbi->sync_work, delayed_sync_fs); err = -EINVAL; if (!sbi->nls) { /* try utf8 first, as this is the old default behaviour */ sbi->nls = load_nls("utf8"); if (!sbi->nls) sbi->nls = load_nls_default(); } /* temporarily use utf8 to correctly find the hidden dir below */ nls = sbi->nls; sbi->nls = load_nls("utf8"); if (!sbi->nls) { pr_err("unable to load nls for utf8\n"); goto out_unload_nls; } /* Grab the volume header */ if (hfsplus_read_wrapper(sb)) { if (!silent) pr_warn("unable to find HFS+ superblock\n"); goto out_unload_nls; } vhdr = sbi->s_vhdr; /* Copy parts of the volume header into the superblock */ sb->s_magic = HFSPLUS_VOLHEAD_SIG; if (be16_to_cpu(vhdr->version) < HFSPLUS_MIN_VERSION || be16_to_cpu(vhdr->version) > HFSPLUS_CURRENT_VERSION) { pr_err("wrong filesystem version\n"); goto out_free_vhdr; } sbi->total_blocks = be32_to_cpu(vhdr->total_blocks); sbi->free_blocks = be32_to_cpu(vhdr->free_blocks); sbi->next_cnid = be32_to_cpu(vhdr->next_cnid); sbi->file_count = be32_to_cpu(vhdr->file_count); sbi->folder_count = be32_to_cpu(vhdr->folder_count); sbi->data_clump_blocks = be32_to_cpu(vhdr->data_clump_sz) >> sbi->alloc_blksz_shift; if (!sbi->data_clump_blocks) sbi->data_clump_blocks = 1; sbi->rsrc_clump_blocks = be32_to_cpu(vhdr->rsrc_clump_sz) >> sbi->alloc_blksz_shift; if (!sbi->rsrc_clump_blocks) sbi->rsrc_clump_blocks = 1; err = -EFBIG; last_fs_block = sbi->total_blocks - 1; last_fs_page = (last_fs_block << sbi->alloc_blksz_shift) >> PAGE_SHIFT; if ((last_fs_block > (sector_t)(~0ULL) >> (sbi->alloc_blksz_shift - 9)) || (last_fs_page > (pgoff_t)(~0ULL))) { pr_err("filesystem size too large\n"); goto out_free_vhdr; } /* Set up operations so we can load metadata */ sb->s_op = &hfsplus_sops; sb->s_maxbytes = MAX_LFS_FILESIZE; if (!(vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_UNMNT))) { pr_warn("Filesystem was not cleanly unmounted, running fsck.hfsplus is recommended. mounting read-only.\n"); sb->s_flags |= SB_RDONLY; } else if (test_and_clear_bit(HFSPLUS_SB_FORCE, &sbi->flags)) { /* nothing */ } else if (vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_SOFTLOCK)) { pr_warn("Filesystem is marked locked, mounting read-only.\n"); sb->s_flags |= SB_RDONLY; } else if ((vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_JOURNALED)) && !sb_rdonly(sb)) { pr_warn("write access to a journaled filesystem is not supported, use the force option at your own risk, mounting read-only.\n"); sb->s_flags |= SB_RDONLY; } err = -EINVAL; /* Load metadata objects (B*Trees) */ sbi->ext_tree = hfs_btree_open(sb, HFSPLUS_EXT_CNID); if (!sbi->ext_tree) { pr_err("failed to load extents file\n"); goto out_free_vhdr; } sbi->cat_tree = hfs_btree_open(sb, HFSPLUS_CAT_CNID); if (!sbi->cat_tree) { pr_err("failed to load catalog file\n"); goto out_close_ext_tree; } atomic_set(&sbi->attr_tree_state, HFSPLUS_EMPTY_ATTR_TREE); if (vhdr->attr_file.total_blocks != 0) { sbi->attr_tree = hfs_btree_open(sb, HFSPLUS_ATTR_CNID); if (!sbi->attr_tree) { pr_err("failed to load attributes file\n"); goto out_close_cat_tree; } atomic_set(&sbi->attr_tree_state, HFSPLUS_VALID_ATTR_TREE); } sb->s_xattr = hfsplus_xattr_handlers; inode = hfsplus_iget(sb, HFSPLUS_ALLOC_CNID); if (IS_ERR(inode)) { pr_err("failed to load allocation file\n"); err = PTR_ERR(inode); goto out_close_attr_tree; } sbi->alloc_file = inode; /* Load the root directory */ root = hfsplus_iget(sb, HFSPLUS_ROOT_CNID); if (IS_ERR(root)) { pr_err("failed to load root directory\n"); err = PTR_ERR(root); goto out_put_alloc_file; } set_default_d_op(sb, &hfsplus_dentry_operations); sb->s_root = d_make_root(root); if (!sb->s_root) { err = -ENOMEM; goto out_put_alloc_file; } str.len = sizeof(HFSP_HIDDENDIR_NAME) - 1; str.name = HFSP_HIDDENDIR_NAME; err = hfs_find_init(sbi->cat_tree, &fd); if (err) goto out_put_root; err = hfsplus_cat_build_key(sb, fd.search_key, HFSPLUS_ROOT_CNID, &str); if (unlikely(err < 0)) goto out_put_root; if (!hfs_brec_read(&fd, &entry, sizeof(entry))) { hfs_find_exit(&fd); if (entry.type != cpu_to_be16(HFSPLUS_FOLDER)) { err = -EINVAL; goto out_put_root; } inode = hfsplus_iget(sb, be32_to_cpu(entry.folder.id)); if (IS_ERR(inode)) { err = PTR_ERR(inode); goto out_put_root; } sbi->hidden_dir = inode; } else hfs_find_exit(&fd); if (!sb_rdonly(sb)) { /* * H+LX == hfsplusutils, H+Lx == this driver, H+lx is unused * all three are registered with Apple for our use */ vhdr->last_mount_vers = cpu_to_be32(HFSP_MOUNT_VERSION); vhdr->modify_date = hfsp_now2mt(); be32_add_cpu(&vhdr->write_count, 1); vhdr->attributes &= cpu_to_be32(~HFSPLUS_VOL_UNMNT); vhdr->attributes |= cpu_to_be32(HFSPLUS_VOL_INCNSTNT); hfsplus_sync_fs(sb, 1); if (!sbi->hidden_dir) { mutex_lock(&sbi->vh_mutex); sbi->hidden_dir = hfsplus_new_inode(sb, root, S_IFDIR); if (!sbi->hidden_dir) { mutex_unlock(&sbi->vh_mutex); err = -ENOMEM; goto out_put_root; } err = hfsplus_create_cat(sbi->hidden_dir->i_ino, root, &str, sbi->hidden_dir); if (err) { mutex_unlock(&sbi->vh_mutex); goto out_put_hidden_dir; } err = hfsplus_init_security(sbi->hidden_dir, root, &str); if (err == -EOPNOTSUPP) err = 0; /* Operation is not supported. */ else if (err) { /* * Try to delete anyway without * error analysis. */ hfsplus_delete_cat(sbi->hidden_dir->i_ino, root, &str); mutex_unlock(&sbi->vh_mutex); goto out_put_hidden_dir; } mutex_unlock(&sbi->vh_mutex); hfsplus_mark_inode_dirty(sbi->hidden_dir, HFSPLUS_I_CAT_DIRTY); } } unload_nls(sbi->nls); sbi->nls = nls; return 0; out_put_hidden_dir: cancel_delayed_work_sync(&sbi->sync_work); iput(sbi->hidden_dir); out_put_root: dput(sb->s_root); sb->s_root = NULL; out_put_alloc_file: iput(sbi->alloc_file); out_close_attr_tree: hfs_btree_close(sbi->attr_tree); out_close_cat_tree: hfs_btree_close(sbi->cat_tree); out_close_ext_tree: hfs_btree_close(sbi->ext_tree); out_free_vhdr: kfree(sbi->s_vhdr_buf); kfree(sbi->s_backup_vhdr_buf); out_unload_nls: unload_nls(sbi->nls); unload_nls(nls); kfree(sbi); return err; } MODULE_AUTHOR("Brad Boyer"); MODULE_DESCRIPTION("Extended Macintosh Filesystem"); MODULE_LICENSE("GPL"); static struct kmem_cache *hfsplus_inode_cachep; static struct inode *hfsplus_alloc_inode(struct super_block *sb) { struct hfsplus_inode_info *i; i = alloc_inode_sb(sb, hfsplus_inode_cachep, GFP_KERNEL); return i ? &i->vfs_inode : NULL; } static void hfsplus_free_inode(struct inode *inode) { kmem_cache_free(hfsplus_inode_cachep, HFSPLUS_I(inode)); } #define HFSPLUS_INODE_SIZE sizeof(struct hfsplus_inode_info) static int hfsplus_get_tree(struct fs_context *fc) { return get_tree_bdev(fc, hfsplus_fill_super); } static void hfsplus_free_fc(struct fs_context *fc) { kfree(fc->s_fs_info); } static const struct fs_context_operations hfsplus_context_ops = { .parse_param = hfsplus_parse_param, .get_tree = hfsplus_get_tree, .reconfigure = hfsplus_reconfigure, .free = hfsplus_free_fc, }; static int hfsplus_init_fs_context(struct fs_context *fc) { struct hfsplus_sb_info *sbi; sbi = kzalloc(sizeof(struct hfsplus_sb_info), GFP_KERNEL); if (!sbi) return -ENOMEM; if (fc->purpose != FS_CONTEXT_FOR_RECONFIGURE) hfsplus_fill_defaults(sbi); fc->s_fs_info = sbi; fc->ops = &hfsplus_context_ops; return 0; } static struct file_system_type hfsplus_fs_type = { .owner = THIS_MODULE, .name = "hfsplus", .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, .init_fs_context = hfsplus_init_fs_context, }; MODULE_ALIAS_FS("hfsplus"); static void hfsplus_init_once(void *p) { struct hfsplus_inode_info *i = p; inode_init_once(&i->vfs_inode); } static int __init init_hfsplus_fs(void) { int err; hfsplus_inode_cachep = kmem_cache_create("hfsplus_icache", HFSPLUS_INODE_SIZE, 0, SLAB_HWCACHE_ALIGN|SLAB_ACCOUNT, hfsplus_init_once); if (!hfsplus_inode_cachep) return -ENOMEM; err = hfsplus_create_attr_tree_cache(); if (err) goto destroy_inode_cache; err = register_filesystem(&hfsplus_fs_type); if (err) goto destroy_attr_tree_cache; return 0; destroy_attr_tree_cache: hfsplus_destroy_attr_tree_cache(); destroy_inode_cache: kmem_cache_destroy(hfsplus_inode_cachep); return err; } static void __exit exit_hfsplus_fs(void) { unregister_filesystem(&hfsplus_fs_type); /* * Make sure all delayed rcu free inodes are flushed before we * destroy cache. */ rcu_barrier(); hfsplus_destroy_attr_tree_cache(); kmem_cache_destroy(hfsplus_inode_cachep); } module_init(init_hfsplus_fs) module_exit(exit_hfsplus_fs) |
| 160 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 | /* SPDX-License-Identifier: GPL-2.0 */ /* * Kernel Electric-Fence (KFENCE). Public interface for allocator and fault * handler integration. For more info see Documentation/dev-tools/kfence.rst. * * Copyright (C) 2020, Google LLC. */ #ifndef _LINUX_KFENCE_H #define _LINUX_KFENCE_H #include <linux/mm.h> #include <linux/types.h> #ifdef CONFIG_KFENCE #include <linux/atomic.h> #include <linux/static_key.h> extern unsigned long kfence_sample_interval; /* * We allocate an even number of pages, as it simplifies calculations to map * address to metadata indices; effectively, the very first page serves as an * extended guard page, but otherwise has no special purpose. */ #define KFENCE_POOL_SIZE ((CONFIG_KFENCE_NUM_OBJECTS + 1) * 2 * PAGE_SIZE) extern char *__kfence_pool; DECLARE_STATIC_KEY_FALSE(kfence_allocation_key); extern atomic_t kfence_allocation_gate; /** * is_kfence_address() - check if an address belongs to KFENCE pool * @addr: address to check * * Return: true or false depending on whether the address is within the KFENCE * object range. * * KFENCE objects live in a separate page range and are not to be intermixed * with regular heap objects (e.g. KFENCE objects must never be added to the * allocator freelists). Failing to do so may and will result in heap * corruptions, therefore is_kfence_address() must be used to check whether * an object requires specific handling. * * Note: This function may be used in fast-paths, and is performance critical. * Future changes should take this into account; for instance, we want to avoid * introducing another load and therefore need to keep KFENCE_POOL_SIZE a * constant (until immediate patching support is added to the kernel). */ static __always_inline bool is_kfence_address(const void *addr) { /* * The __kfence_pool != NULL check is required to deal with the case * where __kfence_pool == NULL && addr < KFENCE_POOL_SIZE. Keep it in * the slow-path after the range-check! */ return unlikely((unsigned long)((char *)addr - __kfence_pool) < KFENCE_POOL_SIZE && __kfence_pool); } /** * kfence_alloc_pool_and_metadata() - allocate the KFENCE pool and KFENCE * metadata via memblock */ void __init kfence_alloc_pool_and_metadata(void); /** * kfence_init() - perform KFENCE initialization at boot time * * Requires that kfence_alloc_pool_and_metadata() was called before. This sets * up the allocation gate timer, and requires that workqueues are available. */ void __init kfence_init(void); /** * kfence_shutdown_cache() - handle shutdown_cache() for KFENCE objects * @s: cache being shut down * * Before shutting down a cache, one must ensure there are no remaining objects * allocated from it. Because KFENCE objects are not referenced from the cache * directly, we need to check them here. * * Note that shutdown_cache() is internal to SL*B, and kmem_cache_destroy() does * not return if allocated objects still exist: it prints an error message and * simply aborts destruction of a cache, leaking memory. * * If the only such objects are KFENCE objects, we will not leak the entire * cache, but instead try to provide more useful debug info by making allocated * objects "zombie allocations". Objects may then still be used or freed (which * is handled gracefully), but usage will result in showing KFENCE error reports * which include stack traces to the user of the object, the original allocation * site, and caller to shutdown_cache(). */ void kfence_shutdown_cache(struct kmem_cache *s); /* * Allocate a KFENCE object. Allocators must not call this function directly, * use kfence_alloc() instead. */ void *__kfence_alloc(struct kmem_cache *s, size_t size, gfp_t flags); /** * kfence_alloc() - allocate a KFENCE object with a low probability * @s: struct kmem_cache with object requirements * @size: exact size of the object to allocate (can be less than @s->size * e.g. for kmalloc caches) * @flags: GFP flags * * Return: * * NULL - must proceed with allocating as usual, * * non-NULL - pointer to a KFENCE object. * * kfence_alloc() should be inserted into the heap allocation fast path, * allowing it to transparently return KFENCE-allocated objects with a low * probability using a static branch (the probability is controlled by the * kfence.sample_interval boot parameter). */ static __always_inline void *kfence_alloc(struct kmem_cache *s, size_t size, gfp_t flags) { #if defined(CONFIG_KFENCE_STATIC_KEYS) || CONFIG_KFENCE_SAMPLE_INTERVAL == 0 if (!static_branch_unlikely(&kfence_allocation_key)) return NULL; #else if (!static_branch_likely(&kfence_allocation_key)) return NULL; #endif if (likely(atomic_read(&kfence_allocation_gate) > 0)) return NULL; return __kfence_alloc(s, size, flags); } /** * kfence_ksize() - get actual amount of memory allocated for a KFENCE object * @addr: pointer to a heap object * * Return: * * 0 - not a KFENCE object, must call __ksize() instead, * * non-0 - this many bytes can be accessed without causing a memory error. * * kfence_ksize() returns the number of bytes requested for a KFENCE object at * allocation time. This number may be less than the object size of the * corresponding struct kmem_cache. */ size_t kfence_ksize(const void *addr); /** * kfence_object_start() - find the beginning of a KFENCE object * @addr: address within a KFENCE-allocated object * * Return: address of the beginning of the object. * * SL[AU]B-allocated objects are laid out within a page one by one, so it is * easy to calculate the beginning of an object given a pointer inside it and * the object size. The same is not true for KFENCE, which places a single * object at either end of the page. This helper function is used to find the * beginning of a KFENCE-allocated object. */ void *kfence_object_start(const void *addr); /** * __kfence_free() - release a KFENCE heap object to KFENCE pool * @addr: object to be freed * * Requires: is_kfence_address(addr) * * Release a KFENCE object and mark it as freed. */ void __kfence_free(void *addr); /** * kfence_free() - try to release an arbitrary heap object to KFENCE pool * @addr: object to be freed * * Return: * * false - object doesn't belong to KFENCE pool and was ignored, * * true - object was released to KFENCE pool. * * Release a KFENCE object and mark it as freed. May be called on any object, * even non-KFENCE objects, to simplify integration of the hooks into the * allocator's free codepath. The allocator must check the return value to * determine if it was a KFENCE object or not. */ static __always_inline __must_check bool kfence_free(void *addr) { if (!is_kfence_address(addr)) return false; __kfence_free(addr); return true; } /** * kfence_handle_page_fault() - perform page fault handling for KFENCE pages * @addr: faulting address * @is_write: is access a write * @regs: current struct pt_regs (can be NULL, but shows full stack trace) * * Return: * * false - address outside KFENCE pool, * * true - page fault handled by KFENCE, no additional handling required. * * A page fault inside KFENCE pool indicates a memory error, such as an * out-of-bounds access, a use-after-free or an invalid memory access. In these * cases KFENCE prints an error message and marks the offending page as * present, so that the kernel can proceed. */ bool __must_check kfence_handle_page_fault(unsigned long addr, bool is_write, struct pt_regs *regs); #ifdef CONFIG_PRINTK struct kmem_obj_info; /** * __kfence_obj_info() - fill kmem_obj_info struct * @kpp: kmem_obj_info to be filled * @object: the object * * Return: * * false - not a KFENCE object * * true - a KFENCE object, filled @kpp * * Copies information to @kpp for KFENCE objects. */ bool __kfence_obj_info(struct kmem_obj_info *kpp, void *object, struct slab *slab); #endif #else /* CONFIG_KFENCE */ #define kfence_sample_interval (0) static inline bool is_kfence_address(const void *addr) { return false; } static inline void kfence_alloc_pool_and_metadata(void) { } static inline void kfence_init(void) { } static inline void kfence_shutdown_cache(struct kmem_cache *s) { } static inline void *kfence_alloc(struct kmem_cache *s, size_t size, gfp_t flags) { return NULL; } static inline size_t kfence_ksize(const void *addr) { return 0; } static inline void *kfence_object_start(const void *addr) { return NULL; } static inline void __kfence_free(void *addr) { } static inline bool __must_check kfence_free(void *addr) { return false; } static inline bool __must_check kfence_handle_page_fault(unsigned long addr, bool is_write, struct pt_regs *regs) { return false; } #ifdef CONFIG_PRINTK struct kmem_obj_info; static inline bool __kfence_obj_info(struct kmem_obj_info *kpp, void *object, struct slab *slab) { return false; } #endif #endif #endif /* _LINUX_KFENCE_H */ |
| 314 312 316 317 317 317 316 317 317 106 2 1 104 1 1 107 97 106 106 107 104 104 310 316 5 308 3 2 2 1 2 312 311 8 8 6 6 6 1 2 1 1 1 312 314 315 313 312 316 316 8 8 8 8 7 1 299 2 1 1 1 302 304 303 1 304 206 206 205 302 310 310 314 312 311 226 12 12 12 2 312 274 307 19 19 19 313 1 315 312 222 224 200 312 312 316 317 312 223 22 19 220 199 22 4 308 1 1 1 1 269 1 270 6 5 1 1 6 317 313 312 308 1 308 286 103 3 271 6 279 1 312 315 16 313 312 297 296 297 265 28 300 297 297 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 | // SPDX-License-Identifier: GPL-2.0-or-later /* * x86 instruction analysis * * Copyright (C) IBM Corporation, 2002, 2004, 2009 */ #include <linux/kernel.h> #ifdef __KERNEL__ #include <linux/string.h> #else #include <string.h> #endif #include <asm/inat.h> /*__ignore_sync_check__ */ #include <asm/insn.h> /* __ignore_sync_check__ */ #include <linux/unaligned.h> /* __ignore_sync_check__ */ #include <linux/errno.h> #include <linux/kconfig.h> #include <asm/emulate_prefix.h> /* __ignore_sync_check__ */ #define leXX_to_cpu(t, r) \ ({ \ __typeof__(t) v; \ switch (sizeof(t)) { \ case 4: v = le32_to_cpu(r); break; \ case 2: v = le16_to_cpu(r); break; \ case 1: v = r; break; \ default: \ BUILD_BUG(); break; \ } \ v; \ }) /* Verify next sizeof(t) bytes can be on the same instruction */ #define validate_next(t, insn, n) \ ((insn)->next_byte + sizeof(t) + n <= (insn)->end_kaddr) #define __get_next(t, insn) \ ({ t r = get_unaligned((t *)(insn)->next_byte); (insn)->next_byte += sizeof(t); leXX_to_cpu(t, r); }) #define __peek_nbyte_next(t, insn, n) \ ({ t r = get_unaligned((t *)(insn)->next_byte + n); leXX_to_cpu(t, r); }) #define get_next(t, insn) \ ({ if (unlikely(!validate_next(t, insn, 0))) goto err_out; __get_next(t, insn); }) #define peek_nbyte_next(t, insn, n) \ ({ if (unlikely(!validate_next(t, insn, n))) goto err_out; __peek_nbyte_next(t, insn, n); }) #define peek_next(t, insn) peek_nbyte_next(t, insn, 0) /** * insn_init() - initialize struct insn * @insn: &struct insn to be initialized * @kaddr: address (in kernel memory) of instruction (or copy thereof) * @buf_len: length of the insn buffer at @kaddr * @x86_64: !0 for 64-bit kernel or 64-bit app */ void insn_init(struct insn *insn, const void *kaddr, int buf_len, int x86_64) { /* * Instructions longer than MAX_INSN_SIZE (15 bytes) are invalid * even if the input buffer is long enough to hold them. */ if (buf_len > MAX_INSN_SIZE) buf_len = MAX_INSN_SIZE; memset(insn, 0, sizeof(*insn)); insn->kaddr = kaddr; insn->end_kaddr = kaddr + buf_len; insn->next_byte = kaddr; insn->x86_64 = x86_64; insn->opnd_bytes = 4; if (x86_64) insn->addr_bytes = 8; else insn->addr_bytes = 4; } static const insn_byte_t xen_prefix[] = { __XEN_EMULATE_PREFIX }; static const insn_byte_t kvm_prefix[] = { __KVM_EMULATE_PREFIX }; static int __insn_get_emulate_prefix(struct insn *insn, const insn_byte_t *prefix, size_t len) { size_t i; for (i = 0; i < len; i++) { if (peek_nbyte_next(insn_byte_t, insn, i) != prefix[i]) goto err_out; } insn->emulate_prefix_size = len; insn->next_byte += len; return 1; err_out: return 0; } static void insn_get_emulate_prefix(struct insn *insn) { if (__insn_get_emulate_prefix(insn, xen_prefix, sizeof(xen_prefix))) return; __insn_get_emulate_prefix(insn, kvm_prefix, sizeof(kvm_prefix)); } /** * insn_get_prefixes - scan x86 instruction prefix bytes * @insn: &struct insn containing instruction * * Populates the @insn->prefixes bitmap, and updates @insn->next_byte * to point to the (first) opcode. No effect if @insn->prefixes.got * is already set. * * * Returns: * 0: on success * < 0: on error */ int insn_get_prefixes(struct insn *insn) { struct insn_field *prefixes = &insn->prefixes; insn_attr_t attr; insn_byte_t b, lb; int i, nb; if (prefixes->got) return 0; insn_get_emulate_prefix(insn); nb = 0; lb = 0; b = peek_next(insn_byte_t, insn); attr = inat_get_opcode_attribute(b); while (inat_is_legacy_prefix(attr)) { /* Skip if same prefix */ for (i = 0; i < nb; i++) if (prefixes->bytes[i] == b) goto found; if (nb == 4) /* Invalid instruction */ break; prefixes->bytes[nb++] = b; if (inat_is_address_size_prefix(attr)) { /* address size switches 2/4 or 4/8 */ if (insn->x86_64) insn->addr_bytes ^= 12; else insn->addr_bytes ^= 6; } else if (inat_is_operand_size_prefix(attr)) { /* oprand size switches 2/4 */ insn->opnd_bytes ^= 6; } found: prefixes->nbytes++; insn->next_byte++; lb = b; b = peek_next(insn_byte_t, insn); attr = inat_get_opcode_attribute(b); } /* Set the last prefix */ if (lb && lb != insn->prefixes.bytes[3]) { if (unlikely(insn->prefixes.bytes[3])) { /* Swap the last prefix */ b = insn->prefixes.bytes[3]; for (i = 0; i < nb; i++) if (prefixes->bytes[i] == lb) insn_set_byte(prefixes, i, b); } insn_set_byte(&insn->prefixes, 3, lb); } /* Decode REX prefix */ if (insn->x86_64) { b = peek_next(insn_byte_t, insn); attr = inat_get_opcode_attribute(b); if (inat_is_rex_prefix(attr)) { insn_field_set(&insn->rex_prefix, b, 1); insn->next_byte++; if (X86_REX_W(b)) /* REX.W overrides opnd_size */ insn->opnd_bytes = 8; } else if (inat_is_rex2_prefix(attr)) { insn_set_byte(&insn->rex_prefix, 0, b); b = peek_nbyte_next(insn_byte_t, insn, 1); insn_set_byte(&insn->rex_prefix, 1, b); insn->rex_prefix.nbytes = 2; insn->next_byte += 2; if (X86_REX_W(b)) /* REX.W overrides opnd_size */ insn->opnd_bytes = 8; insn->rex_prefix.got = 1; goto vex_end; } } insn->rex_prefix.got = 1; /* Decode VEX prefix */ b = peek_next(insn_byte_t, insn); attr = inat_get_opcode_attribute(b); if (inat_is_vex_prefix(attr)) { insn_byte_t b2 = peek_nbyte_next(insn_byte_t, insn, 1); if (!insn->x86_64) { /* * In 32-bits mode, if the [7:6] bits (mod bits of * ModRM) on the second byte are not 11b, it is * LDS or LES or BOUND. */ if (X86_MODRM_MOD(b2) != 3) goto vex_end; } insn_set_byte(&insn->vex_prefix, 0, b); insn_set_byte(&insn->vex_prefix, 1, b2); if (inat_is_evex_prefix(attr)) { b2 = peek_nbyte_next(insn_byte_t, insn, 2); insn_set_byte(&insn->vex_prefix, 2, b2); b2 = peek_nbyte_next(insn_byte_t, insn, 3); insn_set_byte(&insn->vex_prefix, 3, b2); insn->vex_prefix.nbytes = 4; insn->next_byte += 4; if (insn->x86_64 && X86_VEX_W(b2)) /* VEX.W overrides opnd_size */ insn->opnd_bytes = 8; } else if (inat_is_vex3_prefix(attr)) { b2 = peek_nbyte_next(insn_byte_t, insn, 2); insn_set_byte(&insn->vex_prefix, 2, b2); insn->vex_prefix.nbytes = 3; insn->next_byte += 3; if (insn->x86_64 && X86_VEX_W(b2)) /* VEX.W overrides opnd_size */ insn->opnd_bytes = 8; } else { /* * For VEX2, fake VEX3-like byte#2. * Makes it easier to decode vex.W, vex.vvvv, * vex.L and vex.pp. Masking with 0x7f sets vex.W == 0. */ insn_set_byte(&insn->vex_prefix, 2, b2 & 0x7f); insn->vex_prefix.nbytes = 2; insn->next_byte += 2; } } vex_end: insn->vex_prefix.got = 1; prefixes->got = 1; return 0; err_out: return -ENODATA; } /** * insn_get_opcode - collect opcode(s) * @insn: &struct insn containing instruction * * Populates @insn->opcode, updates @insn->next_byte to point past the * opcode byte(s), and set @insn->attr (except for groups). * If necessary, first collects any preceding (prefix) bytes. * Sets @insn->opcode.value = opcode1. No effect if @insn->opcode.got * is already 1. * * Returns: * 0: on success * < 0: on error */ int insn_get_opcode(struct insn *insn) { struct insn_field *opcode = &insn->opcode; int pfx_id, ret; insn_byte_t op; if (opcode->got) return 0; ret = insn_get_prefixes(insn); if (ret) return ret; /* Get first opcode */ op = get_next(insn_byte_t, insn); insn_set_byte(opcode, 0, op); opcode->nbytes = 1; /* Check if there is VEX prefix or not */ if (insn_is_avx(insn)) { insn_byte_t m, p; m = insn_vex_m_bits(insn); p = insn_vex_p_bits(insn); insn->attr = inat_get_avx_attribute(op, m, p); /* SCALABLE EVEX uses p bits to encode operand size */ if (inat_evex_scalable(insn->attr) && !insn_vex_w_bit(insn) && p == INAT_PFX_OPNDSZ) insn->opnd_bytes = 2; if ((inat_must_evex(insn->attr) && !insn_is_evex(insn)) || (!inat_accept_vex(insn->attr) && !inat_is_group(insn->attr))) { /* This instruction is bad */ insn->attr = 0; return -EINVAL; } /* VEX has only 1 byte for opcode */ goto end; } /* Check if there is REX2 prefix or not */ if (insn_is_rex2(insn)) { if (insn_rex2_m_bit(insn)) { /* map 1 is escape 0x0f */ insn_attr_t esc_attr = inat_get_opcode_attribute(0x0f); pfx_id = insn_last_prefix_id(insn); insn->attr = inat_get_escape_attribute(op, pfx_id, esc_attr); } else { insn->attr = inat_get_opcode_attribute(op); } goto end; } insn->attr = inat_get_opcode_attribute(op); if (insn->x86_64 && inat_is_invalid64(insn->attr)) { /* This instruction is invalid, like UD2. Stop decoding. */ insn->attr &= INAT_INV64; } while (inat_is_escape(insn->attr)) { /* Get escaped opcode */ op = get_next(insn_byte_t, insn); opcode->bytes[opcode->nbytes++] = op; pfx_id = insn_last_prefix_id(insn); insn->attr = inat_get_escape_attribute(op, pfx_id, insn->attr); } if (inat_must_vex(insn->attr)) { /* This instruction is bad */ insn->attr = 0; return -EINVAL; } end: opcode->got = 1; return 0; err_out: return -ENODATA; } /** * insn_get_modrm - collect ModRM byte, if any * @insn: &struct insn containing instruction * * Populates @insn->modrm and updates @insn->next_byte to point past the * ModRM byte, if any. If necessary, first collects the preceding bytes * (prefixes and opcode(s)). No effect if @insn->modrm.got is already 1. * * Returns: * 0: on success * < 0: on error */ int insn_get_modrm(struct insn *insn) { struct insn_field *modrm = &insn->modrm; insn_byte_t pfx_id, mod; int ret; if (modrm->got) return 0; ret = insn_get_opcode(insn); if (ret) return ret; if (inat_has_modrm(insn->attr)) { mod = get_next(insn_byte_t, insn); insn_field_set(modrm, mod, 1); if (inat_is_group(insn->attr)) { pfx_id = insn_last_prefix_id(insn); insn->attr = inat_get_group_attribute(mod, pfx_id, insn->attr); if (insn_is_avx(insn) && !inat_accept_vex(insn->attr)) { /* Bad insn */ insn->attr = 0; return -EINVAL; } } } if (insn->x86_64 && inat_is_force64(insn->attr)) insn->opnd_bytes = 8; modrm->got = 1; return 0; err_out: return -ENODATA; } /** * insn_rip_relative() - Does instruction use RIP-relative addressing mode? * @insn: &struct insn containing instruction * * If necessary, first collects the instruction up to and including the * ModRM byte. No effect if @insn->x86_64 is 0. */ int insn_rip_relative(struct insn *insn) { struct insn_field *modrm = &insn->modrm; int ret; if (!insn->x86_64) return 0; ret = insn_get_modrm(insn); if (ret) return 0; /* * For rip-relative instructions, the mod field (top 2 bits) * is zero and the r/m field (bottom 3 bits) is 0x5. */ return (modrm->nbytes && (modrm->bytes[0] & 0xc7) == 0x5); } /** * insn_get_sib() - Get the SIB byte of instruction * @insn: &struct insn containing instruction * * If necessary, first collects the instruction up to and including the * ModRM byte. * * Returns: * 0: if decoding succeeded * < 0: otherwise. */ int insn_get_sib(struct insn *insn) { insn_byte_t modrm; int ret; if (insn->sib.got) return 0; ret = insn_get_modrm(insn); if (ret) return ret; if (insn->modrm.nbytes) { modrm = insn->modrm.bytes[0]; if (insn->addr_bytes != 2 && X86_MODRM_MOD(modrm) != 3 && X86_MODRM_RM(modrm) == 4) { insn_field_set(&insn->sib, get_next(insn_byte_t, insn), 1); } } insn->sib.got = 1; return 0; err_out: return -ENODATA; } /** * insn_get_displacement() - Get the displacement of instruction * @insn: &struct insn containing instruction * * If necessary, first collects the instruction up to and including the * SIB byte. * Displacement value is sign-expanded. * * * Returns: * 0: if decoding succeeded * < 0: otherwise. */ int insn_get_displacement(struct insn *insn) { insn_byte_t mod, rm, base; int ret; if (insn->displacement.got) return 0; ret = insn_get_sib(insn); if (ret) return ret; if (insn->modrm.nbytes) { /* * Interpreting the modrm byte: * mod = 00 - no displacement fields (exceptions below) * mod = 01 - 1-byte displacement field * mod = 10 - displacement field is 4 bytes, or 2 bytes if * address size = 2 (0x67 prefix in 32-bit mode) * mod = 11 - no memory operand * * If address size = 2... * mod = 00, r/m = 110 - displacement field is 2 bytes * * If address size != 2... * mod != 11, r/m = 100 - SIB byte exists * mod = 00, SIB base = 101 - displacement field is 4 bytes * mod = 00, r/m = 101 - rip-relative addressing, displacement * field is 4 bytes */ mod = X86_MODRM_MOD(insn->modrm.value); rm = X86_MODRM_RM(insn->modrm.value); base = X86_SIB_BASE(insn->sib.value); if (mod == 3) goto out; if (mod == 1) { insn_field_set(&insn->displacement, get_next(signed char, insn), 1); } else if (insn->addr_bytes == 2) { if ((mod == 0 && rm == 6) || mod == 2) { insn_field_set(&insn->displacement, get_next(short, insn), 2); } } else { if ((mod == 0 && rm == 5) || mod == 2 || (mod == 0 && base == 5)) { insn_field_set(&insn->displacement, get_next(int, insn), 4); } } } out: insn->displacement.got = 1; return 0; err_out: return -ENODATA; } /* Decode moffset16/32/64. Return 0 if failed */ static int __get_moffset(struct insn *insn) { switch (insn->addr_bytes) { case 2: insn_field_set(&insn->moffset1, get_next(short, insn), 2); break; case 4: insn_field_set(&insn->moffset1, get_next(int, insn), 4); break; case 8: insn_field_set(&insn->moffset1, get_next(int, insn), 4); insn_field_set(&insn->moffset2, get_next(int, insn), 4); break; default: /* opnd_bytes must be modified manually */ goto err_out; } insn->moffset1.got = insn->moffset2.got = 1; return 1; err_out: return 0; } /* Decode imm v32(Iz). Return 0 if failed */ static int __get_immv32(struct insn *insn) { switch (insn->opnd_bytes) { case 2: insn_field_set(&insn->immediate, get_next(short, insn), 2); break; case 4: case 8: insn_field_set(&insn->immediate, get_next(int, insn), 4); break; default: /* opnd_bytes must be modified manually */ goto err_out; } return 1; err_out: return 0; } /* Decode imm v64(Iv/Ov), Return 0 if failed */ static int __get_immv(struct insn *insn) { switch (insn->opnd_bytes) { case 2: insn_field_set(&insn->immediate1, get_next(short, insn), 2); break; case 4: insn_field_set(&insn->immediate1, get_next(int, insn), 4); insn->immediate1.nbytes = 4; break; case 8: insn_field_set(&insn->immediate1, get_next(int, insn), 4); insn_field_set(&insn->immediate2, get_next(int, insn), 4); break; default: /* opnd_bytes must be modified manually */ goto err_out; } insn->immediate1.got = insn->immediate2.got = 1; return 1; err_out: return 0; } /* Decode ptr16:16/32(Ap) */ static int __get_immptr(struct insn *insn) { switch (insn->opnd_bytes) { case 2: insn_field_set(&insn->immediate1, get_next(short, insn), 2); break; case 4: insn_field_set(&insn->immediate1, get_next(int, insn), 4); break; case 8: /* ptr16:64 is not exist (no segment) */ return 0; default: /* opnd_bytes must be modified manually */ goto err_out; } insn_field_set(&insn->immediate2, get_next(unsigned short, insn), 2); insn->immediate1.got = insn->immediate2.got = 1; return 1; err_out: return 0; } /** * insn_get_immediate() - Get the immediate in an instruction * @insn: &struct insn containing instruction * * If necessary, first collects the instruction up to and including the * displacement bytes. * Basically, most of immediates are sign-expanded. Unsigned-value can be * computed by bit masking with ((1 << (nbytes * 8)) - 1) * * Returns: * 0: on success * < 0: on error */ int insn_get_immediate(struct insn *insn) { int ret; if (insn->immediate.got) return 0; ret = insn_get_displacement(insn); if (ret) return ret; if (inat_has_moffset(insn->attr)) { if (!__get_moffset(insn)) goto err_out; goto done; } if (!inat_has_immediate(insn->attr)) goto done; switch (inat_immediate_size(insn->attr)) { case INAT_IMM_BYTE: insn_field_set(&insn->immediate, get_next(signed char, insn), 1); break; case INAT_IMM_WORD: insn_field_set(&insn->immediate, get_next(short, insn), 2); break; case INAT_IMM_DWORD: insn_field_set(&insn->immediate, get_next(int, insn), 4); break; case INAT_IMM_QWORD: insn_field_set(&insn->immediate1, get_next(int, insn), 4); insn_field_set(&insn->immediate2, get_next(int, insn), 4); break; case INAT_IMM_PTR: if (!__get_immptr(insn)) goto err_out; break; case INAT_IMM_VWORD32: if (!__get_immv32(insn)) goto err_out; break; case INAT_IMM_VWORD: if (!__get_immv(insn)) goto err_out; break; default: /* Here, insn must have an immediate, but failed */ goto err_out; } if (inat_has_second_immediate(insn->attr)) { insn_field_set(&insn->immediate2, get_next(signed char, insn), 1); } done: insn->immediate.got = 1; return 0; err_out: return -ENODATA; } /** * insn_get_length() - Get the length of instruction * @insn: &struct insn containing instruction * * If necessary, first collects the instruction up to and including the * immediates bytes. * * Returns: * - 0 on success * - < 0 on error */ int insn_get_length(struct insn *insn) { int ret; if (insn->length) return 0; ret = insn_get_immediate(insn); if (ret) return ret; insn->length = (unsigned char)((unsigned long)insn->next_byte - (unsigned long)insn->kaddr); return 0; } /* Ensure this instruction is decoded completely */ static inline int insn_complete(struct insn *insn) { return insn->opcode.got && insn->modrm.got && insn->sib.got && insn->displacement.got && insn->immediate.got; } /** * insn_decode() - Decode an x86 instruction * @insn: &struct insn to be initialized * @kaddr: address (in kernel memory) of instruction (or copy thereof) * @buf_len: length of the insn buffer at @kaddr * @m: insn mode, see enum insn_mode * * Returns: * 0: if decoding succeeded * < 0: otherwise. */ int insn_decode(struct insn *insn, const void *kaddr, int buf_len, enum insn_mode m) { int ret; /* #define INSN_MODE_KERN -1 __ignore_sync_check__ mode is only valid in the kernel */ if (m == INSN_MODE_KERN) insn_init(insn, kaddr, buf_len, IS_ENABLED(CONFIG_X86_64)); else insn_init(insn, kaddr, buf_len, m == INSN_MODE_64); ret = insn_get_length(insn); if (ret) return ret; if (insn_complete(insn)) return 0; return -EINVAL; } |
| 3 3 3 3 2 1 4 3 1 2 3 1 3 4 1 3 2 1 3 1 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Crypto API wrappers for the ChaCha20, XChaCha20, and XChaCha12 stream ciphers * * Copyright (C) 2015 Martin Willi * Copyright (C) 2018 Google LLC */ #include <linux/unaligned.h> #include <crypto/algapi.h> #include <crypto/chacha.h> #include <crypto/internal/skcipher.h> #include <linux/module.h> struct chacha_ctx { u32 key[8]; int nrounds; }; static int chacha_setkey(struct crypto_skcipher *tfm, const u8 *key, unsigned int keysize, int nrounds) { struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm); int i; if (keysize != CHACHA_KEY_SIZE) return -EINVAL; for (i = 0; i < ARRAY_SIZE(ctx->key); i++) ctx->key[i] = get_unaligned_le32(key + i * sizeof(u32)); ctx->nrounds = nrounds; return 0; } static int chacha20_setkey(struct crypto_skcipher *tfm, const u8 *key, unsigned int keysize) { return chacha_setkey(tfm, key, keysize, 20); } static int chacha12_setkey(struct crypto_skcipher *tfm, const u8 *key, unsigned int keysize) { return chacha_setkey(tfm, key, keysize, 12); } static int chacha_stream_xor(struct skcipher_request *req, const struct chacha_ctx *ctx, const u8 iv[CHACHA_IV_SIZE], bool arch) { struct skcipher_walk walk; struct chacha_state state; int err; err = skcipher_walk_virt(&walk, req, false); chacha_init(&state, ctx->key, iv); while (walk.nbytes > 0) { unsigned int nbytes = walk.nbytes; if (nbytes < walk.total) nbytes = round_down(nbytes, CHACHA_BLOCK_SIZE); if (arch) chacha_crypt(&state, walk.dst.virt.addr, walk.src.virt.addr, nbytes, ctx->nrounds); else chacha_crypt_generic(&state, walk.dst.virt.addr, walk.src.virt.addr, nbytes, ctx->nrounds); err = skcipher_walk_done(&walk, walk.nbytes - nbytes); } return err; } static int crypto_chacha_crypt_generic(struct skcipher_request *req) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); const struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm); return chacha_stream_xor(req, ctx, req->iv, false); } static int crypto_chacha_crypt_arch(struct skcipher_request *req) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); const struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm); return chacha_stream_xor(req, ctx, req->iv, true); } static int crypto_xchacha_crypt(struct skcipher_request *req, bool arch) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); const struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm); struct chacha_ctx subctx; struct chacha_state state; u8 real_iv[16]; /* Compute the subkey given the original key and first 128 nonce bits */ chacha_init(&state, ctx->key, req->iv); if (arch) hchacha_block(&state, subctx.key, ctx->nrounds); else hchacha_block_generic(&state, subctx.key, ctx->nrounds); subctx.nrounds = ctx->nrounds; /* Build the real IV */ memcpy(&real_iv[0], req->iv + 24, 8); /* stream position */ memcpy(&real_iv[8], req->iv + 16, 8); /* remaining 64 nonce bits */ /* Generate the stream and XOR it with the data */ return chacha_stream_xor(req, &subctx, real_iv, arch); } static int crypto_xchacha_crypt_generic(struct skcipher_request *req) { return crypto_xchacha_crypt(req, false); } static int crypto_xchacha_crypt_arch(struct skcipher_request *req) { return crypto_xchacha_crypt(req, true); } static struct skcipher_alg algs[] = { { .base.cra_name = "chacha20", .base.cra_driver_name = "chacha20-generic", .base.cra_priority = 100, .base.cra_blocksize = 1, .base.cra_ctxsize = sizeof(struct chacha_ctx), .base.cra_module = THIS_MODULE, .min_keysize = CHACHA_KEY_SIZE, .max_keysize = CHACHA_KEY_SIZE, .ivsize = CHACHA_IV_SIZE, .chunksize = CHACHA_BLOCK_SIZE, .setkey = chacha20_setkey, .encrypt = crypto_chacha_crypt_generic, .decrypt = crypto_chacha_crypt_generic, }, { .base.cra_name = "xchacha20", .base.cra_driver_name = "xchacha20-generic", .base.cra_priority = 100, .base.cra_blocksize = 1, .base.cra_ctxsize = sizeof(struct chacha_ctx), .base.cra_module = THIS_MODULE, .min_keysize = CHACHA_KEY_SIZE, .max_keysize = CHACHA_KEY_SIZE, .ivsize = XCHACHA_IV_SIZE, .chunksize = CHACHA_BLOCK_SIZE, .setkey = chacha20_setkey, .encrypt = crypto_xchacha_crypt_generic, .decrypt = crypto_xchacha_crypt_generic, }, { .base.cra_name = "xchacha12", .base.cra_driver_name = "xchacha12-generic", .base.cra_priority = 100, .base.cra_blocksize = 1, .base.cra_ctxsize = sizeof(struct chacha_ctx), .base.cra_module = THIS_MODULE, .min_keysize = CHACHA_KEY_SIZE, .max_keysize = CHACHA_KEY_SIZE, .ivsize = XCHACHA_IV_SIZE, .chunksize = CHACHA_BLOCK_SIZE, .setkey = chacha12_setkey, .encrypt = crypto_xchacha_crypt_generic, .decrypt = crypto_xchacha_crypt_generic, }, { .base.cra_name = "chacha20", .base.cra_driver_name = "chacha20-" __stringify(ARCH), .base.cra_priority = 300, .base.cra_blocksize = 1, .base.cra_ctxsize = sizeof(struct chacha_ctx), .base.cra_module = THIS_MODULE, .min_keysize = CHACHA_KEY_SIZE, .max_keysize = CHACHA_KEY_SIZE, .ivsize = CHACHA_IV_SIZE, .chunksize = CHACHA_BLOCK_SIZE, .setkey = chacha20_setkey, .encrypt = crypto_chacha_crypt_arch, .decrypt = crypto_chacha_crypt_arch, }, { .base.cra_name = "xchacha20", .base.cra_driver_name = "xchacha20-" __stringify(ARCH), .base.cra_priority = 300, .base.cra_blocksize = 1, .base.cra_ctxsize = sizeof(struct chacha_ctx), .base.cra_module = THIS_MODULE, .min_keysize = CHACHA_KEY_SIZE, .max_keysize = CHACHA_KEY_SIZE, .ivsize = XCHACHA_IV_SIZE, .chunksize = CHACHA_BLOCK_SIZE, .setkey = chacha20_setkey, .encrypt = crypto_xchacha_crypt_arch, .decrypt = crypto_xchacha_crypt_arch, }, { .base.cra_name = "xchacha12", .base.cra_driver_name = "xchacha12-" __stringify(ARCH), .base.cra_priority = 300, .base.cra_blocksize = 1, .base.cra_ctxsize = sizeof(struct chacha_ctx), .base.cra_module = THIS_MODULE, .min_keysize = CHACHA_KEY_SIZE, .max_keysize = CHACHA_KEY_SIZE, .ivsize = XCHACHA_IV_SIZE, .chunksize = CHACHA_BLOCK_SIZE, .setkey = chacha12_setkey, .encrypt = crypto_xchacha_crypt_arch, .decrypt = crypto_xchacha_crypt_arch, } }; static unsigned int num_algs; static int __init crypto_chacha_mod_init(void) { /* register the arch flavours only if they differ from generic */ num_algs = ARRAY_SIZE(algs); BUILD_BUG_ON(ARRAY_SIZE(algs) % 2 != 0); if (!chacha_is_arch_optimized()) num_algs /= 2; return crypto_register_skciphers(algs, num_algs); } static void __exit crypto_chacha_mod_fini(void) { crypto_unregister_skciphers(algs, num_algs); } module_init(crypto_chacha_mod_init); module_exit(crypto_chacha_mod_fini); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Martin Willi <martin@strongswan.org>"); MODULE_DESCRIPTION("Crypto API wrappers for the ChaCha20, XChaCha20, and XChaCha12 stream ciphers"); MODULE_ALIAS_CRYPTO("chacha20"); MODULE_ALIAS_CRYPTO("chacha20-generic"); MODULE_ALIAS_CRYPTO("chacha20-" __stringify(ARCH)); MODULE_ALIAS_CRYPTO("xchacha20"); MODULE_ALIAS_CRYPTO("xchacha20-generic"); MODULE_ALIAS_CRYPTO("xchacha20-" __stringify(ARCH)); MODULE_ALIAS_CRYPTO("xchacha12"); MODULE_ALIAS_CRYPTO("xchacha12-generic"); MODULE_ALIAS_CRYPTO("xchacha12-" __stringify(ARCH)); |
| 8 8 7 8 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 | // SPDX-License-Identifier: GPL-2.0-or-later /* General filesystem local caching manager * * Copyright (C) 2021 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #define FSCACHE_DEBUG_LEVEL CACHE #include <linux/module.h> #include <linux/init.h> #include "internal.h" #define CREATE_TRACE_POINTS #include <trace/events/fscache.h> EXPORT_TRACEPOINT_SYMBOL(fscache_access_cache); EXPORT_TRACEPOINT_SYMBOL(fscache_access_volume); EXPORT_TRACEPOINT_SYMBOL(fscache_access); struct workqueue_struct *fscache_wq; EXPORT_SYMBOL(fscache_wq); /* * Mixing scores (in bits) for (7,20): * Input delta: 1-bit 2-bit * 1 round: 330.3 9201.6 * 2 rounds: 1246.4 25475.4 * 3 rounds: 1907.1 31295.1 * 4 rounds: 2042.3 31718.6 * Perfect: 2048 31744 * (32*64) (32*31/2 * 64) */ #define HASH_MIX(x, y, a) \ ( x ^= (a), \ y ^= x, x = rol32(x, 7),\ x += y, y = rol32(y,20),\ y *= 9 ) static inline unsigned int fold_hash(unsigned long x, unsigned long y) { /* Use arch-optimized multiply if one exists */ return __hash_32(y ^ __hash_32(x)); } /* * Generate a hash. This is derived from full_name_hash(), but we want to be * sure it is arch independent and that it doesn't change as bits of the * computed hash value might appear on disk. The caller must guarantee that * the source data is a multiple of four bytes in size. */ unsigned int fscache_hash(unsigned int salt, const void *data, size_t len) { const __le32 *p = data; unsigned int a, x = 0, y = salt, n = len / sizeof(__le32); for (; n; n--) { a = le32_to_cpu(*p++); HASH_MIX(x, y, a); } return fold_hash(x, y); } /* * initialise the fs caching module */ int __init fscache_init(void) { int ret = -ENOMEM; fscache_wq = alloc_workqueue("fscache", WQ_UNBOUND | WQ_FREEZABLE, 0); if (!fscache_wq) goto error_wq; ret = fscache_proc_init(); if (ret < 0) goto error_proc; fscache_cookie_jar = kmem_cache_create("fscache_cookie_jar", sizeof(struct fscache_cookie), 0, 0, NULL); if (!fscache_cookie_jar) { pr_notice("Failed to allocate a cookie jar\n"); ret = -ENOMEM; goto error_cookie_jar; } pr_notice("FS-Cache loaded\n"); return 0; error_cookie_jar: fscache_proc_cleanup(); error_proc: destroy_workqueue(fscache_wq); error_wq: return ret; } /* * clean up on module removal */ void __exit fscache_exit(void) { _enter(""); kmem_cache_destroy(fscache_cookie_jar); fscache_proc_cleanup(); timer_shutdown_sync(&fscache_cookie_lru_timer); destroy_workqueue(fscache_wq); pr_notice("FS-Cache unloaded\n"); } |
| 1221 914 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 | // SPDX-License-Identifier: GPL-2.0 #include <linux/export.h> #include <linux/spinlock.h> #include <linux/atomic.h> /* * This is an implementation of the notion of "decrement a * reference count, and return locked if it decremented to zero". * * NOTE NOTE NOTE! This is _not_ equivalent to * * if (atomic_dec_and_test(&atomic)) { * spin_lock(&lock); * return 1; * } * return 0; * * because the spin-lock and the decrement must be * "atomic". */ int _atomic_dec_and_lock(atomic_t *atomic, spinlock_t *lock) { /* Subtract 1 from counter unless that drops it to 0 (ie. it was 1) */ if (atomic_add_unless(atomic, -1, 1)) return 0; /* Otherwise do it the slow way */ spin_lock(lock); if (atomic_dec_and_test(atomic)) return 1; spin_unlock(lock); return 0; } EXPORT_SYMBOL(_atomic_dec_and_lock); int _atomic_dec_and_lock_irqsave(atomic_t *atomic, spinlock_t *lock, unsigned long *flags) { /* Subtract 1 from counter unless that drops it to 0 (ie. it was 1) */ if (atomic_add_unless(atomic, -1, 1)) return 0; /* Otherwise do it the slow way */ spin_lock_irqsave(lock, *flags); if (atomic_dec_and_test(atomic)) return 1; spin_unlock_irqrestore(lock, *flags); return 0; } EXPORT_SYMBOL(_atomic_dec_and_lock_irqsave); int _atomic_dec_and_raw_lock(atomic_t *atomic, raw_spinlock_t *lock) { /* Subtract 1 from counter unless that drops it to 0 (ie. it was 1) */ if (atomic_add_unless(atomic, -1, 1)) return 0; /* Otherwise do it the slow way */ raw_spin_lock(lock); if (atomic_dec_and_test(atomic)) return 1; raw_spin_unlock(lock); return 0; } EXPORT_SYMBOL(_atomic_dec_and_raw_lock); int _atomic_dec_and_raw_lock_irqsave(atomic_t *atomic, raw_spinlock_t *lock, unsigned long *flags) { /* Subtract 1 from counter unless that drops it to 0 (ie. it was 1) */ if (atomic_add_unless(atomic, -1, 1)) return 0; /* Otherwise do it the slow way */ raw_spin_lock_irqsave(lock, *flags); if (atomic_dec_and_test(atomic)) return 1; raw_spin_unlock_irqrestore(lock, *flags); return 0; } EXPORT_SYMBOL(_atomic_dec_and_raw_lock_irqsave); |
| 35 406 186 315 312 314 313 305 83 82 6 82 334 164 4 8 6 165 3 144 123 137 118 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 | /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Definitions for the Forwarding Information Base. * * Authors: A.N.Kuznetsov, <kuznet@ms2.inr.ac.ru> */ #ifndef _NET_IP_FIB_H #define _NET_IP_FIB_H #include <net/flow.h> #include <linux/seq_file.h> #include <linux/rcupdate.h> #include <net/fib_notifier.h> #include <net/fib_rules.h> #include <net/inet_dscp.h> #include <net/inetpeer.h> #include <linux/percpu.h> #include <linux/notifier.h> #include <linux/refcount.h> #include <linux/ip.h> #include <linux/in_route.h> struct fib_config { u8 fc_dst_len; dscp_t fc_dscp; u8 fc_protocol; u8 fc_scope; u8 fc_type; u8 fc_gw_family; /* 2 bytes unused */ u32 fc_table; __be32 fc_dst; union { __be32 fc_gw4; struct in6_addr fc_gw6; }; int fc_oif; u32 fc_flags; u32 fc_priority; __be32 fc_prefsrc; u32 fc_nh_id; struct nlattr *fc_mx; struct rtnexthop *fc_mp; int fc_mx_len; int fc_mp_len; u32 fc_flow; u32 fc_nlflags; struct nl_info fc_nlinfo; struct nlattr *fc_encap; u16 fc_encap_type; }; struct fib_info; struct rtable; struct fib_nh_exception { struct fib_nh_exception __rcu *fnhe_next; int fnhe_genid; __be32 fnhe_daddr; u32 fnhe_pmtu; bool fnhe_mtu_locked; __be32 fnhe_gw; unsigned long fnhe_expires; struct rtable __rcu *fnhe_rth_input; struct rtable __rcu *fnhe_rth_output; unsigned long fnhe_stamp; struct rcu_head rcu; }; struct fnhe_hash_bucket { struct fib_nh_exception __rcu *chain; }; #define FNHE_HASH_SHIFT 11 #define FNHE_HASH_SIZE (1 << FNHE_HASH_SHIFT) #define FNHE_RECLAIM_DEPTH 5 struct fib_nh_common { struct net_device *nhc_dev; netdevice_tracker nhc_dev_tracker; int nhc_oif; unsigned char nhc_scope; u8 nhc_family; u8 nhc_gw_family; unsigned char nhc_flags; struct lwtunnel_state *nhc_lwtstate; union { __be32 ipv4; struct in6_addr ipv6; } nhc_gw; int nhc_weight; atomic_t nhc_upper_bound; /* v4 specific, but allows fib6_nh with v4 routes */ struct rtable __rcu * __percpu *nhc_pcpu_rth_output; struct rtable __rcu *nhc_rth_input; struct fnhe_hash_bucket __rcu *nhc_exceptions; }; struct fib_nh { struct fib_nh_common nh_common; struct hlist_node nh_hash; struct fib_info *nh_parent; #ifdef CONFIG_IP_ROUTE_CLASSID __u32 nh_tclassid; #endif __be32 nh_saddr; int nh_saddr_genid; #define fib_nh_family nh_common.nhc_family #define fib_nh_dev nh_common.nhc_dev #define fib_nh_dev_tracker nh_common.nhc_dev_tracker #define fib_nh_oif nh_common.nhc_oif #define fib_nh_flags nh_common.nhc_flags #define fib_nh_lws nh_common.nhc_lwtstate #define fib_nh_scope nh_common.nhc_scope #define fib_nh_gw_family nh_common.nhc_gw_family #define fib_nh_gw4 nh_common.nhc_gw.ipv4 #define fib_nh_gw6 nh_common.nhc_gw.ipv6 #define fib_nh_weight nh_common.nhc_weight #define fib_nh_upper_bound nh_common.nhc_upper_bound }; /* * This structure contains data shared by many of routes. */ struct nexthop; struct fib_info { struct hlist_node fib_hash; struct hlist_node fib_lhash; struct list_head nh_list; struct net *fib_net; refcount_t fib_treeref; refcount_t fib_clntref; unsigned int fib_flags; unsigned char fib_dead; unsigned char fib_protocol; unsigned char fib_scope; unsigned char fib_type; __be32 fib_prefsrc; u32 fib_tb_id; u32 fib_priority; struct dst_metrics *fib_metrics; #define fib_mtu fib_metrics->metrics[RTAX_MTU-1] #define fib_window fib_metrics->metrics[RTAX_WINDOW-1] #define fib_rtt fib_metrics->metrics[RTAX_RTT-1] #define fib_advmss fib_metrics->metrics[RTAX_ADVMSS-1] int fib_nhs; bool fib_nh_is_v6; bool nh_updated; bool pfsrc_removed; struct nexthop *nh; struct rcu_head rcu; struct fib_nh fib_nh[] __counted_by(fib_nhs); }; int __net_init fib4_semantics_init(struct net *net); void __net_exit fib4_semantics_exit(struct net *net); #ifdef CONFIG_IP_MULTIPLE_TABLES struct fib_rule; #endif struct fib_table; struct fib_result { __be32 prefix; unsigned char prefixlen; unsigned char nh_sel; unsigned char type; unsigned char scope; u32 tclassid; dscp_t dscp; struct fib_nh_common *nhc; struct fib_info *fi; struct fib_table *table; struct hlist_head *fa_head; }; struct fib_result_nl { __be32 fl_addr; /* To be looked up*/ u32 fl_mark; unsigned char fl_tos; unsigned char fl_scope; unsigned char tb_id_in; unsigned char tb_id; /* Results */ unsigned char prefixlen; unsigned char nh_sel; unsigned char type; unsigned char scope; int err; }; #ifdef CONFIG_IP_MULTIPLE_TABLES #define FIB_TABLE_HASHSZ 256 #else #define FIB_TABLE_HASHSZ 2 #endif __be32 fib_info_update_nhc_saddr(struct net *net, struct fib_nh_common *nhc, unsigned char scope); __be32 fib_result_prefsrc(struct net *net, struct fib_result *res); #define FIB_RES_NHC(res) ((res).nhc) #define FIB_RES_DEV(res) (FIB_RES_NHC(res)->nhc_dev) #define FIB_RES_OIF(res) (FIB_RES_NHC(res)->nhc_oif) struct fib_rt_info { struct fib_info *fi; u32 tb_id; __be32 dst; int dst_len; dscp_t dscp; u8 type; u8 offload:1, trap:1, offload_failed:1, unused:5; }; struct fib_entry_notifier_info { struct fib_notifier_info info; /* must be first */ u32 dst; int dst_len; struct fib_info *fi; dscp_t dscp; u8 type; u32 tb_id; }; struct fib_nh_notifier_info { struct fib_notifier_info info; /* must be first */ struct fib_nh *fib_nh; }; int call_fib4_notifier(struct notifier_block *nb, enum fib_event_type event_type, struct fib_notifier_info *info); int call_fib4_notifiers(struct net *net, enum fib_event_type event_type, struct fib_notifier_info *info); int __net_init fib4_notifier_init(struct net *net); void __net_exit fib4_notifier_exit(struct net *net); void fib_info_notify_update(struct net *net, struct nl_info *info); int fib_notify(struct net *net, struct notifier_block *nb, struct netlink_ext_ack *extack); struct fib_table { struct hlist_node tb_hlist; u32 tb_id; int tb_num_default; struct rcu_head rcu; unsigned long *tb_data; unsigned long __data[]; }; struct fib_dump_filter { u32 table_id; /* filter_set is an optimization that an entry is set */ bool filter_set; bool dump_routes; bool dump_exceptions; bool rtnl_held; unsigned char protocol; unsigned char rt_type; unsigned int flags; struct net_device *dev; }; int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp, struct fib_result *res, int fib_flags); int fib_table_insert(struct net *, struct fib_table *, struct fib_config *, struct netlink_ext_ack *extack); int fib_table_delete(struct net *, struct fib_table *, struct fib_config *, struct netlink_ext_ack *extack); int fib_table_dump(struct fib_table *table, struct sk_buff *skb, struct netlink_callback *cb, struct fib_dump_filter *filter); int fib_table_flush(struct net *net, struct fib_table *table, bool flush_all); struct fib_table *fib_trie_unmerge(struct fib_table *main_tb); void fib_table_flush_external(struct fib_table *table); void fib_free_table(struct fib_table *tb); #ifndef CONFIG_IP_MULTIPLE_TABLES #define TABLE_LOCAL_INDEX (RT_TABLE_LOCAL & (FIB_TABLE_HASHSZ - 1)) #define TABLE_MAIN_INDEX (RT_TABLE_MAIN & (FIB_TABLE_HASHSZ - 1)) static inline struct fib_table *fib_get_table(struct net *net, u32 id) { struct hlist_node *tb_hlist; struct hlist_head *ptr; ptr = id == RT_TABLE_LOCAL ? &net->ipv4.fib_table_hash[TABLE_LOCAL_INDEX] : &net->ipv4.fib_table_hash[TABLE_MAIN_INDEX]; tb_hlist = rcu_dereference_rtnl(hlist_first_rcu(ptr)); return hlist_entry(tb_hlist, struct fib_table, tb_hlist); } static inline struct fib_table *fib_new_table(struct net *net, u32 id) { return fib_get_table(net, id); } static inline int fib_lookup(struct net *net, const struct flowi4 *flp, struct fib_result *res, unsigned int flags) { struct fib_table *tb; int err = -ENETUNREACH; rcu_read_lock(); tb = fib_get_table(net, RT_TABLE_MAIN); if (tb) err = fib_table_lookup(tb, flp, res, flags | FIB_LOOKUP_NOREF); if (err == -EAGAIN) err = -ENETUNREACH; rcu_read_unlock(); return err; } static inline bool fib4_has_custom_rules(const struct net *net) { return false; } static inline bool fib4_rule_default(const struct fib_rule *rule) { return true; } static inline int fib4_rules_dump(struct net *net, struct notifier_block *nb, struct netlink_ext_ack *extack) { return 0; } static inline unsigned int fib4_rules_seq_read(const struct net *net) { return 0; } static inline bool fib4_rules_early_flow_dissect(struct net *net, struct sk_buff *skb, struct flowi4 *fl4, struct flow_keys *flkeys) { return false; } #else /* CONFIG_IP_MULTIPLE_TABLES */ int __net_init fib4_rules_init(struct net *net); void __net_exit fib4_rules_exit(struct net *net); struct fib_table *fib_new_table(struct net *net, u32 id); struct fib_table *fib_get_table(struct net *net, u32 id); int __fib_lookup(struct net *net, struct flowi4 *flp, struct fib_result *res, unsigned int flags); static inline int fib_lookup(struct net *net, struct flowi4 *flp, struct fib_result *res, unsigned int flags) { struct fib_table *tb; int err = -ENETUNREACH; flags |= FIB_LOOKUP_NOREF; if (net->ipv4.fib_has_custom_rules) return __fib_lookup(net, flp, res, flags); rcu_read_lock(); res->tclassid = 0; tb = rcu_dereference_rtnl(net->ipv4.fib_main); if (tb) err = fib_table_lookup(tb, flp, res, flags); if (!err) goto out; tb = rcu_dereference_rtnl(net->ipv4.fib_default); if (tb) err = fib_table_lookup(tb, flp, res, flags); out: if (err == -EAGAIN) err = -ENETUNREACH; rcu_read_unlock(); return err; } static inline bool fib4_has_custom_rules(const struct net *net) { return net->ipv4.fib_has_custom_rules; } bool fib4_rule_default(const struct fib_rule *rule); int fib4_rules_dump(struct net *net, struct notifier_block *nb, struct netlink_ext_ack *extack); unsigned int fib4_rules_seq_read(const struct net *net); static inline bool fib4_rules_early_flow_dissect(struct net *net, struct sk_buff *skb, struct flowi4 *fl4, struct flow_keys *flkeys) { unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP; if (!net->ipv4.fib_rules_require_fldissect) return false; memset(flkeys, 0, sizeof(*flkeys)); __skb_flow_dissect(net, skb, &flow_keys_dissector, flkeys, NULL, 0, 0, 0, flag); fl4->fl4_sport = flkeys->ports.src; fl4->fl4_dport = flkeys->ports.dst; fl4->flowi4_proto = flkeys->basic.ip_proto; return true; } #endif /* CONFIG_IP_MULTIPLE_TABLES */ static inline bool fib_dscp_masked_match(dscp_t dscp, const struct flowi4 *fl4) { return dscp == inet_dsfield_to_dscp(RT_TOS(fl4->flowi4_tos)); } /* Exported by fib_frontend.c */ extern const struct nla_policy rtm_ipv4_policy[]; void ip_fib_init(void); int fib_gw_from_via(struct fib_config *cfg, struct nlattr *nla, struct netlink_ext_ack *extack); __be32 fib_compute_spec_dst(struct sk_buff *skb); bool fib_info_nh_uses_dev(struct fib_info *fi, const struct net_device *dev); int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, dscp_t dscp, int oif, struct net_device *dev, struct in_device *idev, u32 *itag); static inline enum skb_drop_reason fib_validate_source_reason(struct sk_buff *skb, __be32 src, __be32 dst, dscp_t dscp, int oif, struct net_device *dev, struct in_device *idev, u32 *itag) { int err = fib_validate_source(skb, src, dst, dscp, oif, dev, idev, itag); if (err < 0) return -err; return SKB_NOT_DROPPED_YET; } #ifdef CONFIG_IP_ROUTE_CLASSID static inline int fib_num_tclassid_users(struct net *net) { return atomic_read(&net->ipv4.fib_num_tclassid_users); } #else static inline int fib_num_tclassid_users(struct net *net) { return 0; } #endif int fib_unmerge(struct net *net); static inline bool nhc_l3mdev_matches_dev(const struct fib_nh_common *nhc, const struct net_device *dev) { if (nhc->nhc_dev == dev || l3mdev_master_ifindex_rcu(nhc->nhc_dev) == dev->ifindex) return true; return false; } /* Exported by fib_semantics.c */ int ip_fib_check_default(__be32 gw, struct net_device *dev); int fib_sync_down_dev(struct net_device *dev, unsigned long event, bool force); int fib_sync_down_addr(struct net_device *dev, __be32 local); int fib_sync_up(struct net_device *dev, unsigned char nh_flags); void fib_sync_mtu(struct net_device *dev, u32 orig_mtu); void fib_nhc_update_mtu(struct fib_nh_common *nhc, u32 new, u32 orig); /* Fields used for sysctl_fib_multipath_hash_fields. * Common to IPv4 and IPv6. * * Add new fields at the end. This is user API. */ #define FIB_MULTIPATH_HASH_FIELD_SRC_IP BIT(0) #define FIB_MULTIPATH_HASH_FIELD_DST_IP BIT(1) #define FIB_MULTIPATH_HASH_FIELD_IP_PROTO BIT(2) #define FIB_MULTIPATH_HASH_FIELD_FLOWLABEL BIT(3) #define FIB_MULTIPATH_HASH_FIELD_SRC_PORT BIT(4) #define FIB_MULTIPATH_HASH_FIELD_DST_PORT BIT(5) #define FIB_MULTIPATH_HASH_FIELD_INNER_SRC_IP BIT(6) #define FIB_MULTIPATH_HASH_FIELD_INNER_DST_IP BIT(7) #define FIB_MULTIPATH_HASH_FIELD_INNER_IP_PROTO BIT(8) #define FIB_MULTIPATH_HASH_FIELD_INNER_FLOWLABEL BIT(9) #define FIB_MULTIPATH_HASH_FIELD_INNER_SRC_PORT BIT(10) #define FIB_MULTIPATH_HASH_FIELD_INNER_DST_PORT BIT(11) #define FIB_MULTIPATH_HASH_FIELD_OUTER_MASK \ (FIB_MULTIPATH_HASH_FIELD_SRC_IP | \ FIB_MULTIPATH_HASH_FIELD_DST_IP | \ FIB_MULTIPATH_HASH_FIELD_IP_PROTO | \ FIB_MULTIPATH_HASH_FIELD_FLOWLABEL | \ FIB_MULTIPATH_HASH_FIELD_SRC_PORT | \ FIB_MULTIPATH_HASH_FIELD_DST_PORT) #define FIB_MULTIPATH_HASH_FIELD_INNER_MASK \ (FIB_MULTIPATH_HASH_FIELD_INNER_SRC_IP | \ FIB_MULTIPATH_HASH_FIELD_INNER_DST_IP | \ FIB_MULTIPATH_HASH_FIELD_INNER_IP_PROTO | \ FIB_MULTIPATH_HASH_FIELD_INNER_FLOWLABEL | \ FIB_MULTIPATH_HASH_FIELD_INNER_SRC_PORT | \ FIB_MULTIPATH_HASH_FIELD_INNER_DST_PORT) #define FIB_MULTIPATH_HASH_FIELD_ALL_MASK \ (FIB_MULTIPATH_HASH_FIELD_OUTER_MASK | \ FIB_MULTIPATH_HASH_FIELD_INNER_MASK) #define FIB_MULTIPATH_HASH_FIELD_DEFAULT_MASK \ (FIB_MULTIPATH_HASH_FIELD_SRC_IP | \ FIB_MULTIPATH_HASH_FIELD_DST_IP | \ FIB_MULTIPATH_HASH_FIELD_IP_PROTO) #ifdef CONFIG_IP_ROUTE_MULTIPATH int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4, const struct sk_buff *skb, struct flow_keys *flkeys); static void fib_multipath_hash_construct_key(siphash_key_t *key, u32 mp_seed) { u64 mp_seed_64 = mp_seed; key->key[0] = (mp_seed_64 << 32) | mp_seed_64; key->key[1] = key->key[0]; } static inline u32 fib_multipath_hash_from_keys(const struct net *net, struct flow_keys *keys) { siphash_aligned_key_t hash_key; u32 mp_seed; mp_seed = READ_ONCE(net->ipv4.sysctl_fib_multipath_hash_seed).mp_seed; fib_multipath_hash_construct_key(&hash_key, mp_seed); return flow_hash_from_keys_seed(keys, &hash_key); } #else static inline u32 fib_multipath_hash_from_keys(const struct net *net, struct flow_keys *keys) { return flow_hash_from_keys(keys); } #endif int fib_check_nh(struct net *net, struct fib_nh *nh, u32 table, u8 scope, struct netlink_ext_ack *extack); void fib_select_multipath(struct fib_result *res, int hash, const struct flowi4 *fl4); void fib_select_path(struct net *net, struct fib_result *res, struct flowi4 *fl4, const struct sk_buff *skb); int fib_nh_init(struct net *net, struct fib_nh *fib_nh, struct fib_config *cfg, int nh_weight, struct netlink_ext_ack *extack); void fib_nh_release(struct net *net, struct fib_nh *fib_nh); int fib_nh_common_init(struct net *net, struct fib_nh_common *nhc, struct nlattr *fc_encap, u16 fc_encap_type, void *cfg, gfp_t gfp_flags, struct netlink_ext_ack *extack); void fib_nh_common_release(struct fib_nh_common *nhc); /* Exported by fib_trie.c */ void fib_alias_hw_flags_set(struct net *net, const struct fib_rt_info *fri); void fib_trie_init(void); struct fib_table *fib_trie_table(u32 id, struct fib_table *alias); bool fib_lookup_good_nhc(const struct fib_nh_common *nhc, int fib_flags, const struct flowi4 *flp); static inline void fib_combine_itag(u32 *itag, const struct fib_result *res) { #ifdef CONFIG_IP_ROUTE_CLASSID struct fib_nh_common *nhc = res->nhc; #ifdef CONFIG_IP_MULTIPLE_TABLES u32 rtag; #endif if (nhc->nhc_family == AF_INET) { struct fib_nh *nh; nh = container_of(nhc, struct fib_nh, nh_common); *itag = nh->nh_tclassid << 16; } else { *itag = 0; } #ifdef CONFIG_IP_MULTIPLE_TABLES rtag = res->tclassid; if (*itag == 0) *itag = (rtag<<16); *itag |= (rtag>>16); #endif #endif } void fib_flush(struct net *net); void free_fib_info(struct fib_info *fi); static inline void fib_info_hold(struct fib_info *fi) { refcount_inc(&fi->fib_clntref); } static inline void fib_info_put(struct fib_info *fi) { if (refcount_dec_and_test(&fi->fib_clntref)) free_fib_info(fi); } #ifdef CONFIG_PROC_FS int __net_init fib_proc_init(struct net *net); void __net_exit fib_proc_exit(struct net *net); #else static inline int fib_proc_init(struct net *net) { return 0; } static inline void fib_proc_exit(struct net *net) { } #endif u32 ip_mtu_from_fib_result(struct fib_result *res, __be32 daddr); int ip_valid_fib_dump_req(struct net *net, const struct nlmsghdr *nlh, struct fib_dump_filter *filter, struct netlink_callback *cb); int fib_nexthop_info(struct sk_buff *skb, const struct fib_nh_common *nh, u8 rt_family, unsigned char *flags, bool skip_oif); int fib_add_nexthop(struct sk_buff *skb, const struct fib_nh_common *nh, int nh_weight, u8 rt_family, u32 nh_tclassid); #endif /* _NET_FIB_H */ |
| 283 285 38 36 37 10 38 38 38 38 38 37 37 36 38 37 35 35 35 17 17 17 17 34 35 18 31 27 17 18 14 20 12 21 21 5 2 21 20 21 21 21 21 21 21 5 5 21 21 20 21 5 5 5 5 21 14 14 22 22 21 21 21 20 20 21 21 21 22 22 22 22 32 13 13 32 12 12 12 12 12 12 12 12 12 13 13 14 12 14 4 14 12 2 14 2 12 46 46 36 20 10 10 10 10 10 10 10 10 25 31 31 31 31 27 31 37 11 37 36 25 12 12 12 11 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 | // SPDX-License-Identifier: GPL-2.0 /* * drivers/base/dd.c - The core device/driver interactions. * * This file contains the (sometimes tricky) code that controls the * interactions between devices and drivers, which primarily includes * driver binding and unbinding. * * All of this code used to exist in drivers/base/bus.c, but was * relocated to here in the name of compartmentalization (since it wasn't * strictly code just for the 'struct bus_type'. * * Copyright (c) 2002-5 Patrick Mochel * Copyright (c) 2002-3 Open Source Development Labs * Copyright (c) 2007-2009 Greg Kroah-Hartman <gregkh@suse.de> * Copyright (c) 2007-2009 Novell Inc. */ #include <linux/debugfs.h> #include <linux/device.h> #include <linux/delay.h> #include <linux/dma-map-ops.h> #include <linux/init.h> #include <linux/module.h> #include <linux/kthread.h> #include <linux/wait.h> #include <linux/async.h> #include <linux/pm_domain.h> #include <linux/pm_runtime.h> #include <linux/pinctrl/devinfo.h> #include <linux/slab.h> #include "base.h" #include "power/power.h" /* * Deferred Probe infrastructure. * * Sometimes driver probe order matters, but the kernel doesn't always have * dependency information which means some drivers will get probed before a * resource it depends on is available. For example, an SDHCI driver may * first need a GPIO line from an i2c GPIO controller before it can be * initialized. If a required resource is not available yet, a driver can * request probing to be deferred by returning -EPROBE_DEFER from its probe hook * * Deferred probe maintains two lists of devices, a pending list and an active * list. A driver returning -EPROBE_DEFER causes the device to be added to the * pending list. A successful driver probe will trigger moving all devices * from the pending to the active list so that the workqueue will eventually * retry them. * * The deferred_probe_mutex must be held any time the deferred_probe_*_list * of the (struct device*)->p->deferred_probe pointers are manipulated */ static DEFINE_MUTEX(deferred_probe_mutex); static LIST_HEAD(deferred_probe_pending_list); static LIST_HEAD(deferred_probe_active_list); static atomic_t deferred_trigger_count = ATOMIC_INIT(0); static bool initcalls_done; /* Save the async probe drivers' name from kernel cmdline */ #define ASYNC_DRV_NAMES_MAX_LEN 256 static char async_probe_drv_names[ASYNC_DRV_NAMES_MAX_LEN]; static bool async_probe_default; /* * In some cases, like suspend to RAM or hibernation, It might be reasonable * to prohibit probing of devices as it could be unsafe. * Once defer_all_probes is true all drivers probes will be forcibly deferred. */ static bool defer_all_probes; static void __device_set_deferred_probe_reason(const struct device *dev, char *reason) { kfree(dev->p->deferred_probe_reason); dev->p->deferred_probe_reason = reason; } /* * deferred_probe_work_func() - Retry probing devices in the active list. */ static void deferred_probe_work_func(struct work_struct *work) { struct device *dev; struct device_private *private; /* * This block processes every device in the deferred 'active' list. * Each device is removed from the active list and passed to * bus_probe_device() to re-attempt the probe. The loop continues * until every device in the active list is removed and retried. * * Note: Once the device is removed from the list and the mutex is * released, it is possible for the device get freed by another thread * and cause a illegal pointer dereference. This code uses * get/put_device() to ensure the device structure cannot disappear * from under our feet. */ mutex_lock(&deferred_probe_mutex); while (!list_empty(&deferred_probe_active_list)) { private = list_first_entry(&deferred_probe_active_list, typeof(*dev->p), deferred_probe); dev = private->device; list_del_init(&private->deferred_probe); get_device(dev); __device_set_deferred_probe_reason(dev, NULL); /* * Drop the mutex while probing each device; the probe path may * manipulate the deferred list */ mutex_unlock(&deferred_probe_mutex); /* * Force the device to the end of the dpm_list since * the PM code assumes that the order we add things to * the list is a good order for suspend but deferred * probe makes that very unsafe. */ device_pm_move_to_tail(dev); dev_dbg(dev, "Retrying from deferred list\n"); bus_probe_device(dev); mutex_lock(&deferred_probe_mutex); put_device(dev); } mutex_unlock(&deferred_probe_mutex); } static DECLARE_WORK(deferred_probe_work, deferred_probe_work_func); void driver_deferred_probe_add(struct device *dev) { if (!dev->can_match) return; mutex_lock(&deferred_probe_mutex); if (list_empty(&dev->p->deferred_probe)) { dev_dbg(dev, "Added to deferred list\n"); list_add_tail(&dev->p->deferred_probe, &deferred_probe_pending_list); } mutex_unlock(&deferred_probe_mutex); } void driver_deferred_probe_del(struct device *dev) { mutex_lock(&deferred_probe_mutex); if (!list_empty(&dev->p->deferred_probe)) { dev_dbg(dev, "Removed from deferred list\n"); list_del_init(&dev->p->deferred_probe); __device_set_deferred_probe_reason(dev, NULL); } mutex_unlock(&deferred_probe_mutex); } static bool driver_deferred_probe_enable; /** * driver_deferred_probe_trigger() - Kick off re-probing deferred devices * * This functions moves all devices from the pending list to the active * list and schedules the deferred probe workqueue to process them. It * should be called anytime a driver is successfully bound to a device. * * Note, there is a race condition in multi-threaded probe. In the case where * more than one device is probing at the same time, it is possible for one * probe to complete successfully while another is about to defer. If the second * depends on the first, then it will get put on the pending list after the * trigger event has already occurred and will be stuck there. * * The atomic 'deferred_trigger_count' is used to determine if a successful * trigger has occurred in the midst of probing a driver. If the trigger count * changes in the midst of a probe, then deferred processing should be triggered * again. */ void driver_deferred_probe_trigger(void) { if (!driver_deferred_probe_enable) return; /* * A successful probe means that all the devices in the pending list * should be triggered to be reprobed. Move all the deferred devices * into the active list so they can be retried by the workqueue */ mutex_lock(&deferred_probe_mutex); atomic_inc(&deferred_trigger_count); list_splice_tail_init(&deferred_probe_pending_list, &deferred_probe_active_list); mutex_unlock(&deferred_probe_mutex); /* * Kick the re-probe thread. It may already be scheduled, but it is * safe to kick it again. */ queue_work(system_unbound_wq, &deferred_probe_work); } /** * device_block_probing() - Block/defer device's probes * * It will disable probing of devices and defer their probes instead. */ void device_block_probing(void) { defer_all_probes = true; /* sync with probes to avoid races. */ wait_for_device_probe(); } /** * device_unblock_probing() - Unblock/enable device's probes * * It will restore normal behavior and trigger re-probing of deferred * devices. */ void device_unblock_probing(void) { defer_all_probes = false; driver_deferred_probe_trigger(); } /** * device_set_deferred_probe_reason() - Set defer probe reason message for device * @dev: the pointer to the struct device * @vaf: the pointer to va_format structure with message */ void device_set_deferred_probe_reason(const struct device *dev, struct va_format *vaf) { const char *drv = dev_driver_string(dev); char *reason; mutex_lock(&deferred_probe_mutex); reason = kasprintf(GFP_KERNEL, "%s: %pV", drv, vaf); __device_set_deferred_probe_reason(dev, reason); mutex_unlock(&deferred_probe_mutex); } /* * deferred_devs_show() - Show the devices in the deferred probe pending list. */ static int deferred_devs_show(struct seq_file *s, void *data) { struct device_private *curr; mutex_lock(&deferred_probe_mutex); list_for_each_entry(curr, &deferred_probe_pending_list, deferred_probe) seq_printf(s, "%s\t%s", dev_name(curr->device), curr->deferred_probe_reason ?: "\n"); mutex_unlock(&deferred_probe_mutex); return 0; } DEFINE_SHOW_ATTRIBUTE(deferred_devs); #ifdef CONFIG_MODULES static int driver_deferred_probe_timeout = 10; #else static int driver_deferred_probe_timeout; #endif static int __init deferred_probe_timeout_setup(char *str) { int timeout; if (!kstrtoint(str, 10, &timeout)) driver_deferred_probe_timeout = timeout; return 1; } __setup("deferred_probe_timeout=", deferred_probe_timeout_setup); /** * driver_deferred_probe_check_state() - Check deferred probe state * @dev: device to check * * Return: * * -ENODEV if initcalls have completed and modules are disabled. * * -ETIMEDOUT if the deferred probe timeout was set and has expired * and modules are enabled. * * -EPROBE_DEFER in other cases. * * Drivers or subsystems can opt-in to calling this function instead of directly * returning -EPROBE_DEFER. */ int driver_deferred_probe_check_state(struct device *dev) { if (!IS_ENABLED(CONFIG_MODULES) && initcalls_done) { dev_warn(dev, "ignoring dependency for device, assuming no driver\n"); return -ENODEV; } if (!driver_deferred_probe_timeout && initcalls_done) { dev_warn(dev, "deferred probe timeout, ignoring dependency\n"); return -ETIMEDOUT; } return -EPROBE_DEFER; } EXPORT_SYMBOL_GPL(driver_deferred_probe_check_state); static void deferred_probe_timeout_work_func(struct work_struct *work) { struct device_private *p; fw_devlink_drivers_done(); driver_deferred_probe_timeout = 0; driver_deferred_probe_trigger(); flush_work(&deferred_probe_work); mutex_lock(&deferred_probe_mutex); list_for_each_entry(p, &deferred_probe_pending_list, deferred_probe) dev_warn(p->device, "deferred probe pending: %s", p->deferred_probe_reason ?: "(reason unknown)\n"); mutex_unlock(&deferred_probe_mutex); fw_devlink_probing_done(); } static DECLARE_DELAYED_WORK(deferred_probe_timeout_work, deferred_probe_timeout_work_func); void deferred_probe_extend_timeout(void) { /* * If the work hasn't been queued yet or if the work expired, don't * start a new one. */ if (cancel_delayed_work(&deferred_probe_timeout_work)) { schedule_delayed_work(&deferred_probe_timeout_work, driver_deferred_probe_timeout * HZ); pr_debug("Extended deferred probe timeout by %d secs\n", driver_deferred_probe_timeout); } } /** * deferred_probe_initcall() - Enable probing of deferred devices * * We don't want to get in the way when the bulk of drivers are getting probed. * Instead, this initcall makes sure that deferred probing is delayed until * late_initcall time. */ static int deferred_probe_initcall(void) { debugfs_create_file("devices_deferred", 0444, NULL, NULL, &deferred_devs_fops); driver_deferred_probe_enable = true; driver_deferred_probe_trigger(); /* Sort as many dependencies as possible before exiting initcalls */ flush_work(&deferred_probe_work); initcalls_done = true; if (!IS_ENABLED(CONFIG_MODULES)) fw_devlink_drivers_done(); /* * Trigger deferred probe again, this time we won't defer anything * that is optional */ driver_deferred_probe_trigger(); flush_work(&deferred_probe_work); if (driver_deferred_probe_timeout > 0) { schedule_delayed_work(&deferred_probe_timeout_work, driver_deferred_probe_timeout * HZ); } if (!IS_ENABLED(CONFIG_MODULES)) fw_devlink_probing_done(); return 0; } late_initcall(deferred_probe_initcall); static void __exit deferred_probe_exit(void) { debugfs_lookup_and_remove("devices_deferred", NULL); } __exitcall(deferred_probe_exit); /** * device_is_bound() - Check if device is bound to a driver * @dev: device to check * * Returns true if passed device has already finished probing successfully * against a driver. * * This function must be called with the device lock held. */ bool device_is_bound(struct device *dev) { return dev->p && klist_node_attached(&dev->p->knode_driver); } EXPORT_SYMBOL_GPL(device_is_bound); static void driver_bound(struct device *dev) { if (device_is_bound(dev)) { dev_warn(dev, "%s: device already bound\n", __func__); return; } dev_dbg(dev, "driver: '%s': %s: bound to device\n", dev->driver->name, __func__); klist_add_tail(&dev->p->knode_driver, &dev->driver->p->klist_devices); device_links_driver_bound(dev); device_pm_check_callbacks(dev); /* * Make sure the device is no longer in one of the deferred lists and * kick off retrying all pending devices */ driver_deferred_probe_del(dev); driver_deferred_probe_trigger(); bus_notify(dev, BUS_NOTIFY_BOUND_DRIVER); kobject_uevent(&dev->kobj, KOBJ_BIND); } static ssize_t coredump_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { device_lock(dev); dev->driver->coredump(dev); device_unlock(dev); return count; } static DEVICE_ATTR_WO(coredump); static int driver_sysfs_add(struct device *dev) { int ret; bus_notify(dev, BUS_NOTIFY_BIND_DRIVER); ret = sysfs_create_link(&dev->driver->p->kobj, &dev->kobj, kobject_name(&dev->kobj)); if (ret) goto fail; ret = sysfs_create_link(&dev->kobj, &dev->driver->p->kobj, "driver"); if (ret) goto rm_dev; if (!IS_ENABLED(CONFIG_DEV_COREDUMP) || !dev->driver->coredump) return 0; ret = device_create_file(dev, &dev_attr_coredump); if (!ret) return 0; sysfs_remove_link(&dev->kobj, "driver"); rm_dev: sysfs_remove_link(&dev->driver->p->kobj, kobject_name(&dev->kobj)); fail: return ret; } static void driver_sysfs_remove(struct device *dev) { struct device_driver *drv = dev->driver; if (drv) { if (drv->coredump) device_remove_file(dev, &dev_attr_coredump); sysfs_remove_link(&drv->p->kobj, kobject_name(&dev->kobj)); sysfs_remove_link(&dev->kobj, "driver"); } } /** * device_bind_driver - bind a driver to one device. * @dev: device. * * Allow manual attachment of a driver to a device. * Caller must have already set @dev->driver. * * Note that this does not modify the bus reference count. * Please verify that is accounted for before calling this. * (It is ok to call with no other effort from a driver's probe() method.) * * This function must be called with the device lock held. * * Callers should prefer to use device_driver_attach() instead. */ int device_bind_driver(struct device *dev) { int ret; ret = driver_sysfs_add(dev); if (!ret) { device_links_force_bind(dev); driver_bound(dev); } else bus_notify(dev, BUS_NOTIFY_DRIVER_NOT_BOUND); return ret; } EXPORT_SYMBOL_GPL(device_bind_driver); static atomic_t probe_count = ATOMIC_INIT(0); static DECLARE_WAIT_QUEUE_HEAD(probe_waitqueue); static ssize_t state_synced_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { int ret = 0; if (strcmp("1", buf)) return -EINVAL; device_lock(dev); if (!dev->state_synced) { dev->state_synced = true; dev_sync_state(dev); } else { ret = -EINVAL; } device_unlock(dev); return ret ? ret : count; } static ssize_t state_synced_show(struct device *dev, struct device_attribute *attr, char *buf) { bool val; device_lock(dev); val = dev->state_synced; device_unlock(dev); return sysfs_emit(buf, "%u\n", val); } static DEVICE_ATTR_RW(state_synced); static void device_unbind_cleanup(struct device *dev) { devres_release_all(dev); arch_teardown_dma_ops(dev); kfree(dev->dma_range_map); dev->dma_range_map = NULL; device_set_driver(dev, NULL); dev_set_drvdata(dev, NULL); dev_pm_domain_detach(dev, dev->power.detach_power_off); if (dev->pm_domain && dev->pm_domain->dismiss) dev->pm_domain->dismiss(dev); pm_runtime_reinit(dev); dev_pm_set_driver_flags(dev, 0); } static void device_remove(struct device *dev) { device_remove_file(dev, &dev_attr_state_synced); device_remove_groups(dev, dev->driver->dev_groups); if (dev->bus && dev->bus->remove) dev->bus->remove(dev); else if (dev->driver->remove) dev->driver->remove(dev); } static int call_driver_probe(struct device *dev, const struct device_driver *drv) { int ret = 0; if (dev->bus->probe) ret = dev->bus->probe(dev); else if (drv->probe) ret = drv->probe(dev); switch (ret) { case 0: break; case -EPROBE_DEFER: /* Driver requested deferred probing */ dev_dbg(dev, "Driver %s requests probe deferral\n", drv->name); break; case -ENODEV: case -ENXIO: dev_dbg(dev, "probe with driver %s rejects match %d\n", drv->name, ret); break; default: /* driver matched but the probe failed */ dev_err(dev, "probe with driver %s failed with error %d\n", drv->name, ret); break; } return ret; } static int really_probe(struct device *dev, const struct device_driver *drv) { bool test_remove = IS_ENABLED(CONFIG_DEBUG_TEST_DRIVER_REMOVE) && !drv->suppress_bind_attrs; int ret, link_ret; if (defer_all_probes) { /* * Value of defer_all_probes can be set only by * device_block_probing() which, in turn, will call * wait_for_device_probe() right after that to avoid any races. */ dev_dbg(dev, "Driver %s force probe deferral\n", drv->name); return -EPROBE_DEFER; } link_ret = device_links_check_suppliers(dev); if (link_ret == -EPROBE_DEFER) return link_ret; dev_dbg(dev, "bus: '%s': %s: probing driver %s with device\n", drv->bus->name, __func__, drv->name); if (!list_empty(&dev->devres_head)) { dev_crit(dev, "Resources present before probing\n"); ret = -EBUSY; goto done; } re_probe: device_set_driver(dev, drv); /* If using pinctrl, bind pins now before probing */ ret = pinctrl_bind_pins(dev); if (ret) goto pinctrl_bind_failed; if (dev->bus->dma_configure) { ret = dev->bus->dma_configure(dev); if (ret) goto pinctrl_bind_failed; } ret = driver_sysfs_add(dev); if (ret) { dev_err(dev, "%s: driver_sysfs_add failed\n", __func__); goto sysfs_failed; } if (dev->pm_domain && dev->pm_domain->activate) { ret = dev->pm_domain->activate(dev); if (ret) goto probe_failed; } ret = call_driver_probe(dev, drv); if (ret) { /* * If fw_devlink_best_effort is active (denoted by -EAGAIN), the * device might actually probe properly once some of its missing * suppliers have probed. So, treat this as if the driver * returned -EPROBE_DEFER. */ if (link_ret == -EAGAIN) ret = -EPROBE_DEFER; /* * Return probe errors as positive values so that the callers * can distinguish them from other errors. */ ret = -ret; goto probe_failed; } ret = device_add_groups(dev, drv->dev_groups); if (ret) { dev_err(dev, "device_add_groups() failed\n"); goto dev_groups_failed; } if (dev_has_sync_state(dev)) { ret = device_create_file(dev, &dev_attr_state_synced); if (ret) { dev_err(dev, "state_synced sysfs add failed\n"); goto dev_sysfs_state_synced_failed; } } if (test_remove) { test_remove = false; device_remove(dev); driver_sysfs_remove(dev); if (dev->bus && dev->bus->dma_cleanup) dev->bus->dma_cleanup(dev); device_unbind_cleanup(dev); goto re_probe; } pinctrl_init_done(dev); if (dev->pm_domain && dev->pm_domain->sync) dev->pm_domain->sync(dev); driver_bound(dev); dev_dbg(dev, "bus: '%s': %s: bound device to driver %s\n", drv->bus->name, __func__, drv->name); goto done; dev_sysfs_state_synced_failed: dev_groups_failed: device_remove(dev); probe_failed: driver_sysfs_remove(dev); sysfs_failed: bus_notify(dev, BUS_NOTIFY_DRIVER_NOT_BOUND); if (dev->bus && dev->bus->dma_cleanup) dev->bus->dma_cleanup(dev); pinctrl_bind_failed: device_links_no_driver(dev); device_unbind_cleanup(dev); done: return ret; } /* * For initcall_debug, show the driver probe time. */ static int really_probe_debug(struct device *dev, const struct device_driver *drv) { ktime_t calltime, rettime; int ret; calltime = ktime_get(); ret = really_probe(dev, drv); rettime = ktime_get(); /* * Don't change this to pr_debug() because that requires * CONFIG_DYNAMIC_DEBUG and we want a simple 'initcall_debug' on the * kernel commandline to print this all the time at the debug level. */ printk(KERN_DEBUG "probe of %s returned %d after %lld usecs\n", dev_name(dev), ret, ktime_us_delta(rettime, calltime)); return ret; } /** * driver_probe_done * Determine if the probe sequence is finished or not. * * Should somehow figure out how to use a semaphore, not an atomic variable... */ bool __init driver_probe_done(void) { int local_probe_count = atomic_read(&probe_count); pr_debug("%s: probe_count = %d\n", __func__, local_probe_count); return !local_probe_count; } /** * wait_for_device_probe * Wait for device probing to be completed. */ void wait_for_device_probe(void) { /* wait for the deferred probe workqueue to finish */ flush_work(&deferred_probe_work); /* wait for the known devices to complete their probing */ wait_event(probe_waitqueue, atomic_read(&probe_count) == 0); async_synchronize_full(); } EXPORT_SYMBOL_GPL(wait_for_device_probe); static int __driver_probe_device(const struct device_driver *drv, struct device *dev) { int ret = 0; if (dev->p->dead || !device_is_registered(dev)) return -ENODEV; if (dev->driver) return -EBUSY; dev->can_match = true; dev_dbg(dev, "bus: '%s': %s: matched device with driver %s\n", drv->bus->name, __func__, drv->name); pm_runtime_get_suppliers(dev); if (dev->parent) pm_runtime_get_sync(dev->parent); pm_runtime_barrier(dev); if (initcall_debug) ret = really_probe_debug(dev, drv); else ret = really_probe(dev, drv); pm_request_idle(dev); if (dev->parent) pm_runtime_put(dev->parent); pm_runtime_put_suppliers(dev); return ret; } /** * driver_probe_device - attempt to bind device & driver together * @drv: driver to bind a device to * @dev: device to try to bind to the driver * * This function returns -ENODEV if the device is not registered, -EBUSY if it * already has a driver, 0 if the device is bound successfully and a positive * (inverted) error code for failures from the ->probe method. * * This function must be called with @dev lock held. When called for a * USB interface, @dev->parent lock must be held as well. * * If the device has a parent, runtime-resume the parent before driver probing. */ static int driver_probe_device(const struct device_driver *drv, struct device *dev) { int trigger_count = atomic_read(&deferred_trigger_count); int ret; atomic_inc(&probe_count); ret = __driver_probe_device(drv, dev); if (ret == -EPROBE_DEFER || ret == EPROBE_DEFER) { driver_deferred_probe_add(dev); /* * Did a trigger occur while probing? Need to re-trigger if yes */ if (trigger_count != atomic_read(&deferred_trigger_count) && !defer_all_probes) driver_deferred_probe_trigger(); } atomic_dec(&probe_count); wake_up_all(&probe_waitqueue); return ret; } static inline bool cmdline_requested_async_probing(const char *drv_name) { bool async_drv; async_drv = parse_option_str(async_probe_drv_names, drv_name); return (async_probe_default != async_drv); } /* The option format is "driver_async_probe=drv_name1,drv_name2,..." */ static int __init save_async_options(char *buf) { if (strlen(buf) >= ASYNC_DRV_NAMES_MAX_LEN) pr_warn("Too long list of driver names for 'driver_async_probe'!\n"); strscpy(async_probe_drv_names, buf, ASYNC_DRV_NAMES_MAX_LEN); async_probe_default = parse_option_str(async_probe_drv_names, "*"); return 1; } __setup("driver_async_probe=", save_async_options); static bool driver_allows_async_probing(const struct device_driver *drv) { switch (drv->probe_type) { case PROBE_PREFER_ASYNCHRONOUS: return true; case PROBE_FORCE_SYNCHRONOUS: return false; default: if (cmdline_requested_async_probing(drv->name)) return true; if (module_requested_async_probing(drv->owner)) return true; return false; } } struct device_attach_data { struct device *dev; /* * Indicates whether we are considering asynchronous probing or * not. Only initial binding after device or driver registration * (including deferral processing) may be done asynchronously, the * rest is always synchronous, as we expect it is being done by * request from userspace. */ bool check_async; /* * Indicates if we are binding synchronous or asynchronous drivers. * When asynchronous probing is enabled we'll execute 2 passes * over drivers: first pass doing synchronous probing and second * doing asynchronous probing (if synchronous did not succeed - * most likely because there was no driver requiring synchronous * probing - and we found asynchronous driver during first pass). * The 2 passes are done because we can't shoot asynchronous * probe for given device and driver from bus_for_each_drv() since * driver pointer is not guaranteed to stay valid once * bus_for_each_drv() iterates to the next driver on the bus. */ bool want_async; /* * We'll set have_async to 'true' if, while scanning for matching * driver, we'll encounter one that requests asynchronous probing. */ bool have_async; }; static int __device_attach_driver(struct device_driver *drv, void *_data) { struct device_attach_data *data = _data; struct device *dev = data->dev; bool async_allowed; int ret; ret = driver_match_device(drv, dev); if (ret == 0) { /* no match */ return 0; } else if (ret == -EPROBE_DEFER) { dev_dbg(dev, "Device match requests probe deferral\n"); dev->can_match = true; driver_deferred_probe_add(dev); /* * Device can't match with a driver right now, so don't attempt * to match or bind with other drivers on the bus. */ return ret; } else if (ret < 0) { dev_dbg(dev, "Bus failed to match device: %d\n", ret); return ret; } /* ret > 0 means positive match */ async_allowed = driver_allows_async_probing(drv); if (async_allowed) data->have_async = true; if (data->check_async && async_allowed != data->want_async) return 0; /* * Ignore errors returned by ->probe so that the next driver can try * its luck. */ ret = driver_probe_device(drv, dev); if (ret < 0) return ret; return ret == 0; } static void __device_attach_async_helper(void *_dev, async_cookie_t cookie) { struct device *dev = _dev; struct device_attach_data data = { .dev = dev, .check_async = true, .want_async = true, }; device_lock(dev); /* * Check if device has already been removed or claimed. This may * happen with driver loading, device discovery/registration, * and deferred probe processing happens all at once with * multiple threads. */ if (dev->p->dead || dev->driver) goto out_unlock; if (dev->parent) pm_runtime_get_sync(dev->parent); bus_for_each_drv(dev->bus, NULL, &data, __device_attach_driver); dev_dbg(dev, "async probe completed\n"); pm_request_idle(dev); if (dev->parent) pm_runtime_put(dev->parent); out_unlock: device_unlock(dev); put_device(dev); } static int __device_attach(struct device *dev, bool allow_async) { int ret = 0; bool async = false; device_lock(dev); if (dev->p->dead) { goto out_unlock; } else if (dev->driver) { if (device_is_bound(dev)) { ret = 1; goto out_unlock; } ret = device_bind_driver(dev); if (ret == 0) ret = 1; else { device_set_driver(dev, NULL); ret = 0; } } else { struct device_attach_data data = { .dev = dev, .check_async = allow_async, .want_async = false, }; if (dev->parent) pm_runtime_get_sync(dev->parent); ret = bus_for_each_drv(dev->bus, NULL, &data, __device_attach_driver); if (!ret && allow_async && data.have_async) { /* * If we could not find appropriate driver * synchronously and we are allowed to do * async probes and there are drivers that * want to probe asynchronously, we'll * try them. */ dev_dbg(dev, "scheduling asynchronous probe\n"); get_device(dev); async = true; } else { pm_request_idle(dev); } if (dev->parent) pm_runtime_put(dev->parent); } out_unlock: device_unlock(dev); if (async) async_schedule_dev(__device_attach_async_helper, dev); return ret; } /** * device_attach - try to attach device to a driver. * @dev: device. * * Walk the list of drivers that the bus has and call * driver_probe_device() for each pair. If a compatible * pair is found, break out and return. * * Returns 1 if the device was bound to a driver; * 0 if no matching driver was found; * -ENODEV if the device is not registered. * * When called for a USB interface, @dev->parent lock must be held. */ int device_attach(struct device *dev) { return __device_attach(dev, false); } EXPORT_SYMBOL_GPL(device_attach); void device_initial_probe(struct device *dev) { __device_attach(dev, true); } /* * __device_driver_lock - acquire locks needed to manipulate dev->drv * @dev: Device we will update driver info for * @parent: Parent device. Needed if the bus requires parent lock * * This function will take the required locks for manipulating dev->drv. * Normally this will just be the @dev lock, but when called for a USB * interface, @parent lock will be held as well. */ static void __device_driver_lock(struct device *dev, struct device *parent) { if (parent && dev->bus->need_parent_lock) device_lock(parent); device_lock(dev); } /* * __device_driver_unlock - release locks needed to manipulate dev->drv * @dev: Device we will update driver info for * @parent: Parent device. Needed if the bus requires parent lock * * This function will release the required locks for manipulating dev->drv. * Normally this will just be the @dev lock, but when called for a * USB interface, @parent lock will be released as well. */ static void __device_driver_unlock(struct device *dev, struct device *parent) { device_unlock(dev); if (parent && dev->bus->need_parent_lock) device_unlock(parent); } /** * device_driver_attach - attach a specific driver to a specific device * @drv: Driver to attach * @dev: Device to attach it to * * Manually attach driver to a device. Will acquire both @dev lock and * @dev->parent lock if needed. Returns 0 on success, -ERR on failure. */ int device_driver_attach(const struct device_driver *drv, struct device *dev) { int ret; __device_driver_lock(dev, dev->parent); ret = __driver_probe_device(drv, dev); __device_driver_unlock(dev, dev->parent); /* also return probe errors as normal negative errnos */ if (ret > 0) ret = -ret; if (ret == -EPROBE_DEFER) return -EAGAIN; return ret; } EXPORT_SYMBOL_GPL(device_driver_attach); static void __driver_attach_async_helper(void *_dev, async_cookie_t cookie) { struct device *dev = _dev; const struct device_driver *drv; int ret; __device_driver_lock(dev, dev->parent); drv = dev->p->async_driver; dev->p->async_driver = NULL; ret = driver_probe_device(drv, dev); __device_driver_unlock(dev, dev->parent); dev_dbg(dev, "driver %s async attach completed: %d\n", drv->name, ret); put_device(dev); } static int __driver_attach(struct device *dev, void *data) { const struct device_driver *drv = data; bool async = false; int ret; /* * Lock device and try to bind to it. We drop the error * here and always return 0, because we need to keep trying * to bind to devices and some drivers will return an error * simply if it didn't support the device. * * driver_probe_device() will spit a warning if there * is an error. */ ret = driver_match_device(drv, dev); if (ret == 0) { /* no match */ return 0; } else if (ret == -EPROBE_DEFER) { dev_dbg(dev, "Device match requests probe deferral\n"); dev->can_match = true; driver_deferred_probe_add(dev); /* * Driver could not match with device, but may match with * another device on the bus. */ return 0; } else if (ret < 0) { dev_dbg(dev, "Bus failed to match device: %d\n", ret); /* * Driver could not match with device, but may match with * another device on the bus. */ return 0; } /* ret > 0 means positive match */ if (driver_allows_async_probing(drv)) { /* * Instead of probing the device synchronously we will * probe it asynchronously to allow for more parallelism. * * We only take the device lock here in order to guarantee * that the dev->driver and async_driver fields are protected */ dev_dbg(dev, "probing driver %s asynchronously\n", drv->name); device_lock(dev); if (!dev->driver && !dev->p->async_driver) { get_device(dev); dev->p->async_driver = drv; async = true; } device_unlock(dev); if (async) async_schedule_dev(__driver_attach_async_helper, dev); return 0; } __device_driver_lock(dev, dev->parent); driver_probe_device(drv, dev); __device_driver_unlock(dev, dev->parent); return 0; } /** * driver_attach - try to bind driver to devices. * @drv: driver. * * Walk the list of devices that the bus has on it and try to * match the driver with each one. If driver_probe_device() * returns 0 and the @dev->driver is set, we've found a * compatible pair. */ int driver_attach(const struct device_driver *drv) { /* The (void *) will be put back to const * in __driver_attach() */ return bus_for_each_dev(drv->bus, NULL, (void *)drv, __driver_attach); } EXPORT_SYMBOL_GPL(driver_attach); /* * __device_release_driver() must be called with @dev lock held. * When called for a USB interface, @dev->parent lock must be held as well. */ static void __device_release_driver(struct device *dev, struct device *parent) { struct device_driver *drv; drv = dev->driver; if (drv) { pm_runtime_get_sync(dev); while (device_links_busy(dev)) { __device_driver_unlock(dev, parent); device_links_unbind_consumers(dev); __device_driver_lock(dev, parent); /* * A concurrent invocation of the same function might * have released the driver successfully while this one * was waiting, so check for that. */ if (dev->driver != drv) { pm_runtime_put(dev); return; } } driver_sysfs_remove(dev); bus_notify(dev, BUS_NOTIFY_UNBIND_DRIVER); pm_runtime_put_sync(dev); device_remove(dev); if (dev->bus && dev->bus->dma_cleanup) dev->bus->dma_cleanup(dev); device_unbind_cleanup(dev); device_links_driver_cleanup(dev); klist_remove(&dev->p->knode_driver); device_pm_check_callbacks(dev); bus_notify(dev, BUS_NOTIFY_UNBOUND_DRIVER); kobject_uevent(&dev->kobj, KOBJ_UNBIND); } } void device_release_driver_internal(struct device *dev, const struct device_driver *drv, struct device *parent) { __device_driver_lock(dev, parent); if (!drv || drv == dev->driver) __device_release_driver(dev, parent); __device_driver_unlock(dev, parent); } /** * device_release_driver - manually detach device from driver. * @dev: device. * * Manually detach device from driver. * When called for a USB interface, @dev->parent lock must be held. * * If this function is to be called with @dev->parent lock held, ensure that * the device's consumers are unbound in advance or that their locks can be * acquired under the @dev->parent lock. */ void device_release_driver(struct device *dev) { /* * If anyone calls device_release_driver() recursively from * within their ->remove callback for the same device, they * will deadlock right here. */ device_release_driver_internal(dev, NULL, NULL); } EXPORT_SYMBOL_GPL(device_release_driver); /** * device_driver_detach - detach driver from a specific device * @dev: device to detach driver from * * Detach driver from device. Will acquire both @dev lock and @dev->parent * lock if needed. */ void device_driver_detach(struct device *dev) { device_release_driver_internal(dev, NULL, dev->parent); } /** * driver_detach - detach driver from all devices it controls. * @drv: driver. */ void driver_detach(const struct device_driver *drv) { struct device_private *dev_prv; struct device *dev; if (driver_allows_async_probing(drv)) async_synchronize_full(); for (;;) { spin_lock(&drv->p->klist_devices.k_lock); if (list_empty(&drv->p->klist_devices.k_list)) { spin_unlock(&drv->p->klist_devices.k_lock); break; } dev_prv = list_last_entry(&drv->p->klist_devices.k_list, struct device_private, knode_driver.n_node); dev = dev_prv->device; get_device(dev); spin_unlock(&drv->p->klist_devices.k_lock); device_release_driver_internal(dev, drv, dev->parent); put_device(dev); } } |
| 1 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 | // SPDX-License-Identifier: GPL-2.0-only /* * The NFC Controller Interface is the communication protocol between an * NFC Controller (NFCC) and a Device Host (DH). * * Copyright (C) 2011 Texas Instruments, Inc. * Copyright (C) 2014 Marvell International Ltd. * * Written by Ilan Elias <ilane@ti.com> */ #define pr_fmt(fmt) KBUILD_MODNAME ": %s: " fmt, __func__ #include <linux/types.h> #include <linux/interrupt.h> #include <linux/wait.h> #include <linux/bitops.h> #include <linux/skbuff.h> #include "../nfc.h" #include <net/nfc/nci.h> #include <net/nfc/nci_core.h> #include <linux/nfc.h> /* Complete data exchange transaction and forward skb to nfc core */ void nci_data_exchange_complete(struct nci_dev *ndev, struct sk_buff *skb, __u8 conn_id, int err) { const struct nci_conn_info *conn_info; data_exchange_cb_t cb; void *cb_context; conn_info = nci_get_conn_info_by_conn_id(ndev, conn_id); if (!conn_info) { kfree_skb(skb); goto exit; } cb = conn_info->data_exchange_cb; cb_context = conn_info->data_exchange_cb_context; pr_debug("len %d, err %d\n", skb ? skb->len : 0, err); /* data exchange is complete, stop the data timer */ timer_delete_sync(&ndev->data_timer); clear_bit(NCI_DATA_EXCHANGE_TO, &ndev->flags); if (cb) { /* forward skb to nfc core */ cb(cb_context, skb, err); } else if (skb) { pr_err("no rx callback, dropping rx data...\n"); /* no waiting callback, free skb */ kfree_skb(skb); } exit: clear_bit(NCI_DATA_EXCHANGE, &ndev->flags); } /* ----------------- NCI TX Data ----------------- */ static inline void nci_push_data_hdr(struct nci_dev *ndev, __u8 conn_id, struct sk_buff *skb, __u8 pbf) { struct nci_data_hdr *hdr; int plen = skb->len; hdr = skb_push(skb, NCI_DATA_HDR_SIZE); hdr->conn_id = conn_id; hdr->rfu = 0; hdr->plen = plen; nci_mt_set((__u8 *)hdr, NCI_MT_DATA_PKT); nci_pbf_set((__u8 *)hdr, pbf); } int nci_conn_max_data_pkt_payload_size(struct nci_dev *ndev, __u8 conn_id) { const struct nci_conn_info *conn_info; conn_info = nci_get_conn_info_by_conn_id(ndev, conn_id); if (!conn_info) return -EPROTO; return conn_info->max_pkt_payload_len; } EXPORT_SYMBOL(nci_conn_max_data_pkt_payload_size); static int nci_queue_tx_data_frags(struct nci_dev *ndev, __u8 conn_id, struct sk_buff *skb) { const struct nci_conn_info *conn_info; int total_len = skb->len; const unsigned char *data = skb->data; unsigned long flags; struct sk_buff_head frags_q; struct sk_buff *skb_frag; int frag_len; int rc = 0; pr_debug("conn_id 0x%x, total_len %d\n", conn_id, total_len); conn_info = nci_get_conn_info_by_conn_id(ndev, conn_id); if (!conn_info) { rc = -EPROTO; goto exit; } __skb_queue_head_init(&frags_q); while (total_len) { frag_len = min_t(int, total_len, conn_info->max_pkt_payload_len); skb_frag = nci_skb_alloc(ndev, (NCI_DATA_HDR_SIZE + frag_len), GFP_ATOMIC); if (skb_frag == NULL) { rc = -ENOMEM; goto free_exit; } skb_reserve(skb_frag, NCI_DATA_HDR_SIZE); /* first, copy the data */ skb_put_data(skb_frag, data, frag_len); /* second, set the header */ nci_push_data_hdr(ndev, conn_id, skb_frag, ((total_len == frag_len) ? (NCI_PBF_LAST) : (NCI_PBF_CONT))); __skb_queue_tail(&frags_q, skb_frag); data += frag_len; total_len -= frag_len; pr_debug("frag_len %d, remaining total_len %d\n", frag_len, total_len); } /* queue all fragments atomically */ spin_lock_irqsave(&ndev->tx_q.lock, flags); while ((skb_frag = __skb_dequeue(&frags_q)) != NULL) __skb_queue_tail(&ndev->tx_q, skb_frag); spin_unlock_irqrestore(&ndev->tx_q.lock, flags); /* free the original skb */ kfree_skb(skb); goto exit; free_exit: while ((skb_frag = __skb_dequeue(&frags_q)) != NULL) kfree_skb(skb_frag); exit: return rc; } /* Send NCI data */ int nci_send_data(struct nci_dev *ndev, __u8 conn_id, struct sk_buff *skb) { const struct nci_conn_info *conn_info; int rc = 0; pr_debug("conn_id 0x%x, plen %d\n", conn_id, skb->len); conn_info = nci_get_conn_info_by_conn_id(ndev, conn_id); if (!conn_info) { rc = -EPROTO; goto free_exit; } /* check if the packet need to be fragmented */ if (skb->len <= conn_info->max_pkt_payload_len) { /* no need to fragment packet */ nci_push_data_hdr(ndev, conn_id, skb, NCI_PBF_LAST); skb_queue_tail(&ndev->tx_q, skb); } else { /* fragment packet and queue the fragments */ rc = nci_queue_tx_data_frags(ndev, conn_id, skb); if (rc) { pr_err("failed to fragment tx data packet\n"); goto free_exit; } } ndev->cur_conn_id = conn_id; queue_work(ndev->tx_wq, &ndev->tx_work); goto exit; free_exit: kfree_skb(skb); exit: return rc; } EXPORT_SYMBOL(nci_send_data); /* ----------------- NCI RX Data ----------------- */ static void nci_add_rx_data_frag(struct nci_dev *ndev, struct sk_buff *skb, __u8 pbf, __u8 conn_id, __u8 status) { int reassembly_len; int err = 0; if (status) { err = status; goto exit; } if (ndev->rx_data_reassembly) { reassembly_len = ndev->rx_data_reassembly->len; /* first, make enough room for the already accumulated data */ if (skb_cow_head(skb, reassembly_len)) { pr_err("error adding room for accumulated rx data\n"); kfree_skb(skb); skb = NULL; kfree_skb(ndev->rx_data_reassembly); ndev->rx_data_reassembly = NULL; err = -ENOMEM; goto exit; } /* second, combine the two fragments */ memcpy(skb_push(skb, reassembly_len), ndev->rx_data_reassembly->data, reassembly_len); /* third, free old reassembly */ kfree_skb(ndev->rx_data_reassembly); ndev->rx_data_reassembly = NULL; } if (pbf == NCI_PBF_CONT) { /* need to wait for next fragment, store skb and exit */ ndev->rx_data_reassembly = skb; return; } exit: if (ndev->nfc_dev->rf_mode == NFC_RF_TARGET) { /* Data received in Target mode, forward to nfc core */ err = nfc_tm_data_received(ndev->nfc_dev, skb); if (err) pr_err("unable to handle received data\n"); } else { nci_data_exchange_complete(ndev, skb, conn_id, err); } } /* Rx Data packet */ void nci_rx_data_packet(struct nci_dev *ndev, struct sk_buff *skb) { __u8 pbf = nci_pbf(skb->data); __u8 status = 0; __u8 conn_id = nci_conn_id(skb->data); const struct nci_conn_info *conn_info; pr_debug("len %d\n", skb->len); pr_debug("NCI RX: MT=data, PBF=%d, conn_id=%d, plen=%d\n", nci_pbf(skb->data), nci_conn_id(skb->data), nci_plen(skb->data)); conn_info = nci_get_conn_info_by_conn_id(ndev, nci_conn_id(skb->data)); if (!conn_info) { kfree_skb(skb); return; } /* strip the nci data header */ skb_pull(skb, NCI_DATA_HDR_SIZE); if (ndev->target_active_prot == NFC_PROTO_MIFARE || ndev->target_active_prot == NFC_PROTO_JEWEL || ndev->target_active_prot == NFC_PROTO_FELICA || ndev->target_active_prot == NFC_PROTO_ISO15693) { /* frame I/F => remove the status byte */ pr_debug("frame I/F => remove the status byte\n"); status = skb->data[skb->len - 1]; skb_trim(skb, (skb->len - 1)); } nci_add_rx_data_frag(ndev, skb, pbf, conn_id, nci_to_errno(status)); } |
| 2 2 2 2 1 1 2 2 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 | /* SPDX-License-Identifier: GPL-2.0 */ #include <linux/kernel.h> #include <linux/init.h> #include <linux/module.h> #include <linux/seqlock.h> #include <linux/netlink.h> #include <linux/netfilter.h> #include <linux/netfilter/nf_tables.h> #include <net/netfilter/nf_tables.h> #include <net/dst_metadata.h> #include <net/ip_tunnels.h> #include <net/vxlan.h> #include <net/erspan.h> #include <net/geneve.h> struct nft_tunnel { enum nft_tunnel_keys key:8; u8 dreg; enum nft_tunnel_mode mode:8; u8 len; }; static void nft_tunnel_get_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { const struct nft_tunnel *priv = nft_expr_priv(expr); u32 *dest = ®s->data[priv->dreg]; struct ip_tunnel_info *tun_info; tun_info = skb_tunnel_info(pkt->skb); switch (priv->key) { case NFT_TUNNEL_PATH: if (!tun_info) { nft_reg_store8(dest, false); return; } if (priv->mode == NFT_TUNNEL_MODE_NONE || (priv->mode == NFT_TUNNEL_MODE_RX && !(tun_info->mode & IP_TUNNEL_INFO_TX)) || (priv->mode == NFT_TUNNEL_MODE_TX && (tun_info->mode & IP_TUNNEL_INFO_TX))) nft_reg_store8(dest, true); else nft_reg_store8(dest, false); break; case NFT_TUNNEL_ID: if (!tun_info) { regs->verdict.code = NFT_BREAK; return; } if (priv->mode == NFT_TUNNEL_MODE_NONE || (priv->mode == NFT_TUNNEL_MODE_RX && !(tun_info->mode & IP_TUNNEL_INFO_TX)) || (priv->mode == NFT_TUNNEL_MODE_TX && (tun_info->mode & IP_TUNNEL_INFO_TX))) *dest = ntohl(tunnel_id_to_key32(tun_info->key.tun_id)); else regs->verdict.code = NFT_BREAK; break; default: WARN_ON(1); regs->verdict.code = NFT_BREAK; } } static const struct nla_policy nft_tunnel_policy[NFTA_TUNNEL_MAX + 1] = { [NFTA_TUNNEL_KEY] = NLA_POLICY_MAX(NLA_BE32, 255), [NFTA_TUNNEL_DREG] = { .type = NLA_U32 }, [NFTA_TUNNEL_MODE] = NLA_POLICY_MAX(NLA_BE32, 255), }; static int nft_tunnel_get_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) { struct nft_tunnel *priv = nft_expr_priv(expr); u32 len; if (!tb[NFTA_TUNNEL_KEY] || !tb[NFTA_TUNNEL_DREG]) return -EINVAL; priv->key = ntohl(nla_get_be32(tb[NFTA_TUNNEL_KEY])); switch (priv->key) { case NFT_TUNNEL_PATH: len = sizeof(u8); break; case NFT_TUNNEL_ID: len = sizeof(u32); break; default: return -EOPNOTSUPP; } if (tb[NFTA_TUNNEL_MODE]) { priv->mode = ntohl(nla_get_be32(tb[NFTA_TUNNEL_MODE])); if (priv->mode > NFT_TUNNEL_MODE_MAX) return -EOPNOTSUPP; } else { priv->mode = NFT_TUNNEL_MODE_NONE; } priv->len = len; return nft_parse_register_store(ctx, tb[NFTA_TUNNEL_DREG], &priv->dreg, NULL, NFT_DATA_VALUE, len); } static int nft_tunnel_get_dump(struct sk_buff *skb, const struct nft_expr *expr, bool reset) { const struct nft_tunnel *priv = nft_expr_priv(expr); if (nla_put_be32(skb, NFTA_TUNNEL_KEY, htonl(priv->key))) goto nla_put_failure; if (nft_dump_register(skb, NFTA_TUNNEL_DREG, priv->dreg)) goto nla_put_failure; if (nla_put_be32(skb, NFTA_TUNNEL_MODE, htonl(priv->mode))) goto nla_put_failure; return 0; nla_put_failure: return -1; } static bool nft_tunnel_get_reduce(struct nft_regs_track *track, const struct nft_expr *expr) { const struct nft_tunnel *priv = nft_expr_priv(expr); const struct nft_tunnel *tunnel; if (!nft_reg_track_cmp(track, expr, priv->dreg)) { nft_reg_track_update(track, expr, priv->dreg, priv->len); return false; } tunnel = nft_expr_priv(track->regs[priv->dreg].selector); if (priv->key != tunnel->key || priv->dreg != tunnel->dreg || priv->mode != tunnel->mode) { nft_reg_track_update(track, expr, priv->dreg, priv->len); return false; } if (!track->regs[priv->dreg].bitwise) return true; return false; } static struct nft_expr_type nft_tunnel_type; static const struct nft_expr_ops nft_tunnel_get_ops = { .type = &nft_tunnel_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_tunnel)), .eval = nft_tunnel_get_eval, .init = nft_tunnel_get_init, .dump = nft_tunnel_get_dump, .reduce = nft_tunnel_get_reduce, }; static struct nft_expr_type nft_tunnel_type __read_mostly = { .name = "tunnel", .family = NFPROTO_NETDEV, .ops = &nft_tunnel_get_ops, .policy = nft_tunnel_policy, .maxattr = NFTA_TUNNEL_MAX, .owner = THIS_MODULE, }; struct nft_tunnel_opts { union { struct vxlan_metadata vxlan; struct erspan_metadata erspan; u8 data[IP_TUNNEL_OPTS_MAX]; } u; IP_TUNNEL_DECLARE_FLAGS(flags); u32 len; }; struct nft_tunnel_obj { struct metadata_dst *md; struct nft_tunnel_opts opts; }; static const struct nla_policy nft_tunnel_ip_policy[NFTA_TUNNEL_KEY_IP_MAX + 1] = { [NFTA_TUNNEL_KEY_IP_SRC] = { .type = NLA_U32 }, [NFTA_TUNNEL_KEY_IP_DST] = { .type = NLA_U32 }, }; static int nft_tunnel_obj_ip_init(const struct nft_ctx *ctx, const struct nlattr *attr, struct ip_tunnel_info *info) { struct nlattr *tb[NFTA_TUNNEL_KEY_IP_MAX + 1]; int err; err = nla_parse_nested_deprecated(tb, NFTA_TUNNEL_KEY_IP_MAX, attr, nft_tunnel_ip_policy, NULL); if (err < 0) return err; if (!tb[NFTA_TUNNEL_KEY_IP_DST]) return -EINVAL; if (tb[NFTA_TUNNEL_KEY_IP_SRC]) info->key.u.ipv4.src = nla_get_be32(tb[NFTA_TUNNEL_KEY_IP_SRC]); if (tb[NFTA_TUNNEL_KEY_IP_DST]) info->key.u.ipv4.dst = nla_get_be32(tb[NFTA_TUNNEL_KEY_IP_DST]); return 0; } static const struct nla_policy nft_tunnel_ip6_policy[NFTA_TUNNEL_KEY_IP6_MAX + 1] = { [NFTA_TUNNEL_KEY_IP6_SRC] = { .len = sizeof(struct in6_addr), }, [NFTA_TUNNEL_KEY_IP6_DST] = { .len = sizeof(struct in6_addr), }, [NFTA_TUNNEL_KEY_IP6_FLOWLABEL] = { .type = NLA_U32, } }; static int nft_tunnel_obj_ip6_init(const struct nft_ctx *ctx, const struct nlattr *attr, struct ip_tunnel_info *info) { struct nlattr *tb[NFTA_TUNNEL_KEY_IP6_MAX + 1]; int err; err = nla_parse_nested_deprecated(tb, NFTA_TUNNEL_KEY_IP6_MAX, attr, nft_tunnel_ip6_policy, NULL); if (err < 0) return err; if (!tb[NFTA_TUNNEL_KEY_IP6_DST]) return -EINVAL; if (tb[NFTA_TUNNEL_KEY_IP6_SRC]) { memcpy(&info->key.u.ipv6.src, nla_data(tb[NFTA_TUNNEL_KEY_IP6_SRC]), sizeof(struct in6_addr)); } if (tb[NFTA_TUNNEL_KEY_IP6_DST]) { memcpy(&info->key.u.ipv6.dst, nla_data(tb[NFTA_TUNNEL_KEY_IP6_DST]), sizeof(struct in6_addr)); } if (tb[NFTA_TUNNEL_KEY_IP6_FLOWLABEL]) info->key.label = nla_get_be32(tb[NFTA_TUNNEL_KEY_IP6_FLOWLABEL]); info->mode |= IP_TUNNEL_INFO_IPV6; return 0; } static const struct nla_policy nft_tunnel_opts_vxlan_policy[NFTA_TUNNEL_KEY_VXLAN_MAX + 1] = { [NFTA_TUNNEL_KEY_VXLAN_GBP] = { .type = NLA_U32 }, }; static int nft_tunnel_obj_vxlan_init(const struct nlattr *attr, struct nft_tunnel_opts *opts) { struct nlattr *tb[NFTA_TUNNEL_KEY_VXLAN_MAX + 1]; int err; err = nla_parse_nested_deprecated(tb, NFTA_TUNNEL_KEY_VXLAN_MAX, attr, nft_tunnel_opts_vxlan_policy, NULL); if (err < 0) return err; if (!tb[NFTA_TUNNEL_KEY_VXLAN_GBP]) return -EINVAL; opts->u.vxlan.gbp = ntohl(nla_get_be32(tb[NFTA_TUNNEL_KEY_VXLAN_GBP])); opts->len = sizeof(struct vxlan_metadata); ip_tunnel_flags_zero(opts->flags); __set_bit(IP_TUNNEL_VXLAN_OPT_BIT, opts->flags); return 0; } static const struct nla_policy nft_tunnel_opts_erspan_policy[NFTA_TUNNEL_KEY_ERSPAN_MAX + 1] = { [NFTA_TUNNEL_KEY_ERSPAN_VERSION] = { .type = NLA_U32 }, [NFTA_TUNNEL_KEY_ERSPAN_V1_INDEX] = { .type = NLA_U32 }, [NFTA_TUNNEL_KEY_ERSPAN_V2_DIR] = { .type = NLA_U8 }, [NFTA_TUNNEL_KEY_ERSPAN_V2_HWID] = { .type = NLA_U8 }, }; static int nft_tunnel_obj_erspan_init(const struct nlattr *attr, struct nft_tunnel_opts *opts) { struct nlattr *tb[NFTA_TUNNEL_KEY_ERSPAN_MAX + 1]; uint8_t hwid, dir; int err, version; err = nla_parse_nested_deprecated(tb, NFTA_TUNNEL_KEY_ERSPAN_MAX, attr, nft_tunnel_opts_erspan_policy, NULL); if (err < 0) return err; if (!tb[NFTA_TUNNEL_KEY_ERSPAN_VERSION]) return -EINVAL; version = ntohl(nla_get_be32(tb[NFTA_TUNNEL_KEY_ERSPAN_VERSION])); switch (version) { case ERSPAN_VERSION: if (!tb[NFTA_TUNNEL_KEY_ERSPAN_V1_INDEX]) return -EINVAL; opts->u.erspan.u.index = nla_get_be32(tb[NFTA_TUNNEL_KEY_ERSPAN_V1_INDEX]); break; case ERSPAN_VERSION2: if (!tb[NFTA_TUNNEL_KEY_ERSPAN_V2_DIR] || !tb[NFTA_TUNNEL_KEY_ERSPAN_V2_HWID]) return -EINVAL; hwid = nla_get_u8(tb[NFTA_TUNNEL_KEY_ERSPAN_V2_HWID]); dir = nla_get_u8(tb[NFTA_TUNNEL_KEY_ERSPAN_V2_DIR]); set_hwid(&opts->u.erspan.u.md2, hwid); opts->u.erspan.u.md2.dir = dir; break; default: return -EOPNOTSUPP; } opts->u.erspan.version = version; opts->len = sizeof(struct erspan_metadata); ip_tunnel_flags_zero(opts->flags); __set_bit(IP_TUNNEL_ERSPAN_OPT_BIT, opts->flags); return 0; } static const struct nla_policy nft_tunnel_opts_geneve_policy[NFTA_TUNNEL_KEY_GENEVE_MAX + 1] = { [NFTA_TUNNEL_KEY_GENEVE_CLASS] = { .type = NLA_U16 }, [NFTA_TUNNEL_KEY_GENEVE_TYPE] = { .type = NLA_U8 }, [NFTA_TUNNEL_KEY_GENEVE_DATA] = { .type = NLA_BINARY, .len = 127 }, }; static int nft_tunnel_obj_geneve_init(const struct nlattr *attr, struct nft_tunnel_opts *opts) { struct geneve_opt *opt = (struct geneve_opt *)(opts->u.data + opts->len); struct nlattr *tb[NFTA_TUNNEL_KEY_GENEVE_MAX + 1]; int err, data_len; err = nla_parse_nested(tb, NFTA_TUNNEL_KEY_GENEVE_MAX, attr, nft_tunnel_opts_geneve_policy, NULL); if (err < 0) return err; if (!tb[NFTA_TUNNEL_KEY_GENEVE_CLASS] || !tb[NFTA_TUNNEL_KEY_GENEVE_TYPE] || !tb[NFTA_TUNNEL_KEY_GENEVE_DATA]) return -EINVAL; attr = tb[NFTA_TUNNEL_KEY_GENEVE_DATA]; data_len = nla_len(attr); if (data_len % 4) return -EINVAL; opts->len += sizeof(*opt) + data_len; if (opts->len > IP_TUNNEL_OPTS_MAX) return -EINVAL; memcpy(opt->opt_data, nla_data(attr), data_len); opt->length = data_len / 4; opt->opt_class = nla_get_be16(tb[NFTA_TUNNEL_KEY_GENEVE_CLASS]); opt->type = nla_get_u8(tb[NFTA_TUNNEL_KEY_GENEVE_TYPE]); ip_tunnel_flags_zero(opts->flags); __set_bit(IP_TUNNEL_GENEVE_OPT_BIT, opts->flags); return 0; } static const struct nla_policy nft_tunnel_opts_policy[NFTA_TUNNEL_KEY_OPTS_MAX + 1] = { [NFTA_TUNNEL_KEY_OPTS_UNSPEC] = { .strict_start_type = NFTA_TUNNEL_KEY_OPTS_GENEVE }, [NFTA_TUNNEL_KEY_OPTS_VXLAN] = { .type = NLA_NESTED, }, [NFTA_TUNNEL_KEY_OPTS_ERSPAN] = { .type = NLA_NESTED, }, [NFTA_TUNNEL_KEY_OPTS_GENEVE] = { .type = NLA_NESTED, }, }; static int nft_tunnel_obj_opts_init(const struct nft_ctx *ctx, const struct nlattr *attr, struct ip_tunnel_info *info, struct nft_tunnel_opts *opts) { struct nlattr *nla; int err, rem; u32 type = 0; err = nla_validate_nested_deprecated(attr, NFTA_TUNNEL_KEY_OPTS_MAX, nft_tunnel_opts_policy, NULL); if (err < 0) return err; nla_for_each_attr(nla, nla_data(attr), nla_len(attr), rem) { switch (nla_type(nla)) { case NFTA_TUNNEL_KEY_OPTS_VXLAN: if (type) return -EINVAL; err = nft_tunnel_obj_vxlan_init(nla, opts); if (err) return err; type = IP_TUNNEL_VXLAN_OPT_BIT; break; case NFTA_TUNNEL_KEY_OPTS_ERSPAN: if (type) return -EINVAL; err = nft_tunnel_obj_erspan_init(nla, opts); if (err) return err; type = IP_TUNNEL_ERSPAN_OPT_BIT; break; case NFTA_TUNNEL_KEY_OPTS_GENEVE: if (type && type != IP_TUNNEL_GENEVE_OPT_BIT) return -EINVAL; err = nft_tunnel_obj_geneve_init(nla, opts); if (err) return err; type = IP_TUNNEL_GENEVE_OPT_BIT; break; default: return -EOPNOTSUPP; } } return err; } static const struct nla_policy nft_tunnel_key_policy[NFTA_TUNNEL_KEY_MAX + 1] = { [NFTA_TUNNEL_KEY_IP] = { .type = NLA_NESTED, }, [NFTA_TUNNEL_KEY_IP6] = { .type = NLA_NESTED, }, [NFTA_TUNNEL_KEY_ID] = { .type = NLA_U32, }, [NFTA_TUNNEL_KEY_FLAGS] = { .type = NLA_U32, }, [NFTA_TUNNEL_KEY_TOS] = { .type = NLA_U8, }, [NFTA_TUNNEL_KEY_TTL] = { .type = NLA_U8, }, [NFTA_TUNNEL_KEY_SPORT] = { .type = NLA_U16, }, [NFTA_TUNNEL_KEY_DPORT] = { .type = NLA_U16, }, [NFTA_TUNNEL_KEY_OPTS] = { .type = NLA_NESTED, }, }; static int nft_tunnel_obj_init(const struct nft_ctx *ctx, const struct nlattr * const tb[], struct nft_object *obj) { struct nft_tunnel_obj *priv = nft_obj_data(obj); struct ip_tunnel_info info; struct metadata_dst *md; int err; if (!tb[NFTA_TUNNEL_KEY_ID]) return -EINVAL; memset(&info, 0, sizeof(info)); info.mode = IP_TUNNEL_INFO_TX; info.key.tun_id = key32_to_tunnel_id(nla_get_be32(tb[NFTA_TUNNEL_KEY_ID])); __set_bit(IP_TUNNEL_KEY_BIT, info.key.tun_flags); __set_bit(IP_TUNNEL_CSUM_BIT, info.key.tun_flags); __set_bit(IP_TUNNEL_NOCACHE_BIT, info.key.tun_flags); if (tb[NFTA_TUNNEL_KEY_IP]) { err = nft_tunnel_obj_ip_init(ctx, tb[NFTA_TUNNEL_KEY_IP], &info); if (err < 0) return err; } else if (tb[NFTA_TUNNEL_KEY_IP6]) { err = nft_tunnel_obj_ip6_init(ctx, tb[NFTA_TUNNEL_KEY_IP6], &info); if (err < 0) return err; } else { return -EINVAL; } if (tb[NFTA_TUNNEL_KEY_SPORT]) { info.key.tp_src = nla_get_be16(tb[NFTA_TUNNEL_KEY_SPORT]); } if (tb[NFTA_TUNNEL_KEY_DPORT]) { info.key.tp_dst = nla_get_be16(tb[NFTA_TUNNEL_KEY_DPORT]); } if (tb[NFTA_TUNNEL_KEY_FLAGS]) { u32 tun_flags; tun_flags = ntohl(nla_get_be32(tb[NFTA_TUNNEL_KEY_FLAGS])); if (tun_flags & ~NFT_TUNNEL_F_MASK) return -EOPNOTSUPP; if (tun_flags & NFT_TUNNEL_F_ZERO_CSUM_TX) __clear_bit(IP_TUNNEL_CSUM_BIT, info.key.tun_flags); if (tun_flags & NFT_TUNNEL_F_DONT_FRAGMENT) __set_bit(IP_TUNNEL_DONT_FRAGMENT_BIT, info.key.tun_flags); if (tun_flags & NFT_TUNNEL_F_SEQ_NUMBER) __set_bit(IP_TUNNEL_SEQ_BIT, info.key.tun_flags); } if (tb[NFTA_TUNNEL_KEY_TOS]) info.key.tos = nla_get_u8(tb[NFTA_TUNNEL_KEY_TOS]); info.key.ttl = nla_get_u8_default(tb[NFTA_TUNNEL_KEY_TTL], U8_MAX); if (tb[NFTA_TUNNEL_KEY_OPTS]) { err = nft_tunnel_obj_opts_init(ctx, tb[NFTA_TUNNEL_KEY_OPTS], &info, &priv->opts); if (err < 0) return err; } md = metadata_dst_alloc(priv->opts.len, METADATA_IP_TUNNEL, GFP_KERNEL_ACCOUNT); if (!md) return -ENOMEM; memcpy(&md->u.tun_info, &info, sizeof(info)); #ifdef CONFIG_DST_CACHE err = dst_cache_init(&md->u.tun_info.dst_cache, GFP_KERNEL_ACCOUNT); if (err < 0) { metadata_dst_free(md); return err; } #endif ip_tunnel_info_opts_set(&md->u.tun_info, &priv->opts.u, priv->opts.len, priv->opts.flags); priv->md = md; return 0; } static inline void nft_tunnel_obj_eval(struct nft_object *obj, struct nft_regs *regs, const struct nft_pktinfo *pkt) { struct nft_tunnel_obj *priv = nft_obj_data(obj); struct sk_buff *skb = pkt->skb; skb_dst_drop(skb); dst_hold((struct dst_entry *) priv->md); skb_dst_set(skb, (struct dst_entry *) priv->md); } static int nft_tunnel_ip_dump(struct sk_buff *skb, struct ip_tunnel_info *info) { struct nlattr *nest; if (info->mode & IP_TUNNEL_INFO_IPV6) { nest = nla_nest_start_noflag(skb, NFTA_TUNNEL_KEY_IP6); if (!nest) return -1; if (nla_put_in6_addr(skb, NFTA_TUNNEL_KEY_IP6_SRC, &info->key.u.ipv6.src) < 0 || nla_put_in6_addr(skb, NFTA_TUNNEL_KEY_IP6_DST, &info->key.u.ipv6.dst) < 0 || nla_put_be32(skb, NFTA_TUNNEL_KEY_IP6_FLOWLABEL, info->key.label)) { nla_nest_cancel(skb, nest); return -1; } nla_nest_end(skb, nest); } else { nest = nla_nest_start_noflag(skb, NFTA_TUNNEL_KEY_IP); if (!nest) return -1; if (nla_put_in_addr(skb, NFTA_TUNNEL_KEY_IP_SRC, info->key.u.ipv4.src) < 0 || nla_put_in_addr(skb, NFTA_TUNNEL_KEY_IP_DST, info->key.u.ipv4.dst) < 0) { nla_nest_cancel(skb, nest); return -1; } nla_nest_end(skb, nest); } return 0; } static int nft_tunnel_opts_dump(struct sk_buff *skb, struct nft_tunnel_obj *priv) { struct nft_tunnel_opts *opts = &priv->opts; struct nlattr *nest, *inner; nest = nla_nest_start_noflag(skb, NFTA_TUNNEL_KEY_OPTS); if (!nest) return -1; if (test_bit(IP_TUNNEL_VXLAN_OPT_BIT, opts->flags)) { inner = nla_nest_start_noflag(skb, NFTA_TUNNEL_KEY_OPTS_VXLAN); if (!inner) goto failure; if (nla_put_be32(skb, NFTA_TUNNEL_KEY_VXLAN_GBP, htonl(opts->u.vxlan.gbp))) goto inner_failure; nla_nest_end(skb, inner); } else if (test_bit(IP_TUNNEL_ERSPAN_OPT_BIT, opts->flags)) { inner = nla_nest_start_noflag(skb, NFTA_TUNNEL_KEY_OPTS_ERSPAN); if (!inner) goto failure; if (nla_put_be32(skb, NFTA_TUNNEL_KEY_ERSPAN_VERSION, htonl(opts->u.erspan.version))) goto inner_failure; switch (opts->u.erspan.version) { case ERSPAN_VERSION: if (nla_put_be32(skb, NFTA_TUNNEL_KEY_ERSPAN_V1_INDEX, opts->u.erspan.u.index)) goto inner_failure; break; case ERSPAN_VERSION2: if (nla_put_u8(skb, NFTA_TUNNEL_KEY_ERSPAN_V2_HWID, get_hwid(&opts->u.erspan.u.md2)) || nla_put_u8(skb, NFTA_TUNNEL_KEY_ERSPAN_V2_DIR, opts->u.erspan.u.md2.dir)) goto inner_failure; break; } nla_nest_end(skb, inner); } else if (test_bit(IP_TUNNEL_GENEVE_OPT_BIT, opts->flags)) { struct geneve_opt *opt; int offset = 0; while (opts->len > offset) { inner = nla_nest_start_noflag(skb, NFTA_TUNNEL_KEY_OPTS_GENEVE); if (!inner) goto failure; opt = (struct geneve_opt *)(opts->u.data + offset); if (nla_put_be16(skb, NFTA_TUNNEL_KEY_GENEVE_CLASS, opt->opt_class) || nla_put_u8(skb, NFTA_TUNNEL_KEY_GENEVE_TYPE, opt->type) || nla_put(skb, NFTA_TUNNEL_KEY_GENEVE_DATA, opt->length * 4, opt->opt_data)) goto inner_failure; offset += sizeof(*opt) + opt->length * 4; nla_nest_end(skb, inner); } } nla_nest_end(skb, nest); return 0; inner_failure: nla_nest_cancel(skb, inner); failure: nla_nest_cancel(skb, nest); return -1; } static int nft_tunnel_ports_dump(struct sk_buff *skb, struct ip_tunnel_info *info) { if (nla_put_be16(skb, NFTA_TUNNEL_KEY_SPORT, info->key.tp_src) < 0 || nla_put_be16(skb, NFTA_TUNNEL_KEY_DPORT, info->key.tp_dst) < 0) return -1; return 0; } static int nft_tunnel_flags_dump(struct sk_buff *skb, struct ip_tunnel_info *info) { u32 flags = 0; if (test_bit(IP_TUNNEL_DONT_FRAGMENT_BIT, info->key.tun_flags)) flags |= NFT_TUNNEL_F_DONT_FRAGMENT; if (!test_bit(IP_TUNNEL_CSUM_BIT, info->key.tun_flags)) flags |= NFT_TUNNEL_F_ZERO_CSUM_TX; if (test_bit(IP_TUNNEL_SEQ_BIT, info->key.tun_flags)) flags |= NFT_TUNNEL_F_SEQ_NUMBER; if (nla_put_be32(skb, NFTA_TUNNEL_KEY_FLAGS, htonl(flags)) < 0) return -1; return 0; } static int nft_tunnel_obj_dump(struct sk_buff *skb, struct nft_object *obj, bool reset) { struct nft_tunnel_obj *priv = nft_obj_data(obj); struct ip_tunnel_info *info = &priv->md->u.tun_info; if (nla_put_be32(skb, NFTA_TUNNEL_KEY_ID, tunnel_id_to_key32(info->key.tun_id)) || nft_tunnel_ip_dump(skb, info) < 0 || nft_tunnel_ports_dump(skb, info) < 0 || nft_tunnel_flags_dump(skb, info) < 0 || nla_put_u8(skb, NFTA_TUNNEL_KEY_TOS, info->key.tos) || nla_put_u8(skb, NFTA_TUNNEL_KEY_TTL, info->key.ttl) || nft_tunnel_opts_dump(skb, priv) < 0) goto nla_put_failure; return 0; nla_put_failure: return -1; } static void nft_tunnel_obj_destroy(const struct nft_ctx *ctx, struct nft_object *obj) { struct nft_tunnel_obj *priv = nft_obj_data(obj); metadata_dst_free(priv->md); } static struct nft_object_type nft_tunnel_obj_type; static const struct nft_object_ops nft_tunnel_obj_ops = { .type = &nft_tunnel_obj_type, .size = sizeof(struct nft_tunnel_obj), .eval = nft_tunnel_obj_eval, .init = nft_tunnel_obj_init, .destroy = nft_tunnel_obj_destroy, .dump = nft_tunnel_obj_dump, }; static struct nft_object_type nft_tunnel_obj_type __read_mostly = { .type = NFT_OBJECT_TUNNEL, .family = NFPROTO_NETDEV, .ops = &nft_tunnel_obj_ops, .maxattr = NFTA_TUNNEL_KEY_MAX, .policy = nft_tunnel_key_policy, .owner = THIS_MODULE, }; static int __init nft_tunnel_module_init(void) { int err; err = nft_register_expr(&nft_tunnel_type); if (err < 0) return err; err = nft_register_obj(&nft_tunnel_obj_type); if (err < 0) nft_unregister_expr(&nft_tunnel_type); return err; } static void __exit nft_tunnel_module_exit(void) { nft_unregister_obj(&nft_tunnel_obj_type); nft_unregister_expr(&nft_tunnel_type); } module_init(nft_tunnel_module_init); module_exit(nft_tunnel_module_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>"); MODULE_ALIAS_NFT_EXPR("tunnel"); MODULE_ALIAS_NFT_OBJ(NFT_OBJECT_TUNNEL); MODULE_DESCRIPTION("nftables tunnel expression support"); |
| 648 529 648 161 406 403 319 364 692 372 217 580 578 328 468 389 97 73 97 370 259 218 217 218 371 258 372 372 161 341 342 682 717 679 250 314 364 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 | /* SPDX-License-Identifier: GPL-2.0-or-later */ /* Red Black Trees (C) 1999 Andrea Arcangeli <andrea@suse.de> (C) 2002 David Woodhouse <dwmw2@infradead.org> (C) 2012 Michel Lespinasse <walken@google.com> linux/include/linux/rbtree_augmented.h */ #ifndef _LINUX_RBTREE_AUGMENTED_H #define _LINUX_RBTREE_AUGMENTED_H #include <linux/compiler.h> #include <linux/rbtree.h> #include <linux/rcupdate.h> /* * Please note - only struct rb_augment_callbacks and the prototypes for * rb_insert_augmented() and rb_erase_augmented() are intended to be public. * The rest are implementation details you are not expected to depend on. * * See Documentation/core-api/rbtree.rst for documentation and samples. */ struct rb_augment_callbacks { void (*propagate)(struct rb_node *node, struct rb_node *stop); void (*copy)(struct rb_node *old, struct rb_node *new); void (*rotate)(struct rb_node *old, struct rb_node *new); }; extern void __rb_insert_augmented(struct rb_node *node, struct rb_root *root, void (*augment_rotate)(struct rb_node *old, struct rb_node *new)); /* * Fixup the rbtree and update the augmented information when rebalancing. * * On insertion, the user must update the augmented information on the path * leading to the inserted node, then call rb_link_node() as usual and * rb_insert_augmented() instead of the usual rb_insert_color() call. * If rb_insert_augmented() rebalances the rbtree, it will callback into * a user provided function to update the augmented information on the * affected subtrees. */ static inline void rb_insert_augmented(struct rb_node *node, struct rb_root *root, const struct rb_augment_callbacks *augment) { __rb_insert_augmented(node, root, augment->rotate); } static inline void rb_insert_augmented_cached(struct rb_node *node, struct rb_root_cached *root, bool newleft, const struct rb_augment_callbacks *augment) { if (newleft) root->rb_leftmost = node; rb_insert_augmented(node, &root->rb_root, augment); } static __always_inline struct rb_node * rb_add_augmented_cached(struct rb_node *node, struct rb_root_cached *tree, bool (*less)(struct rb_node *, const struct rb_node *), const struct rb_augment_callbacks *augment) { struct rb_node **link = &tree->rb_root.rb_node; struct rb_node *parent = NULL; bool leftmost = true; while (*link) { parent = *link; if (less(node, parent)) { link = &parent->rb_left; } else { link = &parent->rb_right; leftmost = false; } } rb_link_node(node, parent, link); augment->propagate(parent, NULL); /* suboptimal */ rb_insert_augmented_cached(node, tree, leftmost, augment); return leftmost ? node : NULL; } /* * Template for declaring augmented rbtree callbacks (generic case) * * RBSTATIC: 'static' or empty * RBNAME: name of the rb_augment_callbacks structure * RBSTRUCT: struct type of the tree nodes * RBFIELD: name of struct rb_node field within RBSTRUCT * RBAUGMENTED: name of field within RBSTRUCT holding data for subtree * RBCOMPUTE: name of function that recomputes the RBAUGMENTED data */ #define RB_DECLARE_CALLBACKS(RBSTATIC, RBNAME, \ RBSTRUCT, RBFIELD, RBAUGMENTED, RBCOMPUTE) \ static inline void \ RBNAME ## _propagate(struct rb_node *rb, struct rb_node *stop) \ { \ while (rb != stop) { \ RBSTRUCT *node = rb_entry(rb, RBSTRUCT, RBFIELD); \ if (RBCOMPUTE(node, true)) \ break; \ rb = rb_parent(&node->RBFIELD); \ } \ } \ static inline void \ RBNAME ## _copy(struct rb_node *rb_old, struct rb_node *rb_new) \ { \ RBSTRUCT *old = rb_entry(rb_old, RBSTRUCT, RBFIELD); \ RBSTRUCT *new = rb_entry(rb_new, RBSTRUCT, RBFIELD); \ new->RBAUGMENTED = old->RBAUGMENTED; \ } \ static void \ RBNAME ## _rotate(struct rb_node *rb_old, struct rb_node *rb_new) \ { \ RBSTRUCT *old = rb_entry(rb_old, RBSTRUCT, RBFIELD); \ RBSTRUCT *new = rb_entry(rb_new, RBSTRUCT, RBFIELD); \ new->RBAUGMENTED = old->RBAUGMENTED; \ RBCOMPUTE(old, false); \ } \ RBSTATIC const struct rb_augment_callbacks RBNAME = { \ .propagate = RBNAME ## _propagate, \ .copy = RBNAME ## _copy, \ .rotate = RBNAME ## _rotate \ }; /* * Template for declaring augmented rbtree callbacks, * computing RBAUGMENTED scalar as max(RBCOMPUTE(node)) for all subtree nodes. * * RBSTATIC: 'static' or empty * RBNAME: name of the rb_augment_callbacks structure * RBSTRUCT: struct type of the tree nodes * RBFIELD: name of struct rb_node field within RBSTRUCT * RBTYPE: type of the RBAUGMENTED field * RBAUGMENTED: name of RBTYPE field within RBSTRUCT holding data for subtree * RBCOMPUTE: name of function that returns the per-node RBTYPE scalar */ #define RB_DECLARE_CALLBACKS_MAX(RBSTATIC, RBNAME, RBSTRUCT, RBFIELD, \ RBTYPE, RBAUGMENTED, RBCOMPUTE) \ static inline bool RBNAME ## _compute_max(RBSTRUCT *node, bool exit) \ { \ RBSTRUCT *child; \ RBTYPE max = RBCOMPUTE(node); \ if (node->RBFIELD.rb_left) { \ child = rb_entry(node->RBFIELD.rb_left, RBSTRUCT, RBFIELD); \ if (child->RBAUGMENTED > max) \ max = child->RBAUGMENTED; \ } \ if (node->RBFIELD.rb_right) { \ child = rb_entry(node->RBFIELD.rb_right, RBSTRUCT, RBFIELD); \ if (child->RBAUGMENTED > max) \ max = child->RBAUGMENTED; \ } \ if (exit && node->RBAUGMENTED == max) \ return true; \ node->RBAUGMENTED = max; \ return false; \ } \ RB_DECLARE_CALLBACKS(RBSTATIC, RBNAME, \ RBSTRUCT, RBFIELD, RBAUGMENTED, RBNAME ## _compute_max) #define RB_RED 0 #define RB_BLACK 1 #define __rb_parent(pc) ((struct rb_node *)(pc & ~3)) #define __rb_color(pc) ((pc) & 1) #define __rb_is_black(pc) __rb_color(pc) #define __rb_is_red(pc) (!__rb_color(pc)) #define rb_color(rb) __rb_color((rb)->__rb_parent_color) #define rb_is_red(rb) __rb_is_red((rb)->__rb_parent_color) #define rb_is_black(rb) __rb_is_black((rb)->__rb_parent_color) static inline void rb_set_parent(struct rb_node *rb, struct rb_node *p) { rb->__rb_parent_color = rb_color(rb) + (unsigned long)p; } static inline void rb_set_parent_color(struct rb_node *rb, struct rb_node *p, int color) { rb->__rb_parent_color = (unsigned long)p + color; } static inline void __rb_change_child(struct rb_node *old, struct rb_node *new, struct rb_node *parent, struct rb_root *root) { if (parent) { if (parent->rb_left == old) WRITE_ONCE(parent->rb_left, new); else WRITE_ONCE(parent->rb_right, new); } else WRITE_ONCE(root->rb_node, new); } static inline void __rb_change_child_rcu(struct rb_node *old, struct rb_node *new, struct rb_node *parent, struct rb_root *root) { if (parent) { if (parent->rb_left == old) rcu_assign_pointer(parent->rb_left, new); else rcu_assign_pointer(parent->rb_right, new); } else rcu_assign_pointer(root->rb_node, new); } extern void __rb_erase_color(struct rb_node *parent, struct rb_root *root, void (*augment_rotate)(struct rb_node *old, struct rb_node *new)); static __always_inline struct rb_node * __rb_erase_augmented(struct rb_node *node, struct rb_root *root, const struct rb_augment_callbacks *augment) { struct rb_node *child = node->rb_right; struct rb_node *tmp = node->rb_left; struct rb_node *parent, *rebalance; unsigned long pc; if (!tmp) { /* * Case 1: node to erase has no more than 1 child (easy!) * * Note that if there is one child it must be red due to 5) * and node must be black due to 4). We adjust colors locally * so as to bypass __rb_erase_color() later on. */ pc = node->__rb_parent_color; parent = __rb_parent(pc); __rb_change_child(node, child, parent, root); if (child) { child->__rb_parent_color = pc; rebalance = NULL; } else rebalance = __rb_is_black(pc) ? parent : NULL; tmp = parent; } else if (!child) { /* Still case 1, but this time the child is node->rb_left */ tmp->__rb_parent_color = pc = node->__rb_parent_color; parent = __rb_parent(pc); __rb_change_child(node, tmp, parent, root); rebalance = NULL; tmp = parent; } else { struct rb_node *successor = child, *child2; tmp = child->rb_left; if (!tmp) { /* * Case 2: node's successor is its right child * * (n) (s) * / \ / \ * (x) (s) -> (x) (c) * \ * (c) */ parent = successor; child2 = successor->rb_right; augment->copy(node, successor); } else { /* * Case 3: node's successor is leftmost under * node's right child subtree * * (n) (s) * / \ / \ * (x) (y) -> (x) (y) * / / * (p) (p) * / / * (s) (c) * \ * (c) */ do { parent = successor; successor = tmp; tmp = tmp->rb_left; } while (tmp); child2 = successor->rb_right; WRITE_ONCE(parent->rb_left, child2); WRITE_ONCE(successor->rb_right, child); rb_set_parent(child, successor); augment->copy(node, successor); augment->propagate(parent, successor); } tmp = node->rb_left; WRITE_ONCE(successor->rb_left, tmp); rb_set_parent(tmp, successor); pc = node->__rb_parent_color; tmp = __rb_parent(pc); __rb_change_child(node, successor, tmp, root); if (child2) { rb_set_parent_color(child2, parent, RB_BLACK); rebalance = NULL; } else { rebalance = rb_is_black(successor) ? parent : NULL; } successor->__rb_parent_color = pc; tmp = successor; } augment->propagate(tmp, NULL); return rebalance; } static __always_inline void rb_erase_augmented(struct rb_node *node, struct rb_root *root, const struct rb_augment_callbacks *augment) { struct rb_node *rebalance = __rb_erase_augmented(node, root, augment); if (rebalance) __rb_erase_color(rebalance, root, augment->rotate); } static __always_inline void rb_erase_augmented_cached(struct rb_node *node, struct rb_root_cached *root, const struct rb_augment_callbacks *augment) { if (root->rb_leftmost == node) root->rb_leftmost = rb_next(node); rb_erase_augmented(node, &root->rb_root, augment); } #endif /* _LINUX_RBTREE_AUGMENTED_H */ |
| 902 911 899 183 4 4 4 4 4 4 2 2 3 2 2 4 20 21 1 21 17 4 4 21 4 2 2 4 1 1 2 2 4 2 2 2 2 21 20 21 1 19 2 17 17 17 17 792 793 799 792 709 734 802 707 706 709 620 619 614 614 1 620 611 1 615 618 619 593 617 344 346 345 326 339 345 175 181 178 171 3 177 180 165 177 3 333 334 333 336 334 4 4 4 4 4 4 1 332 812 810 805 788 336 339 338 339 295 334 337 286 6 3 602 604 606 607 608 609 604 605 3 3 604 602 602 607 1 1 1 1 603 608 607 601 5 5 4 4 4 1 602 602 605 604 604 609 605 4 605 601 1 603 733 734 737 734 733 730 704 740 713 704 721 3 3 3 3 3 3 3 632 632 633 3 3 3 3 86 35 577 578 579 132 132 11 11 11 129 132 132 131 132 131 5 134 32 1 32 21 22 21 32 31 32 7 7 7 7 7 3 3 3 3 3 3 1 1 1 1 1 1 1 768 741 68 770 761 769 765 68 68 769 610 599 605 609 602 605 608 605 604 608 336 337 337 335 334 336 335 337 336 334 336 337 334 1 336 332 336 335 313 310 313 41 31 41 31 383 380 322 321 335 384 321 17 17 17 17 17 17 1 17 16 17 13 17 17 12 17 17 16 12 17 17 4 4 5 5 5 4 5 5 5 5 5 5 5 1 1 5 5 5 5 5 4 4 5 5 1 4 5 5 5 5 1 4 5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 | // SPDX-License-Identifier: GPL-2.0-only /* * fs/kernfs/dir.c - kernfs directory implementation * * Copyright (c) 2001-3 Patrick Mochel * Copyright (c) 2007 SUSE Linux Products GmbH * Copyright (c) 2007, 2013 Tejun Heo <tj@kernel.org> */ #include <linux/sched.h> #include <linux/fs.h> #include <linux/namei.h> #include <linux/idr.h> #include <linux/slab.h> #include <linux/security.h> #include <linux/hash.h> #include "kernfs-internal.h" /* * Don't use rename_lock to piggy back on pr_cont_buf. We don't want to * call pr_cont() while holding rename_lock. Because sometimes pr_cont() * will perform wakeups when releasing console_sem. Holding rename_lock * will introduce deadlock if the scheduler reads the kernfs_name in the * wakeup path. */ static DEFINE_SPINLOCK(kernfs_pr_cont_lock); static char kernfs_pr_cont_buf[PATH_MAX]; /* protected by pr_cont_lock */ #define rb_to_kn(X) rb_entry((X), struct kernfs_node, rb) static bool __kernfs_active(struct kernfs_node *kn) { return atomic_read(&kn->active) >= 0; } static bool kernfs_active(struct kernfs_node *kn) { lockdep_assert_held(&kernfs_root(kn)->kernfs_rwsem); return __kernfs_active(kn); } static bool kernfs_lockdep(struct kernfs_node *kn) { #ifdef CONFIG_DEBUG_LOCK_ALLOC return kn->flags & KERNFS_LOCKDEP; #else return false; #endif } /* kernfs_node_depth - compute depth from @from to @to */ static size_t kernfs_depth(struct kernfs_node *from, struct kernfs_node *to) { size_t depth = 0; while (rcu_dereference(to->__parent) && to != from) { depth++; to = rcu_dereference(to->__parent); } return depth; } static struct kernfs_node *kernfs_common_ancestor(struct kernfs_node *a, struct kernfs_node *b) { size_t da, db; struct kernfs_root *ra = kernfs_root(a), *rb = kernfs_root(b); if (ra != rb) return NULL; da = kernfs_depth(ra->kn, a); db = kernfs_depth(rb->kn, b); while (da > db) { a = rcu_dereference(a->__parent); da--; } while (db > da) { b = rcu_dereference(b->__parent); db--; } /* worst case b and a will be the same at root */ while (b != a) { b = rcu_dereference(b->__parent); a = rcu_dereference(a->__parent); } return a; } /** * kernfs_path_from_node_locked - find a pseudo-absolute path to @kn_to, * where kn_from is treated as root of the path. * @kn_from: kernfs node which should be treated as root for the path * @kn_to: kernfs node to which path is needed * @buf: buffer to copy the path into * @buflen: size of @buf * * We need to handle couple of scenarios here: * [1] when @kn_from is an ancestor of @kn_to at some level * kn_from: /n1/n2/n3 * kn_to: /n1/n2/n3/n4/n5 * result: /n4/n5 * * [2] when @kn_from is on a different hierarchy and we need to find common * ancestor between @kn_from and @kn_to. * kn_from: /n1/n2/n3/n4 * kn_to: /n1/n2/n5 * result: /../../n5 * OR * kn_from: /n1/n2/n3/n4/n5 [depth=5] * kn_to: /n1/n2/n3 [depth=3] * result: /../.. * * [3] when @kn_to is %NULL result will be "(null)" * * Return: the length of the constructed path. If the path would have been * greater than @buflen, @buf contains the truncated path with the trailing * '\0'. On error, -errno is returned. */ static int kernfs_path_from_node_locked(struct kernfs_node *kn_to, struct kernfs_node *kn_from, char *buf, size_t buflen) { struct kernfs_node *kn, *common; const char parent_str[] = "/.."; size_t depth_from, depth_to, len = 0; ssize_t copied; int i, j; if (!kn_to) return strscpy(buf, "(null)", buflen); if (!kn_from) kn_from = kernfs_root(kn_to)->kn; if (kn_from == kn_to) return strscpy(buf, "/", buflen); common = kernfs_common_ancestor(kn_from, kn_to); if (WARN_ON(!common)) return -EINVAL; depth_to = kernfs_depth(common, kn_to); depth_from = kernfs_depth(common, kn_from); buf[0] = '\0'; for (i = 0; i < depth_from; i++) { copied = strscpy(buf + len, parent_str, buflen - len); if (copied < 0) return copied; len += copied; } /* Calculate how many bytes we need for the rest */ for (i = depth_to - 1; i >= 0; i--) { const char *name; for (kn = kn_to, j = 0; j < i; j++) kn = rcu_dereference(kn->__parent); name = rcu_dereference(kn->name); len += scnprintf(buf + len, buflen - len, "/%s", name); } return len; } /** * kernfs_name - obtain the name of a given node * @kn: kernfs_node of interest * @buf: buffer to copy @kn's name into * @buflen: size of @buf * * Copies the name of @kn into @buf of @buflen bytes. The behavior is * similar to strscpy(). * * Fills buffer with "(null)" if @kn is %NULL. * * Return: the resulting length of @buf. If @buf isn't long enough, * it's filled up to @buflen-1 and nul terminated, and returns -E2BIG. * * This function can be called from any context. */ int kernfs_name(struct kernfs_node *kn, char *buf, size_t buflen) { struct kernfs_node *kn_parent; if (!kn) return strscpy(buf, "(null)", buflen); guard(rcu)(); /* * KERNFS_ROOT_INVARIANT_PARENT is ignored here. The name is RCU freed and * the parent is either existing or not. */ kn_parent = rcu_dereference(kn->__parent); return strscpy(buf, kn_parent ? rcu_dereference(kn->name) : "/", buflen); } /** * kernfs_path_from_node - build path of node @to relative to @from. * @from: parent kernfs_node relative to which we need to build the path * @to: kernfs_node of interest * @buf: buffer to copy @to's path into * @buflen: size of @buf * * Builds @to's path relative to @from in @buf. @from and @to must * be on the same kernfs-root. If @from is not parent of @to, then a relative * path (which includes '..'s) as needed to reach from @from to @to is * returned. * * Return: the length of the constructed path. If the path would have been * greater than @buflen, @buf contains the truncated path with the trailing * '\0'. On error, -errno is returned. */ int kernfs_path_from_node(struct kernfs_node *to, struct kernfs_node *from, char *buf, size_t buflen) { struct kernfs_root *root; guard(rcu)(); if (to) { root = kernfs_root(to); if (!(root->flags & KERNFS_ROOT_INVARIANT_PARENT)) { guard(read_lock_irqsave)(&root->kernfs_rename_lock); return kernfs_path_from_node_locked(to, from, buf, buflen); } } return kernfs_path_from_node_locked(to, from, buf, buflen); } EXPORT_SYMBOL_GPL(kernfs_path_from_node); /** * pr_cont_kernfs_name - pr_cont name of a kernfs_node * @kn: kernfs_node of interest * * This function can be called from any context. */ void pr_cont_kernfs_name(struct kernfs_node *kn) { unsigned long flags; spin_lock_irqsave(&kernfs_pr_cont_lock, flags); kernfs_name(kn, kernfs_pr_cont_buf, sizeof(kernfs_pr_cont_buf)); pr_cont("%s", kernfs_pr_cont_buf); spin_unlock_irqrestore(&kernfs_pr_cont_lock, flags); } /** * pr_cont_kernfs_path - pr_cont path of a kernfs_node * @kn: kernfs_node of interest * * This function can be called from any context. */ void pr_cont_kernfs_path(struct kernfs_node *kn) { unsigned long flags; int sz; spin_lock_irqsave(&kernfs_pr_cont_lock, flags); sz = kernfs_path_from_node(kn, NULL, kernfs_pr_cont_buf, sizeof(kernfs_pr_cont_buf)); if (sz < 0) { if (sz == -E2BIG) pr_cont("(name too long)"); else pr_cont("(error)"); goto out; } pr_cont("%s", kernfs_pr_cont_buf); out: spin_unlock_irqrestore(&kernfs_pr_cont_lock, flags); } /** * kernfs_get_parent - determine the parent node and pin it * @kn: kernfs_node of interest * * Determines @kn's parent, pins and returns it. This function can be * called from any context. * * Return: parent node of @kn */ struct kernfs_node *kernfs_get_parent(struct kernfs_node *kn) { struct kernfs_node *parent; struct kernfs_root *root; unsigned long flags; root = kernfs_root(kn); read_lock_irqsave(&root->kernfs_rename_lock, flags); parent = kernfs_parent(kn); kernfs_get(parent); read_unlock_irqrestore(&root->kernfs_rename_lock, flags); return parent; } /** * kernfs_name_hash - calculate hash of @ns + @name * @name: Null terminated string to hash * @ns: Namespace tag to hash * * Return: 31-bit hash of ns + name (so it fits in an off_t) */ static unsigned int kernfs_name_hash(const char *name, const void *ns) { unsigned long hash = init_name_hash(ns); unsigned int len = strlen(name); while (len--) hash = partial_name_hash(*name++, hash); hash = end_name_hash(hash); hash &= 0x7fffffffU; /* Reserve hash numbers 0, 1 and INT_MAX for magic directory entries */ if (hash < 2) hash += 2; if (hash >= INT_MAX) hash = INT_MAX - 1; return hash; } static int kernfs_name_compare(unsigned int hash, const char *name, const void *ns, const struct kernfs_node *kn) { if (hash < kn->hash) return -1; if (hash > kn->hash) return 1; if (ns < kn->ns) return -1; if (ns > kn->ns) return 1; return strcmp(name, kernfs_rcu_name(kn)); } static int kernfs_sd_compare(const struct kernfs_node *left, const struct kernfs_node *right) { return kernfs_name_compare(left->hash, kernfs_rcu_name(left), left->ns, right); } /** * kernfs_link_sibling - link kernfs_node into sibling rbtree * @kn: kernfs_node of interest * * Link @kn into its sibling rbtree which starts from * @kn->parent->dir.children. * * Locking: * kernfs_rwsem held exclusive * * Return: * %0 on success, -EEXIST on failure. */ static int kernfs_link_sibling(struct kernfs_node *kn) { struct rb_node *parent = NULL; struct kernfs_node *kn_parent; struct rb_node **node; kn_parent = kernfs_parent(kn); node = &kn_parent->dir.children.rb_node; while (*node) { struct kernfs_node *pos; int result; pos = rb_to_kn(*node); parent = *node; result = kernfs_sd_compare(kn, pos); if (result < 0) node = &pos->rb.rb_left; else if (result > 0) node = &pos->rb.rb_right; else return -EEXIST; } /* add new node and rebalance the tree */ rb_link_node(&kn->rb, parent, node); rb_insert_color(&kn->rb, &kn_parent->dir.children); /* successfully added, account subdir number */ down_write(&kernfs_root(kn)->kernfs_iattr_rwsem); if (kernfs_type(kn) == KERNFS_DIR) kn_parent->dir.subdirs++; kernfs_inc_rev(kn_parent); up_write(&kernfs_root(kn)->kernfs_iattr_rwsem); return 0; } /** * kernfs_unlink_sibling - unlink kernfs_node from sibling rbtree * @kn: kernfs_node of interest * * Try to unlink @kn from its sibling rbtree which starts from * kn->parent->dir.children. * * Return: %true if @kn was actually removed, * %false if @kn wasn't on the rbtree. * * Locking: * kernfs_rwsem held exclusive */ static bool kernfs_unlink_sibling(struct kernfs_node *kn) { struct kernfs_node *kn_parent; if (RB_EMPTY_NODE(&kn->rb)) return false; kn_parent = kernfs_parent(kn); down_write(&kernfs_root(kn)->kernfs_iattr_rwsem); if (kernfs_type(kn) == KERNFS_DIR) kn_parent->dir.subdirs--; kernfs_inc_rev(kn_parent); up_write(&kernfs_root(kn)->kernfs_iattr_rwsem); rb_erase(&kn->rb, &kn_parent->dir.children); RB_CLEAR_NODE(&kn->rb); return true; } /** * kernfs_get_active - get an active reference to kernfs_node * @kn: kernfs_node to get an active reference to * * Get an active reference of @kn. This function is noop if @kn * is %NULL. * * Return: * Pointer to @kn on success, %NULL on failure. */ struct kernfs_node *kernfs_get_active(struct kernfs_node *kn) { if (unlikely(!kn)) return NULL; if (!atomic_inc_unless_negative(&kn->active)) return NULL; if (kernfs_lockdep(kn)) rwsem_acquire_read(&kn->dep_map, 0, 1, _RET_IP_); return kn; } /** * kernfs_put_active - put an active reference to kernfs_node * @kn: kernfs_node to put an active reference to * * Put an active reference to @kn. This function is noop if @kn * is %NULL. */ void kernfs_put_active(struct kernfs_node *kn) { int v; if (unlikely(!kn)) return; if (kernfs_lockdep(kn)) rwsem_release(&kn->dep_map, _RET_IP_); v = atomic_dec_return(&kn->active); if (likely(v != KN_DEACTIVATED_BIAS)) return; wake_up_all(&kernfs_root(kn)->deactivate_waitq); } /** * kernfs_drain - drain kernfs_node * @kn: kernfs_node to drain * * Drain existing usages and nuke all existing mmaps of @kn. Multiple * removers may invoke this function concurrently on @kn and all will * return after draining is complete. */ static void kernfs_drain(struct kernfs_node *kn) __releases(&kernfs_root(kn)->kernfs_rwsem) __acquires(&kernfs_root(kn)->kernfs_rwsem) { struct kernfs_root *root = kernfs_root(kn); lockdep_assert_held_write(&root->kernfs_rwsem); WARN_ON_ONCE(kernfs_active(kn)); /* * Skip draining if already fully drained. This avoids draining and its * lockdep annotations for nodes which have never been activated * allowing embedding kernfs_remove() in create error paths without * worrying about draining. */ if (atomic_read(&kn->active) == KN_DEACTIVATED_BIAS && !kernfs_should_drain_open_files(kn)) return; up_write(&root->kernfs_rwsem); if (kernfs_lockdep(kn)) { rwsem_acquire(&kn->dep_map, 0, 0, _RET_IP_); if (atomic_read(&kn->active) != KN_DEACTIVATED_BIAS) lock_contended(&kn->dep_map, _RET_IP_); } wait_event(root->deactivate_waitq, atomic_read(&kn->active) == KN_DEACTIVATED_BIAS); if (kernfs_lockdep(kn)) { lock_acquired(&kn->dep_map, _RET_IP_); rwsem_release(&kn->dep_map, _RET_IP_); } if (kernfs_should_drain_open_files(kn)) kernfs_drain_open_files(kn); down_write(&root->kernfs_rwsem); } /** * kernfs_get - get a reference count on a kernfs_node * @kn: the target kernfs_node */ void kernfs_get(struct kernfs_node *kn) { if (kn) { WARN_ON(!atomic_read(&kn->count)); atomic_inc(&kn->count); } } EXPORT_SYMBOL_GPL(kernfs_get); static void kernfs_free_rcu(struct rcu_head *rcu) { struct kernfs_node *kn = container_of(rcu, struct kernfs_node, rcu); /* If the whole node goes away, then name can't be used outside */ kfree_const(rcu_access_pointer(kn->name)); if (kn->iattr) { simple_xattrs_free(&kn->iattr->xattrs, NULL); kmem_cache_free(kernfs_iattrs_cache, kn->iattr); } kmem_cache_free(kernfs_node_cache, kn); } /** * kernfs_put - put a reference count on a kernfs_node * @kn: the target kernfs_node * * Put a reference count of @kn and destroy it if it reached zero. */ void kernfs_put(struct kernfs_node *kn) { struct kernfs_node *parent; struct kernfs_root *root; if (!kn || !atomic_dec_and_test(&kn->count)) return; root = kernfs_root(kn); repeat: /* * Moving/renaming is always done while holding reference. * kn->parent won't change beneath us. */ parent = kernfs_parent(kn); WARN_ONCE(atomic_read(&kn->active) != KN_DEACTIVATED_BIAS, "kernfs_put: %s/%s: released with incorrect active_ref %d\n", parent ? rcu_dereference(parent->name) : "", rcu_dereference(kn->name), atomic_read(&kn->active)); if (kernfs_type(kn) == KERNFS_LINK) kernfs_put(kn->symlink.target_kn); spin_lock(&root->kernfs_idr_lock); idr_remove(&root->ino_idr, (u32)kernfs_ino(kn)); spin_unlock(&root->kernfs_idr_lock); call_rcu(&kn->rcu, kernfs_free_rcu); kn = parent; if (kn) { if (atomic_dec_and_test(&kn->count)) goto repeat; } else { /* just released the root kn, free @root too */ idr_destroy(&root->ino_idr); kfree_rcu(root, rcu); } } EXPORT_SYMBOL_GPL(kernfs_put); /** * kernfs_node_from_dentry - determine kernfs_node associated with a dentry * @dentry: the dentry in question * * Return: the kernfs_node associated with @dentry. If @dentry is not a * kernfs one, %NULL is returned. * * While the returned kernfs_node will stay accessible as long as @dentry * is accessible, the returned node can be in any state and the caller is * fully responsible for determining what's accessible. */ struct kernfs_node *kernfs_node_from_dentry(struct dentry *dentry) { if (dentry->d_sb->s_op == &kernfs_sops) return kernfs_dentry_node(dentry); return NULL; } static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root, struct kernfs_node *parent, const char *name, umode_t mode, kuid_t uid, kgid_t gid, unsigned flags) { struct kernfs_node *kn; u32 id_highbits; int ret; name = kstrdup_const(name, GFP_KERNEL); if (!name) return NULL; kn = kmem_cache_zalloc(kernfs_node_cache, GFP_KERNEL); if (!kn) goto err_out1; idr_preload(GFP_KERNEL); spin_lock(&root->kernfs_idr_lock); ret = idr_alloc_cyclic(&root->ino_idr, kn, 1, 0, GFP_ATOMIC); if (ret >= 0 && ret < root->last_id_lowbits) root->id_highbits++; id_highbits = root->id_highbits; root->last_id_lowbits = ret; spin_unlock(&root->kernfs_idr_lock); idr_preload_end(); if (ret < 0) goto err_out2; kn->id = (u64)id_highbits << 32 | ret; atomic_set(&kn->count, 1); atomic_set(&kn->active, KN_DEACTIVATED_BIAS); RB_CLEAR_NODE(&kn->rb); rcu_assign_pointer(kn->name, name); kn->mode = mode; kn->flags = flags; if (!uid_eq(uid, GLOBAL_ROOT_UID) || !gid_eq(gid, GLOBAL_ROOT_GID)) { struct iattr iattr = { .ia_valid = ATTR_UID | ATTR_GID, .ia_uid = uid, .ia_gid = gid, }; ret = __kernfs_setattr(kn, &iattr); if (ret < 0) goto err_out3; } if (parent) { ret = security_kernfs_init_security(parent, kn); if (ret) goto err_out3; } return kn; err_out3: spin_lock(&root->kernfs_idr_lock); idr_remove(&root->ino_idr, (u32)kernfs_ino(kn)); spin_unlock(&root->kernfs_idr_lock); err_out2: kmem_cache_free(kernfs_node_cache, kn); err_out1: kfree_const(name); return NULL; } struct kernfs_node *kernfs_new_node(struct kernfs_node *parent, const char *name, umode_t mode, kuid_t uid, kgid_t gid, unsigned flags) { struct kernfs_node *kn; if (parent->mode & S_ISGID) { /* this code block imitates inode_init_owner() for * kernfs */ if (parent->iattr) gid = parent->iattr->ia_gid; if (flags & KERNFS_DIR) mode |= S_ISGID; } kn = __kernfs_new_node(kernfs_root(parent), parent, name, mode, uid, gid, flags); if (kn) { kernfs_get(parent); rcu_assign_pointer(kn->__parent, parent); } return kn; } /* * kernfs_find_and_get_node_by_id - get kernfs_node from node id * @root: the kernfs root * @id: the target node id * * @id's lower 32bits encode ino and upper gen. If the gen portion is * zero, all generations are matched. * * Return: %NULL on failure, * otherwise a kernfs node with reference counter incremented. */ struct kernfs_node *kernfs_find_and_get_node_by_id(struct kernfs_root *root, u64 id) { struct kernfs_node *kn; ino_t ino = kernfs_id_ino(id); u32 gen = kernfs_id_gen(id); rcu_read_lock(); kn = idr_find(&root->ino_idr, (u32)ino); if (!kn) goto err_unlock; if (sizeof(ino_t) >= sizeof(u64)) { /* we looked up with the low 32bits, compare the whole */ if (kernfs_ino(kn) != ino) goto err_unlock; } else { /* 0 matches all generations */ if (unlikely(gen && kernfs_gen(kn) != gen)) goto err_unlock; } /* * We should fail if @kn has never been activated and guarantee success * if the caller knows that @kn is active. Both can be achieved by * __kernfs_active() which tests @kn->active without kernfs_rwsem. */ if (unlikely(!__kernfs_active(kn) || !atomic_inc_not_zero(&kn->count))) goto err_unlock; rcu_read_unlock(); return kn; err_unlock: rcu_read_unlock(); return NULL; } /** * kernfs_add_one - add kernfs_node to parent without warning * @kn: kernfs_node to be added * * The caller must already have initialized @kn->parent. This * function increments nlink of the parent's inode if @kn is a * directory and link into the children list of the parent. * * Return: * %0 on success, -EEXIST if entry with the given name already * exists. */ int kernfs_add_one(struct kernfs_node *kn) { struct kernfs_root *root = kernfs_root(kn); struct kernfs_iattrs *ps_iattr; struct kernfs_node *parent; bool has_ns; int ret; down_write(&root->kernfs_rwsem); parent = kernfs_parent(kn); ret = -EINVAL; has_ns = kernfs_ns_enabled(parent); if (WARN(has_ns != (bool)kn->ns, KERN_WARNING "kernfs: ns %s in '%s' for '%s'\n", has_ns ? "required" : "invalid", kernfs_rcu_name(parent), kernfs_rcu_name(kn))) goto out_unlock; if (kernfs_type(parent) != KERNFS_DIR) goto out_unlock; ret = -ENOENT; if (parent->flags & (KERNFS_REMOVING | KERNFS_EMPTY_DIR)) goto out_unlock; kn->hash = kernfs_name_hash(kernfs_rcu_name(kn), kn->ns); ret = kernfs_link_sibling(kn); if (ret) goto out_unlock; /* Update timestamps on the parent */ down_write(&root->kernfs_iattr_rwsem); ps_iattr = parent->iattr; if (ps_iattr) { ktime_get_real_ts64(&ps_iattr->ia_ctime); ps_iattr->ia_mtime = ps_iattr->ia_ctime; } up_write(&root->kernfs_iattr_rwsem); up_write(&root->kernfs_rwsem); /* * Activate the new node unless CREATE_DEACTIVATED is requested. * If not activated here, the kernfs user is responsible for * activating the node with kernfs_activate(). A node which hasn't * been activated is not visible to userland and its removal won't * trigger deactivation. */ if (!(kernfs_root(kn)->flags & KERNFS_ROOT_CREATE_DEACTIVATED)) kernfs_activate(kn); return 0; out_unlock: up_write(&root->kernfs_rwsem); return ret; } /** * kernfs_find_ns - find kernfs_node with the given name * @parent: kernfs_node to search under * @name: name to look for * @ns: the namespace tag to use * * Look for kernfs_node with name @name under @parent. * * Return: pointer to the found kernfs_node on success, %NULL on failure. */ static struct kernfs_node *kernfs_find_ns(struct kernfs_node *parent, const unsigned char *name, const void *ns) { struct rb_node *node = parent->dir.children.rb_node; bool has_ns = kernfs_ns_enabled(parent); unsigned int hash; lockdep_assert_held(&kernfs_root(parent)->kernfs_rwsem); if (has_ns != (bool)ns) { WARN(1, KERN_WARNING "kernfs: ns %s in '%s' for '%s'\n", has_ns ? "required" : "invalid", kernfs_rcu_name(parent), name); return NULL; } hash = kernfs_name_hash(name, ns); while (node) { struct kernfs_node *kn; int result; kn = rb_to_kn(node); result = kernfs_name_compare(hash, name, ns, kn); if (result < 0) node = node->rb_left; else if (result > 0) node = node->rb_right; else return kn; } return NULL; } static struct kernfs_node *kernfs_walk_ns(struct kernfs_node *parent, const unsigned char *path, const void *ns) { ssize_t len; char *p, *name; lockdep_assert_held_read(&kernfs_root(parent)->kernfs_rwsem); spin_lock_irq(&kernfs_pr_cont_lock); len = strscpy(kernfs_pr_cont_buf, path, sizeof(kernfs_pr_cont_buf)); if (len < 0) { spin_unlock_irq(&kernfs_pr_cont_lock); return NULL; } p = kernfs_pr_cont_buf; while ((name = strsep(&p, "/")) && parent) { if (*name == '\0') continue; parent = kernfs_find_ns(parent, name, ns); } spin_unlock_irq(&kernfs_pr_cont_lock); return parent; } /** * kernfs_find_and_get_ns - find and get kernfs_node with the given name * @parent: kernfs_node to search under * @name: name to look for * @ns: the namespace tag to use * * Look for kernfs_node with name @name under @parent and get a reference * if found. This function may sleep. * * Return: pointer to the found kernfs_node on success, %NULL on failure. */ struct kernfs_node *kernfs_find_and_get_ns(struct kernfs_node *parent, const char *name, const void *ns) { struct kernfs_node *kn; struct kernfs_root *root = kernfs_root(parent); down_read(&root->kernfs_rwsem); kn = kernfs_find_ns(parent, name, ns); kernfs_get(kn); up_read(&root->kernfs_rwsem); return kn; } EXPORT_SYMBOL_GPL(kernfs_find_and_get_ns); /** * kernfs_walk_and_get_ns - find and get kernfs_node with the given path * @parent: kernfs_node to search under * @path: path to look for * @ns: the namespace tag to use * * Look for kernfs_node with path @path under @parent and get a reference * if found. This function may sleep. * * Return: pointer to the found kernfs_node on success, %NULL on failure. */ struct kernfs_node *kernfs_walk_and_get_ns(struct kernfs_node *parent, const char *path, const void *ns) { struct kernfs_node *kn; struct kernfs_root *root = kernfs_root(parent); down_read(&root->kernfs_rwsem); kn = kernfs_walk_ns(parent, path, ns); kernfs_get(kn); up_read(&root->kernfs_rwsem); return kn; } unsigned int kernfs_root_flags(struct kernfs_node *kn) { return kernfs_root(kn)->flags; } /** * kernfs_create_root - create a new kernfs hierarchy * @scops: optional syscall operations for the hierarchy * @flags: KERNFS_ROOT_* flags * @priv: opaque data associated with the new directory * * Return: the root of the new hierarchy on success, ERR_PTR() value on * failure. */ struct kernfs_root *kernfs_create_root(struct kernfs_syscall_ops *scops, unsigned int flags, void *priv) { struct kernfs_root *root; struct kernfs_node *kn; root = kzalloc(sizeof(*root), GFP_KERNEL); if (!root) return ERR_PTR(-ENOMEM); idr_init(&root->ino_idr); spin_lock_init(&root->kernfs_idr_lock); init_rwsem(&root->kernfs_rwsem); init_rwsem(&root->kernfs_iattr_rwsem); init_rwsem(&root->ke |