| 224 223 222 13 224 224 222 225 207 207 207 205 209 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 | // SPDX-License-Identifier: GPL-2.0-or-later /* Key permission checking * * Copyright (C) 2005 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #include <linux/export.h> #include <linux/security.h> #include "internal.h" /** * key_task_permission - Check a key can be used * @key_ref: The key to check. * @cred: The credentials to use. * @need_perm: The permission required. * * Check to see whether permission is granted to use a key in the desired way, * but permit the security modules to override. * * The caller must hold either a ref on cred or must hold the RCU readlock. * * Returns 0 if successful, -EACCES if access is denied based on the * permissions bits or the LSM check. */ int key_task_permission(const key_ref_t key_ref, const struct cred *cred, enum key_need_perm need_perm) { struct key *key; key_perm_t kperm, mask; int ret; switch (need_perm) { default: WARN_ON(1); return -EACCES; case KEY_NEED_UNLINK: case KEY_SYSADMIN_OVERRIDE: case KEY_AUTHTOKEN_OVERRIDE: case KEY_DEFER_PERM_CHECK: goto lsm; case KEY_NEED_VIEW: mask = KEY_OTH_VIEW; break; case KEY_NEED_READ: mask = KEY_OTH_READ; break; case KEY_NEED_WRITE: mask = KEY_OTH_WRITE; break; case KEY_NEED_SEARCH: mask = KEY_OTH_SEARCH; break; case KEY_NEED_LINK: mask = KEY_OTH_LINK; break; case KEY_NEED_SETATTR: mask = KEY_OTH_SETATTR; break; } key = key_ref_to_ptr(key_ref); /* use the second 8-bits of permissions for keys the caller owns */ if (uid_eq(key->uid, cred->fsuid)) { kperm = key->perm >> 16; goto use_these_perms; } /* use the third 8-bits of permissions for keys the caller has a group * membership in common with */ if (gid_valid(key->gid) && key->perm & KEY_GRP_ALL) { if (gid_eq(key->gid, cred->fsgid)) { kperm = key->perm >> 8; goto use_these_perms; } ret = groups_search(cred->group_info, key->gid); if (ret) { kperm = key->perm >> 8; goto use_these_perms; } } /* otherwise use the least-significant 8-bits */ kperm = key->perm; use_these_perms: /* use the top 8-bits of permissions for keys the caller possesses * - possessor permissions are additive with other permissions */ if (is_key_possessed(key_ref)) kperm |= key->perm >> 24; if ((kperm & mask) != mask) return -EACCES; /* let LSM be the final arbiter */ lsm: return security_key_permission(key_ref, cred, need_perm); } EXPORT_SYMBOL(key_task_permission); /** * key_validate - Validate a key. * @key: The key to be validated. * * Check that a key is valid, returning 0 if the key is okay, -ENOKEY if the * key is invalidated, -EKEYREVOKED if the key's type has been removed or if * the key has been revoked or -EKEYEXPIRED if the key has expired. */ int key_validate(const struct key *key) { unsigned long flags = READ_ONCE(key->flags); time64_t expiry = READ_ONCE(key->expiry); if (flags & (1 << KEY_FLAG_INVALIDATED)) return -ENOKEY; /* check it's still accessible */ if (flags & ((1 << KEY_FLAG_REVOKED) | (1 << KEY_FLAG_DEAD))) return -EKEYREVOKED; /* check it hasn't expired */ if (expiry) { if (ktime_get_real_seconds() >= expiry) return -EKEYEXPIRED; } return 0; } EXPORT_SYMBOL(key_validate); |
| 22 156 1622 1792 120 2 1043 17 121 200 108 62 207 1 44 108 114 10 104 25 29 1527 363 579 36 36 41 17 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __LINUX_DCACHE_H #define __LINUX_DCACHE_H #include <linux/atomic.h> #include <linux/list.h> #include <linux/math.h> #include <linux/rculist.h> #include <linux/rculist_bl.h> #include <linux/spinlock.h> #include <linux/seqlock.h> #include <linux/cache.h> #include <linux/rcupdate.h> #include <linux/lockref.h> #include <linux/stringhash.h> #include <linux/wait.h> struct path; struct file; struct vfsmount; /* * linux/include/linux/dcache.h * * Dirent cache data structures * * (C) Copyright 1997 Thomas Schoebel-Theuer, * with heavy changes by Linus Torvalds */ #define IS_ROOT(x) ((x) == (x)->d_parent) /* The hash is always the low bits of hash_len */ #ifdef __LITTLE_ENDIAN #define HASH_LEN_DECLARE u32 hash; u32 len #define bytemask_from_count(cnt) (~(~0ul << (cnt)*8)) #else #define HASH_LEN_DECLARE u32 len; u32 hash #define bytemask_from_count(cnt) (~(~0ul >> (cnt)*8)) #endif /* * "quick string" -- eases parameter passing, but more importantly * saves "metadata" about the string (ie length and the hash). * * hash comes first so it snuggles against d_parent in the * dentry. */ struct qstr { union { struct { HASH_LEN_DECLARE; }; u64 hash_len; }; const unsigned char *name; }; #define QSTR_INIT(n,l) { { { .len = l } }, .name = n } #define QSTR_LEN(n,l) (struct qstr)QSTR_INIT(n,l) #define QSTR(n) QSTR_LEN(n, strlen(n)) extern const struct qstr empty_name; extern const struct qstr slash_name; extern const struct qstr dotdot_name; /* * Try to keep struct dentry aligned on 64 byte cachelines (this will * give reasonable cacheline footprint with larger lines without the * large memory footprint increase). */ #ifdef CONFIG_64BIT # define DNAME_INLINE_WORDS 5 /* 192 bytes */ #else # ifdef CONFIG_SMP # define DNAME_INLINE_WORDS 9 /* 128 bytes */ # else # define DNAME_INLINE_WORDS 11 /* 128 bytes */ # endif #endif #define DNAME_INLINE_LEN (DNAME_INLINE_WORDS*sizeof(unsigned long)) union shortname_store { unsigned char string[DNAME_INLINE_LEN]; unsigned long words[DNAME_INLINE_WORDS]; }; #define d_lock d_lockref.lock #define d_iname d_shortname.string struct dentry { /* RCU lookup touched fields */ unsigned int d_flags; /* protected by d_lock */ seqcount_spinlock_t d_seq; /* per dentry seqlock */ struct hlist_bl_node d_hash; /* lookup hash list */ struct dentry *d_parent; /* parent directory */ struct qstr d_name; struct inode *d_inode; /* Where the name belongs to - NULL is * negative */ union shortname_store d_shortname; /* --- cacheline 1 boundary (64 bytes) was 32 bytes ago --- */ /* Ref lookup also touches following */ const struct dentry_operations *d_op; struct super_block *d_sb; /* The root of the dentry tree */ unsigned long d_time; /* used by d_revalidate */ void *d_fsdata; /* fs-specific data */ /* --- cacheline 2 boundary (128 bytes) --- */ struct lockref d_lockref; /* per-dentry lock and refcount * keep separate from RCU lookup area if * possible! */ union { struct list_head d_lru; /* LRU list */ wait_queue_head_t *d_wait; /* in-lookup ones only */ }; struct hlist_node d_sib; /* child of parent list */ struct hlist_head d_children; /* our children */ /* * d_alias and d_rcu can share memory */ union { struct hlist_node d_alias; /* inode alias list */ struct hlist_bl_node d_in_lookup_hash; /* only for in-lookup ones */ struct rcu_head d_rcu; } d_u; }; /* * dentry->d_lock spinlock nesting subclasses: * * 0: normal * 1: nested */ enum dentry_d_lock_class { DENTRY_D_LOCK_NORMAL, /* implicitly used by plain spin_lock() APIs. */ DENTRY_D_LOCK_NESTED }; enum d_real_type { D_REAL_DATA, D_REAL_METADATA, }; struct dentry_operations { int (*d_revalidate)(struct inode *, const struct qstr *, struct dentry *, unsigned int); int (*d_weak_revalidate)(struct dentry *, unsigned int); int (*d_hash)(const struct dentry *, struct qstr *); int (*d_compare)(const struct dentry *, unsigned int, const char *, const struct qstr *); int (*d_delete)(const struct dentry *); int (*d_init)(struct dentry *); void (*d_release)(struct dentry *); void (*d_prune)(struct dentry *); void (*d_iput)(struct dentry *, struct inode *); char *(*d_dname)(struct dentry *, char *, int); struct vfsmount *(*d_automount)(struct path *); int (*d_manage)(const struct path *, bool); struct dentry *(*d_real)(struct dentry *, enum d_real_type type); bool (*d_unalias_trylock)(const struct dentry *); void (*d_unalias_unlock)(const struct dentry *); } ____cacheline_aligned; /* * Locking rules for dentry_operations callbacks are to be found in * Documentation/filesystems/locking.rst. Keep it updated! * * FUrther descriptions are found in Documentation/filesystems/vfs.rst. * Keep it updated too! */ /* d_flags entries */ enum dentry_flags { DCACHE_OP_HASH = BIT(0), DCACHE_OP_COMPARE = BIT(1), DCACHE_OP_REVALIDATE = BIT(2), DCACHE_OP_DELETE = BIT(3), DCACHE_OP_PRUNE = BIT(4), /* * This dentry is possibly not currently connected to the dcache tree, * in which case its parent will either be itself, or will have this * flag as well. nfsd will not use a dentry with this bit set, but will * first endeavour to clear the bit either by discovering that it is * connected, or by performing lookup operations. Any filesystem which * supports nfsd_operations MUST have a lookup function which, if it * finds a directory inode with a DCACHE_DISCONNECTED dentry, will * d_move that dentry into place and return that dentry rather than the * passed one, typically using d_splice_alias. */ DCACHE_DISCONNECTED = BIT(5), DCACHE_REFERENCED = BIT(6), /* Recently used, don't discard. */ DCACHE_DONTCACHE = BIT(7), /* Purge from memory on final dput() */ DCACHE_CANT_MOUNT = BIT(8), DCACHE_GENOCIDE = BIT(9), DCACHE_SHRINK_LIST = BIT(10), DCACHE_OP_WEAK_REVALIDATE = BIT(11), /* * this dentry has been "silly renamed" and has to be deleted on the * last dput() */ DCACHE_NFSFS_RENAMED = BIT(12), DCACHE_FSNOTIFY_PARENT_WATCHED = BIT(13), /* Parent inode is watched by some fsnotify listener */ DCACHE_DENTRY_KILLED = BIT(14), DCACHE_MOUNTED = BIT(15), /* is a mountpoint */ DCACHE_NEED_AUTOMOUNT = BIT(16), /* handle automount on this dir */ DCACHE_MANAGE_TRANSIT = BIT(17), /* manage transit from this dirent */ DCACHE_LRU_LIST = BIT(18), DCACHE_ENTRY_TYPE = (7 << 19), /* bits 19..21 are for storing type: */ DCACHE_MISS_TYPE = (0 << 19), /* Negative dentry */ DCACHE_WHITEOUT_TYPE = (1 << 19), /* Whiteout dentry (stop pathwalk) */ DCACHE_DIRECTORY_TYPE = (2 << 19), /* Normal directory */ DCACHE_AUTODIR_TYPE = (3 << 19), /* Lookupless directory (presumed automount) */ DCACHE_REGULAR_TYPE = (4 << 19), /* Regular file type */ DCACHE_SPECIAL_TYPE = (5 << 19), /* Other file type */ DCACHE_SYMLINK_TYPE = (6 << 19), /* Symlink */ DCACHE_NOKEY_NAME = BIT(22), /* Encrypted name encoded without key */ DCACHE_OP_REAL = BIT(23), DCACHE_PAR_LOOKUP = BIT(24), /* being looked up (with parent locked shared) */ DCACHE_DENTRY_CURSOR = BIT(25), DCACHE_NORCU = BIT(26), /* No RCU delay for freeing */ }; #define DCACHE_MANAGED_DENTRY \ (DCACHE_MOUNTED|DCACHE_NEED_AUTOMOUNT|DCACHE_MANAGE_TRANSIT) extern seqlock_t rename_lock; /* * These are the low-level FS interfaces to the dcache.. */ extern void d_instantiate(struct dentry *, struct inode *); extern void d_instantiate_new(struct dentry *, struct inode *); extern void __d_drop(struct dentry *dentry); extern void d_drop(struct dentry *dentry); extern void d_delete(struct dentry *); /* allocate/de-allocate */ extern struct dentry * d_alloc(struct dentry *, const struct qstr *); extern struct dentry * d_alloc_anon(struct super_block *); extern struct dentry * d_alloc_parallel(struct dentry *, const struct qstr *, wait_queue_head_t *); extern struct dentry * d_splice_alias(struct inode *, struct dentry *); /* weird procfs mess; *NOT* exported */ extern struct dentry * d_splice_alias_ops(struct inode *, struct dentry *, const struct dentry_operations *); extern struct dentry * d_add_ci(struct dentry *, struct inode *, struct qstr *); extern bool d_same_name(const struct dentry *dentry, const struct dentry *parent, const struct qstr *name); extern struct dentry *d_find_any_alias(struct inode *inode); extern struct dentry * d_obtain_alias(struct inode *); extern struct dentry * d_obtain_root(struct inode *); extern void shrink_dcache_sb(struct super_block *); extern void shrink_dcache_parent(struct dentry *); extern void d_invalidate(struct dentry *); /* only used at mount-time */ extern struct dentry * d_make_root(struct inode *); extern void d_mark_tmpfile(struct file *, struct inode *); extern void d_tmpfile(struct file *, struct inode *); extern struct dentry *d_find_alias(struct inode *); extern void d_prune_aliases(struct inode *); extern struct dentry *d_find_alias_rcu(struct inode *); /* test whether we have any submounts in a subdir tree */ extern int path_has_submounts(const struct path *); /* * This adds the entry to the hash queues. */ extern void d_rehash(struct dentry *); extern void d_add(struct dentry *, struct inode *); /* used for rename() and baskets */ extern void d_move(struct dentry *, struct dentry *); extern void d_exchange(struct dentry *, struct dentry *); extern struct dentry *d_ancestor(struct dentry *, struct dentry *); extern struct dentry *d_lookup(const struct dentry *, const struct qstr *); static inline unsigned d_count(const struct dentry *dentry) { return dentry->d_lockref.count; } ino_t d_parent_ino(struct dentry *dentry); /* * helper function for dentry_operations.d_dname() members */ extern __printf(3, 4) char *dynamic_dname(char *, int, const char *, ...); extern char *__d_path(const struct path *, const struct path *, char *, int); extern char *d_absolute_path(const struct path *, char *, int); extern char *d_path(const struct path *, char *, int); extern char *dentry_path_raw(const struct dentry *, char *, int); extern char *dentry_path(const struct dentry *, char *, int); /* Allocation counts.. */ /** * dget_dlock - get a reference to a dentry * @dentry: dentry to get a reference to * * Given a live dentry, increment the reference count and return the dentry. * Caller must hold @dentry->d_lock. Making sure that dentry is alive is * caller's resonsibility. There are many conditions sufficient to guarantee * that; e.g. anything with non-negative refcount is alive, so's anything * hashed, anything positive, anyone's parent, etc. */ static inline struct dentry *dget_dlock(struct dentry *dentry) { dentry->d_lockref.count++; return dentry; } /** * dget - get a reference to a dentry * @dentry: dentry to get a reference to * * Given a dentry or %NULL pointer increment the reference count * if appropriate and return the dentry. A dentry will not be * destroyed when it has references. Conversely, a dentry with * no references can disappear for any number of reasons, starting * with memory pressure. In other words, that primitive is * used to clone an existing reference; using it on something with * zero refcount is a bug. * * NOTE: it will spin if @dentry->d_lock is held. From the deadlock * avoidance point of view it is equivalent to spin_lock()/increment * refcount/spin_unlock(), so calling it under @dentry->d_lock is * always a bug; so's calling it under ->d_lock on any of its descendents. * */ static inline struct dentry *dget(struct dentry *dentry) { if (dentry) lockref_get(&dentry->d_lockref); return dentry; } extern struct dentry *dget_parent(struct dentry *dentry); /** * d_unhashed - is dentry hashed * @dentry: entry to check * * Returns true if the dentry passed is not currently hashed. */ static inline int d_unhashed(const struct dentry *dentry) { return hlist_bl_unhashed(&dentry->d_hash); } static inline int d_unlinked(const struct dentry *dentry) { return d_unhashed(dentry) && !IS_ROOT(dentry); } static inline int cant_mount(const struct dentry *dentry) { return (dentry->d_flags & DCACHE_CANT_MOUNT); } static inline void dont_mount(struct dentry *dentry) { spin_lock(&dentry->d_lock); dentry->d_flags |= DCACHE_CANT_MOUNT; spin_unlock(&dentry->d_lock); } extern void __d_lookup_unhash_wake(struct dentry *dentry); static inline int d_in_lookup(const struct dentry *dentry) { return dentry->d_flags & DCACHE_PAR_LOOKUP; } static inline void d_lookup_done(struct dentry *dentry) { if (unlikely(d_in_lookup(dentry))) __d_lookup_unhash_wake(dentry); } extern void dput(struct dentry *); static inline bool d_managed(const struct dentry *dentry) { return dentry->d_flags & DCACHE_MANAGED_DENTRY; } static inline bool d_mountpoint(const struct dentry *dentry) { return dentry->d_flags & DCACHE_MOUNTED; } /* * Directory cache entry type accessor functions. */ static inline unsigned __d_entry_type(const struct dentry *dentry) { return dentry->d_flags & DCACHE_ENTRY_TYPE; } static inline bool d_is_miss(const struct dentry *dentry) { return __d_entry_type(dentry) == DCACHE_MISS_TYPE; } static inline bool d_is_whiteout(const struct dentry *dentry) { return __d_entry_type(dentry) == DCACHE_WHITEOUT_TYPE; } static inline bool d_can_lookup(const struct dentry *dentry) { return __d_entry_type(dentry) == DCACHE_DIRECTORY_TYPE; } static inline bool d_is_autodir(const struct dentry *dentry) { return __d_entry_type(dentry) == DCACHE_AUTODIR_TYPE; } static inline bool d_is_dir(const struct dentry *dentry) { return d_can_lookup(dentry) || d_is_autodir(dentry); } static inline bool d_is_symlink(const struct dentry *dentry) { return __d_entry_type(dentry) == DCACHE_SYMLINK_TYPE; } static inline bool d_is_reg(const struct dentry *dentry) { return __d_entry_type(dentry) == DCACHE_REGULAR_TYPE; } static inline bool d_is_special(const struct dentry *dentry) { return __d_entry_type(dentry) == DCACHE_SPECIAL_TYPE; } static inline bool d_is_file(const struct dentry *dentry) { return d_is_reg(dentry) || d_is_special(dentry); } static inline bool d_is_negative(const struct dentry *dentry) { // TODO: check d_is_whiteout(dentry) also. return d_is_miss(dentry); } static inline bool d_flags_negative(unsigned flags) { return (flags & DCACHE_ENTRY_TYPE) == DCACHE_MISS_TYPE; } static inline bool d_is_positive(const struct dentry *dentry) { return !d_is_negative(dentry); } /** * d_really_is_negative - Determine if a dentry is really negative (ignoring fallthroughs) * @dentry: The dentry in question * * Returns true if the dentry represents either an absent name or a name that * doesn't map to an inode (ie. ->d_inode is NULL). The dentry could represent * a true miss, a whiteout that isn't represented by a 0,0 chardev or a * fallthrough marker in an opaque directory. * * Note! (1) This should be used *only* by a filesystem to examine its own * dentries. It should not be used to look at some other filesystem's * dentries. (2) It should also be used in combination with d_inode() to get * the inode. (3) The dentry may have something attached to ->d_lower and the * type field of the flags may be set to something other than miss or whiteout. */ static inline bool d_really_is_negative(const struct dentry *dentry) { return dentry->d_inode == NULL; } /** * d_really_is_positive - Determine if a dentry is really positive (ignoring fallthroughs) * @dentry: The dentry in question * * Returns true if the dentry represents a name that maps to an inode * (ie. ->d_inode is not NULL). The dentry might still represent a whiteout if * that is represented on medium as a 0,0 chardev. * * Note! (1) This should be used *only* by a filesystem to examine its own * dentries. It should not be used to look at some other filesystem's * dentries. (2) It should also be used in combination with d_inode() to get * the inode. */ static inline bool d_really_is_positive(const struct dentry *dentry) { return dentry->d_inode != NULL; } static inline int simple_positive(const struct dentry *dentry) { return d_really_is_positive(dentry) && !d_unhashed(dentry); } unsigned long vfs_pressure_ratio(unsigned long val); /** * d_inode - Get the actual inode of this dentry * @dentry: The dentry to query * * This is the helper normal filesystems should use to get at their own inodes * in their own dentries and ignore the layering superimposed upon them. */ static inline struct inode *d_inode(const struct dentry *dentry) { return dentry->d_inode; } /** * d_inode_rcu - Get the actual inode of this dentry with READ_ONCE() * @dentry: The dentry to query * * This is the helper normal filesystems should use to get at their own inodes * in their own dentries and ignore the layering superimposed upon them. */ static inline struct inode *d_inode_rcu(const struct dentry *dentry) { return READ_ONCE(dentry->d_inode); } /** * d_backing_inode - Get upper or lower inode we should be using * @upper: The upper layer * * This is the helper that should be used to get at the inode that will be used * if this dentry were to be opened as a file. The inode may be on the upper * dentry or it may be on a lower dentry pinned by the upper. * * Normal filesystems should not use this to access their own inodes. */ static inline struct inode *d_backing_inode(const struct dentry *upper) { struct inode *inode = upper->d_inode; return inode; } /** * d_real - Return the real dentry * @dentry: the dentry to query * @type: the type of real dentry (data or metadata) * * If dentry is on a union/overlay, then return the underlying, real dentry. * Otherwise return the dentry itself. * * See also: Documentation/filesystems/vfs.rst */ static inline struct dentry *d_real(struct dentry *dentry, enum d_real_type type) { if (unlikely(dentry->d_flags & DCACHE_OP_REAL)) return dentry->d_op->d_real(dentry, type); else return dentry; } /** * d_real_inode - Return the real inode hosting the data * @dentry: The dentry to query * * If dentry is on a union/overlay, then return the underlying, real inode. * Otherwise return d_inode(). */ static inline struct inode *d_real_inode(const struct dentry *dentry) { /* This usage of d_real() results in const dentry */ return d_inode(d_real((struct dentry *) dentry, D_REAL_DATA)); } struct name_snapshot { struct qstr name; union shortname_store inline_name; }; void take_dentry_name_snapshot(struct name_snapshot *, struct dentry *); void release_dentry_name_snapshot(struct name_snapshot *); static inline struct dentry *d_first_child(const struct dentry *dentry) { return hlist_entry_safe(dentry->d_children.first, struct dentry, d_sib); } static inline struct dentry *d_next_sibling(const struct dentry *dentry) { return hlist_entry_safe(dentry->d_sib.next, struct dentry, d_sib); } void set_default_d_op(struct super_block *, const struct dentry_operations *); #endif /* __LINUX_DCACHE_H */ |
| 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 | // SPDX-License-Identifier: GPL-2.0 /* * USB 7 Segment Driver * * Copyright (C) 2008 Harrison Metzger <harrisonmetz@gmail.com> * Based on usbled.c by Greg Kroah-Hartman (greg@kroah.com) */ #include <linux/kernel.h> #include <linux/errno.h> #include <linux/slab.h> #include <linux/module.h> #include <linux/string.h> #include <linux/usb.h> #define DRIVER_AUTHOR "Harrison Metzger <harrisonmetz@gmail.com>" #define DRIVER_DESC "USB 7 Segment Driver" #define VENDOR_ID 0x0fc5 #define PRODUCT_ID 0x1227 #define MAXLEN 8 /* table of devices that work with this driver */ static const struct usb_device_id id_table[] = { { USB_DEVICE(VENDOR_ID, PRODUCT_ID) }, { }, }; MODULE_DEVICE_TABLE(usb, id_table); /* the different text display modes the device is capable of */ static const char *display_textmodes[] = {"raw", "hex", "ascii"}; struct usb_sevsegdev { struct usb_device *udev; struct usb_interface *intf; u8 powered; u8 mode_msb; u8 mode_lsb; u8 decimals[MAXLEN]; u8 textmode; u8 text[MAXLEN]; u16 textlength; u8 shadow_power; /* for PM */ u8 has_interface_pm; }; /* sysfs_streq can't replace this completely * If the device was in hex mode, and the user wanted a 0, * if str commands are used, we would assume the end of string * so mem commands are used. */ static inline size_t my_memlen(const char *buf, size_t count) { if (count > 0 && buf[count-1] == '\n') return count - 1; else return count; } static void update_display_powered(struct usb_sevsegdev *mydev) { int rc; if (mydev->powered && !mydev->has_interface_pm) { rc = usb_autopm_get_interface(mydev->intf); if (rc < 0) return; mydev->has_interface_pm = 1; } if (mydev->shadow_power != 1) return; rc = usb_control_msg_send(mydev->udev, 0, 0x12, 0x48, (80 * 0x100) + 10, /* (power mode) */ (0x00 * 0x100) + (mydev->powered ? 1 : 0), NULL, 0, 2000, GFP_KERNEL); if (rc < 0) dev_dbg(&mydev->udev->dev, "power retval = %d\n", rc); if (!mydev->powered && mydev->has_interface_pm) { usb_autopm_put_interface(mydev->intf); mydev->has_interface_pm = 0; } } static void update_display_mode(struct usb_sevsegdev *mydev) { int rc; if(mydev->shadow_power != 1) return; rc = usb_control_msg_send(mydev->udev, 0, 0x12, 0x48, (82 * 0x100) + 10, /* (set mode) */ (mydev->mode_msb * 0x100) + mydev->mode_lsb, NULL, 0, 2000, GFP_NOIO); if (rc < 0) dev_dbg(&mydev->udev->dev, "mode retval = %d\n", rc); } static void update_display_visual(struct usb_sevsegdev *mydev, gfp_t mf) { int rc; int i; unsigned char buffer[MAXLEN] = {0}; u8 decimals = 0; if(mydev->shadow_power != 1) return; /* The device is right to left, where as you write left to right */ for (i = 0; i < mydev->textlength; i++) buffer[i] = mydev->text[mydev->textlength-1-i]; rc = usb_control_msg_send(mydev->udev, 0, 0x12, 0x48, (85 * 0x100) + 10, /* (write text) */ (0 * 0x100) + mydev->textmode, /* mode */ &buffer, mydev->textlength, 2000, mf); if (rc < 0) dev_dbg(&mydev->udev->dev, "write retval = %d\n", rc); /* The device is right to left, where as you write left to right */ for (i = 0; i < sizeof(mydev->decimals); i++) decimals |= mydev->decimals[i] << i; rc = usb_control_msg_send(mydev->udev, 0, 0x12, 0x48, (86 * 0x100) + 10, /* (set decimal) */ (0 * 0x100) + decimals, /* decimals */ NULL, 0, 2000, mf); if (rc < 0) dev_dbg(&mydev->udev->dev, "decimal retval = %d\n", rc); } #define MYDEV_ATTR_SIMPLE_UNSIGNED(name, update_fcn) \ static ssize_t name##_show(struct device *dev, \ struct device_attribute *attr, char *buf) \ { \ struct usb_interface *intf = to_usb_interface(dev); \ struct usb_sevsegdev *mydev = usb_get_intfdata(intf); \ \ return sprintf(buf, "%u\n", mydev->name); \ } \ \ static ssize_t name##_store(struct device *dev, \ struct device_attribute *attr, const char *buf, size_t count) \ { \ struct usb_interface *intf = to_usb_interface(dev); \ struct usb_sevsegdev *mydev = usb_get_intfdata(intf); \ \ mydev->name = simple_strtoul(buf, NULL, 10); \ update_fcn(mydev); \ \ return count; \ } \ static DEVICE_ATTR_RW(name); static ssize_t text_show(struct device *dev, struct device_attribute *attr, char *buf) { struct usb_interface *intf = to_usb_interface(dev); struct usb_sevsegdev *mydev = usb_get_intfdata(intf); return sysfs_emit(buf, "%s\n", mydev->text); } static ssize_t text_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct usb_interface *intf = to_usb_interface(dev); struct usb_sevsegdev *mydev = usb_get_intfdata(intf); size_t end = my_memlen(buf, count); if (end > sizeof(mydev->text)) return -EINVAL; memset(mydev->text, 0, sizeof(mydev->text)); mydev->textlength = end; if (end > 0) memcpy(mydev->text, buf, end); update_display_visual(mydev, GFP_KERNEL); return count; } static DEVICE_ATTR_RW(text); static ssize_t decimals_show(struct device *dev, struct device_attribute *attr, char *buf) { struct usb_interface *intf = to_usb_interface(dev); struct usb_sevsegdev *mydev = usb_get_intfdata(intf); int i; int pos; for (i = 0; i < sizeof(mydev->decimals); i++) { pos = sizeof(mydev->decimals) - 1 - i; if (mydev->decimals[i] == 0) buf[pos] = '0'; else if (mydev->decimals[i] == 1) buf[pos] = '1'; else buf[pos] = 'x'; } buf[sizeof(mydev->decimals)] = '\n'; return sizeof(mydev->decimals) + 1; } static ssize_t decimals_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct usb_interface *intf = to_usb_interface(dev); struct usb_sevsegdev *mydev = usb_get_intfdata(intf); size_t end = my_memlen(buf, count); int i; if (end > sizeof(mydev->decimals)) return -EINVAL; for (i = 0; i < end; i++) if (buf[i] != '0' && buf[i] != '1') return -EINVAL; memset(mydev->decimals, 0, sizeof(mydev->decimals)); for (i = 0; i < end; i++) if (buf[i] == '1') mydev->decimals[end-1-i] = 1; update_display_visual(mydev, GFP_KERNEL); return count; } static DEVICE_ATTR_RW(decimals); static ssize_t textmode_show(struct device *dev, struct device_attribute *attr, char *buf) { struct usb_interface *intf = to_usb_interface(dev); struct usb_sevsegdev *mydev = usb_get_intfdata(intf); int i; buf[0] = 0; for (i = 0; i < ARRAY_SIZE(display_textmodes); i++) { if (mydev->textmode == i) { strcat(buf, " ["); strcat(buf, display_textmodes[i]); strcat(buf, "] "); } else { strcat(buf, " "); strcat(buf, display_textmodes[i]); strcat(buf, " "); } } strcat(buf, "\n"); return strlen(buf); } static ssize_t textmode_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct usb_interface *intf = to_usb_interface(dev); struct usb_sevsegdev *mydev = usb_get_intfdata(intf); int i; i = sysfs_match_string(display_textmodes, buf); if (i < 0) return i; mydev->textmode = i; update_display_visual(mydev, GFP_KERNEL); return count; } static DEVICE_ATTR_RW(textmode); MYDEV_ATTR_SIMPLE_UNSIGNED(powered, update_display_powered); MYDEV_ATTR_SIMPLE_UNSIGNED(mode_msb, update_display_mode); MYDEV_ATTR_SIMPLE_UNSIGNED(mode_lsb, update_display_mode); static struct attribute *sevseg_attrs[] = { &dev_attr_powered.attr, &dev_attr_text.attr, &dev_attr_textmode.attr, &dev_attr_decimals.attr, &dev_attr_mode_msb.attr, &dev_attr_mode_lsb.attr, NULL }; ATTRIBUTE_GROUPS(sevseg); static int sevseg_probe(struct usb_interface *interface, const struct usb_device_id *id) { struct usb_device *udev = interface_to_usbdev(interface); struct usb_sevsegdev *mydev; int rc = -ENOMEM; mydev = kzalloc(sizeof(struct usb_sevsegdev), GFP_KERNEL); if (!mydev) goto error_mem; mydev->udev = usb_get_dev(udev); mydev->intf = interface; usb_set_intfdata(interface, mydev); /* PM */ mydev->shadow_power = 1; /* currently active */ mydev->has_interface_pm = 0; /* have not issued autopm_get */ /*set defaults */ mydev->textmode = 0x02; /* ascii mode */ mydev->mode_msb = 0x06; /* 6 characters */ mydev->mode_lsb = 0x3f; /* scanmode for 6 chars */ dev_info(&interface->dev, "USB 7 Segment device now attached\n"); return 0; error_mem: return rc; } static void sevseg_disconnect(struct usb_interface *interface) { struct usb_sevsegdev *mydev; mydev = usb_get_intfdata(interface); usb_set_intfdata(interface, NULL); usb_put_dev(mydev->udev); kfree(mydev); dev_info(&interface->dev, "USB 7 Segment now disconnected\n"); } static int sevseg_suspend(struct usb_interface *intf, pm_message_t message) { struct usb_sevsegdev *mydev; mydev = usb_get_intfdata(intf); mydev->shadow_power = 0; return 0; } static int sevseg_resume(struct usb_interface *intf) { struct usb_sevsegdev *mydev; mydev = usb_get_intfdata(intf); mydev->shadow_power = 1; update_display_mode(mydev); update_display_visual(mydev, GFP_NOIO); return 0; } static int sevseg_reset_resume(struct usb_interface *intf) { struct usb_sevsegdev *mydev; mydev = usb_get_intfdata(intf); mydev->shadow_power = 1; update_display_mode(mydev); update_display_visual(mydev, GFP_NOIO); return 0; } static struct usb_driver sevseg_driver = { .name = "usbsevseg", .probe = sevseg_probe, .disconnect = sevseg_disconnect, .suspend = sevseg_suspend, .resume = sevseg_resume, .reset_resume = sevseg_reset_resume, .id_table = id_table, .dev_groups = sevseg_groups, .supports_autosuspend = 1, }; module_usb_driver(sevseg_driver); MODULE_AUTHOR(DRIVER_AUTHOR); MODULE_DESCRIPTION(DRIVER_DESC); MODULE_LICENSE("GPL"); |
| 13 13 13 1 12 5 7 7 12 12 13 13 1 1 1 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 11 7 11 10 10 10 10 1 1 1 2 1 1 1 1 1 1 1 1 10 11 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 | // SPDX-License-Identifier: GPL-2.0-only /* * linux/kernel/power/user.c * * This file provides the user space interface for software suspend/resume. * * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl> */ #include <linux/suspend.h> #include <linux/reboot.h> #include <linux/string.h> #include <linux/device.h> #include <linux/miscdevice.h> #include <linux/mm.h> #include <linux/swap.h> #include <linux/swapops.h> #include <linux/pm.h> #include <linux/fs.h> #include <linux/compat.h> #include <linux/console.h> #include <linux/cpu.h> #include <linux/freezer.h> #include <linux/uaccess.h> #include "power.h" static bool need_wait; static struct snapshot_data { struct snapshot_handle handle; int swap; int mode; bool frozen; bool ready; bool platform_support; bool free_bitmaps; dev_t dev; } snapshot_state; int is_hibernate_resume_dev(dev_t dev) { return hibernation_available() && snapshot_state.dev == dev; } static int snapshot_open(struct inode *inode, struct file *filp) { struct snapshot_data *data; unsigned int sleep_flags; int error; if (!hibernation_available()) return -EPERM; sleep_flags = lock_system_sleep(); if (!hibernate_acquire()) { error = -EBUSY; goto Unlock; } if ((filp->f_flags & O_ACCMODE) == O_RDWR) { hibernate_release(); error = -ENOSYS; goto Unlock; } nonseekable_open(inode, filp); data = &snapshot_state; filp->private_data = data; memset(&data->handle, 0, sizeof(struct snapshot_handle)); if ((filp->f_flags & O_ACCMODE) == O_RDONLY) { /* Hibernating. The image device should be accessible. */ data->swap = swap_type_of(swsusp_resume_device, 0); data->mode = O_RDONLY; data->free_bitmaps = false; error = pm_notifier_call_chain_robust(PM_HIBERNATION_PREPARE, PM_POST_HIBERNATION); } else { /* * Resuming. We may need to wait for the image device to * appear. */ need_wait = true; data->swap = -1; data->mode = O_WRONLY; error = pm_notifier_call_chain_robust(PM_RESTORE_PREPARE, PM_POST_RESTORE); if (!error) { error = create_basic_memory_bitmaps(); data->free_bitmaps = !error; } } if (error) hibernate_release(); data->frozen = false; data->ready = false; data->platform_support = false; data->dev = 0; Unlock: unlock_system_sleep(sleep_flags); return error; } static int snapshot_release(struct inode *inode, struct file *filp) { struct snapshot_data *data; unsigned int sleep_flags; sleep_flags = lock_system_sleep(); swsusp_free(); data = filp->private_data; data->dev = 0; free_all_swap_pages(data->swap); if (data->frozen) { pm_restore_gfp_mask(); free_basic_memory_bitmaps(); thaw_processes(); } else if (data->free_bitmaps) { free_basic_memory_bitmaps(); } pm_notifier_call_chain(data->mode == O_RDONLY ? PM_POST_HIBERNATION : PM_POST_RESTORE); hibernate_release(); unlock_system_sleep(sleep_flags); return 0; } static ssize_t snapshot_read(struct file *filp, char __user *buf, size_t count, loff_t *offp) { loff_t pg_offp = *offp & ~PAGE_MASK; struct snapshot_data *data; unsigned int sleep_flags; ssize_t res; sleep_flags = lock_system_sleep(); data = filp->private_data; if (!data->ready) { res = -ENODATA; goto Unlock; } if (!pg_offp) { /* on page boundary? */ res = snapshot_read_next(&data->handle); if (res <= 0) goto Unlock; } else { res = PAGE_SIZE - pg_offp; } res = simple_read_from_buffer(buf, count, &pg_offp, data_of(data->handle), res); if (res > 0) *offp += res; Unlock: unlock_system_sleep(sleep_flags); return res; } static ssize_t snapshot_write(struct file *filp, const char __user *buf, size_t count, loff_t *offp) { loff_t pg_offp = *offp & ~PAGE_MASK; struct snapshot_data *data; unsigned long sleep_flags; ssize_t res; if (need_wait) { wait_for_device_probe(); need_wait = false; } sleep_flags = lock_system_sleep(); data = filp->private_data; if (!pg_offp) { res = snapshot_write_next(&data->handle); if (res <= 0) goto unlock; } else { res = PAGE_SIZE; } if (!data_of(data->handle)) { res = -EINVAL; goto unlock; } res = simple_write_to_buffer(data_of(data->handle), res, &pg_offp, buf, count); if (res > 0) *offp += res; unlock: unlock_system_sleep(sleep_flags); return res; } struct compat_resume_swap_area { compat_loff_t offset; u32 dev; } __packed; static int snapshot_set_swap_area(struct snapshot_data *data, void __user *argp) { sector_t offset; dev_t swdev; if (swsusp_swap_in_use()) return -EPERM; if (in_compat_syscall()) { struct compat_resume_swap_area swap_area; if (copy_from_user(&swap_area, argp, sizeof(swap_area))) return -EFAULT; swdev = new_decode_dev(swap_area.dev); offset = swap_area.offset; } else { struct resume_swap_area swap_area; if (copy_from_user(&swap_area, argp, sizeof(swap_area))) return -EFAULT; swdev = new_decode_dev(swap_area.dev); offset = swap_area.offset; } /* * User space encodes device types as two-byte values, * so we need to recode them */ data->swap = swap_type_of(swdev, offset); if (data->swap < 0) return swdev ? -ENODEV : -EINVAL; data->dev = swdev; return 0; } static long snapshot_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { int error = 0; struct snapshot_data *data; loff_t size; sector_t offset; if (need_wait) { wait_for_device_probe(); need_wait = false; } if (_IOC_TYPE(cmd) != SNAPSHOT_IOC_MAGIC) return -ENOTTY; if (_IOC_NR(cmd) > SNAPSHOT_IOC_MAXNR) return -ENOTTY; if (!capable(CAP_SYS_ADMIN)) return -EPERM; if (!mutex_trylock(&system_transition_mutex)) return -EBUSY; lock_device_hotplug(); data = filp->private_data; switch (cmd) { case SNAPSHOT_FREEZE: if (data->frozen) break; ksys_sync_helper(); error = freeze_processes(); if (error) break; error = create_basic_memory_bitmaps(); if (error) thaw_processes(); else data->frozen = true; break; case SNAPSHOT_UNFREEZE: if (!data->frozen || data->ready) break; pm_restore_gfp_mask(); free_basic_memory_bitmaps(); data->free_bitmaps = false; thaw_processes(); data->frozen = false; break; case SNAPSHOT_CREATE_IMAGE: if (data->mode != O_RDONLY || !data->frozen || data->ready) { error = -EPERM; break; } pm_restore_gfp_mask(); error = hibernation_snapshot(data->platform_support); if (!error) { error = put_user(in_suspend, (int __user *)arg); data->ready = !freezer_test_done && !error; freezer_test_done = false; } break; case SNAPSHOT_ATOMIC_RESTORE: error = snapshot_write_finalize(&data->handle); if (error) break; if (data->mode != O_WRONLY || !data->frozen || !snapshot_image_loaded(&data->handle)) { error = -EPERM; break; } error = hibernation_restore(data->platform_support); break; case SNAPSHOT_FREE: swsusp_free(); memset(&data->handle, 0, sizeof(struct snapshot_handle)); data->ready = false; /* * It is necessary to thaw kernel threads here, because * SNAPSHOT_CREATE_IMAGE may be invoked directly after * SNAPSHOT_FREE. In that case, if kernel threads were not * thawed, the preallocation of memory carried out by * hibernation_snapshot() might run into problems (i.e. it * might fail or even deadlock). */ thaw_kernel_threads(); break; case SNAPSHOT_PREF_IMAGE_SIZE: image_size = arg; break; case SNAPSHOT_GET_IMAGE_SIZE: if (!data->ready) { error = -ENODATA; break; } size = snapshot_get_image_size(); size <<= PAGE_SHIFT; error = put_user(size, (loff_t __user *)arg); break; case SNAPSHOT_AVAIL_SWAP_SIZE: size = count_swap_pages(data->swap, 1); size <<= PAGE_SHIFT; error = put_user(size, (loff_t __user *)arg); break; case SNAPSHOT_ALLOC_SWAP_PAGE: if (data->swap < 0 || data->swap >= MAX_SWAPFILES) { error = -ENODEV; break; } offset = alloc_swapdev_block(data->swap); if (offset) { offset <<= PAGE_SHIFT; error = put_user(offset, (loff_t __user *)arg); } else { error = -ENOSPC; } break; case SNAPSHOT_FREE_SWAP_PAGES: if (data->swap < 0 || data->swap >= MAX_SWAPFILES) { error = -ENODEV; break; } free_all_swap_pages(data->swap); break; case SNAPSHOT_S2RAM: if (!data->frozen) { error = -EPERM; break; } /* * Tasks are frozen and the notifiers have been called with * PM_HIBERNATION_PREPARE */ error = suspend_devices_and_enter(PM_SUSPEND_MEM); data->ready = false; break; case SNAPSHOT_PLATFORM_SUPPORT: data->platform_support = !!arg; break; case SNAPSHOT_POWER_OFF: if (data->platform_support) error = hibernation_platform_enter(); break; case SNAPSHOT_SET_SWAP_AREA: error = snapshot_set_swap_area(data, (void __user *)arg); break; default: error = -ENOTTY; } unlock_device_hotplug(); mutex_unlock(&system_transition_mutex); return error; } #ifdef CONFIG_COMPAT static long snapshot_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { BUILD_BUG_ON(sizeof(loff_t) != sizeof(compat_loff_t)); switch (cmd) { case SNAPSHOT_GET_IMAGE_SIZE: case SNAPSHOT_AVAIL_SWAP_SIZE: case SNAPSHOT_ALLOC_SWAP_PAGE: case SNAPSHOT_CREATE_IMAGE: case SNAPSHOT_SET_SWAP_AREA: return snapshot_ioctl(file, cmd, (unsigned long) compat_ptr(arg)); default: return snapshot_ioctl(file, cmd, arg); } } #endif /* CONFIG_COMPAT */ static const struct file_operations snapshot_fops = { .open = snapshot_open, .release = snapshot_release, .read = snapshot_read, .write = snapshot_write, .unlocked_ioctl = snapshot_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = snapshot_compat_ioctl, #endif }; static struct miscdevice snapshot_device = { .minor = SNAPSHOT_MINOR, .name = "snapshot", .fops = &snapshot_fops, }; static int __init snapshot_device_init(void) { return misc_register(&snapshot_device); }; device_initcall(snapshot_device_init); |
| 3 2 1 3 3 1 3 3 4 1 4 1 1 4 1 1 4 3 3 3 2 2 2 1 1 2 1 2 2 2 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 | // SPDX-License-Identifier: GPL-2.0 /* USB Driver for Sierra Wireless Copyright (C) 2006, 2007, 2008 Kevin Lloyd <klloyd@sierrawireless.com>, Copyright (C) 2008, 2009 Elina Pasheva, Matthew Safar, Rory Filer <linux@sierrawireless.com> IMPORTANT DISCLAIMER: This driver is not commercially supported by Sierra Wireless. Use at your own risk. Portions based on the option driver by Matthias Urlichs <smurf@smurf.noris.de> Whom based his on the Keyspan driver by Hugh Blemings <hugh@blemings.org> */ /* Uncomment to log function calls */ /* #define DEBUG */ #define DRIVER_AUTHOR "Kevin Lloyd, Elina Pasheva, Matthew Safar, Rory Filer" #define DRIVER_DESC "USB Driver for Sierra Wireless USB modems" #include <linux/kernel.h> #include <linux/jiffies.h> #include <linux/errno.h> #include <linux/tty.h> #include <linux/slab.h> #include <linux/tty_flip.h> #include <linux/module.h> #include <linux/usb.h> #include <linux/usb/serial.h> #define SWIMS_USB_REQUEST_SetPower 0x00 #define SWIMS_USB_REQUEST_SetNmea 0x07 #define N_IN_URB_HM 8 #define N_OUT_URB_HM 64 #define N_IN_URB 4 #define N_OUT_URB 4 #define IN_BUFLEN 4096 #define MAX_TRANSFER (PAGE_SIZE - 512) /* MAX_TRANSFER is chosen so that the VM is not stressed by allocations > PAGE_SIZE and the number of packets in a page is an integer 512 is the largest possible packet on EHCI */ static bool nmea; struct sierra_iface_list { const u8 *nums; /* array of interface numbers */ size_t count; /* number of elements in array */ }; struct sierra_intf_private { spinlock_t susp_lock; unsigned int suspended:1; int in_flight; unsigned int open_ports; }; static int sierra_set_power_state(struct usb_device *udev, __u16 swiState) { return usb_control_msg(udev, usb_sndctrlpipe(udev, 0), SWIMS_USB_REQUEST_SetPower, /* __u8 request */ USB_TYPE_VENDOR, /* __u8 request type */ swiState, /* __u16 value */ 0, /* __u16 index */ NULL, /* void *data */ 0, /* __u16 size */ USB_CTRL_SET_TIMEOUT); /* int timeout */ } static int sierra_vsc_set_nmea(struct usb_device *udev, __u16 enable) { return usb_control_msg(udev, usb_sndctrlpipe(udev, 0), SWIMS_USB_REQUEST_SetNmea, /* __u8 request */ USB_TYPE_VENDOR, /* __u8 request type */ enable, /* __u16 value */ 0x0000, /* __u16 index */ NULL, /* void *data */ 0, /* __u16 size */ USB_CTRL_SET_TIMEOUT); /* int timeout */ } static int sierra_calc_num_ports(struct usb_serial *serial, struct usb_serial_endpoints *epds) { int num_ports = 0; u8 ifnum, numendpoints; ifnum = serial->interface->cur_altsetting->desc.bInterfaceNumber; numendpoints = serial->interface->cur_altsetting->desc.bNumEndpoints; /* Dummy interface present on some SKUs should be ignored */ if (ifnum == 0x99) num_ports = 0; else if (numendpoints <= 3) num_ports = 1; else num_ports = (numendpoints-1)/2; return num_ports; } static bool is_listed(const u8 ifnum, const struct sierra_iface_list *list) { int i; if (!list) return false; for (i = 0; i < list->count; i++) { if (list->nums[i] == ifnum) return true; } return false; } static u8 sierra_interface_num(struct usb_serial *serial) { return serial->interface->cur_altsetting->desc.bInterfaceNumber; } static int sierra_probe(struct usb_serial *serial, const struct usb_device_id *id) { const struct sierra_iface_list *ignore_list; int result = 0; struct usb_device *udev; u8 ifnum; udev = serial->dev; ifnum = sierra_interface_num(serial); /* * If this interface supports more than 1 alternate * select the 2nd one */ if (serial->interface->num_altsetting == 2) { dev_dbg(&udev->dev, "Selecting alt setting for interface %d\n", ifnum); /* We know the alternate setting is 1 for the MC8785 */ usb_set_interface(udev, ifnum, 1); } ignore_list = (const struct sierra_iface_list *)id->driver_info; if (is_listed(ifnum, ignore_list)) { dev_dbg(&serial->dev->dev, "Ignoring interface #%d\n", ifnum); return -ENODEV; } return result; } /* interfaces with higher memory requirements */ static const u8 hi_memory_typeA_ifaces[] = { 0, 2 }; static const struct sierra_iface_list typeA_interface_list = { .nums = hi_memory_typeA_ifaces, .count = ARRAY_SIZE(hi_memory_typeA_ifaces), }; static const u8 hi_memory_typeB_ifaces[] = { 3, 4, 5, 6 }; static const struct sierra_iface_list typeB_interface_list = { .nums = hi_memory_typeB_ifaces, .count = ARRAY_SIZE(hi_memory_typeB_ifaces), }; /* 'ignorelist' of interfaces not served by this driver */ static const u8 direct_ip_non_serial_ifaces[] = { 7, 8, 9, 10, 11, 19, 20 }; static const struct sierra_iface_list direct_ip_interface_ignore = { .nums = direct_ip_non_serial_ifaces, .count = ARRAY_SIZE(direct_ip_non_serial_ifaces), }; static const struct usb_device_id id_table[] = { { USB_DEVICE(0x0F3D, 0x0112) }, /* Airprime/Sierra PC 5220 */ { USB_DEVICE(0x03F0, 0x1B1D) }, /* HP ev2200 a.k.a MC5720 */ { USB_DEVICE(0x03F0, 0x211D) }, /* HP ev2210 a.k.a MC5725 */ { USB_DEVICE(0x03F0, 0x1E1D) }, /* HP hs2300 a.k.a MC8775 */ { USB_DEVICE(0x1199, 0x0017) }, /* Sierra Wireless EM5625 */ { USB_DEVICE(0x1199, 0x0018) }, /* Sierra Wireless MC5720 */ { USB_DEVICE(0x1199, 0x0218) }, /* Sierra Wireless MC5720 */ { USB_DEVICE(0x1199, 0x0020) }, /* Sierra Wireless MC5725 */ { USB_DEVICE(0x1199, 0x0220) }, /* Sierra Wireless MC5725 */ { USB_DEVICE(0x1199, 0x0022) }, /* Sierra Wireless EM5725 */ { USB_DEVICE(0x1199, 0x0024) }, /* Sierra Wireless MC5727 */ { USB_DEVICE(0x1199, 0x0224) }, /* Sierra Wireless MC5727 */ { USB_DEVICE(0x1199, 0x0019) }, /* Sierra Wireless AirCard 595 */ { USB_DEVICE(0x1199, 0x0021) }, /* Sierra Wireless AirCard 597E */ { USB_DEVICE(0x1199, 0x0112) }, /* Sierra Wireless AirCard 580 */ { USB_DEVICE(0x1199, 0x0120) }, /* Sierra Wireless USB Dongle 595U */ { USB_DEVICE(0x1199, 0x0301) }, /* Sierra Wireless USB Dongle 250U */ /* Sierra Wireless C597 */ { USB_DEVICE_AND_INTERFACE_INFO(0x1199, 0x0023, 0xFF, 0xFF, 0xFF) }, /* Sierra Wireless T598 */ { USB_DEVICE_AND_INTERFACE_INFO(0x1199, 0x0025, 0xFF, 0xFF, 0xFF) }, { USB_DEVICE(0x1199, 0x0026) }, /* Sierra Wireless T11 */ { USB_DEVICE(0x1199, 0x0027) }, /* Sierra Wireless AC402 */ { USB_DEVICE(0x1199, 0x0028) }, /* Sierra Wireless MC5728 */ { USB_DEVICE(0x1199, 0x0029) }, /* Sierra Wireless Device */ { USB_DEVICE(0x1199, 0x6802) }, /* Sierra Wireless MC8755 */ { USB_DEVICE(0x1199, 0x6803) }, /* Sierra Wireless MC8765 */ { USB_DEVICE(0x1199, 0x6804) }, /* Sierra Wireless MC8755 */ { USB_DEVICE(0x1199, 0x6805) }, /* Sierra Wireless MC8765 */ { USB_DEVICE(0x1199, 0x6808) }, /* Sierra Wireless MC8755 */ { USB_DEVICE(0x1199, 0x6809) }, /* Sierra Wireless MC8765 */ { USB_DEVICE(0x1199, 0x6812) }, /* Sierra Wireless MC8775 & AC 875U */ { USB_DEVICE(0x1199, 0x6813) }, /* Sierra Wireless MC8775 */ { USB_DEVICE(0x1199, 0x6815) }, /* Sierra Wireless MC8775 */ { USB_DEVICE(0x1199, 0x6816) }, /* Sierra Wireless MC8775 */ { USB_DEVICE(0x1199, 0x6820) }, /* Sierra Wireless AirCard 875 */ { USB_DEVICE(0x1199, 0x6821) }, /* Sierra Wireless AirCard 875U */ { USB_DEVICE(0x1199, 0x6822) }, /* Sierra Wireless AirCard 875E */ { USB_DEVICE(0x1199, 0x6832) }, /* Sierra Wireless MC8780 */ { USB_DEVICE(0x1199, 0x6833) }, /* Sierra Wireless MC8781 */ { USB_DEVICE(0x1199, 0x6834) }, /* Sierra Wireless MC8780 */ { USB_DEVICE(0x1199, 0x6835) }, /* Sierra Wireless MC8781 */ { USB_DEVICE(0x1199, 0x6838) }, /* Sierra Wireless MC8780 */ { USB_DEVICE(0x1199, 0x6839) }, /* Sierra Wireless MC8781 */ { USB_DEVICE(0x1199, 0x683A) }, /* Sierra Wireless MC8785 */ { USB_DEVICE(0x1199, 0x683B) }, /* Sierra Wireless MC8785 Composite */ /* Sierra Wireless MC8790, MC8791, MC8792 Composite */ { USB_DEVICE(0x1199, 0x683C) }, { USB_DEVICE(0x1199, 0x683D) }, /* Sierra Wireless MC8791 Composite */ /* Sierra Wireless MC8790, MC8791, MC8792 */ { USB_DEVICE(0x1199, 0x683E) }, { USB_DEVICE(0x1199, 0x6850) }, /* Sierra Wireless AirCard 880 */ { USB_DEVICE(0x1199, 0x6851) }, /* Sierra Wireless AirCard 881 */ { USB_DEVICE(0x1199, 0x6852) }, /* Sierra Wireless AirCard 880 E */ { USB_DEVICE(0x1199, 0x6853) }, /* Sierra Wireless AirCard 881 E */ { USB_DEVICE(0x1199, 0x6855) }, /* Sierra Wireless AirCard 880 U */ { USB_DEVICE(0x1199, 0x6856) }, /* Sierra Wireless AirCard 881 U */ { USB_DEVICE(0x1199, 0x6859) }, /* Sierra Wireless AirCard 885 E */ { USB_DEVICE(0x1199, 0x685A) }, /* Sierra Wireless AirCard 885 E */ /* Sierra Wireless C885 */ { USB_DEVICE_AND_INTERFACE_INFO(0x1199, 0x6880, 0xFF, 0xFF, 0xFF)}, /* Sierra Wireless C888, Air Card 501, USB 303, USB 304 */ { USB_DEVICE_AND_INTERFACE_INFO(0x1199, 0x6890, 0xFF, 0xFF, 0xFF)}, /* Sierra Wireless C22/C33 */ { USB_DEVICE_AND_INTERFACE_INFO(0x1199, 0x6891, 0xFF, 0xFF, 0xFF)}, /* Sierra Wireless HSPA Non-Composite Device */ { USB_DEVICE_AND_INTERFACE_INFO(0x1199, 0x6892, 0xFF, 0xFF, 0xFF)}, { USB_DEVICE(0x1199, 0x6893) }, /* Sierra Wireless Device */ /* Sierra Wireless Direct IP modems */ { USB_DEVICE_AND_INTERFACE_INFO(0x1199, 0x68A3, 0xFF, 0xFF, 0xFF), .driver_info = (kernel_ulong_t)&direct_ip_interface_ignore }, { USB_DEVICE_AND_INTERFACE_INFO(0x1199, 0x68AA, 0xFF, 0xFF, 0xFF), .driver_info = (kernel_ulong_t)&direct_ip_interface_ignore }, { USB_DEVICE(0x1199, 0x68AB) }, /* Sierra Wireless AR8550 */ /* AT&T Direct IP LTE modems */ { USB_DEVICE_AND_INTERFACE_INFO(0x0F3D, 0x68AA, 0xFF, 0xFF, 0xFF), .driver_info = (kernel_ulong_t)&direct_ip_interface_ignore }, /* Airprime/Sierra Wireless Direct IP modems */ { USB_DEVICE_AND_INTERFACE_INFO(0x0F3D, 0x68A3, 0xFF, 0xFF, 0xFF), .driver_info = (kernel_ulong_t)&direct_ip_interface_ignore }, { } }; MODULE_DEVICE_TABLE(usb, id_table); struct sierra_port_private { spinlock_t lock; /* lock the structure */ int outstanding_urbs; /* number of out urbs in flight */ struct usb_anchor active; struct usb_anchor delayed; int num_out_urbs; int num_in_urbs; /* Input endpoints and buffers for this port */ struct urb *in_urbs[N_IN_URB_HM]; /* Settings for the port */ int rts_state; /* Handshaking pins (outputs) */ int dtr_state; int cts_state; /* Handshaking pins (inputs) */ int dsr_state; int dcd_state; int ri_state; }; static int sierra_send_setup(struct usb_serial_port *port) { struct usb_serial *serial = port->serial; struct sierra_port_private *portdata; __u16 interface = 0; int val = 0; int do_send = 0; int retval; portdata = usb_get_serial_port_data(port); if (portdata->dtr_state) val |= 0x01; if (portdata->rts_state) val |= 0x02; /* If composite device then properly report interface */ if (serial->num_ports == 1) { interface = sierra_interface_num(serial); /* Control message is sent only to interfaces with * interrupt_in endpoints */ if (port->interrupt_in_urb) { /* send control message */ do_send = 1; } } /* Otherwise the need to do non-composite mapping */ else { if (port->bulk_out_endpointAddress == 2) interface = 0; else if (port->bulk_out_endpointAddress == 4) interface = 1; else if (port->bulk_out_endpointAddress == 5) interface = 2; do_send = 1; } if (!do_send) return 0; retval = usb_autopm_get_interface(serial->interface); if (retval < 0) return retval; retval = usb_control_msg(serial->dev, usb_sndctrlpipe(serial->dev, 0), 0x22, 0x21, val, interface, NULL, 0, USB_CTRL_SET_TIMEOUT); usb_autopm_put_interface(serial->interface); return retval; } static int sierra_tiocmget(struct tty_struct *tty) { struct usb_serial_port *port = tty->driver_data; unsigned int value; struct sierra_port_private *portdata; portdata = usb_get_serial_port_data(port); value = ((portdata->rts_state) ? TIOCM_RTS : 0) | ((portdata->dtr_state) ? TIOCM_DTR : 0) | ((portdata->cts_state) ? TIOCM_CTS : 0) | ((portdata->dsr_state) ? TIOCM_DSR : 0) | ((portdata->dcd_state) ? TIOCM_CAR : 0) | ((portdata->ri_state) ? TIOCM_RNG : 0); return value; } static int sierra_tiocmset(struct tty_struct *tty, unsigned int set, unsigned int clear) { struct usb_serial_port *port = tty->driver_data; struct sierra_port_private *portdata; portdata = usb_get_serial_port_data(port); if (set & TIOCM_RTS) portdata->rts_state = 1; if (set & TIOCM_DTR) portdata->dtr_state = 1; if (clear & TIOCM_RTS) portdata->rts_state = 0; if (clear & TIOCM_DTR) portdata->dtr_state = 0; return sierra_send_setup(port); } static void sierra_release_urb(struct urb *urb) { if (urb) { kfree(urb->transfer_buffer); usb_free_urb(urb); } } static void sierra_outdat_callback(struct urb *urb) { struct usb_serial_port *port = urb->context; struct sierra_port_private *portdata = usb_get_serial_port_data(port); struct sierra_intf_private *intfdata; int status = urb->status; unsigned long flags; intfdata = usb_get_serial_data(port->serial); /* free up the transfer buffer, as usb_free_urb() does not do this */ kfree(urb->transfer_buffer); usb_autopm_put_interface_async(port->serial->interface); if (status) dev_dbg(&port->dev, "%s - nonzero write bulk status " "received: %d\n", __func__, status); spin_lock_irqsave(&portdata->lock, flags); --portdata->outstanding_urbs; spin_unlock_irqrestore(&portdata->lock, flags); spin_lock_irqsave(&intfdata->susp_lock, flags); --intfdata->in_flight; spin_unlock_irqrestore(&intfdata->susp_lock, flags); usb_serial_port_softint(port); } /* Write */ static int sierra_write(struct tty_struct *tty, struct usb_serial_port *port, const unsigned char *buf, int count) { struct sierra_port_private *portdata; struct sierra_intf_private *intfdata; struct usb_serial *serial = port->serial; unsigned long flags; unsigned char *buffer; struct urb *urb; size_t writesize = min_t(size_t, count, MAX_TRANSFER); int retval = 0; /* verify that we actually have some data to write */ if (count == 0) return 0; portdata = usb_get_serial_port_data(port); intfdata = usb_get_serial_data(serial); dev_dbg(&port->dev, "%s: write (%zd bytes)\n", __func__, writesize); spin_lock_irqsave(&portdata->lock, flags); dev_dbg(&port->dev, "%s - outstanding_urbs: %d\n", __func__, portdata->outstanding_urbs); if (portdata->outstanding_urbs > portdata->num_out_urbs) { spin_unlock_irqrestore(&portdata->lock, flags); dev_dbg(&port->dev, "%s - write limit hit\n", __func__); return 0; } portdata->outstanding_urbs++; dev_dbg(&port->dev, "%s - 1, outstanding_urbs: %d\n", __func__, portdata->outstanding_urbs); spin_unlock_irqrestore(&portdata->lock, flags); retval = usb_autopm_get_interface_async(serial->interface); if (retval < 0) { spin_lock_irqsave(&portdata->lock, flags); portdata->outstanding_urbs--; spin_unlock_irqrestore(&portdata->lock, flags); goto error_simple; } buffer = kmemdup(buf, writesize, GFP_ATOMIC); if (!buffer) { retval = -ENOMEM; goto error_no_buffer; } urb = usb_alloc_urb(0, GFP_ATOMIC); if (!urb) { retval = -ENOMEM; goto error_no_urb; } usb_serial_debug_data(&port->dev, __func__, writesize, buffer); usb_fill_bulk_urb(urb, serial->dev, usb_sndbulkpipe(serial->dev, port->bulk_out_endpointAddress), buffer, writesize, sierra_outdat_callback, port); /* Handle the need to send a zero length packet */ urb->transfer_flags |= URB_ZERO_PACKET; spin_lock_irqsave(&intfdata->susp_lock, flags); if (intfdata->suspended) { usb_anchor_urb(urb, &portdata->delayed); spin_unlock_irqrestore(&intfdata->susp_lock, flags); goto skip_power; } else { usb_anchor_urb(urb, &portdata->active); } /* send it down the pipe */ retval = usb_submit_urb(urb, GFP_ATOMIC); if (retval) { usb_unanchor_urb(urb); spin_unlock_irqrestore(&intfdata->susp_lock, flags); dev_err(&port->dev, "%s - usb_submit_urb(write bulk) failed " "with status = %d\n", __func__, retval); goto error; } else { intfdata->in_flight++; spin_unlock_irqrestore(&intfdata->susp_lock, flags); } skip_power: /* we are done with this urb, so let the host driver * really free it when it is finished with it */ usb_free_urb(urb); return writesize; error: usb_free_urb(urb); error_no_urb: kfree(buffer); error_no_buffer: spin_lock_irqsave(&portdata->lock, flags); --portdata->outstanding_urbs; dev_dbg(&port->dev, "%s - 2. outstanding_urbs: %d\n", __func__, portdata->outstanding_urbs); spin_unlock_irqrestore(&portdata->lock, flags); usb_autopm_put_interface_async(serial->interface); error_simple: return retval; } static void sierra_indat_callback(struct urb *urb) { int err; int endpoint; struct usb_serial_port *port; unsigned char *data = urb->transfer_buffer; int status = urb->status; endpoint = usb_pipeendpoint(urb->pipe); port = urb->context; if (status) { dev_dbg(&port->dev, "%s: nonzero status: %d on" " endpoint %02x\n", __func__, status, endpoint); } else { if (urb->actual_length) { tty_insert_flip_string(&port->port, data, urb->actual_length); tty_flip_buffer_push(&port->port); usb_serial_debug_data(&port->dev, __func__, urb->actual_length, data); } else { dev_dbg(&port->dev, "%s: empty read urb" " received\n", __func__); } } /* Resubmit urb so we continue receiving */ if (status != -ESHUTDOWN && status != -EPERM) { usb_mark_last_busy(port->serial->dev); err = usb_submit_urb(urb, GFP_ATOMIC); if (err && err != -EPERM) dev_err(&port->dev, "resubmit read urb failed." "(%d)\n", err); } } static void sierra_instat_callback(struct urb *urb) { int err; int status = urb->status; struct usb_serial_port *port = urb->context; struct sierra_port_private *portdata = usb_get_serial_port_data(port); struct usb_serial *serial = port->serial; dev_dbg(&port->dev, "%s: urb %p port %p has data %p\n", __func__, urb, port, portdata); if (status == 0) { struct usb_ctrlrequest *req_pkt = urb->transfer_buffer; if (!req_pkt) { dev_dbg(&port->dev, "%s: NULL req_pkt\n", __func__); return; } if ((req_pkt->bRequestType == 0xA1) && (req_pkt->bRequest == 0x20)) { int old_dcd_state; unsigned char signals = *((unsigned char *) urb->transfer_buffer + sizeof(struct usb_ctrlrequest)); dev_dbg(&port->dev, "%s: signal x%x\n", __func__, signals); old_dcd_state = portdata->dcd_state; portdata->cts_state = 1; portdata->dcd_state = ((signals & 0x01) ? 1 : 0); portdata->dsr_state = ((signals & 0x02) ? 1 : 0); portdata->ri_state = ((signals & 0x08) ? 1 : 0); if (old_dcd_state && !portdata->dcd_state) tty_port_tty_hangup(&port->port, true); } else { dev_dbg(&port->dev, "%s: type %x req %x\n", __func__, req_pkt->bRequestType, req_pkt->bRequest); } } else dev_dbg(&port->dev, "%s: error %d\n", __func__, status); /* Resubmit urb so we continue receiving IRQ data */ if (status != -ESHUTDOWN && status != -ENOENT) { usb_mark_last_busy(serial->dev); err = usb_submit_urb(urb, GFP_ATOMIC); if (err && err != -EPERM) dev_err(&port->dev, "%s: resubmit intr urb " "failed. (%d)\n", __func__, err); } } static unsigned int sierra_write_room(struct tty_struct *tty) { struct usb_serial_port *port = tty->driver_data; struct sierra_port_private *portdata = usb_get_serial_port_data(port); unsigned long flags; /* try to give a good number back based on if we have any free urbs at * this point in time */ spin_lock_irqsave(&portdata->lock, flags); if (portdata->outstanding_urbs > (portdata->num_out_urbs * 2) / 3) { spin_unlock_irqrestore(&portdata->lock, flags); dev_dbg(&port->dev, "%s - write limit hit\n", __func__); return 0; } spin_unlock_irqrestore(&portdata->lock, flags); return 2048; } static unsigned int sierra_chars_in_buffer(struct tty_struct *tty) { struct usb_serial_port *port = tty->driver_data; struct sierra_port_private *portdata = usb_get_serial_port_data(port); unsigned long flags; unsigned int chars; /* NOTE: This overcounts somewhat. */ spin_lock_irqsave(&portdata->lock, flags); chars = portdata->outstanding_urbs * MAX_TRANSFER; spin_unlock_irqrestore(&portdata->lock, flags); dev_dbg(&port->dev, "%s - %u\n", __func__, chars); return chars; } static void sierra_stop_rx_urbs(struct usb_serial_port *port) { int i; struct sierra_port_private *portdata = usb_get_serial_port_data(port); for (i = 0; i < portdata->num_in_urbs; i++) usb_kill_urb(portdata->in_urbs[i]); usb_kill_urb(port->interrupt_in_urb); } static int sierra_submit_rx_urbs(struct usb_serial_port *port, gfp_t mem_flags) { int ok_cnt; int err = -EINVAL; int i; struct urb *urb; struct sierra_port_private *portdata = usb_get_serial_port_data(port); ok_cnt = 0; for (i = 0; i < portdata->num_in_urbs; i++) { urb = portdata->in_urbs[i]; if (!urb) continue; err = usb_submit_urb(urb, mem_flags); if (err) { dev_err(&port->dev, "%s: submit urb failed: %d\n", __func__, err); } else { ok_cnt++; } } if (ok_cnt && port->interrupt_in_urb) { err = usb_submit_urb(port->interrupt_in_urb, mem_flags); if (err) { dev_err(&port->dev, "%s: submit intr urb failed: %d\n", __func__, err); } } if (ok_cnt > 0) /* at least one rx urb submitted */ return 0; else return err; } static struct urb *sierra_setup_urb(struct usb_serial *serial, int endpoint, int dir, void *ctx, int len, gfp_t mem_flags, usb_complete_t callback) { struct urb *urb; u8 *buf; urb = usb_alloc_urb(0, mem_flags); if (!urb) return NULL; buf = kmalloc(len, mem_flags); if (buf) { /* Fill URB using supplied data */ usb_fill_bulk_urb(urb, serial->dev, usb_sndbulkpipe(serial->dev, endpoint) | dir, buf, len, callback, ctx); dev_dbg(&serial->dev->dev, "%s %c u : %p d:%p\n", __func__, dir == USB_DIR_IN ? 'i' : 'o', urb, buf); } else { sierra_release_urb(urb); urb = NULL; } return urb; } static void sierra_close(struct usb_serial_port *port) { int i; struct usb_serial *serial = port->serial; struct sierra_port_private *portdata; struct sierra_intf_private *intfdata = usb_get_serial_data(serial); struct urb *urb; portdata = usb_get_serial_port_data(port); /* * Need to take susp_lock to make sure port is not already being * resumed, but no need to hold it due to the tty-port initialized * flag. */ spin_lock_irq(&intfdata->susp_lock); if (--intfdata->open_ports == 0) serial->interface->needs_remote_wakeup = 0; spin_unlock_irq(&intfdata->susp_lock); for (;;) { urb = usb_get_from_anchor(&portdata->delayed); if (!urb) break; kfree(urb->transfer_buffer); usb_free_urb(urb); usb_autopm_put_interface_async(serial->interface); spin_lock_irq(&portdata->lock); portdata->outstanding_urbs--; spin_unlock_irq(&portdata->lock); } sierra_stop_rx_urbs(port); usb_kill_anchored_urbs(&portdata->active); for (i = 0; i < portdata->num_in_urbs; i++) { sierra_release_urb(portdata->in_urbs[i]); portdata->in_urbs[i] = NULL; } usb_autopm_get_interface_no_resume(serial->interface); } static int sierra_open(struct tty_struct *tty, struct usb_serial_port *port) { struct sierra_port_private *portdata; struct usb_serial *serial = port->serial; struct sierra_intf_private *intfdata = usb_get_serial_data(serial); int i; int err; int endpoint; struct urb *urb; portdata = usb_get_serial_port_data(port); endpoint = port->bulk_in_endpointAddress; for (i = 0; i < portdata->num_in_urbs; i++) { urb = sierra_setup_urb(serial, endpoint, USB_DIR_IN, port, IN_BUFLEN, GFP_KERNEL, sierra_indat_callback); portdata->in_urbs[i] = urb; } /* clear halt condition */ usb_clear_halt(serial->dev, usb_sndbulkpipe(serial->dev, endpoint) | USB_DIR_IN); err = sierra_submit_rx_urbs(port, GFP_KERNEL); if (err) goto err_submit; spin_lock_irq(&intfdata->susp_lock); if (++intfdata->open_ports == 1) serial->interface->needs_remote_wakeup = 1; spin_unlock_irq(&intfdata->susp_lock); usb_autopm_put_interface(serial->interface); return 0; err_submit: sierra_stop_rx_urbs(port); for (i = 0; i < portdata->num_in_urbs; i++) { sierra_release_urb(portdata->in_urbs[i]); portdata->in_urbs[i] = NULL; } return err; } static void sierra_dtr_rts(struct usb_serial_port *port, int on) { struct sierra_port_private *portdata; portdata = usb_get_serial_port_data(port); portdata->rts_state = on; portdata->dtr_state = on; sierra_send_setup(port); } static int sierra_startup(struct usb_serial *serial) { struct sierra_intf_private *intfdata; intfdata = kzalloc(sizeof(*intfdata), GFP_KERNEL); if (!intfdata) return -ENOMEM; spin_lock_init(&intfdata->susp_lock); usb_set_serial_data(serial, intfdata); /* Set Device mode to D0 */ sierra_set_power_state(serial->dev, 0x0000); /* Check NMEA and set */ if (nmea) sierra_vsc_set_nmea(serial->dev, 1); return 0; } static void sierra_release(struct usb_serial *serial) { struct sierra_intf_private *intfdata; intfdata = usb_get_serial_data(serial); kfree(intfdata); } static int sierra_port_probe(struct usb_serial_port *port) { struct usb_serial *serial = port->serial; struct sierra_port_private *portdata; const struct sierra_iface_list *himemory_list; u8 ifnum; portdata = kzalloc(sizeof(*portdata), GFP_KERNEL); if (!portdata) return -ENOMEM; spin_lock_init(&portdata->lock); init_usb_anchor(&portdata->active); init_usb_anchor(&portdata->delayed); /* Assume low memory requirements */ portdata->num_out_urbs = N_OUT_URB; portdata->num_in_urbs = N_IN_URB; /* Determine actual memory requirements */ if (serial->num_ports == 1) { /* Get interface number for composite device */ ifnum = sierra_interface_num(serial); himemory_list = &typeB_interface_list; } else { /* This is really the usb-serial port number of the interface * rather than the interface number. */ ifnum = port->port_number; himemory_list = &typeA_interface_list; } if (is_listed(ifnum, himemory_list)) { portdata->num_out_urbs = N_OUT_URB_HM; portdata->num_in_urbs = N_IN_URB_HM; } dev_dbg(&port->dev, "Memory usage (urbs) interface #%d, in=%d, out=%d\n", ifnum, portdata->num_in_urbs, portdata->num_out_urbs); usb_set_serial_port_data(port, portdata); return 0; } static void sierra_port_remove(struct usb_serial_port *port) { struct sierra_port_private *portdata; portdata = usb_get_serial_port_data(port); usb_set_serial_port_data(port, NULL); kfree(portdata); } #ifdef CONFIG_PM static void stop_read_write_urbs(struct usb_serial *serial) { int i; struct usb_serial_port *port; struct sierra_port_private *portdata; /* Stop reading/writing urbs */ for (i = 0; i < serial->num_ports; ++i) { port = serial->port[i]; portdata = usb_get_serial_port_data(port); if (!portdata) continue; sierra_stop_rx_urbs(port); usb_kill_anchored_urbs(&portdata->active); } } static int sierra_suspend(struct usb_serial *serial, pm_message_t message) { struct sierra_intf_private *intfdata = usb_get_serial_data(serial); spin_lock_irq(&intfdata->susp_lock); if (PMSG_IS_AUTO(message)) { if (intfdata->in_flight) { spin_unlock_irq(&intfdata->susp_lock); return -EBUSY; } } intfdata->suspended = 1; spin_unlock_irq(&intfdata->susp_lock); stop_read_write_urbs(serial); return 0; } /* Caller must hold susp_lock. */ static int sierra_submit_delayed_urbs(struct usb_serial_port *port) { struct sierra_port_private *portdata = usb_get_serial_port_data(port); struct sierra_intf_private *intfdata; struct urb *urb; int ec = 0; int err; intfdata = usb_get_serial_data(port->serial); for (;;) { urb = usb_get_from_anchor(&portdata->delayed); if (!urb) break; usb_anchor_urb(urb, &portdata->active); intfdata->in_flight++; err = usb_submit_urb(urb, GFP_ATOMIC); if (err) { dev_err(&port->dev, "%s - submit urb failed: %d", __func__, err); ec++; intfdata->in_flight--; usb_unanchor_urb(urb); kfree(urb->transfer_buffer); usb_free_urb(urb); spin_lock(&portdata->lock); portdata->outstanding_urbs--; spin_unlock(&portdata->lock); } } if (ec) return -EIO; return 0; } static int sierra_resume(struct usb_serial *serial) { struct usb_serial_port *port; struct sierra_intf_private *intfdata = usb_get_serial_data(serial); int ec = 0; int i, err; spin_lock_irq(&intfdata->susp_lock); for (i = 0; i < serial->num_ports; i++) { port = serial->port[i]; if (!tty_port_initialized(&port->port)) continue; err = sierra_submit_delayed_urbs(port); if (err) ec++; err = sierra_submit_rx_urbs(port, GFP_ATOMIC); if (err) ec++; } intfdata->suspended = 0; spin_unlock_irq(&intfdata->susp_lock); return ec ? -EIO : 0; } #else #define sierra_suspend NULL #define sierra_resume NULL #endif static struct usb_serial_driver sierra_device = { .driver = { .name = "sierra", }, .description = "Sierra USB modem", .id_table = id_table, .calc_num_ports = sierra_calc_num_ports, .probe = sierra_probe, .open = sierra_open, .close = sierra_close, .dtr_rts = sierra_dtr_rts, .write = sierra_write, .write_room = sierra_write_room, .chars_in_buffer = sierra_chars_in_buffer, .tiocmget = sierra_tiocmget, .tiocmset = sierra_tiocmset, .attach = sierra_startup, .release = sierra_release, .port_probe = sierra_port_probe, .port_remove = sierra_port_remove, .suspend = sierra_suspend, .resume = sierra_resume, .read_int_callback = sierra_instat_callback, }; static struct usb_serial_driver * const serial_drivers[] = { &sierra_device, NULL }; module_usb_serial_driver(serial_drivers, id_table); MODULE_AUTHOR(DRIVER_AUTHOR); MODULE_DESCRIPTION(DRIVER_DESC); MODULE_LICENSE("GPL v2"); module_param(nmea, bool, 0644); MODULE_PARM_DESC(nmea, "NMEA streaming"); |
| 1282 2 939 748 987 901 23 2 26 497 247 16 48 62 1001 372 45 223 1091 723 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 | /* SPDX-License-Identifier: GPL-2.0-only */ /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com */ #ifndef _LINUX_BPF_VERIFIER_H #define _LINUX_BPF_VERIFIER_H 1 #include <linux/bpf.h> /* for enum bpf_reg_type */ #include <linux/btf.h> /* for struct btf and btf_id() */ #include <linux/filter.h> /* for MAX_BPF_STACK */ #include <linux/tnum.h> /* Maximum variable offset umax_value permitted when resolving memory accesses. * In practice this is far bigger than any realistic pointer offset; this limit * ensures that umax_value + (int)off + (int)size cannot overflow a u64. */ #define BPF_MAX_VAR_OFF (1 << 29) /* Maximum variable size permitted for ARG_CONST_SIZE[_OR_ZERO]. This ensures * that converting umax_value to int cannot overflow. */ #define BPF_MAX_VAR_SIZ (1 << 29) /* size of tmp_str_buf in bpf_verifier. * we need at least 306 bytes to fit full stack mask representation * (in the "-8,-16,...,-512" form) */ #define TMP_STR_BUF_LEN 320 /* Patch buffer size */ #define INSN_BUF_SIZE 32 /* Liveness marks, used for registers and spilled-regs (in stack slots). * Read marks propagate upwards until they find a write mark; they record that * "one of this state's descendants read this reg" (and therefore the reg is * relevant for states_equal() checks). * Write marks collect downwards and do not propagate; they record that "the * straight-line code that reached this state (from its parent) wrote this reg" * (and therefore that reads propagated from this state or its descendants * should not propagate to its parent). * A state with a write mark can receive read marks; it just won't propagate * them to its parent, since the write mark is a property, not of the state, * but of the link between it and its parent. See mark_reg_read() and * mark_stack_slot_read() in kernel/bpf/verifier.c. */ enum bpf_reg_liveness { REG_LIVE_NONE = 0, /* reg hasn't been read or written this branch */ REG_LIVE_READ32 = 0x1, /* reg was read, so we're sensitive to initial value */ REG_LIVE_READ64 = 0x2, /* likewise, but full 64-bit content matters */ REG_LIVE_READ = REG_LIVE_READ32 | REG_LIVE_READ64, REG_LIVE_WRITTEN = 0x4, /* reg was written first, screening off later reads */ REG_LIVE_DONE = 0x8, /* liveness won't be updating this register anymore */ }; #define ITER_PREFIX "bpf_iter_" enum bpf_iter_state { BPF_ITER_STATE_INVALID, /* for non-first slot */ BPF_ITER_STATE_ACTIVE, BPF_ITER_STATE_DRAINED, }; struct bpf_reg_state { /* Ordering of fields matters. See states_equal() */ enum bpf_reg_type type; /* * Fixed part of pointer offset, pointer types only. * Or constant delta between "linked" scalars with the same ID. */ s32 off; union { /* valid when type == PTR_TO_PACKET */ int range; /* valid when type == CONST_PTR_TO_MAP | PTR_TO_MAP_VALUE | * PTR_TO_MAP_VALUE_OR_NULL */ struct { struct bpf_map *map_ptr; /* To distinguish map lookups from outer map * the map_uid is non-zero for registers * pointing to inner maps. */ u32 map_uid; }; /* for PTR_TO_BTF_ID */ struct { struct btf *btf; u32 btf_id; }; struct { /* for PTR_TO_MEM | PTR_TO_MEM_OR_NULL */ u32 mem_size; u32 dynptr_id; /* for dynptr slices */ }; /* For dynptr stack slots */ struct { enum bpf_dynptr_type type; /* A dynptr is 16 bytes so it takes up 2 stack slots. * We need to track which slot is the first slot * to protect against cases where the user may try to * pass in an address starting at the second slot of the * dynptr. */ bool first_slot; } dynptr; /* For bpf_iter stack slots */ struct { /* BTF container and BTF type ID describing * struct bpf_iter_<type> of an iterator state */ struct btf *btf; u32 btf_id; /* packing following two fields to fit iter state into 16 bytes */ enum bpf_iter_state state:2; int depth:30; } iter; /* For irq stack slots */ struct { enum { IRQ_NATIVE_KFUNC, IRQ_LOCK_KFUNC, } kfunc_class; } irq; /* Max size from any of the above. */ struct { unsigned long raw1; unsigned long raw2; } raw; u32 subprogno; /* for PTR_TO_FUNC */ }; /* For scalar types (SCALAR_VALUE), this represents our knowledge of * the actual value. * For pointer types, this represents the variable part of the offset * from the pointed-to object, and is shared with all bpf_reg_states * with the same id as us. */ struct tnum var_off; /* Used to determine if any memory access using this register will * result in a bad access. * These refer to the same value as var_off, not necessarily the actual * contents of the register. */ s64 smin_value; /* minimum possible (s64)value */ s64 smax_value; /* maximum possible (s64)value */ u64 umin_value; /* minimum possible (u64)value */ u64 umax_value; /* maximum possible (u64)value */ s32 s32_min_value; /* minimum possible (s32)value */ s32 s32_max_value; /* maximum possible (s32)value */ u32 u32_min_value; /* minimum possible (u32)value */ u32 u32_max_value; /* maximum possible (u32)value */ /* For PTR_TO_PACKET, used to find other pointers with the same variable * offset, so they can share range knowledge. * For PTR_TO_MAP_VALUE_OR_NULL this is used to share which map value we * came from, when one is tested for != NULL. * For PTR_TO_MEM_OR_NULL this is used to identify memory allocation * for the purpose of tracking that it's freed. * For PTR_TO_SOCKET this is used to share which pointers retain the * same reference to the socket, to determine proper reference freeing. * For stack slots that are dynptrs, this is used to track references to * the dynptr to determine proper reference freeing. * Similarly to dynptrs, we use ID to track "belonging" of a reference * to a specific instance of bpf_iter. */ /* * Upper bit of ID is used to remember relationship between "linked" * registers. Example: * r1 = r2; both will have r1->id == r2->id == N * r1 += 10; r1->id == N | BPF_ADD_CONST and r1->off == 10 */ #define BPF_ADD_CONST (1U << 31) u32 id; /* PTR_TO_SOCKET and PTR_TO_TCP_SOCK could be a ptr returned * from a pointer-cast helper, bpf_sk_fullsock() and * bpf_tcp_sock(). * * Consider the following where "sk" is a reference counted * pointer returned from "sk = bpf_sk_lookup_tcp();": * * 1: sk = bpf_sk_lookup_tcp(); * 2: if (!sk) { return 0; } * 3: fullsock = bpf_sk_fullsock(sk); * 4: if (!fullsock) { bpf_sk_release(sk); return 0; } * 5: tp = bpf_tcp_sock(fullsock); * 6: if (!tp) { bpf_sk_release(sk); return 0; } * 7: bpf_sk_release(sk); * 8: snd_cwnd = tp->snd_cwnd; // verifier will complain * * After bpf_sk_release(sk) at line 7, both "fullsock" ptr and * "tp" ptr should be invalidated also. In order to do that, * the reg holding "fullsock" and "sk" need to remember * the original refcounted ptr id (i.e. sk_reg->id) in ref_obj_id * such that the verifier can reset all regs which have * ref_obj_id matching the sk_reg->id. * * sk_reg->ref_obj_id is set to sk_reg->id at line 1. * sk_reg->id will stay as NULL-marking purpose only. * After NULL-marking is done, sk_reg->id can be reset to 0. * * After "fullsock = bpf_sk_fullsock(sk);" at line 3, * fullsock_reg->ref_obj_id is set to sk_reg->ref_obj_id. * * After "tp = bpf_tcp_sock(fullsock);" at line 5, * tp_reg->ref_obj_id is set to fullsock_reg->ref_obj_id * which is the same as sk_reg->ref_obj_id. * * From the verifier perspective, if sk, fullsock and tp * are not NULL, they are the same ptr with different * reg->type. In particular, bpf_sk_release(tp) is also * allowed and has the same effect as bpf_sk_release(sk). */ u32 ref_obj_id; /* parentage chain for liveness checking */ struct bpf_reg_state *parent; /* Inside the callee two registers can be both PTR_TO_STACK like * R1=fp-8 and R2=fp-8, but one of them points to this function stack * while another to the caller's stack. To differentiate them 'frameno' * is used which is an index in bpf_verifier_state->frame[] array * pointing to bpf_func_state. */ u32 frameno; /* Tracks subreg definition. The stored value is the insn_idx of the * writing insn. This is safe because subreg_def is used before any insn * patching which only happens after main verification finished. */ s32 subreg_def; enum bpf_reg_liveness live; /* if (!precise && SCALAR_VALUE) min/max/tnum don't affect safety */ bool precise; }; enum bpf_stack_slot_type { STACK_INVALID, /* nothing was stored in this stack slot */ STACK_SPILL, /* register spilled into stack */ STACK_MISC, /* BPF program wrote some data into this slot */ STACK_ZERO, /* BPF program wrote constant zero */ /* A dynptr is stored in this stack slot. The type of dynptr * is stored in bpf_stack_state->spilled_ptr.dynptr.type */ STACK_DYNPTR, STACK_ITER, STACK_IRQ_FLAG, }; #define BPF_REG_SIZE 8 /* size of eBPF register in bytes */ #define BPF_REGMASK_ARGS ((1 << BPF_REG_1) | (1 << BPF_REG_2) | \ (1 << BPF_REG_3) | (1 << BPF_REG_4) | \ (1 << BPF_REG_5)) #define BPF_DYNPTR_SIZE sizeof(struct bpf_dynptr_kern) #define BPF_DYNPTR_NR_SLOTS (BPF_DYNPTR_SIZE / BPF_REG_SIZE) struct bpf_stack_state { struct bpf_reg_state spilled_ptr; u8 slot_type[BPF_REG_SIZE]; }; struct bpf_reference_state { /* Each reference object has a type. Ensure REF_TYPE_PTR is zero to * default to pointer reference on zero initialization of a state. */ enum ref_state_type { REF_TYPE_PTR = (1 << 1), REF_TYPE_IRQ = (1 << 2), REF_TYPE_LOCK = (1 << 3), REF_TYPE_RES_LOCK = (1 << 4), REF_TYPE_RES_LOCK_IRQ = (1 << 5), REF_TYPE_LOCK_MASK = REF_TYPE_LOCK | REF_TYPE_RES_LOCK | REF_TYPE_RES_LOCK_IRQ, } type; /* Track each reference created with a unique id, even if the same * instruction creates the reference multiple times (eg, via CALL). */ int id; /* Instruction where the allocation of this reference occurred. This * is used purely to inform the user of a reference leak. */ int insn_idx; /* Use to keep track of the source object of a lock, to ensure * it matches on unlock. */ void *ptr; }; struct bpf_retval_range { s32 minval; s32 maxval; }; /* state of the program: * type of all registers and stack info */ struct bpf_func_state { struct bpf_reg_state regs[MAX_BPF_REG]; /* index of call instruction that called into this func */ int callsite; /* stack frame number of this function state from pov of * enclosing bpf_verifier_state. * 0 = main function, 1 = first callee. */ u32 frameno; /* subprog number == index within subprog_info * zero == main subprog */ u32 subprogno; /* Every bpf_timer_start will increment async_entry_cnt. * It's used to distinguish: * void foo(void) { for(;;); } * void foo(void) { bpf_timer_set_callback(,foo); } */ u32 async_entry_cnt; struct bpf_retval_range callback_ret_range; bool in_callback_fn; bool in_async_callback_fn; bool in_exception_callback_fn; /* For callback calling functions that limit number of possible * callback executions (e.g. bpf_loop) keeps track of current * simulated iteration number. * Value in frame N refers to number of times callback with frame * N+1 was simulated, e.g. for the following call: * * bpf_loop(..., fn, ...); | suppose current frame is N * | fn would be simulated in frame N+1 * | number of simulations is tracked in frame N */ u32 callback_depth; /* The following fields should be last. See copy_func_state() */ /* The state of the stack. Each element of the array describes BPF_REG_SIZE * (i.e. 8) bytes worth of stack memory. * stack[0] represents bytes [*(r10-8)..*(r10-1)] * stack[1] represents bytes [*(r10-16)..*(r10-9)] * ... * stack[allocated_stack/8 - 1] represents [*(r10-allocated_stack)..*(r10-allocated_stack+7)] */ struct bpf_stack_state *stack; /* Size of the current stack, in bytes. The stack state is tracked below, in * `stack`. allocated_stack is always a multiple of BPF_REG_SIZE. */ int allocated_stack; }; #define MAX_CALL_FRAMES 8 /* instruction history flags, used in bpf_jmp_history_entry.flags field */ enum { /* instruction references stack slot through PTR_TO_STACK register; * we also store stack's frame number in lower 3 bits (MAX_CALL_FRAMES is 8) * and accessed stack slot's index in next 6 bits (MAX_BPF_STACK is 512, * 8 bytes per slot, so slot index (spi) is [0, 63]) */ INSN_F_FRAMENO_MASK = 0x7, /* 3 bits */ INSN_F_SPI_MASK = 0x3f, /* 6 bits */ INSN_F_SPI_SHIFT = 3, /* shifted 3 bits to the left */ INSN_F_STACK_ACCESS = BIT(9), INSN_F_DST_REG_STACK = BIT(10), /* dst_reg is PTR_TO_STACK */ INSN_F_SRC_REG_STACK = BIT(11), /* src_reg is PTR_TO_STACK */ /* total 12 bits are used now. */ }; static_assert(INSN_F_FRAMENO_MASK + 1 >= MAX_CALL_FRAMES); static_assert(INSN_F_SPI_MASK + 1 >= MAX_BPF_STACK / 8); struct bpf_jmp_history_entry { u32 idx; /* insn idx can't be bigger than 1 million */ u32 prev_idx : 20; /* special INSN_F_xxx flags */ u32 flags : 12; /* additional registers that need precision tracking when this * jump is backtracked, vector of six 10-bit records */ u64 linked_regs; }; /* Maximum number of register states that can exist at once */ #define BPF_ID_MAP_SIZE ((MAX_BPF_REG + MAX_BPF_STACK / BPF_REG_SIZE) * MAX_CALL_FRAMES) struct bpf_verifier_state { /* call stack tracking */ struct bpf_func_state *frame[MAX_CALL_FRAMES]; struct bpf_verifier_state *parent; /* Acquired reference states */ struct bpf_reference_state *refs; /* * 'branches' field is the number of branches left to explore: * 0 - all possible paths from this state reached bpf_exit or * were safely pruned * 1 - at least one path is being explored. * This state hasn't reached bpf_exit * 2 - at least two paths are being explored. * This state is an immediate parent of two children. * One is fallthrough branch with branches==1 and another * state is pushed into stack (to be explored later) also with * branches==1. The parent of this state has branches==1. * The verifier state tree connected via 'parent' pointer looks like: * 1 * 1 * 2 -> 1 (first 'if' pushed into stack) * 1 * 2 -> 1 (second 'if' pushed into stack) * 1 * 1 * 1 bpf_exit. * * Once do_check() reaches bpf_exit, it calls update_branch_counts() * and the verifier state tree will look: * 1 * 1 * 2 -> 1 (first 'if' pushed into stack) * 1 * 1 -> 1 (second 'if' pushed into stack) * 0 * 0 * 0 bpf_exit. * After pop_stack() the do_check() will resume at second 'if'. * * If is_state_visited() sees a state with branches > 0 it means * there is a loop. If such state is exactly equal to the current state * it's an infinite loop. Note states_equal() checks for states * equivalency, so two states being 'states_equal' does not mean * infinite loop. The exact comparison is provided by * states_maybe_looping() function. It's a stronger pre-check and * much faster than states_equal(). * * This algorithm may not find all possible infinite loops or * loop iteration count may be too high. * In such cases BPF_COMPLEXITY_LIMIT_INSNS limit kicks in. */ u32 branches; u32 insn_idx; u32 curframe; u32 acquired_refs; u32 active_locks; u32 active_preempt_locks; u32 active_irq_id; u32 active_lock_id; void *active_lock_ptr; bool active_rcu_lock; bool speculative; bool in_sleepable; /* first and last insn idx of this verifier state */ u32 first_insn_idx; u32 last_insn_idx; /* if this state is a backedge state then equal_state * records cached state to which this state is equal. */ struct bpf_verifier_state *equal_state; /* jmp history recorded from first to last. * backtracking is using it to go from last to first. * For most states jmp_history_cnt is [0-3]. * For loops can go up to ~40. */ struct bpf_jmp_history_entry *jmp_history; u32 jmp_history_cnt; u32 dfs_depth; u32 callback_unroll_depth; u32 may_goto_depth; }; #define bpf_get_spilled_reg(slot, frame, mask) \ (((slot < frame->allocated_stack / BPF_REG_SIZE) && \ ((1 << frame->stack[slot].slot_type[BPF_REG_SIZE - 1]) & (mask))) \ ? &frame->stack[slot].spilled_ptr : NULL) /* Iterate over 'frame', setting 'reg' to either NULL or a spilled register. */ #define bpf_for_each_spilled_reg(iter, frame, reg, mask) \ for (iter = 0, reg = bpf_get_spilled_reg(iter, frame, mask); \ iter < frame->allocated_stack / BPF_REG_SIZE; \ iter++, reg = bpf_get_spilled_reg(iter, frame, mask)) #define bpf_for_each_reg_in_vstate_mask(__vst, __state, __reg, __mask, __expr) \ ({ \ struct bpf_verifier_state *___vstate = __vst; \ int ___i, ___j; \ for (___i = 0; ___i <= ___vstate->curframe; ___i++) { \ struct bpf_reg_state *___regs; \ __state = ___vstate->frame[___i]; \ ___regs = __state->regs; \ for (___j = 0; ___j < MAX_BPF_REG; ___j++) { \ __reg = &___regs[___j]; \ (void)(__expr); \ } \ bpf_for_each_spilled_reg(___j, __state, __reg, __mask) { \ if (!__reg) \ continue; \ (void)(__expr); \ } \ } \ }) /* Invoke __expr over regsiters in __vst, setting __state and __reg */ #define bpf_for_each_reg_in_vstate(__vst, __state, __reg, __expr) \ bpf_for_each_reg_in_vstate_mask(__vst, __state, __reg, 1 << STACK_SPILL, __expr) /* linked list of verifier states used to prune search */ struct bpf_verifier_state_list { struct bpf_verifier_state state; struct list_head node; u32 miss_cnt; u32 hit_cnt:31; u32 in_free_list:1; }; struct bpf_loop_inline_state { unsigned int initialized:1; /* set to true upon first entry */ unsigned int fit_for_inline:1; /* true if callback function is the same * at each call and flags are always zero */ u32 callback_subprogno; /* valid when fit_for_inline is true */ }; /* pointer and state for maps */ struct bpf_map_ptr_state { struct bpf_map *map_ptr; bool poison; bool unpriv; }; /* Possible states for alu_state member. */ #define BPF_ALU_SANITIZE_SRC (1U << 0) #define BPF_ALU_SANITIZE_DST (1U << 1) #define BPF_ALU_NEG_VALUE (1U << 2) #define BPF_ALU_NON_POINTER (1U << 3) #define BPF_ALU_IMMEDIATE (1U << 4) #define BPF_ALU_SANITIZE (BPF_ALU_SANITIZE_SRC | \ BPF_ALU_SANITIZE_DST) struct bpf_insn_aux_data { union { enum bpf_reg_type ptr_type; /* pointer type for load/store insns */ struct bpf_map_ptr_state map_ptr_state; s32 call_imm; /* saved imm field of call insn */ u32 alu_limit; /* limit for add/sub register with pointer */ struct { u32 map_index; /* index into used_maps[] */ u32 map_off; /* offset from value base address */ }; struct { enum bpf_reg_type reg_type; /* type of pseudo_btf_id */ union { struct { struct btf *btf; u32 btf_id; /* btf_id for struct typed var */ }; u32 mem_size; /* mem_size for non-struct typed var */ }; } btf_var; /* if instruction is a call to bpf_loop this field tracks * the state of the relevant registers to make decision about inlining */ struct bpf_loop_inline_state loop_inline_state; }; union { /* remember the size of type passed to bpf_obj_new to rewrite R1 */ u64 obj_new_size; /* remember the offset of node field within type to rewrite */ u64 insert_off; }; struct btf_struct_meta *kptr_struct_meta; u64 map_key_state; /* constant (32 bit) key tracking for maps */ int ctx_field_size; /* the ctx field size for load insn, maybe 0 */ u32 seen; /* this insn was processed by the verifier at env->pass_cnt */ bool nospec; /* do not execute this instruction speculatively */ bool nospec_result; /* result is unsafe under speculation, nospec must follow */ bool zext_dst; /* this insn zero extends dst reg */ bool needs_zext; /* alu op needs to clear upper bits */ bool storage_get_func_atomic; /* bpf_*_storage_get() with atomic memory alloc */ bool is_iter_next; /* bpf_iter_<type>_next() kfunc call */ bool call_with_percpu_alloc_ptr; /* {this,per}_cpu_ptr() with prog percpu alloc */ u8 alu_state; /* used in combination with alu_limit */ /* true if STX or LDX instruction is a part of a spill/fill * pattern for a bpf_fastcall call. */ u8 fastcall_pattern:1; /* for CALL instructions, a number of spill/fill pairs in the * bpf_fastcall pattern. */ u8 fastcall_spills_num:3; u8 arg_prog:4; /* below fields are initialized once */ unsigned int orig_idx; /* original instruction index */ bool jmp_point; bool prune_point; /* ensure we check state equivalence and save state checkpoint and * this instruction, regardless of any heuristics */ bool force_checkpoint; /* true if instruction is a call to a helper function that * accepts callback function as a parameter. */ bool calls_callback; /* * CFG strongly connected component this instruction belongs to, * zero if it is a singleton SCC. */ u32 scc; /* registers alive before this instruction. */ u16 live_regs_before; }; #define MAX_USED_MAPS 64 /* max number of maps accessed by one eBPF program */ #define MAX_USED_BTFS 64 /* max number of BTFs accessed by one BPF program */ #define BPF_VERIFIER_TMP_LOG_SIZE 1024 struct bpf_verifier_log { /* Logical start and end positions of a "log window" of the verifier log. * start_pos == 0 means we haven't truncated anything. * Once truncation starts to happen, start_pos + len_total == end_pos, * except during log reset situations, in which (end_pos - start_pos) * might get smaller than len_total (see bpf_vlog_reset()). * Generally, (end_pos - start_pos) gives number of useful data in * user log buffer. */ u64 start_pos; u64 end_pos; char __user *ubuf; u32 level; u32 len_total; u32 len_max; char kbuf[BPF_VERIFIER_TMP_LOG_SIZE]; }; #define BPF_LOG_LEVEL1 1 #define BPF_LOG_LEVEL2 2 #define BPF_LOG_STATS 4 #define BPF_LOG_FIXED 8 #define BPF_LOG_LEVEL (BPF_LOG_LEVEL1 | BPF_LOG_LEVEL2) #define BPF_LOG_MASK (BPF_LOG_LEVEL | BPF_LOG_STATS | BPF_LOG_FIXED) #define BPF_LOG_KERNEL (BPF_LOG_MASK + 1) /* kernel internal flag */ #define BPF_LOG_MIN_ALIGNMENT 8U #define BPF_LOG_ALIGNMENT 40U static inline bool bpf_verifier_log_needed(const struct bpf_verifier_log *log) { return log && log->level; } #define BPF_MAX_SUBPROGS 256 struct bpf_subprog_arg_info { enum bpf_arg_type arg_type; union { u32 mem_size; u32 btf_id; }; }; enum priv_stack_mode { PRIV_STACK_UNKNOWN, NO_PRIV_STACK, PRIV_STACK_ADAPTIVE, }; struct bpf_subprog_info { /* 'start' has to be the first field otherwise find_subprog() won't work */ u32 start; /* insn idx of function entry point */ u32 linfo_idx; /* The idx to the main_prog->aux->linfo */ u16 stack_depth; /* max. stack depth used by this function */ u16 stack_extra; /* offsets in range [stack_depth .. fastcall_stack_off) * are used for bpf_fastcall spills and fills. */ s16 fastcall_stack_off; bool has_tail_call: 1; bool tail_call_reachable: 1; bool has_ld_abs: 1; bool is_cb: 1; bool is_async_cb: 1; bool is_exception_cb: 1; bool args_cached: 1; /* true if bpf_fastcall stack region is used by functions that can't be inlined */ bool keep_fastcall_stack: 1; bool changes_pkt_data: 1; bool might_sleep: 1; enum priv_stack_mode priv_stack_mode; u8 arg_cnt; struct bpf_subprog_arg_info args[MAX_BPF_FUNC_REG_ARGS]; }; struct bpf_verifier_env; struct backtrack_state { struct bpf_verifier_env *env; u32 frame; u32 reg_masks[MAX_CALL_FRAMES]; u64 stack_masks[MAX_CALL_FRAMES]; }; struct bpf_id_pair { u32 old; u32 cur; }; struct bpf_idmap { u32 tmp_id_gen; struct bpf_id_pair map[BPF_ID_MAP_SIZE]; }; struct bpf_idset { u32 count; u32 ids[BPF_ID_MAP_SIZE]; }; /* see verifier.c:compute_scc_callchain() */ struct bpf_scc_callchain { /* call sites from bpf_verifier_state->frame[*]->callsite leading to this SCC */ u32 callsites[MAX_CALL_FRAMES - 1]; /* last frame in a chain is identified by SCC id */ u32 scc; }; /* verifier state waiting for propagate_backedges() */ struct bpf_scc_backedge { struct bpf_scc_backedge *next; struct bpf_verifier_state state; }; struct bpf_scc_visit { struct bpf_scc_callchain callchain; /* first state in current verification path that entered SCC * identified by the callchain */ struct bpf_verifier_state *entry_state; struct bpf_scc_backedge *backedges; /* list of backedges */ u32 num_backedges; }; /* An array of bpf_scc_visit structs sharing tht same bpf_scc_callchain->scc * but having different bpf_scc_callchain->callsites. */ struct bpf_scc_info { u32 num_visits; struct bpf_scc_visit visits[]; }; /* single container for all structs * one verifier_env per bpf_check() call */ struct bpf_verifier_env { u32 insn_idx; u32 prev_insn_idx; struct bpf_prog *prog; /* eBPF program being verified */ const struct bpf_verifier_ops *ops; struct module *attach_btf_mod; /* The owner module of prog->aux->attach_btf */ struct bpf_verifier_stack_elem *head; /* stack of verifier states to be processed */ int stack_size; /* number of states to be processed */ bool strict_alignment; /* perform strict pointer alignment checks */ bool test_state_freq; /* test verifier with different pruning frequency */ bool test_reg_invariants; /* fail verification on register invariants violations */ struct bpf_verifier_state *cur_state; /* current verifier state */ /* Search pruning optimization, array of list_heads for * lists of struct bpf_verifier_state_list. */ struct list_head *explored_states; struct list_head free_list; /* list of struct bpf_verifier_state_list */ struct bpf_map *used_maps[MAX_USED_MAPS]; /* array of map's used by eBPF program */ struct btf_mod_pair used_btfs[MAX_USED_BTFS]; /* array of BTF's used by BPF program */ u32 used_map_cnt; /* number of used maps */ u32 used_btf_cnt; /* number of used BTF objects */ u32 id_gen; /* used to generate unique reg IDs */ u32 hidden_subprog_cnt; /* number of hidden subprogs */ int exception_callback_subprog; bool explore_alu_limits; bool allow_ptr_leaks; /* Allow access to uninitialized stack memory. Writes with fixed offset are * always allowed, so this refers to reads (with fixed or variable offset), * to writes with variable offset and to indirect (helper) accesses. */ bool allow_uninit_stack; bool bpf_capable; bool bypass_spec_v1; bool bypass_spec_v4; bool seen_direct_write; bool seen_exception; struct bpf_insn_aux_data *insn_aux_data; /* array of per-insn state */ const struct bpf_line_info *prev_linfo; struct bpf_verifier_log log; struct bpf_subprog_info subprog_info[BPF_MAX_SUBPROGS + 2]; /* max + 2 for the fake and exception subprogs */ union { struct bpf_idmap idmap_scratch; struct bpf_idset idset_scratch; }; struct { int *insn_state; int *insn_stack; /* vector of instruction indexes sorted in post-order */ int *insn_postorder; int cur_stack; /* current position in the insn_postorder vector */ int cur_postorder; } cfg; struct backtrack_state bt; struct bpf_jmp_history_entry *cur_hist_ent; u32 pass_cnt; /* number of times do_check() was called */ u32 subprog_cnt; /* number of instructions analyzed by the verifier */ u32 prev_insn_processed, insn_processed; /* number of jmps, calls, exits analyzed so far */ u32 prev_jmps_processed, jmps_processed; /* total verification time */ u64 verification_time; /* maximum number of verifier states kept in 'branching' instructions */ u32 max_states_per_insn; /* total number of allocated verifier states */ u32 total_states; /* some states are freed during program analysis. * this is peak number of states. this number dominates kernel * memory consumption during verification */ u32 peak_states; /* longest register parentage chain walked for liveness marking */ u32 longest_mark_read_walk; u32 free_list_size; u32 explored_states_size; u32 num_backedges; bpfptr_t fd_array; /* bit mask to keep track of whether a register has been accessed * since the last time the function state was printed */ u32 scratched_regs; /* Same as scratched_regs but for stack slots */ u64 scratched_stack_slots; u64 prev_log_pos, prev_insn_print_pos; /* buffer used to temporary hold constants as scalar registers */ struct bpf_reg_state fake_reg[2]; /* buffer used to generate temporary string representations, * e.g., in reg_type_str() to generate reg_type string */ char tmp_str_buf[TMP_STR_BUF_LEN]; struct bpf_insn insn_buf[INSN_BUF_SIZE]; struct bpf_insn epilogue_buf[INSN_BUF_SIZE]; struct bpf_scc_callchain callchain_buf; /* array of pointers to bpf_scc_info indexed by SCC id */ struct bpf_scc_info **scc_info; u32 scc_cnt; }; static inline struct bpf_func_info_aux *subprog_aux(struct bpf_verifier_env *env, int subprog) { return &env->prog->aux->func_info_aux[subprog]; } static inline struct bpf_subprog_info *subprog_info(struct bpf_verifier_env *env, int subprog) { return &env->subprog_info[subprog]; } __printf(2, 0) void bpf_verifier_vlog(struct bpf_verifier_log *log, const char *fmt, va_list args); __printf(2, 3) void bpf_verifier_log_write(struct bpf_verifier_env *env, const char *fmt, ...); __printf(2, 3) void bpf_log(struct bpf_verifier_log *log, const char *fmt, ...); int bpf_vlog_init(struct bpf_verifier_log *log, u32 log_level, char __user *log_buf, u32 log_size); void bpf_vlog_reset(struct bpf_verifier_log *log, u64 new_pos); int bpf_vlog_finalize(struct bpf_verifier_log *log, u32 *log_size_actual); __printf(3, 4) void verbose_linfo(struct bpf_verifier_env *env, u32 insn_off, const char *prefix_fmt, ...); #define verifier_bug_if(cond, env, fmt, args...) \ ({ \ bool __cond = (cond); \ if (unlikely(__cond)) { \ BPF_WARN_ONCE(1, "verifier bug: " fmt "(" #cond ")\n", ##args); \ bpf_log(&env->log, "verifier bug: " fmt "(" #cond ")\n", ##args); \ } \ (__cond); \ }) #define verifier_bug(env, fmt, args...) verifier_bug_if(1, env, fmt, ##args) static inline struct bpf_func_state *cur_func(struct bpf_verifier_env *env) { struct bpf_verifier_state *cur = env->cur_state; return cur->frame[cur->curframe]; } static inline struct bpf_reg_state *cur_regs(struct bpf_verifier_env *env) { return cur_func(env)->regs; } int bpf_prog_offload_verifier_prep(struct bpf_prog *prog); int bpf_prog_offload_verify_insn(struct bpf_verifier_env *env, int insn_idx, int prev_insn_idx); int bpf_prog_offload_finalize(struct bpf_verifier_env *env); void bpf_prog_offload_replace_insn(struct bpf_verifier_env *env, u32 off, struct bpf_insn *insn); void bpf_prog_offload_remove_insns(struct bpf_verifier_env *env, u32 off, u32 cnt); /* this lives here instead of in bpf.h because it needs to dereference tgt_prog */ static inline u64 bpf_trampoline_compute_key(const struct bpf_prog *tgt_prog, struct btf *btf, u32 btf_id) { if (tgt_prog) return ((u64)tgt_prog->aux->id << 32) | btf_id; else return ((u64)btf_obj_id(btf) << 32) | 0x80000000 | btf_id; } /* unpack the IDs from the key as constructed above */ static inline void bpf_trampoline_unpack_key(u64 key, u32 *obj_id, u32 *btf_id) { if (obj_id) *obj_id = key >> 32; if (btf_id) *btf_id = key & 0x7FFFFFFF; } int bpf_check_attach_target(struct bpf_verifier_log *log, const struct bpf_prog *prog, const struct bpf_prog *tgt_prog, u32 btf_id, struct bpf_attach_target_info *tgt_info); void bpf_free_kfunc_btf_tab(struct bpf_kfunc_btf_tab *tab); int mark_chain_precision(struct bpf_verifier_env *env, int regno); #define BPF_BASE_TYPE_MASK GENMASK(BPF_BASE_TYPE_BITS - 1, 0) /* extract base type from bpf_{arg, return, reg}_type. */ static inline u32 base_type(u32 type) { return type & BPF_BASE_TYPE_MASK; } /* extract flags from an extended type. See bpf_type_flag in bpf.h. */ static inline u32 type_flag(u32 type) { return type & ~BPF_BASE_TYPE_MASK; } /* only use after check_attach_btf_id() */ static inline enum bpf_prog_type resolve_prog_type(const struct bpf_prog *prog) { return (prog->type == BPF_PROG_TYPE_EXT && prog->aux->saved_dst_prog_type) ? prog->aux->saved_dst_prog_type : prog->type; } static inline bool bpf_prog_check_recur(const struct bpf_prog *prog) { switch (resolve_prog_type(prog)) { case BPF_PROG_TYPE_TRACING: return prog->expected_attach_type != BPF_TRACE_ITER; case BPF_PROG_TYPE_STRUCT_OPS: return prog->aux->jits_use_priv_stack; case BPF_PROG_TYPE_LSM: return false; default: return true; } } #define BPF_REG_TRUSTED_MODIFIERS (MEM_ALLOC | PTR_TRUSTED | NON_OWN_REF) static inline bool bpf_type_has_unsafe_modifiers(u32 type) { return type_flag(type) & ~BPF_REG_TRUSTED_MODIFIERS; } static inline bool type_is_ptr_alloc_obj(u32 type) { return base_type(type) == PTR_TO_BTF_ID && type_flag(type) & MEM_ALLOC; } static inline bool type_is_non_owning_ref(u32 type) { return type_is_ptr_alloc_obj(type) && type_flag(type) & NON_OWN_REF; } static inline bool type_is_pkt_pointer(enum bpf_reg_type type) { type = base_type(type); return type == PTR_TO_PACKET || type == PTR_TO_PACKET_META; } static inline bool type_is_sk_pointer(enum bpf_reg_type type) { return type == PTR_TO_SOCKET || type == PTR_TO_SOCK_COMMON || type == PTR_TO_TCP_SOCK || type == PTR_TO_XDP_SOCK; } static inline bool type_may_be_null(u32 type) { return type & PTR_MAYBE_NULL; } static inline void mark_reg_scratched(struct bpf_verifier_env *env, u32 regno) { env->scratched_regs |= 1U << regno; } static inline void mark_stack_slot_scratched(struct bpf_verifier_env *env, u32 spi) { env->scratched_stack_slots |= 1ULL << spi; } static inline bool reg_scratched(const struct bpf_verifier_env *env, u32 regno) { return (env->scratched_regs >> regno) & 1; } static inline bool stack_slot_scratched(const struct bpf_verifier_env *env, u64 regno) { return (env->scratched_stack_slots >> regno) & 1; } static inline bool verifier_state_scratched(const struct bpf_verifier_env *env) { return env->scratched_regs || env->scratched_stack_slots; } static inline void mark_verifier_state_clean(struct bpf_verifier_env *env) { env->scratched_regs = 0U; env->scratched_stack_slots = 0ULL; } /* Used for printing the entire verifier state. */ static inline void mark_verifier_state_scratched(struct bpf_verifier_env *env) { env->scratched_regs = ~0U; env->scratched_stack_slots = ~0ULL; } static inline bool bpf_stack_narrow_access_ok(int off, int fill_size, int spill_size) { #ifdef __BIG_ENDIAN off -= spill_size - fill_size; #endif return !(off % BPF_REG_SIZE); } const char *reg_type_str(struct bpf_verifier_env *env, enum bpf_reg_type type); const char *dynptr_type_str(enum bpf_dynptr_type type); const char *iter_type_str(const struct btf *btf, u32 btf_id); const char *iter_state_str(enum bpf_iter_state state); void print_verifier_state(struct bpf_verifier_env *env, const struct bpf_verifier_state *vstate, u32 frameno, bool print_all); void print_insn_state(struct bpf_verifier_env *env, const struct bpf_verifier_state *vstate, u32 frameno); #endif /* _LINUX_BPF_VERIFIER_H */ |
| 1143 1144 1974 107 1142 1149 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_RCULIST_BL_H #define _LINUX_RCULIST_BL_H /* * RCU-protected bl list version. See include/linux/list_bl.h. */ #include <linux/list_bl.h> #include <linux/rcupdate.h> static inline void hlist_bl_set_first_rcu(struct hlist_bl_head *h, struct hlist_bl_node *n) { LIST_BL_BUG_ON((unsigned long)n & LIST_BL_LOCKMASK); LIST_BL_BUG_ON(((unsigned long)h->first & LIST_BL_LOCKMASK) != LIST_BL_LOCKMASK); rcu_assign_pointer(h->first, (struct hlist_bl_node *)((unsigned long)n | LIST_BL_LOCKMASK)); } static inline struct hlist_bl_node *hlist_bl_first_rcu(struct hlist_bl_head *h) { return (struct hlist_bl_node *) ((unsigned long)rcu_dereference_check(h->first, hlist_bl_is_locked(h)) & ~LIST_BL_LOCKMASK); } /** * hlist_bl_del_rcu - deletes entry from hash list without re-initialization * @n: the element to delete from the hash list. * * Note: hlist_bl_unhashed() on entry does not return true after this, * the entry is in an undefined state. It is useful for RCU based * lockfree traversal. * * In particular, it means that we can not poison the forward * pointers that may still be used for walking the hash list. * * The caller must take whatever precautions are necessary * (such as holding appropriate locks) to avoid racing * with another list-mutation primitive, such as hlist_bl_add_head_rcu() * or hlist_bl_del_rcu(), running on this same list. * However, it is perfectly legal to run concurrently with * the _rcu list-traversal primitives, such as * hlist_bl_for_each_entry(). */ static inline void hlist_bl_del_rcu(struct hlist_bl_node *n) { __hlist_bl_del(n); n->pprev = LIST_POISON2; } /** * hlist_bl_add_head_rcu * @n: the element to add to the hash list. * @h: the list to add to. * * Description: * Adds the specified element to the specified hlist_bl, * while permitting racing traversals. * * The caller must take whatever precautions are necessary * (such as holding appropriate locks) to avoid racing * with another list-mutation primitive, such as hlist_bl_add_head_rcu() * or hlist_bl_del_rcu(), running on this same list. * However, it is perfectly legal to run concurrently with * the _rcu list-traversal primitives, such as * hlist_bl_for_each_entry_rcu(), used to prevent memory-consistency * problems on Alpha CPUs. Regardless of the type of CPU, the * list-traversal primitive must be guarded by rcu_read_lock(). */ static inline void hlist_bl_add_head_rcu(struct hlist_bl_node *n, struct hlist_bl_head *h) { struct hlist_bl_node *first; /* don't need hlist_bl_first_rcu because we're under lock */ first = hlist_bl_first(h); n->next = first; if (first) first->pprev = &n->next; n->pprev = &h->first; /* need _rcu because we can have concurrent lock free readers */ hlist_bl_set_first_rcu(h, n); } /** * hlist_bl_for_each_entry_rcu - iterate over rcu list of given type * @tpos: the type * to use as a loop cursor. * @pos: the &struct hlist_bl_node to use as a loop cursor. * @head: the head for your list. * @member: the name of the hlist_bl_node within the struct. * */ #define hlist_bl_for_each_entry_rcu(tpos, pos, head, member) \ for (pos = hlist_bl_first_rcu(head); \ pos && \ ({ tpos = hlist_bl_entry(pos, typeof(*tpos), member); 1; }); \ pos = rcu_dereference_raw(pos->next)) #endif |
| 1 1 1 5 2 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 | // SPDX-License-Identifier: GPL-2.0-or-later /* * UDPLITE An implementation of the UDP-Lite protocol (RFC 3828). * * Authors: Gerrit Renker <gerrit@erg.abdn.ac.uk> * * Changes: * Fixes: */ #define pr_fmt(fmt) "UDPLite: " fmt #include <linux/export.h> #include <linux/proc_fs.h> #include "udp_impl.h" struct udp_table udplite_table __read_mostly; EXPORT_SYMBOL(udplite_table); /* Designate sk as UDP-Lite socket */ static int udplite_sk_init(struct sock *sk) { udp_init_sock(sk); pr_warn_once("UDP-Lite is deprecated and scheduled to be removed in 2025, " "please contact the netdev mailing list\n"); return 0; } static int udplite_rcv(struct sk_buff *skb) { return __udp4_lib_rcv(skb, &udplite_table, IPPROTO_UDPLITE); } static int udplite_err(struct sk_buff *skb, u32 info) { return __udp4_lib_err(skb, info, &udplite_table); } static const struct net_protocol udplite_protocol = { .handler = udplite_rcv, .err_handler = udplite_err, .no_policy = 1, }; struct proto udplite_prot = { .name = "UDP-Lite", .owner = THIS_MODULE, .close = udp_lib_close, .connect = ip4_datagram_connect, .disconnect = udp_disconnect, .ioctl = udp_ioctl, .init = udplite_sk_init, .destroy = udp_destroy_sock, .setsockopt = udp_setsockopt, .getsockopt = udp_getsockopt, .sendmsg = udp_sendmsg, .recvmsg = udp_recvmsg, .hash = udp_lib_hash, .unhash = udp_lib_unhash, .rehash = udp_v4_rehash, .get_port = udp_v4_get_port, .memory_allocated = &net_aligned_data.udp_memory_allocated, .per_cpu_fw_alloc = &udp_memory_per_cpu_fw_alloc, .sysctl_mem = sysctl_udp_mem, .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_udp_wmem_min), .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_udp_rmem_min), .obj_size = sizeof(struct udp_sock), .h.udp_table = &udplite_table, }; EXPORT_SYMBOL(udplite_prot); static struct inet_protosw udplite4_protosw = { .type = SOCK_DGRAM, .protocol = IPPROTO_UDPLITE, .prot = &udplite_prot, .ops = &inet_dgram_ops, .flags = INET_PROTOSW_PERMANENT, }; #ifdef CONFIG_PROC_FS static struct udp_seq_afinfo udplite4_seq_afinfo = { .family = AF_INET, .udp_table = &udplite_table, }; static int __net_init udplite4_proc_init_net(struct net *net) { if (!proc_create_net_data("udplite", 0444, net->proc_net, &udp_seq_ops, sizeof(struct udp_iter_state), &udplite4_seq_afinfo)) return -ENOMEM; return 0; } static void __net_exit udplite4_proc_exit_net(struct net *net) { remove_proc_entry("udplite", net->proc_net); } static struct pernet_operations udplite4_net_ops = { .init = udplite4_proc_init_net, .exit = udplite4_proc_exit_net, }; static __init int udplite4_proc_init(void) { return register_pernet_subsys(&udplite4_net_ops); } #else static inline int udplite4_proc_init(void) { return 0; } #endif void __init udplite4_register(void) { udp_table_init(&udplite_table, "UDP-Lite"); if (proto_register(&udplite_prot, 1)) goto out_register_err; if (inet_add_protocol(&udplite_protocol, IPPROTO_UDPLITE) < 0) goto out_unregister_proto; inet_register_protosw(&udplite4_protosw); if (udplite4_proc_init()) pr_err("%s: Cannot register /proc!\n", __func__); return; out_unregister_proto: proto_unregister(&udplite_prot); out_register_err: pr_crit("%s: Cannot add UDP-Lite protocol\n", __func__); } |
| 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 | // SPDX-License-Identifier: GPL-2.0 #include <linux/module.h> #include <linux/netfilter/nf_tables.h> #include <net/netfilter/nf_nat.h> #include <net/netfilter/nf_tables.h> #include <net/netfilter/nf_tables_ipv4.h> #include <net/netfilter/nf_tables_ipv6.h> static unsigned int nft_nat_do_chain(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { struct nft_pktinfo pkt; nft_set_pktinfo(&pkt, skb, state); switch (state->pf) { #ifdef CONFIG_NF_TABLES_IPV4 case NFPROTO_IPV4: nft_set_pktinfo_ipv4(&pkt); break; #endif #ifdef CONFIG_NF_TABLES_IPV6 case NFPROTO_IPV6: nft_set_pktinfo_ipv6(&pkt); break; #endif default: break; } return nft_do_chain(&pkt, priv); } #ifdef CONFIG_NF_TABLES_IPV4 static const struct nft_chain_type nft_chain_nat_ipv4 = { .name = "nat", .type = NFT_CHAIN_T_NAT, .family = NFPROTO_IPV4, .owner = THIS_MODULE, .hook_mask = (1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_POST_ROUTING) | (1 << NF_INET_LOCAL_OUT) | (1 << NF_INET_LOCAL_IN), .hooks = { [NF_INET_PRE_ROUTING] = nft_nat_do_chain, [NF_INET_POST_ROUTING] = nft_nat_do_chain, [NF_INET_LOCAL_OUT] = nft_nat_do_chain, [NF_INET_LOCAL_IN] = nft_nat_do_chain, }, .ops_register = nf_nat_ipv4_register_fn, .ops_unregister = nf_nat_ipv4_unregister_fn, }; #endif #ifdef CONFIG_NF_TABLES_IPV6 static const struct nft_chain_type nft_chain_nat_ipv6 = { .name = "nat", .type = NFT_CHAIN_T_NAT, .family = NFPROTO_IPV6, .owner = THIS_MODULE, .hook_mask = (1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_POST_ROUTING) | (1 << NF_INET_LOCAL_OUT) | (1 << NF_INET_LOCAL_IN), .hooks = { [NF_INET_PRE_ROUTING] = nft_nat_do_chain, [NF_INET_POST_ROUTING] = nft_nat_do_chain, [NF_INET_LOCAL_OUT] = nft_nat_do_chain, [NF_INET_LOCAL_IN] = nft_nat_do_chain, }, .ops_register = nf_nat_ipv6_register_fn, .ops_unregister = nf_nat_ipv6_unregister_fn, }; #endif #ifdef CONFIG_NF_TABLES_INET static int nft_nat_inet_reg(struct net *net, const struct nf_hook_ops *ops) { return nf_nat_inet_register_fn(net, ops); } static void nft_nat_inet_unreg(struct net *net, const struct nf_hook_ops *ops) { nf_nat_inet_unregister_fn(net, ops); } static const struct nft_chain_type nft_chain_nat_inet = { .name = "nat", .type = NFT_CHAIN_T_NAT, .family = NFPROTO_INET, .owner = THIS_MODULE, .hook_mask = (1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_IN) | (1 << NF_INET_LOCAL_OUT) | (1 << NF_INET_POST_ROUTING), .hooks = { [NF_INET_PRE_ROUTING] = nft_nat_do_chain, [NF_INET_LOCAL_IN] = nft_nat_do_chain, [NF_INET_LOCAL_OUT] = nft_nat_do_chain, [NF_INET_POST_ROUTING] = nft_nat_do_chain, }, .ops_register = nft_nat_inet_reg, .ops_unregister = nft_nat_inet_unreg, }; #endif static int __init nft_chain_nat_init(void) { #ifdef CONFIG_NF_TABLES_IPV6 nft_register_chain_type(&nft_chain_nat_ipv6); #endif #ifdef CONFIG_NF_TABLES_IPV4 nft_register_chain_type(&nft_chain_nat_ipv4); #endif #ifdef CONFIG_NF_TABLES_INET nft_register_chain_type(&nft_chain_nat_inet); #endif return 0; } static void __exit nft_chain_nat_exit(void) { #ifdef CONFIG_NF_TABLES_IPV4 nft_unregister_chain_type(&nft_chain_nat_ipv4); #endif #ifdef CONFIG_NF_TABLES_IPV6 nft_unregister_chain_type(&nft_chain_nat_ipv6); #endif #ifdef CONFIG_NF_TABLES_INET nft_unregister_chain_type(&nft_chain_nat_inet); #endif } module_init(nft_chain_nat_init); module_exit(nft_chain_nat_exit); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("nftables network address translation support"); #ifdef CONFIG_NF_TABLES_IPV4 MODULE_ALIAS_NFT_CHAIN(AF_INET, "nat"); #endif #ifdef CONFIG_NF_TABLES_IPV6 MODULE_ALIAS_NFT_CHAIN(AF_INET6, "nat"); #endif #ifdef CONFIG_NF_TABLES_INET MODULE_ALIAS_NFT_CHAIN(1, "nat"); /* NFPROTO_INET */ #endif |
| 682 52 2491 473 62 56 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 | /* SPDX-License-Identifier: GPL-2.0-or-later */ /* Integer base 2 logarithm calculation * * Copyright (C) 2006 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #ifndef _LINUX_LOG2_H #define _LINUX_LOG2_H #include <linux/types.h> #include <linux/bitops.h> /* * non-constant log of base 2 calculators * - the arch may override these in asm/bitops.h if they can be implemented * more efficiently than using fls() and fls64() * - the arch is not required to handle n==0 if implementing the fallback */ #ifndef CONFIG_ARCH_HAS_ILOG2_U32 static __always_inline __attribute__((const)) int __ilog2_u32(u32 n) { return fls(n) - 1; } #endif #ifndef CONFIG_ARCH_HAS_ILOG2_U64 static __always_inline __attribute__((const)) int __ilog2_u64(u64 n) { return fls64(n) - 1; } #endif /** * is_power_of_2() - check if a value is a power of two * @n: the value to check * * Determine whether some value is a power of two, where zero is * *not* considered a power of two. * Return: true if @n is a power of 2, otherwise false. */ static __always_inline __attribute__((const)) bool is_power_of_2(unsigned long n) { return (n != 0 && ((n & (n - 1)) == 0)); } /** * __roundup_pow_of_two() - round up to nearest power of two * @n: value to round up */ static inline __attribute__((const)) unsigned long __roundup_pow_of_two(unsigned long n) { return 1UL << fls_long(n - 1); } /** * __rounddown_pow_of_two() - round down to nearest power of two * @n: value to round down */ static inline __attribute__((const)) unsigned long __rounddown_pow_of_two(unsigned long n) { return 1UL << (fls_long(n) - 1); } /** * const_ilog2 - log base 2 of 32-bit or a 64-bit constant unsigned value * @n: parameter * * Use this where sparse expects a true constant expression, e.g. for array * indices. */ #define const_ilog2(n) \ ( \ __builtin_constant_p(n) ? ( \ (n) < 2 ? 0 : \ (n) & (1ULL << 63) ? 63 : \ (n) & (1ULL << 62) ? 62 : \ (n) & (1ULL << 61) ? 61 : \ (n) & (1ULL << 60) ? 60 : \ (n) & (1ULL << 59) ? 59 : \ (n) & (1ULL << 58) ? 58 : \ (n) & (1ULL << 57) ? 57 : \ (n) & (1ULL << 56) ? 56 : \ (n) & (1ULL << 55) ? 55 : \ (n) & (1ULL << 54) ? 54 : \ (n) & (1ULL << 53) ? 53 : \ (n) & (1ULL << 52) ? 52 : \ (n) & (1ULL << 51) ? 51 : \ (n) & (1ULL << 50) ? 50 : \ (n) & (1ULL << 49) ? 49 : \ (n) & (1ULL << 48) ? 48 : \ (n) & (1ULL << 47) ? 47 : \ (n) & (1ULL << 46) ? 46 : \ (n) & (1ULL << 45) ? 45 : \ (n) & (1ULL << 44) ? 44 : \ (n) & (1ULL << 43) ? 43 : \ (n) & (1ULL << 42) ? 42 : \ (n) & (1ULL << 41) ? 41 : \ (n) & (1ULL << 40) ? 40 : \ (n) & (1ULL << 39) ? 39 : \ (n) & (1ULL << 38) ? 38 : \ (n) & (1ULL << 37) ? 37 : \ (n) & (1ULL << 36) ? 36 : \ (n) & (1ULL << 35) ? 35 : \ (n) & (1ULL << 34) ? 34 : \ (n) & (1ULL << 33) ? 33 : \ (n) & (1ULL << 32) ? 32 : \ (n) & (1ULL << 31) ? 31 : \ (n) & (1ULL << 30) ? 30 : \ (n) & (1ULL << 29) ? 29 : \ (n) & (1ULL << 28) ? 28 : \ (n) & (1ULL << 27) ? 27 : \ (n) & (1ULL << 26) ? 26 : \ (n) & (1ULL << 25) ? 25 : \ (n) & (1ULL << 24) ? 24 : \ (n) & (1ULL << 23) ? 23 : \ (n) & (1ULL << 22) ? 22 : \ (n) & (1ULL << 21) ? 21 : \ (n) & (1ULL << 20) ? 20 : \ (n) & (1ULL << 19) ? 19 : \ (n) & (1ULL << 18) ? 18 : \ (n) & (1ULL << 17) ? 17 : \ (n) & (1ULL << 16) ? 16 : \ (n) & (1ULL << 15) ? 15 : \ (n) & (1ULL << 14) ? 14 : \ (n) & (1ULL << 13) ? 13 : \ (n) & (1ULL << 12) ? 12 : \ (n) & (1ULL << 11) ? 11 : \ (n) & (1ULL << 10) ? 10 : \ (n) & (1ULL << 9) ? 9 : \ (n) & (1ULL << 8) ? 8 : \ (n) & (1ULL << 7) ? 7 : \ (n) & (1ULL << 6) ? 6 : \ (n) & (1ULL << 5) ? 5 : \ (n) & (1ULL << 4) ? 4 : \ (n) & (1ULL << 3) ? 3 : \ (n) & (1ULL << 2) ? 2 : \ 1) : \ -1) /** * ilog2 - log base 2 of 32-bit or a 64-bit unsigned value * @n: parameter * * constant-capable log of base 2 calculation * - this can be used to initialise global variables from constant data, hence * the massive ternary operator construction * * selects the appropriately-sized optimised version depending on sizeof(n) */ #define ilog2(n) \ ( \ __builtin_constant_p(n) ? \ ((n) < 2 ? 0 : \ 63 - __builtin_clzll(n)) : \ (sizeof(n) <= 4) ? \ __ilog2_u32(n) : \ __ilog2_u64(n) \ ) /** * roundup_pow_of_two - round the given value up to nearest power of two * @n: parameter * * round the given value up to the nearest power of two * - the result is undefined when n == 0 * - this can be used to initialise global variables from constant data */ #define roundup_pow_of_two(n) \ ( \ __builtin_constant_p(n) ? ( \ ((n) == 1) ? 1 : \ (1UL << (ilog2((n) - 1) + 1)) \ ) : \ __roundup_pow_of_two(n) \ ) /** * rounddown_pow_of_two - round the given value down to nearest power of two * @n: parameter * * round the given value down to the nearest power of two * - the result is undefined when n == 0 * - this can be used to initialise global variables from constant data */ #define rounddown_pow_of_two(n) \ ( \ __builtin_constant_p(n) ? ( \ (1UL << ilog2(n))) : \ __rounddown_pow_of_two(n) \ ) static inline __attribute_const__ int __order_base_2(unsigned long n) { return n > 1 ? ilog2(n - 1) + 1 : 0; } /** * order_base_2 - calculate the (rounded up) base 2 order of the argument * @n: parameter * * The first few values calculated by this routine: * ob2(0) = 0 * ob2(1) = 0 * ob2(2) = 1 * ob2(3) = 2 * ob2(4) = 2 * ob2(5) = 3 * ... and so on. */ #define order_base_2(n) \ ( \ __builtin_constant_p(n) ? ( \ ((n) == 0 || (n) == 1) ? 0 : \ ilog2((n) - 1) + 1) : \ __order_base_2(n) \ ) static inline __attribute__((const)) int __bits_per(unsigned long n) { if (n < 2) return 1; if (is_power_of_2(n)) return order_base_2(n) + 1; return order_base_2(n); } /** * bits_per - calculate the number of bits required for the argument * @n: parameter * * This is constant-capable and can be used for compile time * initializations, e.g bitfields. * * The first few values calculated by this routine: * bf(0) = 1 * bf(1) = 1 * bf(2) = 2 * bf(3) = 2 * bf(4) = 3 * ... and so on. */ #define bits_per(n) \ ( \ __builtin_constant_p(n) ? ( \ ((n) == 0 || (n) == 1) \ ? 1 : ilog2(n) + 1 \ ) : \ __bits_per(n) \ ) /** * max_pow_of_two_factor - return highest power-of-2 factor * @n: parameter * * find highest power-of-2 which is evenly divisible into n. * 0 is returned for n == 0 or 1. */ static inline __attribute__((const)) unsigned int max_pow_of_two_factor(unsigned int n) { return n & -n; } #endif /* _LINUX_LOG2_H */ |
| 271 54 55 55 10 23 54 38 13 32 5 103 11 5 4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_SEQ_FILE_H #define _LINUX_SEQ_FILE_H #include <linux/types.h> #include <linux/string.h> #include <linux/string_helpers.h> #include <linux/bug.h> #include <linux/mutex.h> #include <linux/nodemask.h> #include <linux/fs.h> #include <linux/cred.h> struct seq_operations; struct seq_file { char *buf; size_t size; size_t from; size_t count; size_t pad_until; loff_t index; loff_t read_pos; struct mutex lock; const struct seq_operations *op; int poll_event; const struct file *file; void *private; }; struct seq_operations { void * (*start) (struct seq_file *m, loff_t *pos); void (*stop) (struct seq_file *m, void *v); void * (*next) (struct seq_file *m, void *v, loff_t *pos); int (*show) (struct seq_file *m, void *v); }; #define SEQ_SKIP 1 /** * seq_has_overflowed - check if the buffer has overflowed * @m: the seq_file handle * * seq_files have a buffer which may overflow. When this happens a larger * buffer is reallocated and all the data will be printed again. * The overflow state is true when m->count == m->size. * * Returns true if the buffer received more than it can hold. */ static inline bool seq_has_overflowed(struct seq_file *m) { return m->count == m->size; } /** * seq_get_buf - get buffer to write arbitrary data to * @m: the seq_file handle * @bufp: the beginning of the buffer is stored here * * Return the number of bytes available in the buffer, or zero if * there's no space. */ static inline size_t seq_get_buf(struct seq_file *m, char **bufp) { BUG_ON(m->count > m->size); if (m->count < m->size) *bufp = m->buf + m->count; else *bufp = NULL; return m->size - m->count; } /** * seq_commit - commit data to the buffer * @m: the seq_file handle * @num: the number of bytes to commit * * Commit @num bytes of data written to a buffer previously acquired * by seq_buf_get. To signal an error condition, or that the data * didn't fit in the available space, pass a negative @num value. */ static inline void seq_commit(struct seq_file *m, int num) { if (num < 0) { m->count = m->size; } else { BUG_ON(m->count + num > m->size); m->count += num; } } /** * seq_setwidth - set padding width * @m: the seq_file handle * @size: the max number of bytes to pad. * * Call seq_setwidth() for setting max width, then call seq_printf() etc. and * finally call seq_pad() to pad the remaining bytes. */ static inline void seq_setwidth(struct seq_file *m, size_t size) { m->pad_until = m->count + size; } void seq_pad(struct seq_file *m, char c); char *mangle_path(char *s, const char *p, const char *esc); int seq_open(struct file *, const struct seq_operations *); ssize_t seq_read(struct file *, char __user *, size_t, loff_t *); ssize_t seq_read_iter(struct kiocb *iocb, struct iov_iter *iter); loff_t seq_lseek(struct file *, loff_t, int); int seq_release(struct inode *, struct file *); int seq_write(struct seq_file *seq, const void *data, size_t len); __printf(2, 0) void seq_vprintf(struct seq_file *m, const char *fmt, va_list args); __printf(2, 3) void seq_printf(struct seq_file *m, const char *fmt, ...); void seq_putc(struct seq_file *m, char c); void __seq_puts(struct seq_file *m, const char *s); static __always_inline void seq_puts(struct seq_file *m, const char *s) { if (!__builtin_constant_p(*s)) __seq_puts(m, s); else if (s[0] && !s[1]) seq_putc(m, s[0]); else seq_write(m, s, __builtin_strlen(s)); } void seq_put_decimal_ull_width(struct seq_file *m, const char *delimiter, unsigned long long num, unsigned int width); void seq_put_decimal_ull(struct seq_file *m, const char *delimiter, unsigned long long num); void seq_put_decimal_ll(struct seq_file *m, const char *delimiter, long long num); void seq_put_hex_ll(struct seq_file *m, const char *delimiter, unsigned long long v, unsigned int width); void seq_escape_mem(struct seq_file *m, const char *src, size_t len, unsigned int flags, const char *esc); static inline void seq_escape_str(struct seq_file *m, const char *src, unsigned int flags, const char *esc) { seq_escape_mem(m, src, strlen(src), flags, esc); } /** * seq_escape - print string into buffer, escaping some characters * @m: target buffer * @s: NULL-terminated string * @esc: set of characters that need escaping * * Puts string into buffer, replacing each occurrence of character from * @esc with usual octal escape. * * Use seq_has_overflowed() to check for errors. */ static inline void seq_escape(struct seq_file *m, const char *s, const char *esc) { seq_escape_str(m, s, ESCAPE_OCTAL, esc); } void seq_hex_dump(struct seq_file *m, const char *prefix_str, int prefix_type, int rowsize, int groupsize, const void *buf, size_t len, bool ascii); int seq_path(struct seq_file *, const struct path *, const char *); int seq_file_path(struct seq_file *, struct file *, const char *); int seq_dentry(struct seq_file *, struct dentry *, const char *); int seq_path_root(struct seq_file *m, const struct path *path, const struct path *root, const char *esc); void *single_start(struct seq_file *, loff_t *); int single_open(struct file *, int (*)(struct seq_file *, void *), void *); int single_open_size(struct file *, int (*)(struct seq_file *, void *), void *, size_t); int single_release(struct inode *, struct file *); void *__seq_open_private(struct file *, const struct seq_operations *, int); int seq_open_private(struct file *, const struct seq_operations *, int); int seq_release_private(struct inode *, struct file *); #ifdef CONFIG_BINARY_PRINTF __printf(2, 0) void seq_bprintf(struct seq_file *m, const char *f, const u32 *binary); #endif #define DEFINE_SEQ_ATTRIBUTE(__name) \ static int __name ## _open(struct inode *inode, struct file *file) \ { \ int ret = seq_open(file, &__name ## _sops); \ if (!ret && inode->i_private) { \ struct seq_file *seq_f = file->private_data; \ seq_f->private = inode->i_private; \ } \ return ret; \ } \ \ static const struct file_operations __name ## _fops = { \ .owner = THIS_MODULE, \ .open = __name ## _open, \ .read = seq_read, \ .llseek = seq_lseek, \ .release = seq_release, \ } #define DEFINE_SHOW_ATTRIBUTE(__name) \ static int __name ## _open(struct inode *inode, struct file *file) \ { \ return single_open(file, __name ## _show, inode->i_private); \ } \ \ static const struct file_operations __name ## _fops = { \ .owner = THIS_MODULE, \ .open = __name ## _open, \ .read = seq_read, \ .llseek = seq_lseek, \ .release = single_release, \ } #define DEFINE_SHOW_STORE_ATTRIBUTE(__name) \ static int __name ## _open(struct inode *inode, struct file *file) \ { \ return single_open(file, __name ## _show, inode->i_private); \ } \ \ static const struct file_operations __name ## _fops = { \ .owner = THIS_MODULE, \ .open = __name ## _open, \ .read = seq_read, \ .write = __name ## _write, \ .llseek = seq_lseek, \ .release = single_release, \ } #define DEFINE_PROC_SHOW_ATTRIBUTE(__name) \ static int __name ## _open(struct inode *inode, struct file *file) \ { \ return single_open(file, __name ## _show, pde_data(inode)); \ } \ \ static const struct proc_ops __name ## _proc_ops = { \ .proc_open = __name ## _open, \ .proc_read = seq_read, \ .proc_lseek = seq_lseek, \ .proc_release = single_release, \ } static inline struct user_namespace *seq_user_ns(struct seq_file *seq) { #ifdef CONFIG_USER_NS return seq->file->f_cred->user_ns; #else extern struct user_namespace init_user_ns; return &init_user_ns; #endif } /** * seq_show_options - display mount options with appropriate escapes. * @m: the seq_file handle * @name: the mount option name * @value: the mount option name's value, can be NULL */ static inline void seq_show_option(struct seq_file *m, const char *name, const char *value) { seq_putc(m, ','); seq_escape(m, name, ",= \t\n\\"); if (value) { seq_putc(m, '='); seq_escape(m, value, ", \t\n\\"); } } /** * seq_show_option_n - display mount options with appropriate escapes * where @value must be a specific length (i.e. * not NUL-terminated). * @m: the seq_file handle * @name: the mount option name * @value: the mount option name's value, cannot be NULL * @length: the exact length of @value to display, must be constant expression * * This is a macro since this uses "length" to define the size of the * stack buffer. */ #define seq_show_option_n(m, name, value, length) { \ char val_buf[length + 1]; \ memcpy(val_buf, value, length); \ val_buf[length] = '\0'; \ seq_show_option(m, name, val_buf); \ } #define SEQ_START_TOKEN ((void *)1) /* * Helpers for iteration over list_head-s in seq_files */ extern struct list_head *seq_list_start(struct list_head *head, loff_t pos); extern struct list_head *seq_list_start_head(struct list_head *head, loff_t pos); extern struct list_head *seq_list_next(void *v, struct list_head *head, loff_t *ppos); extern struct list_head *seq_list_start_rcu(struct list_head *head, loff_t pos); extern struct list_head *seq_list_start_head_rcu(struct list_head *head, loff_t pos); extern struct list_head *seq_list_next_rcu(void *v, struct list_head *head, loff_t *ppos); /* * Helpers for iteration over hlist_head-s in seq_files */ extern struct hlist_node *seq_hlist_start(struct hlist_head *head, loff_t pos); extern struct hlist_node *seq_hlist_start_head(struct hlist_head *head, loff_t pos); extern struct hlist_node *seq_hlist_next(void *v, struct hlist_head *head, loff_t *ppos); extern struct hlist_node *seq_hlist_start_rcu(struct hlist_head *head, loff_t pos); extern struct hlist_node *seq_hlist_start_head_rcu(struct hlist_head *head, loff_t pos); extern struct hlist_node *seq_hlist_next_rcu(void *v, struct hlist_head *head, loff_t *ppos); /* Helpers for iterating over per-cpu hlist_head-s in seq_files */ extern struct hlist_node *seq_hlist_start_percpu(struct hlist_head __percpu *head, int *cpu, loff_t pos); extern struct hlist_node *seq_hlist_next_percpu(void *v, struct hlist_head __percpu *head, int *cpu, loff_t *pos); void seq_file_init(void); #endif |
| 1 1 1 1 1 1 1 1 1 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 | /* * Copyright 2017 Red Hat * Parts ported from amdgpu (fence wait code). * Copyright 2016 Advanced Micro Devices, Inc. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sublicense, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice (including the next * paragraph) shall be included in all copies or substantial portions of the * Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS * IN THE SOFTWARE. * * Authors: * */ /** * DOC: Overview * * DRM synchronisation objects (syncobj, see struct &drm_syncobj) provide a * container for a synchronization primitive which can be used by userspace * to explicitly synchronize GPU commands, can be shared between userspace * processes, and can be shared between different DRM drivers. * Their primary use-case is to implement Vulkan fences and semaphores. * The syncobj userspace API provides ioctls for several operations: * * - Creation and destruction of syncobjs * - Import and export of syncobjs to/from a syncobj file descriptor * - Import and export a syncobj's underlying fence to/from a sync file * - Reset a syncobj (set its fence to NULL) * - Signal a syncobj (set a trivially signaled fence) * - Wait for a syncobj's fence to appear and be signaled * * The syncobj userspace API also provides operations to manipulate a syncobj * in terms of a timeline of struct &dma_fence_chain rather than a single * struct &dma_fence, through the following operations: * * - Signal a given point on the timeline * - Wait for a given point to appear and/or be signaled * - Import and export from/to a given point of a timeline * * At it's core, a syncobj is simply a wrapper around a pointer to a struct * &dma_fence which may be NULL. * When a syncobj is first created, its pointer is either NULL or a pointer * to an already signaled fence depending on whether the * &DRM_SYNCOBJ_CREATE_SIGNALED flag is passed to * &DRM_IOCTL_SYNCOBJ_CREATE. * * If the syncobj is considered as a binary (its state is either signaled or * unsignaled) primitive, when GPU work is enqueued in a DRM driver to signal * the syncobj, the syncobj's fence is replaced with a fence which will be * signaled by the completion of that work. * If the syncobj is considered as a timeline primitive, when GPU work is * enqueued in a DRM driver to signal the a given point of the syncobj, a new * struct &dma_fence_chain pointing to the DRM driver's fence and also * pointing to the previous fence that was in the syncobj. The new struct * &dma_fence_chain fence replace the syncobj's fence and will be signaled by * completion of the DRM driver's work and also any work associated with the * fence previously in the syncobj. * * When GPU work which waits on a syncobj is enqueued in a DRM driver, at the * time the work is enqueued, it waits on the syncobj's fence before * submitting the work to hardware. That fence is either : * * - The syncobj's current fence if the syncobj is considered as a binary * primitive. * - The struct &dma_fence associated with a given point if the syncobj is * considered as a timeline primitive. * * If the syncobj's fence is NULL or not present in the syncobj's timeline, * the enqueue operation is expected to fail. * * With binary syncobj, all manipulation of the syncobjs's fence happens in * terms of the current fence at the time the ioctl is called by userspace * regardless of whether that operation is an immediate host-side operation * (signal or reset) or or an operation which is enqueued in some driver * queue. &DRM_IOCTL_SYNCOBJ_RESET and &DRM_IOCTL_SYNCOBJ_SIGNAL can be used * to manipulate a syncobj from the host by resetting its pointer to NULL or * setting its pointer to a fence which is already signaled. * * With a timeline syncobj, all manipulation of the synobj's fence happens in * terms of a u64 value referring to point in the timeline. See * dma_fence_chain_find_seqno() to see how a given point is found in the * timeline. * * Note that applications should be careful to always use timeline set of * ioctl() when dealing with syncobj considered as timeline. Using a binary * set of ioctl() with a syncobj considered as timeline could result incorrect * synchronization. The use of binary syncobj is supported through the * timeline set of ioctl() by using a point value of 0, this will reproduce * the behavior of the binary set of ioctl() (for example replace the * syncobj's fence when signaling). * * * Host-side wait on syncobjs * -------------------------- * * &DRM_IOCTL_SYNCOBJ_WAIT takes an array of syncobj handles and does a * host-side wait on all of the syncobj fences simultaneously. * If &DRM_SYNCOBJ_WAIT_FLAGS_WAIT_ALL is set, the wait ioctl will wait on * all of the syncobj fences to be signaled before it returns. * Otherwise, it returns once at least one syncobj fence has been signaled * and the index of a signaled fence is written back to the client. * * Unlike the enqueued GPU work dependencies which fail if they see a NULL * fence in a syncobj, if &DRM_SYNCOBJ_WAIT_FLAGS_WAIT_FOR_SUBMIT is set, * the host-side wait will first wait for the syncobj to receive a non-NULL * fence and then wait on that fence. * If &DRM_SYNCOBJ_WAIT_FLAGS_WAIT_FOR_SUBMIT is not set and any one of the * syncobjs in the array has a NULL fence, -EINVAL will be returned. * Assuming the syncobj starts off with a NULL fence, this allows a client * to do a host wait in one thread (or process) which waits on GPU work * submitted in another thread (or process) without having to manually * synchronize between the two. * This requirement is inherited from the Vulkan fence API. * * If &DRM_SYNCOBJ_WAIT_FLAGS_WAIT_DEADLINE is set, the ioctl will also set * a fence deadline hint on the backing fences before waiting, to provide the * fence signaler with an appropriate sense of urgency. The deadline is * specified as an absolute &CLOCK_MONOTONIC value in units of ns. * * Similarly, &DRM_IOCTL_SYNCOBJ_TIMELINE_WAIT takes an array of syncobj * handles as well as an array of u64 points and does a host-side wait on all * of syncobj fences at the given points simultaneously. * * &DRM_IOCTL_SYNCOBJ_TIMELINE_WAIT also adds the ability to wait for a given * fence to materialize on the timeline without waiting for the fence to be * signaled by using the &DRM_SYNCOBJ_WAIT_FLAGS_WAIT_AVAILABLE flag. This * requirement is inherited from the wait-before-signal behavior required by * the Vulkan timeline semaphore API. * * Alternatively, &DRM_IOCTL_SYNCOBJ_EVENTFD can be used to wait without * blocking: an eventfd will be signaled when the syncobj is. This is useful to * integrate the wait in an event loop. * * * Import/export of syncobjs * ------------------------- * * &DRM_IOCTL_SYNCOBJ_FD_TO_HANDLE and &DRM_IOCTL_SYNCOBJ_HANDLE_TO_FD * provide two mechanisms for import/export of syncobjs. * * The first lets the client import or export an entire syncobj to a file * descriptor. * These fd's are opaque and have no other use case, except passing the * syncobj between processes. * All exported file descriptors and any syncobj handles created as a * result of importing those file descriptors own a reference to the * same underlying struct &drm_syncobj and the syncobj can be used * persistently across all the processes with which it is shared. * The syncobj is freed only once the last reference is dropped. * Unlike dma-buf, importing a syncobj creates a new handle (with its own * reference) for every import instead of de-duplicating. * The primary use-case of this persistent import/export is for shared * Vulkan fences and semaphores. * * The second import/export mechanism, which is indicated by * &DRM_SYNCOBJ_FD_TO_HANDLE_FLAGS_IMPORT_SYNC_FILE or * &DRM_SYNCOBJ_HANDLE_TO_FD_FLAGS_EXPORT_SYNC_FILE lets the client * import/export the syncobj's current fence from/to a &sync_file. * When a syncobj is exported to a sync file, that sync file wraps the * sycnobj's fence at the time of export and any later signal or reset * operations on the syncobj will not affect the exported sync file. * When a sync file is imported into a syncobj, the syncobj's fence is set * to the fence wrapped by that sync file. * Because sync files are immutable, resetting or signaling the syncobj * will not affect any sync files whose fences have been imported into the * syncobj. * * * Import/export of timeline points in timeline syncobjs * ----------------------------------------------------- * * &DRM_IOCTL_SYNCOBJ_TRANSFER provides a mechanism to transfer a struct * &dma_fence_chain of a syncobj at a given u64 point to another u64 point * into another syncobj. * * Note that if you want to transfer a struct &dma_fence_chain from a given * point on a timeline syncobj from/into a binary syncobj, you can use the * point 0 to mean take/replace the fence in the syncobj. */ #include <linux/anon_inodes.h> #include <linux/dma-fence-unwrap.h> #include <linux/eventfd.h> #include <linux/export.h> #include <linux/file.h> #include <linux/fs.h> #include <linux/sched/signal.h> #include <linux/sync_file.h> #include <linux/uaccess.h> #include <drm/drm.h> #include <drm/drm_drv.h> #include <drm/drm_file.h> #include <drm/drm_gem.h> #include <drm/drm_print.h> #include <drm/drm_syncobj.h> #include <drm/drm_utils.h> #include "drm_internal.h" struct syncobj_wait_entry { struct list_head node; struct task_struct *task; struct dma_fence *fence; struct dma_fence_cb fence_cb; u64 point; }; static void syncobj_wait_syncobj_func(struct drm_syncobj *syncobj, struct syncobj_wait_entry *wait); struct syncobj_eventfd_entry { struct list_head node; struct dma_fence *fence; struct dma_fence_cb fence_cb; struct drm_syncobj *syncobj; struct eventfd_ctx *ev_fd_ctx; u64 point; u32 flags; }; static void syncobj_eventfd_entry_func(struct drm_syncobj *syncobj, struct syncobj_eventfd_entry *entry); /** * drm_syncobj_find - lookup and reference a sync object. * @file_private: drm file private pointer * @handle: sync object handle to lookup. * * Returns a reference to the syncobj pointed to by handle or NULL. The * reference must be released by calling drm_syncobj_put(). */ struct drm_syncobj *drm_syncobj_find(struct drm_file *file_private, u32 handle) { struct drm_syncobj *syncobj; spin_lock(&file_private->syncobj_table_lock); /* Check if we currently have a reference on the object */ syncobj = idr_find(&file_private->syncobj_idr, handle); if (syncobj) drm_syncobj_get(syncobj); spin_unlock(&file_private->syncobj_table_lock); return syncobj; } EXPORT_SYMBOL(drm_syncobj_find); static void drm_syncobj_fence_add_wait(struct drm_syncobj *syncobj, struct syncobj_wait_entry *wait) { struct dma_fence *fence; if (wait->fence) return; spin_lock(&syncobj->lock); /* We've already tried once to get a fence and failed. Now that we * have the lock, try one more time just to be sure we don't add a * callback when a fence has already been set. */ fence = dma_fence_get(rcu_dereference_protected(syncobj->fence, 1)); if (!fence || dma_fence_chain_find_seqno(&fence, wait->point)) { dma_fence_put(fence); list_add_tail(&wait->node, &syncobj->cb_list); } else if (!fence) { wait->fence = dma_fence_get_stub(); } else { wait->fence = fence; } spin_unlock(&syncobj->lock); } static void drm_syncobj_remove_wait(struct drm_syncobj *syncobj, struct syncobj_wait_entry *wait) { if (!wait->node.next) return; spin_lock(&syncobj->lock); list_del_init(&wait->node); spin_unlock(&syncobj->lock); } static void syncobj_eventfd_entry_free(struct syncobj_eventfd_entry *entry) { eventfd_ctx_put(entry->ev_fd_ctx); dma_fence_put(entry->fence); /* This happens either inside the syncobj lock, or after the node has * already been removed from the list. */ list_del(&entry->node); kfree(entry); } static void drm_syncobj_add_eventfd(struct drm_syncobj *syncobj, struct syncobj_eventfd_entry *entry) { spin_lock(&syncobj->lock); list_add_tail(&entry->node, &syncobj->ev_fd_list); syncobj_eventfd_entry_func(syncobj, entry); spin_unlock(&syncobj->lock); } /** * drm_syncobj_add_point - add new timeline point to the syncobj * @syncobj: sync object to add timeline point do * @chain: chain node to use to add the point * @fence: fence to encapsulate in the chain node * @point: sequence number to use for the point * * Add the chain node as new timeline point to the syncobj. */ void drm_syncobj_add_point(struct drm_syncobj *syncobj, struct dma_fence_chain *chain, struct dma_fence *fence, uint64_t point) { struct syncobj_wait_entry *wait_cur, *wait_tmp; struct syncobj_eventfd_entry *ev_fd_cur, *ev_fd_tmp; struct dma_fence *prev; dma_fence_get(fence); spin_lock(&syncobj->lock); prev = drm_syncobj_fence_get(syncobj); /* You are adding an unorder point to timeline, which could cause payload returned from query_ioctl is 0! */ if (prev && prev->seqno >= point) DRM_DEBUG("You are adding an unorder point to timeline!\n"); dma_fence_chain_init(chain, prev, fence, point); rcu_assign_pointer(syncobj->fence, &chain->base); list_for_each_entry_safe(wait_cur, wait_tmp, &syncobj->cb_list, node) syncobj_wait_syncobj_func(syncobj, wait_cur); list_for_each_entry_safe(ev_fd_cur, ev_fd_tmp, &syncobj->ev_fd_list, node) syncobj_eventfd_entry_func(syncobj, ev_fd_cur); spin_unlock(&syncobj->lock); /* Walk the chain once to trigger garbage collection */ dma_fence_chain_for_each(fence, prev); dma_fence_put(prev); } EXPORT_SYMBOL(drm_syncobj_add_point); /** * drm_syncobj_replace_fence - replace fence in a sync object. * @syncobj: Sync object to replace fence in * @fence: fence to install in sync file. * * This replaces the fence on a sync object. */ void drm_syncobj_replace_fence(struct drm_syncobj *syncobj, struct dma_fence *fence) { struct dma_fence *old_fence; struct syncobj_wait_entry *wait_cur, *wait_tmp; struct syncobj_eventfd_entry *ev_fd_cur, *ev_fd_tmp; if (fence) dma_fence_get(fence); spin_lock(&syncobj->lock); old_fence = rcu_dereference_protected(syncobj->fence, lockdep_is_held(&syncobj->lock)); rcu_assign_pointer(syncobj->fence, fence); if (fence != old_fence) { list_for_each_entry_safe(wait_cur, wait_tmp, &syncobj->cb_list, node) syncobj_wait_syncobj_func(syncobj, wait_cur); list_for_each_entry_safe(ev_fd_cur, ev_fd_tmp, &syncobj->ev_fd_list, node) syncobj_eventfd_entry_func(syncobj, ev_fd_cur); } spin_unlock(&syncobj->lock); dma_fence_put(old_fence); } EXPORT_SYMBOL(drm_syncobj_replace_fence); /** * drm_syncobj_assign_null_handle - assign a stub fence to the sync object * @syncobj: sync object to assign the fence on * * Assign a already signaled stub fence to the sync object. */ static int drm_syncobj_assign_null_handle(struct drm_syncobj *syncobj) { struct dma_fence *fence = dma_fence_allocate_private_stub(ktime_get()); if (!fence) return -ENOMEM; drm_syncobj_replace_fence(syncobj, fence); dma_fence_put(fence); return 0; } /* 5s default for wait submission */ #define DRM_SYNCOBJ_WAIT_FOR_SUBMIT_TIMEOUT 5000000000ULL /** * drm_syncobj_find_fence - lookup and reference the fence in a sync object * @file_private: drm file private pointer * @handle: sync object handle to lookup. * @point: timeline point * @flags: DRM_SYNCOBJ_WAIT_FLAGS_WAIT_FOR_SUBMIT or not * @fence: out parameter for the fence * * This is just a convenience function that combines drm_syncobj_find() and * drm_syncobj_fence_get(). * * Returns 0 on success or a negative error value on failure. On success @fence * contains a reference to the fence, which must be released by calling * dma_fence_put(). */ int drm_syncobj_find_fence(struct drm_file *file_private, u32 handle, u64 point, u64 flags, struct dma_fence **fence) { struct drm_syncobj *syncobj = drm_syncobj_find(file_private, handle); struct syncobj_wait_entry wait; u64 timeout = nsecs_to_jiffies64(DRM_SYNCOBJ_WAIT_FOR_SUBMIT_TIMEOUT); int ret; if (flags & ~DRM_SYNCOBJ_WAIT_FLAGS_WAIT_FOR_SUBMIT) return -EINVAL; if (!syncobj) return -ENOENT; /* Waiting for userspace with locks help is illegal cause that can * trivial deadlock with page faults for example. Make lockdep complain * about it early on. */ if (flags & DRM_SYNCOBJ_WAIT_FLAGS_WAIT_FOR_SUBMIT) { might_sleep(); lockdep_assert_none_held_once(); } *fence = drm_syncobj_fence_get(syncobj); if (*fence) { ret = dma_fence_chain_find_seqno(fence, point); if (!ret) { /* If the requested seqno is already signaled * drm_syncobj_find_fence may return a NULL * fence. To make sure the recipient gets * signalled, use a new fence instead. */ if (!*fence) *fence = dma_fence_get_stub(); goto out; } dma_fence_put(*fence); } else { ret = -EINVAL; } if (!(flags & DRM_SYNCOBJ_WAIT_FLAGS_WAIT_FOR_SUBMIT)) goto out; memset(&wait, 0, sizeof(wait)); wait.task = current; wait.point = point; drm_syncobj_fence_add_wait(syncobj, &wait); do { set_current_state(TASK_INTERRUPTIBLE); if (wait.fence) { ret = 0; break; } if (timeout == 0) { ret = -ETIME; break; } if (signal_pending(current)) { ret = -ERESTARTSYS; break; } timeout = schedule_timeout(timeout); } while (1); __set_current_state(TASK_RUNNING); *fence = wait.fence; if (wait.node.next) drm_syncobj_remove_wait(syncobj, &wait); out: drm_syncobj_put(syncobj); return ret; } EXPORT_SYMBOL(drm_syncobj_find_fence); /** * drm_syncobj_free - free a sync object. * @kref: kref to free. * * Only to be called from kref_put in drm_syncobj_put. */ void drm_syncobj_free(struct kref *kref) { struct drm_syncobj *syncobj = container_of(kref, struct drm_syncobj, refcount); struct syncobj_eventfd_entry *ev_fd_cur, *ev_fd_tmp; drm_syncobj_replace_fence(syncobj, NULL); list_for_each_entry_safe(ev_fd_cur, ev_fd_tmp, &syncobj->ev_fd_list, node) syncobj_eventfd_entry_free(ev_fd_cur); kfree(syncobj); } EXPORT_SYMBOL(drm_syncobj_free); /** * drm_syncobj_create - create a new syncobj * @out_syncobj: returned syncobj * @flags: DRM_SYNCOBJ_* flags * @fence: if non-NULL, the syncobj will represent this fence * * This is the first function to create a sync object. After creating, drivers * probably want to make it available to userspace, either through * drm_syncobj_get_handle() or drm_syncobj_get_fd(). * * Returns 0 on success or a negative error value on failure. */ int drm_syncobj_create(struct drm_syncobj **out_syncobj, uint32_t flags, struct dma_fence *fence) { int ret; struct drm_syncobj *syncobj; syncobj = kzalloc(sizeof(struct drm_syncobj), GFP_KERNEL); if (!syncobj) return -ENOMEM; kref_init(&syncobj->refcount); INIT_LIST_HEAD(&syncobj->cb_list); INIT_LIST_HEAD(&syncobj->ev_fd_list); spin_lock_init(&syncobj->lock); if (flags & DRM_SYNCOBJ_CREATE_SIGNALED) { ret = drm_syncobj_assign_null_handle(syncobj); if (ret < 0) { drm_syncobj_put(syncobj); return ret; } } if (fence) drm_syncobj_replace_fence(syncobj, fence); *out_syncobj = syncobj; return 0; } EXPORT_SYMBOL(drm_syncobj_create); /** * drm_syncobj_get_handle - get a handle from a syncobj * @file_private: drm file private pointer * @syncobj: Sync object to export * @handle: out parameter with the new handle * * Exports a sync object created with drm_syncobj_create() as a handle on * @file_private to userspace. * * Returns 0 on success or a negative error value on failure. */ int drm_syncobj_get_handle(struct drm_file *file_private, struct drm_syncobj *syncobj, u32 *handle) { int ret; /* take a reference to put in the idr */ drm_syncobj_get(syncobj); idr_preload(GFP_KERNEL); spin_lock(&file_private->syncobj_table_lock); ret = idr_alloc(&file_private->syncobj_idr, syncobj, 1, 0, GFP_NOWAIT); spin_unlock(&file_private->syncobj_table_lock); idr_preload_end(); if (ret < 0) { drm_syncobj_put(syncobj); return ret; } *handle = ret; return 0; } EXPORT_SYMBOL(drm_syncobj_get_handle); static int drm_syncobj_create_as_handle(struct drm_file *file_private, u32 *handle, uint32_t flags) { int ret; struct drm_syncobj *syncobj; ret = drm_syncobj_create(&syncobj, flags, NULL); if (ret) return ret; ret = drm_syncobj_get_handle(file_private, syncobj, handle); drm_syncobj_put(syncobj); return ret; } static int drm_syncobj_destroy(struct drm_file *file_private, u32 handle) { struct drm_syncobj *syncobj; spin_lock(&file_private->syncobj_table_lock); syncobj = idr_remove(&file_private->syncobj_idr, handle); spin_unlock(&file_private->syncobj_table_lock); if (!syncobj) return -EINVAL; drm_syncobj_put(syncobj); return 0; } static int drm_syncobj_file_release(struct inode *inode, struct file *file) { struct drm_syncobj *syncobj = file->private_data; drm_syncobj_put(syncobj); return 0; } static const struct file_operations drm_syncobj_file_fops = { .release = drm_syncobj_file_release, }; /** * drm_syncobj_get_fd - get a file descriptor from a syncobj * @syncobj: Sync object to export * @p_fd: out parameter with the new file descriptor * * Exports a sync object created with drm_syncobj_create() as a file descriptor. * * Returns 0 on success or a negative error value on failure. */ int drm_syncobj_get_fd(struct drm_syncobj *syncobj, int *p_fd) { struct file *file; int fd; fd = get_unused_fd_flags(O_CLOEXEC); if (fd < 0) return fd; file = anon_inode_getfile("syncobj_file", &drm_syncobj_file_fops, syncobj, 0); if (IS_ERR(file)) { put_unused_fd(fd); return PTR_ERR(file); } drm_syncobj_get(syncobj); fd_install(fd, file); *p_fd = fd; return 0; } EXPORT_SYMBOL(drm_syncobj_get_fd); static int drm_syncobj_handle_to_fd(struct drm_file *file_private, u32 handle, int *p_fd) { struct drm_syncobj *syncobj = drm_syncobj_find(file_private, handle); int ret; if (!syncobj) return -EINVAL; ret = drm_syncobj_get_fd(syncobj, p_fd); drm_syncobj_put(syncobj); return ret; } static int drm_syncobj_fd_to_handle(struct drm_file *file_private, int fd, u32 *handle) { struct drm_syncobj *syncobj; CLASS(fd, f)(fd); int ret; if (fd_empty(f)) return -EINVAL; if (fd_file(f)->f_op != &drm_syncobj_file_fops) return -EINVAL; /* take a reference to put in the idr */ syncobj = fd_file(f)->private_data; drm_syncobj_get(syncobj); idr_preload(GFP_KERNEL); spin_lock(&file_private->syncobj_table_lock); ret = idr_alloc(&file_private->syncobj_idr, syncobj, 1, 0, GFP_NOWAIT); spin_unlock(&file_private->syncobj_table_lock); idr_preload_end(); if (ret > 0) { *handle = ret; ret = 0; } else drm_syncobj_put(syncobj); return ret; } static int drm_syncobj_import_sync_file_fence(struct drm_file *file_private, int fd, int handle, u64 point) { struct dma_fence *fence = sync_file_get_fence(fd); struct drm_syncobj *syncobj; if (!fence) return -EINVAL; syncobj = drm_syncobj_find(file_private, handle); if (!syncobj) { dma_fence_put(fence); return -ENOENT; } if (point) { struct dma_fence_chain *chain = dma_fence_chain_alloc(); if (!chain) return -ENOMEM; drm_syncobj_add_point(syncobj, chain, fence, point); } else { drm_syncobj_replace_fence(syncobj, fence); } dma_fence_put(fence); drm_syncobj_put(syncobj); return 0; } static int drm_syncobj_export_sync_file(struct drm_file *file_private, int handle, u64 point, int *p_fd) { int ret; struct dma_fence *fence; struct sync_file *sync_file; int fd = get_unused_fd_flags(O_CLOEXEC); if (fd < 0) return fd; ret = drm_syncobj_find_fence(file_private, handle, point, 0, &fence); if (ret) goto err_put_fd; sync_file = sync_file_create(fence); dma_fence_put(fence); if (!sync_file) { ret = -EINVAL; goto err_put_fd; } fd_install(fd, sync_file->file); *p_fd = fd; return 0; err_put_fd: put_unused_fd(fd); return ret; } /** * drm_syncobj_open - initializes syncobj file-private structures at devnode open time * @file_private: drm file-private structure to set up * * Called at device open time, sets up the structure for handling refcounting * of sync objects. */ void drm_syncobj_open(struct drm_file *file_private) { idr_init_base(&file_private->syncobj_idr, 1); spin_lock_init(&file_private->syncobj_table_lock); } static int drm_syncobj_release_handle(int id, void *ptr, void *data) { struct drm_syncobj *syncobj = ptr; drm_syncobj_put(syncobj); return 0; } /** * drm_syncobj_release - release file-private sync object resources * @file_private: drm file-private structure to clean up * * Called at close time when the filp is going away. * * Releases any remaining references on objects by this filp. */ void drm_syncobj_release(struct drm_file *file_private) { idr_for_each(&file_private->syncobj_idr, &drm_syncobj_release_handle, file_private); idr_destroy(&file_private->syncobj_idr); } int drm_syncobj_create_ioctl(struct drm_device *dev, void *data, struct drm_file *file_private) { struct drm_syncobj_create *args = data; if (!drm_core_check_feature(dev, DRIVER_SYNCOBJ)) return -EOPNOTSUPP; /* no valid flags yet */ if (args->flags & ~DRM_SYNCOBJ_CREATE_SIGNALED) return -EINVAL; return drm_syncobj_create_as_handle(file_private, &args->handle, args->flags); } int drm_syncobj_destroy_ioctl(struct drm_device *dev, void *data, struct drm_file *file_private) { struct drm_syncobj_destroy *args = data; if (!drm_core_check_feature(dev, DRIVER_SYNCOBJ)) return -EOPNOTSUPP; /* make sure padding is empty */ if (args->pad) return -EINVAL; return drm_syncobj_destroy(file_private, args->handle); } int drm_syncobj_handle_to_fd_ioctl(struct drm_device *dev, void *data, struct drm_file *file_private) { struct drm_syncobj_handle *args = data; unsigned int valid_flags = DRM_SYNCOBJ_HANDLE_TO_FD_FLAGS_TIMELINE | DRM_SYNCOBJ_HANDLE_TO_FD_FLAGS_EXPORT_SYNC_FILE; u64 point = 0; if (!drm_core_check_feature(dev, DRIVER_SYNCOBJ)) return -EOPNOTSUPP; if (args->pad) return -EINVAL; if (args->flags & ~valid_flags) return -EINVAL; if (args->flags & DRM_SYNCOBJ_HANDLE_TO_FD_FLAGS_TIMELINE) point = args->point; if (args->flags & DRM_SYNCOBJ_HANDLE_TO_FD_FLAGS_EXPORT_SYNC_FILE) return drm_syncobj_export_sync_file(file_private, args->handle, point, &args->fd); if (args->point) return -EINVAL; return drm_syncobj_handle_to_fd(file_private, args->handle, &args->fd); } int drm_syncobj_fd_to_handle_ioctl(struct drm_device *dev, void *data, struct drm_file *file_private) { struct drm_syncobj_handle *args = data; unsigned int valid_flags = DRM_SYNCOBJ_FD_TO_HANDLE_FLAGS_TIMELINE | DRM_SYNCOBJ_FD_TO_HANDLE_FLAGS_IMPORT_SYNC_FILE; u64 point = 0; if (!drm_core_check_feature(dev, DRIVER_SYNCOBJ)) return -EOPNOTSUPP; if (args->pad) return -EINVAL; if (args->flags & ~valid_flags) return -EINVAL; if (args->flags & DRM_SYNCOBJ_FD_TO_HANDLE_FLAGS_TIMELINE) point = args->point; if (args->flags & DRM_SYNCOBJ_FD_TO_HANDLE_FLAGS_IMPORT_SYNC_FILE) return drm_syncobj_import_sync_file_fence(file_private, args->fd, args->handle, point); if (args->point) return -EINVAL; return drm_syncobj_fd_to_handle(file_private, args->fd, &args->handle); } static int drm_syncobj_transfer_to_timeline(struct drm_file *file_private, struct drm_syncobj_transfer *args) { struct drm_syncobj *timeline_syncobj = NULL; struct dma_fence *fence, *tmp; struct dma_fence_chain *chain; int ret; timeline_syncobj = drm_syncobj_find(file_private, args->dst_handle); if (!timeline_syncobj) { return -ENOENT; } ret = drm_syncobj_find_fence(file_private, args->src_handle, args->src_point, args->flags, &tmp); if (ret) goto err_put_timeline; fence = dma_fence_unwrap_merge(tmp); dma_fence_put(tmp); if (!fence) { ret = -ENOMEM; goto err_put_timeline; } chain = dma_fence_chain_alloc(); if (!chain) { ret = -ENOMEM; goto err_free_fence; } drm_syncobj_add_point(timeline_syncobj, chain, fence, args->dst_point); err_free_fence: dma_fence_put(fence); err_put_timeline: drm_syncobj_put(timeline_syncobj); return ret; } static int drm_syncobj_transfer_to_binary(struct drm_file *file_private, struct drm_syncobj_transfer *args) { struct drm_syncobj *binary_syncobj = NULL; struct dma_fence *fence; int ret; binary_syncobj = drm_syncobj_find(file_private, args->dst_handle); if (!binary_syncobj) return -ENOENT; ret = drm_syncobj_find_fence(file_private, args->src_handle, args->src_point, args->flags, &fence); if (ret) goto err; drm_syncobj_replace_fence(binary_syncobj, fence); dma_fence_put(fence); err: drm_syncobj_put(binary_syncobj); return ret; } int drm_syncobj_transfer_ioctl(struct drm_device *dev, void *data, struct drm_file *file_private) { struct drm_syncobj_transfer *args = data; int ret; if (!drm_core_check_feature(dev, DRIVER_SYNCOBJ_TIMELINE)) return -EOPNOTSUPP; if (args->pad) return -EINVAL; if (args->dst_point) ret = drm_syncobj_transfer_to_timeline(file_private, args); else ret = drm_syncobj_transfer_to_binary(file_private, args); return ret; } static void syncobj_wait_fence_func(struct dma_fence *fence, struct dma_fence_cb *cb) { struct syncobj_wait_entry *wait = container_of(cb, struct syncobj_wait_entry, fence_cb); wake_up_process(wait->task); } static void syncobj_wait_syncobj_func(struct drm_syncobj *syncobj, struct syncobj_wait_entry *wait) { struct dma_fence *fence; /* This happens inside the syncobj lock */ fence = rcu_dereference_protected(syncobj->fence, lockdep_is_held(&syncobj->lock)); dma_fence_get(fence); if (!fence || dma_fence_chain_find_seqno(&fence, wait->point)) { dma_fence_put(fence); return; } else if (!fence) { wait->fence = dma_fence_get_stub(); } else { wait->fence = fence; } wake_up_process(wait->task); list_del_init(&wait->node); } static signed long drm_syncobj_array_wait_timeout(struct drm_syncobj **syncobjs, void __user *user_points, uint32_t count, uint32_t flags, signed long timeout, uint32_t *idx, ktime_t *deadline) { struct syncobj_wait_entry *entries; struct dma_fence *fence; uint64_t *points; uint32_t signaled_count, i; if (flags & (DRM_SYNCOBJ_WAIT_FLAGS_WAIT_FOR_SUBMIT | DRM_SYNCOBJ_WAIT_FLAGS_WAIT_AVAILABLE)) { might_sleep(); lockdep_assert_none_held_once(); } points = kmalloc_array(count, sizeof(*points), GFP_KERNEL); if (points == NULL) return -ENOMEM; if (!user_points) { memset(points, 0, count * sizeof(uint64_t)); } else if (copy_from_user(points, user_points, sizeof(uint64_t) * count)) { timeout = -EFAULT; goto err_free_points; } entries = kcalloc(count, sizeof(*entries), GFP_KERNEL); if (!entries) { timeout = -ENOMEM; goto err_free_points; } /* Walk the list of sync objects and initialize entries. We do * this up-front so that we can properly return -EINVAL if there is * a syncobj with a missing fence and then never have the chance of * returning -EINVAL again. */ signaled_count = 0; for (i = 0; i < count; ++i) { struct dma_fence *fence; entries[i].task = current; entries[i].point = points[i]; fence = drm_syncobj_fence_get(syncobjs[i]); if (!fence || dma_fence_chain_find_seqno(&fence, points[i])) { dma_fence_put(fence); if (flags & (DRM_SYNCOBJ_WAIT_FLAGS_WAIT_FOR_SUBMIT | DRM_SYNCOBJ_WAIT_FLAGS_WAIT_AVAILABLE)) { continue; } else { timeout = -EINVAL; goto cleanup_entries; } } if (fence) entries[i].fence = fence; else entries[i].fence = dma_fence_get_stub(); if ((flags & DRM_SYNCOBJ_WAIT_FLAGS_WAIT_AVAILABLE) || dma_fence_is_signaled(entries[i].fence)) { if (signaled_count == 0 && idx) *idx = i; signaled_count++; } } if (signaled_count == count || (signaled_count > 0 && !(flags & DRM_SYNCOBJ_WAIT_FLAGS_WAIT_ALL))) goto cleanup_entries; /* There's a very annoying laxness in the dma_fence API here, in * that backends are not required to automatically report when a * fence is signaled prior to fence->ops->enable_signaling() being * called. So here if we fail to match signaled_count, we need to * fallthough and try a 0 timeout wait! */ if (flags & (DRM_SYNCOBJ_WAIT_FLAGS_WAIT_FOR_SUBMIT | DRM_SYNCOBJ_WAIT_FLAGS_WAIT_AVAILABLE)) { for (i = 0; i < count; ++i) drm_syncobj_fence_add_wait(syncobjs[i], &entries[i]); } if (deadline) { for (i = 0; i < count; ++i) { fence = entries[i].fence; if (!fence) continue; dma_fence_set_deadline(fence, *deadline); } } do { set_current_state(TASK_INTERRUPTIBLE); signaled_count = 0; for (i = 0; i < count; ++i) { fence = entries[i].fence; if (!fence) continue; if ((flags & DRM_SYNCOBJ_WAIT_FLAGS_WAIT_AVAILABLE) || dma_fence_is_signaled(fence) || (!entries[i].fence_cb.func && dma_fence_add_callback(fence, &entries[i].fence_cb, syncobj_wait_fence_func))) { /* The fence has been signaled */ if (flags & DRM_SYNCOBJ_WAIT_FLAGS_WAIT_ALL) { signaled_count++; } else { if (idx) *idx = i; goto done_waiting; } } } if (signaled_count == count) goto done_waiting; if (timeout == 0) { timeout = -ETIME; goto done_waiting; } if (signal_pending(current)) { timeout = -ERESTARTSYS; goto done_waiting; } timeout = schedule_timeout(timeout); } while (1); done_waiting: __set_current_state(TASK_RUNNING); cleanup_entries: for (i = 0; i < count; ++i) { drm_syncobj_remove_wait(syncobjs[i], &entries[i]); if (entries[i].fence_cb.func) dma_fence_remove_callback(entries[i].fence, &entries[i].fence_cb); dma_fence_put(entries[i].fence); } kfree(entries); err_free_points: kfree(points); return timeout; } /** * drm_timeout_abs_to_jiffies - calculate jiffies timeout from absolute value * * @timeout_nsec: timeout nsec component in ns, 0 for poll * * Calculate the timeout in jiffies from an absolute time in sec/nsec. */ signed long drm_timeout_abs_to_jiffies(int64_t timeout_nsec) { ktime_t abs_timeout, now; u64 timeout_ns, timeout_jiffies64; /* make 0 timeout means poll - absolute 0 doesn't seem valid */ if (timeout_nsec == 0) return 0; abs_timeout = ns_to_ktime(timeout_nsec); now = ktime_get(); if (!ktime_after(abs_timeout, now)) return 0; timeout_ns = ktime_to_ns(ktime_sub(abs_timeout, now)); timeout_jiffies64 = nsecs_to_jiffies64(timeout_ns); /* clamp timeout to avoid infinite timeout */ if (timeout_jiffies64 >= MAX_SCHEDULE_TIMEOUT - 1) return MAX_SCHEDULE_TIMEOUT - 1; return timeout_jiffies64 + 1; } EXPORT_SYMBOL(drm_timeout_abs_to_jiffies); static int drm_syncobj_array_wait(struct drm_device *dev, struct drm_file *file_private, struct drm_syncobj_wait *wait, struct drm_syncobj_timeline_wait *timeline_wait, struct drm_syncobj **syncobjs, bool timeline, ktime_t *deadline) { signed long timeout = 0; uint32_t first = ~0; if (!timeline) { timeout = drm_timeout_abs_to_jiffies(wait->timeout_nsec); timeout = drm_syncobj_array_wait_timeout(syncobjs, NULL, wait->count_handles, wait->flags, timeout, &first, deadline); if (timeout < 0) return timeout; wait->first_signaled = first; } else { timeout = drm_timeout_abs_to_jiffies(timeline_wait->timeout_nsec); timeout = drm_syncobj_array_wait_timeout(syncobjs, u64_to_user_ptr(timeline_wait->points), timeline_wait->count_handles, timeline_wait->flags, timeout, &first, deadline); if (timeout < 0) return timeout; timeline_wait->first_signaled = first; } return 0; } static int drm_syncobj_array_find(struct drm_file *file_private, void __user *user_handles, uint32_t count_handles, struct drm_syncobj ***syncobjs_out) { uint32_t i, *handles; struct drm_syncobj **syncobjs; int ret; handles = kmalloc_array(count_handles, sizeof(*handles), GFP_KERNEL); if (handles == NULL) return -ENOMEM; if (copy_from_user(handles, user_handles, sizeof(uint32_t) * count_handles)) { ret = -EFAULT; goto err_free_handles; } syncobjs = kmalloc_array(count_handles, sizeof(*syncobjs), GFP_KERNEL); if (syncobjs == NULL) { ret = -ENOMEM; goto err_free_handles; } for (i = 0; i < count_handles; i++) { syncobjs[i] = drm_syncobj_find(file_private, handles[i]); if (!syncobjs[i]) { ret = -ENOENT; goto err_put_syncobjs; } } kfree(handles); *syncobjs_out = syncobjs; return 0; err_put_syncobjs: while (i-- > 0) drm_syncobj_put(syncobjs[i]); kfree(syncobjs); err_free_handles: kfree(handles); return ret; } static void drm_syncobj_array_free(struct drm_syncobj **syncobjs, uint32_t count) { uint32_t i; for (i = 0; i < count; i++) drm_syncobj_put(syncobjs[i]); kfree(syncobjs); } int drm_syncobj_wait_ioctl(struct drm_device *dev, void *data, struct drm_file *file_private) { struct drm_syncobj_wait *args = data; struct drm_syncobj **syncobjs; unsigned int possible_flags; ktime_t t, *tp = NULL; int ret = 0; if (!drm_core_check_feature(dev, DRIVER_SYNCOBJ)) return -EOPNOTSUPP; possible_flags = DRM_SYNCOBJ_WAIT_FLAGS_WAIT_ALL | DRM_SYNCOBJ_WAIT_FLAGS_WAIT_FOR_SUBMIT | DRM_SYNCOBJ_WAIT_FLAGS_WAIT_DEADLINE; if (args->flags & ~possible_flags) return -EINVAL; if (args->count_handles == 0) return 0; ret = drm_syncobj_array_find(file_private, u64_to_user_ptr(args->handles), args->count_handles, &syncobjs); if (ret < 0) return ret; if (args->flags & DRM_SYNCOBJ_WAIT_FLAGS_WAIT_DEADLINE) { t = ns_to_ktime(args->deadline_nsec); tp = &t; } ret = drm_syncobj_array_wait(dev, file_private, args, NULL, syncobjs, false, tp); drm_syncobj_array_free(syncobjs, args->count_handles); return ret; } int drm_syncobj_timeline_wait_ioctl(struct drm_device *dev, void *data, struct drm_file *file_private) { struct drm_syncobj_timeline_wait *args = data; struct drm_syncobj **syncobjs; unsigned int possible_flags; ktime_t t, *tp = NULL; int ret = 0; if (!drm_core_check_feature(dev, DRIVER_SYNCOBJ_TIMELINE)) return -EOPNOTSUPP; possible_flags = DRM_SYNCOBJ_WAIT_FLAGS_WAIT_ALL | DRM_SYNCOBJ_WAIT_FLAGS_WAIT_FOR_SUBMIT | DRM_SYNCOBJ_WAIT_FLAGS_WAIT_AVAILABLE | DRM_SYNCOBJ_WAIT_FLAGS_WAIT_DEADLINE; if (args->flags & ~possible_flags) return -EINVAL; if (args->count_handles == 0) return 0; ret = drm_syncobj_array_find(file_private, u64_to_user_ptr(args->handles), args->count_handles, &syncobjs); if (ret < 0) return ret; if (args->flags & DRM_SYNCOBJ_WAIT_FLAGS_WAIT_DEADLINE) { t = ns_to_ktime(args->deadline_nsec); tp = &t; } ret = drm_syncobj_array_wait(dev, file_private, NULL, args, syncobjs, true, tp); drm_syncobj_array_free(syncobjs, args->count_handles); return ret; } static void syncobj_eventfd_entry_fence_func(struct dma_fence *fence, struct dma_fence_cb *cb) { struct syncobj_eventfd_entry *entry = container_of(cb, struct syncobj_eventfd_entry, fence_cb); eventfd_signal(entry->ev_fd_ctx); syncobj_eventfd_entry_free(entry); } static void syncobj_eventfd_entry_func(struct drm_syncobj *syncobj, struct syncobj_eventfd_entry *entry) { int ret; struct dma_fence *fence; /* This happens inside the syncobj lock */ fence = dma_fence_get(rcu_dereference_protected(syncobj->fence, 1)); if (!fence) return; ret = dma_fence_chain_find_seqno(&fence, entry->point); if (ret != 0) { /* The given seqno has not been submitted yet. */ dma_fence_put(fence); return; } else if (!fence) { /* If dma_fence_chain_find_seqno returns 0 but sets the fence * to NULL, it implies that the given seqno is signaled and a * later seqno has already been submitted. Assign a stub fence * so that the eventfd still gets signaled below. */ fence = dma_fence_get_stub(); } list_del_init(&entry->node); entry->fence = fence; if (entry->flags & DRM_SYNCOBJ_WAIT_FLAGS_WAIT_AVAILABLE) { eventfd_signal(entry->ev_fd_ctx); syncobj_eventfd_entry_free(entry); } else { ret = dma_fence_add_callback(fence, &entry->fence_cb, syncobj_eventfd_entry_fence_func); if (ret == -ENOENT) { eventfd_signal(entry->ev_fd_ctx); syncobj_eventfd_entry_free(entry); } } } int drm_syncobj_eventfd_ioctl(struct drm_device *dev, void *data, struct drm_file *file_private) { struct drm_syncobj_eventfd *args = data; struct drm_syncobj *syncobj; struct eventfd_ctx *ev_fd_ctx; struct syncobj_eventfd_entry *entry; int ret; if (!drm_core_check_feature(dev, DRIVER_SYNCOBJ_TIMELINE)) return -EOPNOTSUPP; if (args->flags & ~DRM_SYNCOBJ_WAIT_FLAGS_WAIT_AVAILABLE) return -EINVAL; if (args->pad) return -EINVAL; syncobj = drm_syncobj_find(file_private, args->handle); if (!syncobj) return -ENOENT; ev_fd_ctx = eventfd_ctx_fdget(args->fd); if (IS_ERR(ev_fd_ctx)) { ret = PTR_ERR(ev_fd_ctx); goto err_fdget; } entry = kzalloc(sizeof(*entry), GFP_KERNEL); if (!entry) { ret = -ENOMEM; goto err_kzalloc; } entry->syncobj = syncobj; entry->ev_fd_ctx = ev_fd_ctx; entry->point = args->point; entry->flags = args->flags; drm_syncobj_add_eventfd(syncobj, entry); drm_syncobj_put(syncobj); return 0; err_kzalloc: eventfd_ctx_put(ev_fd_ctx); err_fdget: drm_syncobj_put(syncobj); return ret; } int drm_syncobj_reset_ioctl(struct drm_device *dev, void *data, struct drm_file *file_private) { struct drm_syncobj_array *args = data; struct drm_syncobj **syncobjs; uint32_t i; int ret; if (!drm_core_check_feature(dev, DRIVER_SYNCOBJ)) return -EOPNOTSUPP; if (args->pad != 0) return -EINVAL; if (args->count_handles == 0) return -EINVAL; ret = drm_syncobj_array_find(file_private, u64_to_user_ptr(args->handles), args->count_handles, &syncobjs); if (ret < 0) return ret; for (i = 0; i < args->count_handles; i++) drm_syncobj_replace_fence(syncobjs[i], NULL); drm_syncobj_array_free(syncobjs, args->count_handles); return 0; } int drm_syncobj_signal_ioctl(struct drm_device *dev, void *data, struct drm_file *file_private) { struct drm_syncobj_array *args = data; struct drm_syncobj **syncobjs; uint32_t i; int ret; if (!drm_core_check_feature(dev, DRIVER_SYNCOBJ)) return -EOPNOTSUPP; if (args->pad != 0) return -EINVAL; if (args->count_handles == 0) return -EINVAL; ret = drm_syncobj_array_find(file_private, u64_to_user_ptr(args->handles), args->count_handles, &syncobjs); if (ret < 0) return ret; for (i = 0; i < args->count_handles; i++) { ret = drm_syncobj_assign_null_handle(syncobjs[i]); if (ret < 0) break; } drm_syncobj_array_free(syncobjs, args->count_handles); return ret; } int drm_syncobj_timeline_signal_ioctl(struct drm_device *dev, void *data, struct drm_file *file_private) { struct drm_syncobj_timeline_array *args = data; struct drm_syncobj **syncobjs; struct dma_fence_chain **chains; uint64_t *points; uint32_t i, j; int ret; if (!drm_core_check_feature(dev, DRIVER_SYNCOBJ_TIMELINE)) return -EOPNOTSUPP; if (args->flags != 0) return -EINVAL; if (args->count_handles == 0) return -EINVAL; ret = drm_syncobj_array_find(file_private, u64_to_user_ptr(args->handles), args->count_handles, &syncobjs); if (ret < 0) return ret; points = kmalloc_array(args->count_handles, sizeof(*points), GFP_KERNEL); if (!points) { ret = -ENOMEM; goto out; } if (!u64_to_user_ptr(args->points)) { memset(points, 0, args->count_handles * sizeof(uint64_t)); } else if (copy_from_user(points, u64_to_user_ptr(args->points), sizeof(uint64_t) * args->count_handles)) { ret = -EFAULT; goto err_points; } chains = kmalloc_array(args->count_handles, sizeof(void *), GFP_KERNEL); if (!chains) { ret = -ENOMEM; goto err_points; } for (i = 0; i < args->count_handles; i++) { chains[i] = dma_fence_chain_alloc(); if (!chains[i]) { for (j = 0; j < i; j++) dma_fence_chain_free(chains[j]); ret = -ENOMEM; goto err_chains; } } for (i = 0; i < args->count_handles; i++) { struct dma_fence *fence = dma_fence_get_stub(); drm_syncobj_add_point(syncobjs[i], chains[i], fence, points[i]); dma_fence_put(fence); } err_chains: kfree(chains); err_points: kfree(points); out: drm_syncobj_array_free(syncobjs, args->count_handles); return ret; } int drm_syncobj_query_ioctl(struct drm_device *dev, void *data, struct drm_file *file_private) { struct drm_syncobj_timeline_array *args = data; struct drm_syncobj **syncobjs; uint64_t __user *points = u64_to_user_ptr(args->points); uint32_t i; int ret; if (!drm_core_check_feature(dev, DRIVER_SYNCOBJ_TIMELINE)) return -EOPNOTSUPP; if (args->flags & ~DRM_SYNCOBJ_QUERY_FLAGS_LAST_SUBMITTED) return -EINVAL; if (args->count_handles == 0) return -EINVAL; ret = drm_syncobj_array_find(file_private, u64_to_user_ptr(args->handles), args->count_handles, &syncobjs); if (ret < 0) return ret; for (i = 0; i < args->count_handles; i++) { struct dma_fence_chain *chain; struct dma_fence *fence; uint64_t point; fence = drm_syncobj_fence_get(syncobjs[i]); chain = to_dma_fence_chain(fence); if (chain) { struct dma_fence *iter, *last_signaled = dma_fence_get(fence); if (args->flags & DRM_SYNCOBJ_QUERY_FLAGS_LAST_SUBMITTED) { point = fence->seqno; } else { dma_fence_chain_for_each(iter, fence) { if (iter->context != fence->context) { dma_fence_put(iter); /* It is most likely that timeline has * unorder points. */ break; } dma_fence_put(last_signaled); last_signaled = dma_fence_get(iter); } point = dma_fence_is_signaled(last_signaled) ? last_signaled->seqno : to_dma_fence_chain(last_signaled)->prev_seqno; } dma_fence_put(last_signaled); } else { point = 0; } dma_fence_put(fence); ret = copy_to_user(&points[i], &point, sizeof(uint64_t)); ret = ret ? -EFAULT : 0; if (ret) break; } drm_syncobj_array_free(syncobjs, args->count_handles); return ret; } |
| 26 26 26 26 25 25 26 30 31 30 30 28 26 26 26 1 1 1 1 1 1 1 6 3 6 2 1 2 21 3 1 1 1 3 3 40 40 40 40 40 15 15 40 40 1 40 40 40 21 19 22 22 36 36 29 22 11 11 8 8 26 22 24 17 3 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 | #include <linux/bpf.h> #include <linux/btf.h> #include <linux/err.h> #include <linux/irq_work.h> #include <linux/slab.h> #include <linux/filter.h> #include <linux/mm.h> #include <linux/vmalloc.h> #include <linux/wait.h> #include <linux/poll.h> #include <linux/kmemleak.h> #include <uapi/linux/btf.h> #include <linux/btf_ids.h> #include <asm/rqspinlock.h> #define RINGBUF_CREATE_FLAG_MASK (BPF_F_NUMA_NODE) /* non-mmap()'able part of bpf_ringbuf (everything up to consumer page) */ #define RINGBUF_PGOFF \ (offsetof(struct bpf_ringbuf, consumer_pos) >> PAGE_SHIFT) /* consumer page and producer page */ #define RINGBUF_POS_PAGES 2 #define RINGBUF_NR_META_PAGES (RINGBUF_PGOFF + RINGBUF_POS_PAGES) #define RINGBUF_MAX_RECORD_SZ (UINT_MAX/4) struct bpf_ringbuf { wait_queue_head_t waitq; struct irq_work work; u64 mask; struct page **pages; int nr_pages; rqspinlock_t spinlock ____cacheline_aligned_in_smp; /* For user-space producer ring buffers, an atomic_t busy bit is used * to synchronize access to the ring buffers in the kernel, rather than * the spinlock that is used for kernel-producer ring buffers. This is * done because the ring buffer must hold a lock across a BPF program's * callback: * * __bpf_user_ringbuf_peek() // lock acquired * -> program callback_fn() * -> __bpf_user_ringbuf_sample_release() // lock released * * It is unsafe and incorrect to hold an IRQ spinlock across what could * be a long execution window, so we instead simply disallow concurrent * access to the ring buffer by kernel consumers, and return -EBUSY from * __bpf_user_ringbuf_peek() if the busy bit is held by another task. */ atomic_t busy ____cacheline_aligned_in_smp; /* Consumer and producer counters are put into separate pages to * allow each position to be mapped with different permissions. * This prevents a user-space application from modifying the * position and ruining in-kernel tracking. The permissions of the * pages depend on who is producing samples: user-space or the * kernel. Note that the pending counter is placed in the same * page as the producer, so that it shares the same cache line. * * Kernel-producer * --------------- * The producer position and data pages are mapped as r/o in * userspace. For this approach, bits in the header of samples are * used to signal to user-space, and to other producers, whether a * sample is currently being written. * * User-space producer * ------------------- * Only the page containing the consumer position is mapped r/o in * user-space. User-space producers also use bits of the header to * communicate to the kernel, but the kernel must carefully check and * validate each sample to ensure that they're correctly formatted, and * fully contained within the ring buffer. */ unsigned long consumer_pos __aligned(PAGE_SIZE); unsigned long producer_pos __aligned(PAGE_SIZE); unsigned long pending_pos; char data[] __aligned(PAGE_SIZE); }; struct bpf_ringbuf_map { struct bpf_map map; struct bpf_ringbuf *rb; }; /* 8-byte ring buffer record header structure */ struct bpf_ringbuf_hdr { u32 len; u32 pg_off; }; static struct bpf_ringbuf *bpf_ringbuf_area_alloc(size_t data_sz, int numa_node) { const gfp_t flags = GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL | __GFP_NOWARN | __GFP_ZERO; int nr_meta_pages = RINGBUF_NR_META_PAGES; int nr_data_pages = data_sz >> PAGE_SHIFT; int nr_pages = nr_meta_pages + nr_data_pages; struct page **pages, *page; struct bpf_ringbuf *rb; size_t array_size; int i; /* Each data page is mapped twice to allow "virtual" * continuous read of samples wrapping around the end of ring * buffer area: * ------------------------------------------------------ * | meta pages | real data pages | same data pages | * ------------------------------------------------------ * | | 1 2 3 4 5 6 7 8 9 | 1 2 3 4 5 6 7 8 9 | * ------------------------------------------------------ * | | TA DA | TA DA | * ------------------------------------------------------ * ^^^^^^^ * | * Here, no need to worry about special handling of wrapped-around * data due to double-mapped data pages. This works both in kernel and * when mmap()'ed in user-space, simplifying both kernel and * user-space implementations significantly. */ array_size = (nr_meta_pages + 2 * nr_data_pages) * sizeof(*pages); pages = bpf_map_area_alloc(array_size, numa_node); if (!pages) return NULL; for (i = 0; i < nr_pages; i++) { page = alloc_pages_node(numa_node, flags, 0); if (!page) { nr_pages = i; goto err_free_pages; } pages[i] = page; if (i >= nr_meta_pages) pages[nr_data_pages + i] = page; } rb = vmap(pages, nr_meta_pages + 2 * nr_data_pages, VM_MAP | VM_USERMAP, PAGE_KERNEL); if (rb) { kmemleak_not_leak(pages); rb->pages = pages; rb->nr_pages = nr_pages; return rb; } err_free_pages: for (i = 0; i < nr_pages; i++) __free_page(pages[i]); bpf_map_area_free(pages); return NULL; } static void bpf_ringbuf_notify(struct irq_work *work) { struct bpf_ringbuf *rb = container_of(work, struct bpf_ringbuf, work); wake_up_all(&rb->waitq); } /* Maximum size of ring buffer area is limited by 32-bit page offset within * record header, counted in pages. Reserve 8 bits for extensibility, and * take into account few extra pages for consumer/producer pages and * non-mmap()'able parts, the current maximum size would be: * * (((1ULL << 24) - RINGBUF_POS_PAGES - RINGBUF_PGOFF) * PAGE_SIZE) * * This gives 64GB limit, which seems plenty for single ring buffer. Now * considering that the maximum value of data_sz is (4GB - 1), there * will be no overflow, so just note the size limit in the comments. */ static struct bpf_ringbuf *bpf_ringbuf_alloc(size_t data_sz, int numa_node) { struct bpf_ringbuf *rb; rb = bpf_ringbuf_area_alloc(data_sz, numa_node); if (!rb) return NULL; raw_res_spin_lock_init(&rb->spinlock); atomic_set(&rb->busy, 0); init_waitqueue_head(&rb->waitq); init_irq_work(&rb->work, bpf_ringbuf_notify); rb->mask = data_sz - 1; rb->consumer_pos = 0; rb->producer_pos = 0; rb->pending_pos = 0; return rb; } static struct bpf_map *ringbuf_map_alloc(union bpf_attr *attr) { struct bpf_ringbuf_map *rb_map; if (attr->map_flags & ~RINGBUF_CREATE_FLAG_MASK) return ERR_PTR(-EINVAL); if (attr->key_size || attr->value_size || !is_power_of_2(attr->max_entries) || !PAGE_ALIGNED(attr->max_entries)) return ERR_PTR(-EINVAL); rb_map = bpf_map_area_alloc(sizeof(*rb_map), NUMA_NO_NODE); if (!rb_map) return ERR_PTR(-ENOMEM); bpf_map_init_from_attr(&rb_map->map, attr); rb_map->rb = bpf_ringbuf_alloc(attr->max_entries, rb_map->map.numa_node); if (!rb_map->rb) { bpf_map_area_free(rb_map); return ERR_PTR(-ENOMEM); } return &rb_map->map; } static void bpf_ringbuf_free(struct bpf_ringbuf *rb) { /* copy pages pointer and nr_pages to local variable, as we are going * to unmap rb itself with vunmap() below */ struct page **pages = rb->pages; int i, nr_pages = rb->nr_pages; vunmap(rb); for (i = 0; i < nr_pages; i++) __free_page(pages[i]); bpf_map_area_free(pages); } static void ringbuf_map_free(struct bpf_map *map) { struct bpf_ringbuf_map *rb_map; rb_map = container_of(map, struct bpf_ringbuf_map, map); bpf_ringbuf_free(rb_map->rb); bpf_map_area_free(rb_map); } static void *ringbuf_map_lookup_elem(struct bpf_map *map, void *key) { return ERR_PTR(-ENOTSUPP); } static long ringbuf_map_update_elem(struct bpf_map *map, void *key, void *value, u64 flags) { return -ENOTSUPP; } static long ringbuf_map_delete_elem(struct bpf_map *map, void *key) { return -ENOTSUPP; } static int ringbuf_map_get_next_key(struct bpf_map *map, void *key, void *next_key) { return -ENOTSUPP; } static int ringbuf_map_mmap_kern(struct bpf_map *map, struct vm_area_struct *vma) { struct bpf_ringbuf_map *rb_map; rb_map = container_of(map, struct bpf_ringbuf_map, map); if (vma->vm_flags & VM_WRITE) { /* allow writable mapping for the consumer_pos only */ if (vma->vm_pgoff != 0 || vma->vm_end - vma->vm_start != PAGE_SIZE) return -EPERM; } /* remap_vmalloc_range() checks size and offset constraints */ return remap_vmalloc_range(vma, rb_map->rb, vma->vm_pgoff + RINGBUF_PGOFF); } static int ringbuf_map_mmap_user(struct bpf_map *map, struct vm_area_struct *vma) { struct bpf_ringbuf_map *rb_map; rb_map = container_of(map, struct bpf_ringbuf_map, map); if (vma->vm_flags & VM_WRITE) { if (vma->vm_pgoff == 0) /* Disallow writable mappings to the consumer pointer, * and allow writable mappings to both the producer * position, and the ring buffer data itself. */ return -EPERM; } /* remap_vmalloc_range() checks size and offset constraints */ return remap_vmalloc_range(vma, rb_map->rb, vma->vm_pgoff + RINGBUF_PGOFF); } static unsigned long ringbuf_avail_data_sz(struct bpf_ringbuf *rb) { unsigned long cons_pos, prod_pos; cons_pos = smp_load_acquire(&rb->consumer_pos); prod_pos = smp_load_acquire(&rb->producer_pos); return prod_pos - cons_pos; } static u32 ringbuf_total_data_sz(const struct bpf_ringbuf *rb) { return rb->mask + 1; } static __poll_t ringbuf_map_poll_kern(struct bpf_map *map, struct file *filp, struct poll_table_struct *pts) { struct bpf_ringbuf_map *rb_map; rb_map = container_of(map, struct bpf_ringbuf_map, map); poll_wait(filp, &rb_map->rb->waitq, pts); if (ringbuf_avail_data_sz(rb_map->rb)) return EPOLLIN | EPOLLRDNORM; return 0; } static __poll_t ringbuf_map_poll_user(struct bpf_map *map, struct file *filp, struct poll_table_struct *pts) { struct bpf_ringbuf_map *rb_map; rb_map = container_of(map, struct bpf_ringbuf_map, map); poll_wait(filp, &rb_map->rb->waitq, pts); if (ringbuf_avail_data_sz(rb_map->rb) < ringbuf_total_data_sz(rb_map->rb)) return EPOLLOUT | EPOLLWRNORM; return 0; } static u64 ringbuf_map_mem_usage(const struct bpf_map *map) { struct bpf_ringbuf *rb; int nr_data_pages; int nr_meta_pages; u64 usage = sizeof(struct bpf_ringbuf_map); rb = container_of(map, struct bpf_ringbuf_map, map)->rb; usage += (u64)rb->nr_pages << PAGE_SHIFT; nr_meta_pages = RINGBUF_NR_META_PAGES; nr_data_pages = map->max_entries >> PAGE_SHIFT; usage += (nr_meta_pages + 2 * nr_data_pages) * sizeof(struct page *); return usage; } BTF_ID_LIST_SINGLE(ringbuf_map_btf_ids, struct, bpf_ringbuf_map) const struct bpf_map_ops ringbuf_map_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc = ringbuf_map_alloc, .map_free = ringbuf_map_free, .map_mmap = ringbuf_map_mmap_kern, .map_poll = ringbuf_map_poll_kern, .map_lookup_elem = ringbuf_map_lookup_elem, .map_update_elem = ringbuf_map_update_elem, .map_delete_elem = ringbuf_map_delete_elem, .map_get_next_key = ringbuf_map_get_next_key, .map_mem_usage = ringbuf_map_mem_usage, .map_btf_id = &ringbuf_map_btf_ids[0], }; BTF_ID_LIST_SINGLE(user_ringbuf_map_btf_ids, struct, bpf_ringbuf_map) const struct bpf_map_ops user_ringbuf_map_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc = ringbuf_map_alloc, .map_free = ringbuf_map_free, .map_mmap = ringbuf_map_mmap_user, .map_poll = ringbuf_map_poll_user, .map_lookup_elem = ringbuf_map_lookup_elem, .map_update_elem = ringbuf_map_update_elem, .map_delete_elem = ringbuf_map_delete_elem, .map_get_next_key = ringbuf_map_get_next_key, .map_mem_usage = ringbuf_map_mem_usage, .map_btf_id = &user_ringbuf_map_btf_ids[0], }; /* Given pointer to ring buffer record metadata and struct bpf_ringbuf itself, * calculate offset from record metadata to ring buffer in pages, rounded * down. This page offset is stored as part of record metadata and allows to * restore struct bpf_ringbuf * from record pointer. This page offset is * stored at offset 4 of record metadata header. */ static size_t bpf_ringbuf_rec_pg_off(struct bpf_ringbuf *rb, struct bpf_ringbuf_hdr *hdr) { return ((void *)hdr - (void *)rb) >> PAGE_SHIFT; } /* Given pointer to ring buffer record header, restore pointer to struct * bpf_ringbuf itself by using page offset stored at offset 4 */ static struct bpf_ringbuf * bpf_ringbuf_restore_from_rec(struct bpf_ringbuf_hdr *hdr) { unsigned long addr = (unsigned long)(void *)hdr; unsigned long off = (unsigned long)hdr->pg_off << PAGE_SHIFT; return (void*)((addr & PAGE_MASK) - off); } static void *__bpf_ringbuf_reserve(struct bpf_ringbuf *rb, u64 size) { unsigned long cons_pos, prod_pos, new_prod_pos, pend_pos, flags; struct bpf_ringbuf_hdr *hdr; u32 len, pg_off, tmp_size, hdr_len; if (unlikely(size > RINGBUF_MAX_RECORD_SZ)) return NULL; len = round_up(size + BPF_RINGBUF_HDR_SZ, 8); if (len > ringbuf_total_data_sz(rb)) return NULL; cons_pos = smp_load_acquire(&rb->consumer_pos); if (raw_res_spin_lock_irqsave(&rb->spinlock, flags)) return NULL; pend_pos = rb->pending_pos; prod_pos = rb->producer_pos; new_prod_pos = prod_pos + len; while (pend_pos < prod_pos) { hdr = (void *)rb->data + (pend_pos & rb->mask); hdr_len = READ_ONCE(hdr->len); if (hdr_len & BPF_RINGBUF_BUSY_BIT) break; tmp_size = hdr_len & ~BPF_RINGBUF_DISCARD_BIT; tmp_size = round_up(tmp_size + BPF_RINGBUF_HDR_SZ, 8); pend_pos += tmp_size; } rb->pending_pos = pend_pos; /* check for out of ringbuf space: * - by ensuring producer position doesn't advance more than * (ringbuf_size - 1) ahead * - by ensuring oldest not yet committed record until newest * record does not span more than (ringbuf_size - 1) */ if (new_prod_pos - cons_pos > rb->mask || new_prod_pos - pend_pos > rb->mask) { raw_res_spin_unlock_irqrestore(&rb->spinlock, flags); return NULL; } hdr = (void *)rb->data + (prod_pos & rb->mask); pg_off = bpf_ringbuf_rec_pg_off(rb, hdr); hdr->len = size | BPF_RINGBUF_BUSY_BIT; hdr->pg_off = pg_off; /* pairs with consumer's smp_load_acquire() */ smp_store_release(&rb->producer_pos, new_prod_pos); raw_res_spin_unlock_irqrestore(&rb->spinlock, flags); return (void *)hdr + BPF_RINGBUF_HDR_SZ; } BPF_CALL_3(bpf_ringbuf_reserve, struct bpf_map *, map, u64, size, u64, flags) { struct bpf_ringbuf_map *rb_map; if (unlikely(flags)) return 0; rb_map = container_of(map, struct bpf_ringbuf_map, map); return (unsigned long)__bpf_ringbuf_reserve(rb_map->rb, size); } const struct bpf_func_proto bpf_ringbuf_reserve_proto = { .func = bpf_ringbuf_reserve, .ret_type = RET_PTR_TO_RINGBUF_MEM_OR_NULL, .arg1_type = ARG_CONST_MAP_PTR, .arg2_type = ARG_CONST_ALLOC_SIZE_OR_ZERO, .arg3_type = ARG_ANYTHING, }; static void bpf_ringbuf_commit(void *sample, u64 flags, bool discard) { unsigned long rec_pos, cons_pos; struct bpf_ringbuf_hdr *hdr; struct bpf_ringbuf *rb; u32 new_len; hdr = sample - BPF_RINGBUF_HDR_SZ; rb = bpf_ringbuf_restore_from_rec(hdr); new_len = hdr->len ^ BPF_RINGBUF_BUSY_BIT; if (discard) new_len |= BPF_RINGBUF_DISCARD_BIT; /* update record header with correct final size prefix */ xchg(&hdr->len, new_len); /* if consumer caught up and is waiting for our record, notify about * new data availability */ rec_pos = (void *)hdr - (void *)rb->data; cons_pos = smp_load_acquire(&rb->consumer_pos) & rb->mask; if (flags & BPF_RB_FORCE_WAKEUP) irq_work_queue(&rb->work); else if (cons_pos == rec_pos && !(flags & BPF_RB_NO_WAKEUP)) irq_work_queue(&rb->work); } BPF_CALL_2(bpf_ringbuf_submit, void *, sample, u64, flags) { bpf_ringbuf_commit(sample, flags, false /* discard */); return 0; } const struct bpf_func_proto bpf_ringbuf_submit_proto = { .func = bpf_ringbuf_submit, .ret_type = RET_VOID, .arg1_type = ARG_PTR_TO_RINGBUF_MEM | OBJ_RELEASE, .arg2_type = ARG_ANYTHING, }; BPF_CALL_2(bpf_ringbuf_discard, void *, sample, u64, flags) { bpf_ringbuf_commit(sample, flags, true /* discard */); return 0; } const struct bpf_func_proto bpf_ringbuf_discard_proto = { .func = bpf_ringbuf_discard, .ret_type = RET_VOID, .arg1_type = ARG_PTR_TO_RINGBUF_MEM | OBJ_RELEASE, .arg2_type = ARG_ANYTHING, }; BPF_CALL_4(bpf_ringbuf_output, struct bpf_map *, map, void *, data, u64, size, u64, flags) { struct bpf_ringbuf_map *rb_map; void *rec; if (unlikely(flags & ~(BPF_RB_NO_WAKEUP | BPF_RB_FORCE_WAKEUP))) return -EINVAL; rb_map = container_of(map, struct bpf_ringbuf_map, map); rec = __bpf_ringbuf_reserve(rb_map->rb, size); if (!rec) return -EAGAIN; memcpy(rec, data, size); bpf_ringbuf_commit(rec, flags, false /* discard */); return 0; } const struct bpf_func_proto bpf_ringbuf_output_proto = { .func = bpf_ringbuf_output, .ret_type = RET_INTEGER, .arg1_type = ARG_CONST_MAP_PTR, .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE_OR_ZERO, .arg4_type = ARG_ANYTHING, }; BPF_CALL_2(bpf_ringbuf_query, struct bpf_map *, map, u64, flags) { struct bpf_ringbuf *rb; rb = container_of(map, struct bpf_ringbuf_map, map)->rb; switch (flags) { case BPF_RB_AVAIL_DATA: return ringbuf_avail_data_sz(rb); case BPF_RB_RING_SIZE: return ringbuf_total_data_sz(rb); case BPF_RB_CONS_POS: return smp_load_acquire(&rb->consumer_pos); case BPF_RB_PROD_POS: return smp_load_acquire(&rb->producer_pos); default: return 0; } } const struct bpf_func_proto bpf_ringbuf_query_proto = { .func = bpf_ringbuf_query, .ret_type = RET_INTEGER, .arg1_type = ARG_CONST_MAP_PTR, .arg2_type = ARG_ANYTHING, }; BPF_CALL_4(bpf_ringbuf_reserve_dynptr, struct bpf_map *, map, u32, size, u64, flags, struct bpf_dynptr_kern *, ptr) { struct bpf_ringbuf_map *rb_map; void *sample; int err; if (unlikely(flags)) { bpf_dynptr_set_null(ptr); return -EINVAL; } err = bpf_dynptr_check_size(size); if (err) { bpf_dynptr_set_null(ptr); return err; } rb_map = container_of(map, struct bpf_ringbuf_map, map); sample = __bpf_ringbuf_reserve(rb_map->rb, size); if (!sample) { bpf_dynptr_set_null(ptr); return -EINVAL; } bpf_dynptr_init(ptr, sample, BPF_DYNPTR_TYPE_RINGBUF, 0, size); return 0; } const struct bpf_func_proto bpf_ringbuf_reserve_dynptr_proto = { .func = bpf_ringbuf_reserve_dynptr, .ret_type = RET_INTEGER, .arg1_type = ARG_CONST_MAP_PTR, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_RINGBUF | MEM_UNINIT | MEM_WRITE, }; BPF_CALL_2(bpf_ringbuf_submit_dynptr, struct bpf_dynptr_kern *, ptr, u64, flags) { if (!ptr->data) return 0; bpf_ringbuf_commit(ptr->data, flags, false /* discard */); bpf_dynptr_set_null(ptr); return 0; } const struct bpf_func_proto bpf_ringbuf_submit_dynptr_proto = { .func = bpf_ringbuf_submit_dynptr, .ret_type = RET_VOID, .arg1_type = ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_RINGBUF | OBJ_RELEASE, .arg2_type = ARG_ANYTHING, }; BPF_CALL_2(bpf_ringbuf_discard_dynptr, struct bpf_dynptr_kern *, ptr, u64, flags) { if (!ptr->data) return 0; bpf_ringbuf_commit(ptr->data, flags, true /* discard */); bpf_dynptr_set_null(ptr); return 0; } const struct bpf_func_proto bpf_ringbuf_discard_dynptr_proto = { .func = bpf_ringbuf_discard_dynptr, .ret_type = RET_VOID, .arg1_type = ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_RINGBUF | OBJ_RELEASE, .arg2_type = ARG_ANYTHING, }; static int __bpf_user_ringbuf_peek(struct bpf_ringbuf *rb, void **sample, u32 *size) { int err; u32 hdr_len, sample_len, total_len, flags, *hdr; u64 cons_pos, prod_pos; /* Synchronizes with smp_store_release() in user-space producer. */ prod_pos = smp_load_acquire(&rb->producer_pos); if (prod_pos % 8) return -EINVAL; /* Synchronizes with smp_store_release() in __bpf_user_ringbuf_sample_release() */ cons_pos = smp_load_acquire(&rb->consumer_pos); if (cons_pos >= prod_pos) return -ENODATA; hdr = (u32 *)((uintptr_t)rb->data + (uintptr_t)(cons_pos & rb->mask)); /* Synchronizes with smp_store_release() in user-space producer. */ hdr_len = smp_load_acquire(hdr); flags = hdr_len & (BPF_RINGBUF_BUSY_BIT | BPF_RINGBUF_DISCARD_BIT); sample_len = hdr_len & ~flags; total_len = round_up(sample_len + BPF_RINGBUF_HDR_SZ, 8); /* The sample must fit within the region advertised by the producer position. */ if (total_len > prod_pos - cons_pos) return -EINVAL; /* The sample must fit within the data region of the ring buffer. */ if (total_len > ringbuf_total_data_sz(rb)) return -E2BIG; /* The sample must fit into a struct bpf_dynptr. */ err = bpf_dynptr_check_size(sample_len); if (err) return -E2BIG; if (flags & BPF_RINGBUF_DISCARD_BIT) { /* If the discard bit is set, the sample should be skipped. * * Update the consumer pos, and return -EAGAIN so the caller * knows to skip this sample and try to read the next one. */ smp_store_release(&rb->consumer_pos, cons_pos + total_len); return -EAGAIN; } if (flags & BPF_RINGBUF_BUSY_BIT) return -ENODATA; *sample = (void *)((uintptr_t)rb->data + (uintptr_t)((cons_pos + BPF_RINGBUF_HDR_SZ) & rb->mask)); *size = sample_len; return 0; } static void __bpf_user_ringbuf_sample_release(struct bpf_ringbuf *rb, size_t size, u64 flags) { u64 consumer_pos; u32 rounded_size = round_up(size + BPF_RINGBUF_HDR_SZ, 8); /* Using smp_load_acquire() is unnecessary here, as the busy-bit * prevents another task from writing to consumer_pos after it was read * by this task with smp_load_acquire() in __bpf_user_ringbuf_peek(). */ consumer_pos = rb->consumer_pos; /* Synchronizes with smp_load_acquire() in user-space producer. */ smp_store_release(&rb->consumer_pos, consumer_pos + rounded_size); } BPF_CALL_4(bpf_user_ringbuf_drain, struct bpf_map *, map, void *, callback_fn, void *, callback_ctx, u64, flags) { struct bpf_ringbuf *rb; long samples, discarded_samples = 0, ret = 0; bpf_callback_t callback = (bpf_callback_t)callback_fn; u64 wakeup_flags = BPF_RB_NO_WAKEUP | BPF_RB_FORCE_WAKEUP; int busy = 0; if (unlikely(flags & ~wakeup_flags)) return -EINVAL; rb = container_of(map, struct bpf_ringbuf_map, map)->rb; /* If another consumer is already consuming a sample, wait for them to finish. */ if (!atomic_try_cmpxchg(&rb->busy, &busy, 1)) return -EBUSY; for (samples = 0; samples < BPF_MAX_USER_RINGBUF_SAMPLES && ret == 0; samples++) { int err; u32 size; void *sample; struct bpf_dynptr_kern dynptr; err = __bpf_user_ringbuf_peek(rb, &sample, &size); if (err) { if (err == -ENODATA) { break; } else if (err == -EAGAIN) { discarded_samples++; continue; } else { ret = err; goto schedule_work_return; } } bpf_dynptr_init(&dynptr, sample, BPF_DYNPTR_TYPE_LOCAL, 0, size); ret = callback((uintptr_t)&dynptr, (uintptr_t)callback_ctx, 0, 0, 0); __bpf_user_ringbuf_sample_release(rb, size, flags); } ret = samples - discarded_samples; schedule_work_return: /* Prevent the clearing of the busy-bit from being reordered before the * storing of any rb consumer or producer positions. */ atomic_set_release(&rb->busy, 0); if (flags & BPF_RB_FORCE_WAKEUP) irq_work_queue(&rb->work); else if (!(flags & BPF_RB_NO_WAKEUP) && samples > 0) irq_work_queue(&rb->work); return ret; } const struct bpf_func_proto bpf_user_ringbuf_drain_proto = { .func = bpf_user_ringbuf_drain, .ret_type = RET_INTEGER, .arg1_type = ARG_CONST_MAP_PTR, .arg2_type = ARG_PTR_TO_FUNC, .arg3_type = ARG_PTR_TO_STACK_OR_NULL, .arg4_type = ARG_ANYTHING, }; |
| 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 | // SPDX-License-Identifier: GPL-2.0 /* * This file contains the base functions to manage periodic tick * related events. * * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de> * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar * Copyright(C) 2006-2007, Timesys Corp., Thomas Gleixner */ #include <linux/compiler.h> #include <linux/cpu.h> #include <linux/err.h> #include <linux/hrtimer.h> #include <linux/interrupt.h> #include <linux/nmi.h> #include <linux/percpu.h> #include <linux/profile.h> #include <linux/sched.h> #include <linux/module.h> #include <trace/events/power.h> #include <asm/irq_regs.h> #include "tick-internal.h" /* * Tick devices */ DEFINE_PER_CPU(struct tick_device, tick_cpu_device); /* * Tick next event: keeps track of the tick time. It's updated by the * CPU which handles the tick and protected by jiffies_lock. There is * no requirement to write hold the jiffies seqcount for it. */ ktime_t tick_next_period; /* * tick_do_timer_cpu is a timer core internal variable which holds the CPU NR * which is responsible for calling do_timer(), i.e. the timekeeping stuff. This * variable has two functions: * * 1) Prevent a thundering herd issue of a gazillion of CPUs trying to grab the * timekeeping lock all at once. Only the CPU which is assigned to do the * update is handling it. * * 2) Hand off the duty in the NOHZ idle case by setting the value to * TICK_DO_TIMER_NONE, i.e. a non existing CPU. So the next cpu which looks * at it will take over and keep the time keeping alive. The handover * procedure also covers cpu hotplug. */ int tick_do_timer_cpu __read_mostly = TICK_DO_TIMER_BOOT; #ifdef CONFIG_NO_HZ_FULL /* * tick_do_timer_boot_cpu indicates the boot CPU temporarily owns * tick_do_timer_cpu and it should be taken over by an eligible secondary * when one comes online. */ static int tick_do_timer_boot_cpu __read_mostly = -1; #endif /* * Debugging: see timer_list.c */ struct tick_device *tick_get_device(int cpu) { return &per_cpu(tick_cpu_device, cpu); } /** * tick_is_oneshot_available - check for a oneshot capable event device */ int tick_is_oneshot_available(void) { struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev); if (!dev || !(dev->features & CLOCK_EVT_FEAT_ONESHOT)) return 0; if (!(dev->features & CLOCK_EVT_FEAT_C3STOP)) return 1; return tick_broadcast_oneshot_available(); } /* * Periodic tick */ static void tick_periodic(int cpu) { if (READ_ONCE(tick_do_timer_cpu) == cpu) { raw_spin_lock(&jiffies_lock); write_seqcount_begin(&jiffies_seq); /* Keep track of the next tick event */ tick_next_period = ktime_add_ns(tick_next_period, TICK_NSEC); do_timer(1); write_seqcount_end(&jiffies_seq); raw_spin_unlock(&jiffies_lock); update_wall_time(); } update_process_times(user_mode(get_irq_regs())); profile_tick(CPU_PROFILING); } /* * Event handler for periodic ticks */ void tick_handle_periodic(struct clock_event_device *dev) { int cpu = smp_processor_id(); ktime_t next = dev->next_event; tick_periodic(cpu); /* * The cpu might have transitioned to HIGHRES or NOHZ mode via * update_process_times() -> run_local_timers() -> * hrtimer_run_queues(). */ if (IS_ENABLED(CONFIG_TICK_ONESHOT) && dev->event_handler != tick_handle_periodic) return; if (!clockevent_state_oneshot(dev)) return; for (;;) { /* * Setup the next period for devices, which do not have * periodic mode: */ next = ktime_add_ns(next, TICK_NSEC); if (!clockevents_program_event(dev, next, false)) return; /* * Have to be careful here. If we're in oneshot mode, * before we call tick_periodic() in a loop, we need * to be sure we're using a real hardware clocksource. * Otherwise we could get trapped in an infinite * loop, as the tick_periodic() increments jiffies, * which then will increment time, possibly causing * the loop to trigger again and again. */ if (timekeeping_valid_for_hres()) tick_periodic(cpu); } } /* * Setup the device for a periodic tick */ void tick_setup_periodic(struct clock_event_device *dev, int broadcast) { tick_set_periodic_handler(dev, broadcast); /* Broadcast setup ? */ if (!tick_device_is_functional(dev)) return; if ((dev->features & CLOCK_EVT_FEAT_PERIODIC) && !tick_broadcast_oneshot_active()) { clockevents_switch_state(dev, CLOCK_EVT_STATE_PERIODIC); } else { unsigned int seq; ktime_t next; do { seq = read_seqcount_begin(&jiffies_seq); next = tick_next_period; } while (read_seqcount_retry(&jiffies_seq, seq)); clockevents_switch_state(dev, CLOCK_EVT_STATE_ONESHOT); for (;;) { if (!clockevents_program_event(dev, next, false)) return; next = ktime_add_ns(next, TICK_NSEC); } } } /* * Setup the tick device */ static void tick_setup_device(struct tick_device *td, struct clock_event_device *newdev, int cpu, const struct cpumask *cpumask) { void (*handler)(struct clock_event_device *) = NULL; ktime_t next_event = 0; /* * First device setup ? */ if (!td->evtdev) { /* * If no cpu took the do_timer update, assign it to * this cpu: */ if (READ_ONCE(tick_do_timer_cpu) == TICK_DO_TIMER_BOOT) { WRITE_ONCE(tick_do_timer_cpu, cpu); tick_next_period = ktime_get(); #ifdef CONFIG_NO_HZ_FULL /* * The boot CPU may be nohz_full, in which case the * first housekeeping secondary will take do_timer() * from it. */ if (tick_nohz_full_cpu(cpu)) tick_do_timer_boot_cpu = cpu; } else if (tick_do_timer_boot_cpu != -1 && !tick_nohz_full_cpu(cpu)) { tick_do_timer_boot_cpu = -1; /* * The boot CPU will stay in periodic (NOHZ disabled) * mode until clocksource_done_booting() called after * smp_init() selects a high resolution clocksource and * timekeeping_notify() kicks the NOHZ stuff alive. * * So this WRITE_ONCE can only race with the READ_ONCE * check in tick_periodic() but this race is harmless. */ WRITE_ONCE(tick_do_timer_cpu, cpu); #endif } /* * Startup in periodic mode first. */ td->mode = TICKDEV_MODE_PERIODIC; } else { handler = td->evtdev->event_handler; next_event = td->evtdev->next_event; td->evtdev->event_handler = clockevents_handle_noop; } td->evtdev = newdev; /* * When the device is not per cpu, pin the interrupt to the * current cpu: */ if (!cpumask_equal(newdev->cpumask, cpumask)) irq_set_affinity(newdev->irq, cpumask); /* * When global broadcasting is active, check if the current * device is registered as a placeholder for broadcast mode. * This allows us to handle this x86 misfeature in a generic * way. This function also returns !=0 when we keep the * current active broadcast state for this CPU. */ if (tick_device_uses_broadcast(newdev, cpu)) return; if (td->mode == TICKDEV_MODE_PERIODIC) tick_setup_periodic(newdev, 0); else tick_setup_oneshot(newdev, handler, next_event); } void tick_install_replacement(struct clock_event_device *newdev) { struct tick_device *td = this_cpu_ptr(&tick_cpu_device); int cpu = smp_processor_id(); clockevents_exchange_device(td->evtdev, newdev); tick_setup_device(td, newdev, cpu, cpumask_of(cpu)); if (newdev->features & CLOCK_EVT_FEAT_ONESHOT) tick_oneshot_notify(); } static bool tick_check_percpu(struct clock_event_device *curdev, struct clock_event_device *newdev, int cpu) { if (!cpumask_test_cpu(cpu, newdev->cpumask)) return false; if (cpumask_equal(newdev->cpumask, cpumask_of(cpu))) return true; /* Check if irq affinity can be set */ if (newdev->irq >= 0 && !irq_can_set_affinity(newdev->irq)) return false; /* Prefer an existing cpu local device */ if (curdev && cpumask_equal(curdev->cpumask, cpumask_of(cpu))) return false; return true; } static bool tick_check_preferred(struct clock_event_device *curdev, struct clock_event_device *newdev) { /* Prefer oneshot capable device */ if (!(newdev->features & CLOCK_EVT_FEAT_ONESHOT)) { if (curdev && (curdev->features & CLOCK_EVT_FEAT_ONESHOT)) return false; if (tick_oneshot_mode_active()) return false; } /* * Use the higher rated one, but prefer a CPU local device with a lower * rating than a non-CPU local device */ return !curdev || newdev->rating > curdev->rating || !cpumask_equal(curdev->cpumask, newdev->cpumask); } /* * Check whether the new device is a better fit than curdev. curdev * can be NULL ! */ bool tick_check_replacement(struct clock_event_device *curdev, struct clock_event_device *newdev) { if (!tick_check_percpu(curdev, newdev, smp_processor_id())) return false; return tick_check_preferred(curdev, newdev); } /* * Check, if the new registered device should be used. Called with * clockevents_lock held and interrupts disabled. */ void tick_check_new_device(struct clock_event_device *newdev) { struct clock_event_device *curdev; struct tick_device *td; int cpu; cpu = smp_processor_id(); td = &per_cpu(tick_cpu_device, cpu); curdev = td->evtdev; if (!tick_check_replacement(curdev, newdev)) goto out_bc; if (!try_module_get(newdev->owner)) return; /* * Replace the eventually existing device by the new * device. If the current device is the broadcast device, do * not give it back to the clockevents layer ! */ if (tick_is_broadcast_device(curdev)) { clockevents_shutdown(curdev); curdev = NULL; } clockevents_exchange_device(curdev, newdev); tick_setup_device(td, newdev, cpu, cpumask_of(cpu)); if (newdev->features & CLOCK_EVT_FEAT_ONESHOT) tick_oneshot_notify(); return; out_bc: /* * Can the new device be used as a broadcast device ? */ tick_install_broadcast_device(newdev, cpu); } /** * tick_broadcast_oneshot_control - Enter/exit broadcast oneshot mode * @state: The target state (enter/exit) * * The system enters/leaves a state, where affected devices might stop * Returns 0 on success, -EBUSY if the cpu is used to broadcast wakeups. * * Called with interrupts disabled, so clockevents_lock is not * required here because the local clock event device cannot go away * under us. */ int tick_broadcast_oneshot_control(enum tick_broadcast_state state) { struct tick_device *td = this_cpu_ptr(&tick_cpu_device); if (!(td->evtdev->features & CLOCK_EVT_FEAT_C3STOP)) return 0; return __tick_broadcast_oneshot_control(state); } EXPORT_SYMBOL_GPL(tick_broadcast_oneshot_control); #ifdef CONFIG_HOTPLUG_CPU void tick_assert_timekeeping_handover(void) { WARN_ON_ONCE(tick_do_timer_cpu == smp_processor_id()); } /* * Stop the tick and transfer the timekeeping job away from a dying cpu. */ int tick_cpu_dying(unsigned int dying_cpu) { /* * If the current CPU is the timekeeper, it's the only one that can * safely hand over its duty. Also all online CPUs are in stop * machine, guaranteed not to be idle, therefore there is no * concurrency and it's safe to pick any online successor. */ if (tick_do_timer_cpu == dying_cpu) tick_do_timer_cpu = cpumask_first(cpu_online_mask); /* Make sure the CPU won't try to retake the timekeeping duty */ tick_sched_timer_dying(dying_cpu); /* Remove CPU from timer broadcasting */ tick_offline_cpu(dying_cpu); return 0; } /* * Shutdown an event device on a given cpu: * * This is called on a life CPU, when a CPU is dead. So we cannot * access the hardware device itself. * We just set the mode and remove it from the lists. */ void tick_shutdown(unsigned int cpu) { struct tick_device *td = &per_cpu(tick_cpu_device, cpu); struct clock_event_device *dev = td->evtdev; td->mode = TICKDEV_MODE_PERIODIC; if (dev) { /* * Prevent that the clock events layer tries to call * the set mode function! */ clockevent_set_state(dev, CLOCK_EVT_STATE_DETACHED); clockevents_exchange_device(dev, NULL); dev->event_handler = clockevents_handle_noop; td->evtdev = NULL; } } #endif /** * tick_suspend_local - Suspend the local tick device * * Called from the local cpu for freeze with interrupts disabled. * * No locks required. Nothing can change the per cpu device. */ void tick_suspend_local(void) { struct tick_device *td = this_cpu_ptr(&tick_cpu_device); clockevents_shutdown(td->evtdev); } /** * tick_resume_local - Resume the local tick device * * Called from the local CPU for unfreeze or XEN resume magic. * * No locks required. Nothing can change the per cpu device. */ void tick_resume_local(void) { struct tick_device *td = this_cpu_ptr(&tick_cpu_device); bool broadcast = tick_resume_check_broadcast(); clockevents_tick_resume(td->evtdev); if (!broadcast) { if (td->mode == TICKDEV_MODE_PERIODIC) tick_setup_periodic(td->evtdev, 0); else tick_resume_oneshot(); } /* * Ensure that hrtimers are up to date and the clockevents device * is reprogrammed correctly when high resolution timers are * enabled. */ hrtimers_resume_local(); } /** * tick_suspend - Suspend the tick and the broadcast device * * Called from syscore_suspend() via timekeeping_suspend with only one * CPU online and interrupts disabled or from tick_unfreeze() under * tick_freeze_lock. * * No locks required. Nothing can change the per cpu device. */ void tick_suspend(void) { tick_suspend_local(); tick_suspend_broadcast(); } /** * tick_resume - Resume the tick and the broadcast device * * Called from syscore_resume() via timekeeping_resume with only one * CPU online and interrupts disabled. * * No locks required. Nothing can change the per cpu device. */ void tick_resume(void) { tick_resume_broadcast(); tick_resume_local(); } #ifdef CONFIG_SUSPEND static DEFINE_RAW_SPINLOCK(tick_freeze_lock); static DEFINE_WAIT_OVERRIDE_MAP(tick_freeze_map, LD_WAIT_SLEEP); static unsigned int tick_freeze_depth; /** * tick_freeze - Suspend the local tick and (possibly) timekeeping. * * Check if this is the last online CPU executing the function and if so, * suspend timekeeping. Otherwise suspend the local tick. * * Call with interrupts disabled. Must be balanced with %tick_unfreeze(). * Interrupts must not be enabled before the subsequent %tick_unfreeze(). */ void tick_freeze(void) { raw_spin_lock(&tick_freeze_lock); tick_freeze_depth++; if (tick_freeze_depth == num_online_cpus()) { trace_suspend_resume(TPS("timekeeping_freeze"), smp_processor_id(), true); /* * All other CPUs have their interrupts disabled and are * suspended to idle. Other tasks have been frozen so there * is no scheduling happening. This means that there is no * concurrency in the system at this point. Therefore it is * okay to acquire a sleeping lock on PREEMPT_RT, such as a * spinlock, because the lock cannot be held by other CPUs * or threads and acquiring it cannot block. * * Inform lockdep about the situation. */ lock_map_acquire_try(&tick_freeze_map); system_state = SYSTEM_SUSPEND; sched_clock_suspend(); timekeeping_suspend(); lock_map_release(&tick_freeze_map); } else { tick_suspend_local(); } raw_spin_unlock(&tick_freeze_lock); } /** * tick_unfreeze - Resume the local tick and (possibly) timekeeping. * * Check if this is the first CPU executing the function and if so, resume * timekeeping. Otherwise resume the local tick. * * Call with interrupts disabled. Must be balanced with %tick_freeze(). * Interrupts must not be enabled after the preceding %tick_freeze(). */ void tick_unfreeze(void) { raw_spin_lock(&tick_freeze_lock); if (tick_freeze_depth == num_online_cpus()) { /* * Similar to tick_freeze(). On resumption the first CPU may * acquire uncontended sleeping locks while other CPUs block on * tick_freeze_lock. */ lock_map_acquire_try(&tick_freeze_map); timekeeping_resume(); sched_clock_resume(); lock_map_release(&tick_freeze_map); system_state = SYSTEM_RUNNING; trace_suspend_resume(TPS("timekeeping_freeze"), smp_processor_id(), false); } else { touch_softlockup_watchdog(); tick_resume_local(); } tick_freeze_depth--; raw_spin_unlock(&tick_freeze_lock); } #endif /* CONFIG_SUSPEND */ /** * tick_init - initialize the tick control */ void __init tick_init(void) { tick_broadcast_init(); tick_nohz_init(); } |
| 4 8 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 | // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. */ #include "timers.h" #include "device.h" #include "peer.h" #include "queueing.h" #include "socket.h" /* * - Timer for retransmitting the handshake if we don't hear back after * `REKEY_TIMEOUT + jitter` ms. * * - Timer for sending empty packet if we have received a packet but after have * not sent one for `KEEPALIVE_TIMEOUT` ms. * * - Timer for initiating new handshake if we have sent a packet but after have * not received one (even empty) for `(KEEPALIVE_TIMEOUT + REKEY_TIMEOUT) + * jitter` ms. * * - Timer for zeroing out all ephemeral keys after `(REJECT_AFTER_TIME * 3)` ms * if no new keys have been received. * * - Timer for, if enabled, sending an empty authenticated packet every user- * specified seconds. */ static inline void mod_peer_timer(struct wg_peer *peer, struct timer_list *timer, unsigned long expires) { rcu_read_lock_bh(); if (likely(netif_running(peer->device->dev) && !READ_ONCE(peer->is_dead))) mod_timer(timer, expires); rcu_read_unlock_bh(); } static void wg_expired_retransmit_handshake(struct timer_list *timer) { struct wg_peer *peer = timer_container_of(peer, timer, timer_retransmit_handshake); if (peer->timer_handshake_attempts > MAX_TIMER_HANDSHAKES) { pr_debug("%s: Handshake for peer %llu (%pISpfsc) did not complete after %d attempts, giving up\n", peer->device->dev->name, peer->internal_id, &peer->endpoint.addr, (int)MAX_TIMER_HANDSHAKES + 2); timer_delete(&peer->timer_send_keepalive); /* We drop all packets without a keypair and don't try again, * if we try unsuccessfully for too long to make a handshake. */ wg_packet_purge_staged_packets(peer); /* We set a timer for destroying any residue that might be left * of a partial exchange. */ if (!timer_pending(&peer->timer_zero_key_material)) mod_peer_timer(peer, &peer->timer_zero_key_material, jiffies + REJECT_AFTER_TIME * 3 * HZ); } else { ++peer->timer_handshake_attempts; pr_debug("%s: Handshake for peer %llu (%pISpfsc) did not complete after %d seconds, retrying (try %d)\n", peer->device->dev->name, peer->internal_id, &peer->endpoint.addr, (int)REKEY_TIMEOUT, peer->timer_handshake_attempts + 1); /* We clear the endpoint address src address, in case this is * the cause of trouble. */ wg_socket_clear_peer_endpoint_src(peer); wg_packet_send_queued_handshake_initiation(peer, true); } } static void wg_expired_send_keepalive(struct timer_list *timer) { struct wg_peer *peer = timer_container_of(peer, timer, timer_send_keepalive); wg_packet_send_keepalive(peer); if (peer->timer_need_another_keepalive) { peer->timer_need_another_keepalive = false; mod_peer_timer(peer, &peer->timer_send_keepalive, jiffies + KEEPALIVE_TIMEOUT * HZ); } } static void wg_expired_new_handshake(struct timer_list *timer) { struct wg_peer *peer = timer_container_of(peer, timer, timer_new_handshake); pr_debug("%s: Retrying handshake with peer %llu (%pISpfsc) because we stopped hearing back after %d seconds\n", peer->device->dev->name, peer->internal_id, &peer->endpoint.addr, (int)(KEEPALIVE_TIMEOUT + REKEY_TIMEOUT)); /* We clear the endpoint address src address, in case this is the cause * of trouble. */ wg_socket_clear_peer_endpoint_src(peer); wg_packet_send_queued_handshake_initiation(peer, false); } static void wg_expired_zero_key_material(struct timer_list *timer) { struct wg_peer *peer = timer_container_of(peer, timer, timer_zero_key_material); rcu_read_lock_bh(); if (!READ_ONCE(peer->is_dead)) { wg_peer_get(peer); if (!queue_work(peer->device->handshake_send_wq, &peer->clear_peer_work)) /* If the work was already on the queue, we want to drop * the extra reference. */ wg_peer_put(peer); } rcu_read_unlock_bh(); } static void wg_queued_expired_zero_key_material(struct work_struct *work) { struct wg_peer *peer = container_of(work, struct wg_peer, clear_peer_work); pr_debug("%s: Zeroing out all keys for peer %llu (%pISpfsc), since we haven't received a new one in %d seconds\n", peer->device->dev->name, peer->internal_id, &peer->endpoint.addr, (int)REJECT_AFTER_TIME * 3); wg_noise_handshake_clear(&peer->handshake); wg_noise_keypairs_clear(&peer->keypairs); wg_peer_put(peer); } static void wg_expired_send_persistent_keepalive(struct timer_list *timer) { struct wg_peer *peer = timer_container_of(peer, timer, timer_persistent_keepalive); if (likely(peer->persistent_keepalive_interval)) wg_packet_send_keepalive(peer); } /* Should be called after an authenticated data packet is sent. */ void wg_timers_data_sent(struct wg_peer *peer) { if (!timer_pending(&peer->timer_new_handshake)) mod_peer_timer(peer, &peer->timer_new_handshake, jiffies + (KEEPALIVE_TIMEOUT + REKEY_TIMEOUT) * HZ + get_random_u32_below(REKEY_TIMEOUT_JITTER_MAX_JIFFIES)); } /* Should be called after an authenticated data packet is received. */ void wg_timers_data_received(struct wg_peer *peer) { if (likely(netif_running(peer->device->dev))) { if (!timer_pending(&peer->timer_send_keepalive)) mod_peer_timer(peer, &peer->timer_send_keepalive, jiffies + KEEPALIVE_TIMEOUT * HZ); else peer->timer_need_another_keepalive = true; } } /* Should be called after any type of authenticated packet is sent, whether * keepalive, data, or handshake. */ void wg_timers_any_authenticated_packet_sent(struct wg_peer *peer) { timer_delete(&peer->timer_send_keepalive); } /* Should be called after any type of authenticated packet is received, whether * keepalive, data, or handshake. */ void wg_timers_any_authenticated_packet_received(struct wg_peer *peer) { timer_delete(&peer->timer_new_handshake); } /* Should be called after a handshake initiation message is sent. */ void wg_timers_handshake_initiated(struct wg_peer *peer) { mod_peer_timer(peer, &peer->timer_retransmit_handshake, jiffies + REKEY_TIMEOUT * HZ + get_random_u32_below(REKEY_TIMEOUT_JITTER_MAX_JIFFIES)); } /* Should be called after a handshake response message is received and processed * or when getting key confirmation via the first data message. */ void wg_timers_handshake_complete(struct wg_peer *peer) { timer_delete(&peer->timer_retransmit_handshake); peer->timer_handshake_attempts = 0; peer->sent_lastminute_handshake = false; ktime_get_real_ts64(&peer->walltime_last_handshake); } /* Should be called after an ephemeral key is created, which is before sending a * handshake response or after receiving a handshake response. */ void wg_timers_session_derived(struct wg_peer *peer) { mod_peer_timer(peer, &peer->timer_zero_key_material, jiffies + REJECT_AFTER_TIME * 3 * HZ); } /* Should be called before a packet with authentication, whether * keepalive, data, or handshakem is sent, or after one is received. */ void wg_timers_any_authenticated_packet_traversal(struct wg_peer *peer) { if (peer->persistent_keepalive_interval) mod_peer_timer(peer, &peer->timer_persistent_keepalive, jiffies + peer->persistent_keepalive_interval * HZ); } void wg_timers_init(struct wg_peer *peer) { timer_setup(&peer->timer_retransmit_handshake, wg_expired_retransmit_handshake, 0); timer_setup(&peer->timer_send_keepalive, wg_expired_send_keepalive, 0); timer_setup(&peer->timer_new_handshake, wg_expired_new_handshake, 0); timer_setup(&peer->timer_zero_key_material, wg_expired_zero_key_material, 0); timer_setup(&peer->timer_persistent_keepalive, wg_expired_send_persistent_keepalive, 0); INIT_WORK(&peer->clear_peer_work, wg_queued_expired_zero_key_material); peer->timer_handshake_attempts = 0; peer->sent_lastminute_handshake = false; peer->timer_need_another_keepalive = false; } void wg_timers_stop(struct wg_peer *peer) { timer_delete_sync(&peer->timer_retransmit_handshake); timer_delete_sync(&peer->timer_send_keepalive); timer_delete_sync(&peer->timer_new_handshake); timer_delete_sync(&peer->timer_zero_key_material); timer_delete_sync(&peer->timer_persistent_keepalive); flush_work(&peer->clear_peer_work); } |
| 1 4 4 4 4 4 4 4 6 6 2 1 1 1 1 1 1 1 20 4 4 4 4 2 4 3 4 4 4 4 4 1 4 6 20 3 3 1 6 5 4 1 2 2 1 1 6 5 2 3 5 3 9 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 | // SPDX-License-Identifier: GPL-2.0-only /* Copyright (C) 2009 Red Hat, Inc. * Author: Michael S. Tsirkin <mst@redhat.com> * * virtio-net server in host kernel. */ #include <linux/compat.h> #include <linux/eventfd.h> #include <linux/vhost.h> #include <linux/virtio_net.h> #include <linux/miscdevice.h> #include <linux/module.h> #include <linux/moduleparam.h> #include <linux/mutex.h> #include <linux/workqueue.h> #include <linux/file.h> #include <linux/slab.h> #include <linux/sched/clock.h> #include <linux/sched/signal.h> #include <linux/vmalloc.h> #include <linux/net.h> #include <linux/if_packet.h> #include <linux/if_arp.h> #include <linux/if_tun.h> #include <linux/if_macvlan.h> #include <linux/if_tap.h> #include <linux/if_vlan.h> #include <linux/skb_array.h> #include <linux/skbuff.h> #include <net/sock.h> #include <net/xdp.h> #include "vhost.h" static int experimental_zcopytx = 0; module_param(experimental_zcopytx, int, 0444); MODULE_PARM_DESC(experimental_zcopytx, "Enable Zero Copy TX;" " 1 -Enable; 0 - Disable"); /* Max number of bytes transferred before requeueing the job. * Using this limit prevents one virtqueue from starving others. */ #define VHOST_NET_WEIGHT 0x80000 /* Max number of packets transferred before requeueing the job. * Using this limit prevents one virtqueue from starving others with small * pkts. */ #define VHOST_NET_PKT_WEIGHT 256 /* MAX number of TX used buffers for outstanding zerocopy */ #define VHOST_MAX_PEND 128 #define VHOST_GOODCOPY_LEN 256 /* * For transmit, used buffer len is unused; we override it to track buffer * status internally; used for zerocopy tx only. */ /* Lower device DMA failed */ #define VHOST_DMA_FAILED_LEN ((__force __virtio32)3) /* Lower device DMA done */ #define VHOST_DMA_DONE_LEN ((__force __virtio32)2) /* Lower device DMA in progress */ #define VHOST_DMA_IN_PROGRESS ((__force __virtio32)1) /* Buffer unused */ #define VHOST_DMA_CLEAR_LEN ((__force __virtio32)0) #define VHOST_DMA_IS_DONE(len) ((__force u32)(len) >= (__force u32)VHOST_DMA_DONE_LEN) static const u64 vhost_net_features[VIRTIO_FEATURES_DWORDS] = { VHOST_FEATURES | (1ULL << VHOST_NET_F_VIRTIO_NET_HDR) | (1ULL << VIRTIO_NET_F_MRG_RXBUF) | (1ULL << VIRTIO_F_ACCESS_PLATFORM) | (1ULL << VIRTIO_F_RING_RESET) | (1ULL << VIRTIO_F_IN_ORDER), VIRTIO_BIT(VIRTIO_NET_F_GUEST_UDP_TUNNEL_GSO) | VIRTIO_BIT(VIRTIO_NET_F_HOST_UDP_TUNNEL_GSO), }; enum { VHOST_NET_BACKEND_FEATURES = (1ULL << VHOST_BACKEND_F_IOTLB_MSG_V2) }; enum { VHOST_NET_VQ_RX = 0, VHOST_NET_VQ_TX = 1, VHOST_NET_VQ_MAX = 2, }; struct vhost_net_ubuf_ref { /* refcount follows semantics similar to kref: * 0: object is released * 1: no outstanding ubufs * >1: outstanding ubufs */ atomic_t refcount; wait_queue_head_t wait; struct vhost_virtqueue *vq; }; #define VHOST_NET_BATCH 64 struct vhost_net_buf { void **queue; int tail; int head; }; struct vhost_net_virtqueue { struct vhost_virtqueue vq; size_t vhost_hlen; size_t sock_hlen; /* vhost zerocopy support fields below: */ /* last used idx for outstanding DMA zerocopy buffers */ int upend_idx; /* For TX, first used idx for DMA done zerocopy buffers * For RX, number of batched heads */ int done_idx; /* Number of XDP frames batched */ int batched_xdp; /* an array of userspace buffers info */ struct ubuf_info_msgzc *ubuf_info; /* Reference counting for outstanding ubufs. * Protected by vq mutex. Writers must also take device mutex. */ struct vhost_net_ubuf_ref *ubufs; struct ptr_ring *rx_ring; struct vhost_net_buf rxq; /* Batched XDP buffs */ struct xdp_buff *xdp; }; struct vhost_net { struct vhost_dev dev; struct vhost_net_virtqueue vqs[VHOST_NET_VQ_MAX]; struct vhost_poll poll[VHOST_NET_VQ_MAX]; /* Number of TX recently submitted. * Protected by tx vq lock. */ unsigned tx_packets; /* Number of times zerocopy TX recently failed. * Protected by tx vq lock. */ unsigned tx_zcopy_err; /* Flush in progress. Protected by tx vq lock. */ bool tx_flush; /* Private page frag cache */ struct page_frag_cache pf_cache; }; static unsigned vhost_net_zcopy_mask __read_mostly; static void *vhost_net_buf_get_ptr(struct vhost_net_buf *rxq) { if (rxq->tail != rxq->head) return rxq->queue[rxq->head]; else return NULL; } static int vhost_net_buf_get_size(struct vhost_net_buf *rxq) { return rxq->tail - rxq->head; } static int vhost_net_buf_is_empty(struct vhost_net_buf *rxq) { return rxq->tail == rxq->head; } static void *vhost_net_buf_consume(struct vhost_net_buf *rxq) { void *ret = vhost_net_buf_get_ptr(rxq); ++rxq->head; return ret; } static int vhost_net_buf_produce(struct vhost_net_virtqueue *nvq) { struct vhost_net_buf *rxq = &nvq->rxq; rxq->head = 0; rxq->tail = ptr_ring_consume_batched(nvq->rx_ring, rxq->queue, VHOST_NET_BATCH); return rxq->tail; } static void vhost_net_buf_unproduce(struct vhost_net_virtqueue *nvq) { struct vhost_net_buf *rxq = &nvq->rxq; if (nvq->rx_ring && !vhost_net_buf_is_empty(rxq)) { ptr_ring_unconsume(nvq->rx_ring, rxq->queue + rxq->head, vhost_net_buf_get_size(rxq), tun_ptr_free); rxq->head = rxq->tail = 0; } } static int vhost_net_buf_peek_len(void *ptr) { if (tun_is_xdp_frame(ptr)) { struct xdp_frame *xdpf = tun_ptr_to_xdp(ptr); return xdpf->len; } return __skb_array_len_with_tag(ptr); } static int vhost_net_buf_peek(struct vhost_net_virtqueue *nvq) { struct vhost_net_buf *rxq = &nvq->rxq; if (!vhost_net_buf_is_empty(rxq)) goto out; if (!vhost_net_buf_produce(nvq)) return 0; out: return vhost_net_buf_peek_len(vhost_net_buf_get_ptr(rxq)); } static void vhost_net_buf_init(struct vhost_net_buf *rxq) { rxq->head = rxq->tail = 0; } static void vhost_net_enable_zcopy(int vq) { vhost_net_zcopy_mask |= 0x1 << vq; } static struct vhost_net_ubuf_ref * vhost_net_ubuf_alloc(struct vhost_virtqueue *vq, bool zcopy) { struct vhost_net_ubuf_ref *ubufs; /* No zero copy backend? Nothing to count. */ if (!zcopy) return NULL; ubufs = kmalloc(sizeof(*ubufs), GFP_KERNEL); if (!ubufs) return ERR_PTR(-ENOMEM); atomic_set(&ubufs->refcount, 1); init_waitqueue_head(&ubufs->wait); ubufs->vq = vq; return ubufs; } static int vhost_net_ubuf_put(struct vhost_net_ubuf_ref *ubufs) { int r = atomic_sub_return(1, &ubufs->refcount); if (unlikely(!r)) wake_up(&ubufs->wait); return r; } static void vhost_net_ubuf_put_and_wait(struct vhost_net_ubuf_ref *ubufs) { vhost_net_ubuf_put(ubufs); wait_event(ubufs->wait, !atomic_read(&ubufs->refcount)); } static void vhost_net_ubuf_put_wait_and_free(struct vhost_net_ubuf_ref *ubufs) { vhost_net_ubuf_put_and_wait(ubufs); kfree(ubufs); } static void vhost_net_clear_ubuf_info(struct vhost_net *n) { int i; for (i = 0; i < VHOST_NET_VQ_MAX; ++i) { kfree(n->vqs[i].ubuf_info); n->vqs[i].ubuf_info = NULL; } } static int vhost_net_set_ubuf_info(struct vhost_net *n) { bool zcopy; int i; for (i = 0; i < VHOST_NET_VQ_MAX; ++i) { zcopy = vhost_net_zcopy_mask & (0x1 << i); if (!zcopy) continue; n->vqs[i].ubuf_info = kmalloc_array(UIO_MAXIOV, sizeof(*n->vqs[i].ubuf_info), GFP_KERNEL); if (!n->vqs[i].ubuf_info) goto err; } return 0; err: vhost_net_clear_ubuf_info(n); return -ENOMEM; } static void vhost_net_vq_reset(struct vhost_net *n) { int i; vhost_net_clear_ubuf_info(n); for (i = 0; i < VHOST_NET_VQ_MAX; i++) { n->vqs[i].done_idx = 0; n->vqs[i].upend_idx = 0; n->vqs[i].ubufs = NULL; n->vqs[i].vhost_hlen = 0; n->vqs[i].sock_hlen = 0; vhost_net_buf_init(&n->vqs[i].rxq); } } static void vhost_net_tx_packet(struct vhost_net *net) { ++net->tx_packets; if (net->tx_packets < 1024) return; net->tx_packets = 0; net->tx_zcopy_err = 0; } static void vhost_net_tx_err(struct vhost_net *net) { ++net->tx_zcopy_err; } static bool vhost_net_tx_select_zcopy(struct vhost_net *net) { /* TX flush waits for outstanding DMAs to be done. * Don't start new DMAs. */ return !net->tx_flush && net->tx_packets / 64 >= net->tx_zcopy_err; } static bool vhost_sock_zcopy(struct socket *sock) { return unlikely(experimental_zcopytx) && sock_flag(sock->sk, SOCK_ZEROCOPY); } static bool vhost_sock_xdp(struct socket *sock) { return sock_flag(sock->sk, SOCK_XDP); } /* In case of DMA done not in order in lower device driver for some reason. * upend_idx is used to track end of used idx, done_idx is used to track head * of used idx. Once lower device DMA done contiguously, we will signal KVM * guest used idx. */ static void vhost_zerocopy_signal_used(struct vhost_net *net, struct vhost_virtqueue *vq) { struct vhost_net_virtqueue *nvq = container_of(vq, struct vhost_net_virtqueue, vq); int i, add; int j = 0; for (i = nvq->done_idx; i != nvq->upend_idx; i = (i + 1) % UIO_MAXIOV) { if (vq->heads[i].len == VHOST_DMA_FAILED_LEN) vhost_net_tx_err(net); if (VHOST_DMA_IS_DONE(vq->heads[i].len)) { vq->heads[i].len = VHOST_DMA_CLEAR_LEN; ++j; } else break; } while (j) { add = min(UIO_MAXIOV - nvq->done_idx, j); vhost_add_used_and_signal_n(vq->dev, vq, &vq->heads[nvq->done_idx], NULL, add); nvq->done_idx = (nvq->done_idx + add) % UIO_MAXIOV; j -= add; } } static void vhost_zerocopy_complete(struct sk_buff *skb, struct ubuf_info *ubuf_base, bool success) { struct ubuf_info_msgzc *ubuf = uarg_to_msgzc(ubuf_base); struct vhost_net_ubuf_ref *ubufs = ubuf->ctx; struct vhost_virtqueue *vq = ubufs->vq; int cnt; rcu_read_lock_bh(); /* set len to mark this desc buffers done DMA */ vq->heads[ubuf->desc].len = success ? VHOST_DMA_DONE_LEN : VHOST_DMA_FAILED_LEN; cnt = vhost_net_ubuf_put(ubufs); /* * Trigger polling thread if guest stopped submitting new buffers: * in this case, the refcount after decrement will eventually reach 1. * We also trigger polling periodically after each 16 packets * (the value 16 here is more or less arbitrary, it's tuned to trigger * less than 10% of times). */ if (cnt <= 1 || !(cnt % 16)) vhost_poll_queue(&vq->poll); rcu_read_unlock_bh(); } static const struct ubuf_info_ops vhost_ubuf_ops = { .complete = vhost_zerocopy_complete, }; static inline unsigned long busy_clock(void) { return local_clock() >> 10; } static bool vhost_can_busy_poll(unsigned long endtime) { return likely(!need_resched() && !time_after(busy_clock(), endtime) && !signal_pending(current)); } static void vhost_net_disable_vq(struct vhost_net *n, struct vhost_virtqueue *vq) { struct vhost_net_virtqueue *nvq = container_of(vq, struct vhost_net_virtqueue, vq); struct vhost_poll *poll = n->poll + (nvq - n->vqs); if (!vhost_vq_get_backend(vq)) return; vhost_poll_stop(poll); } static int vhost_net_enable_vq(struct vhost_net *n, struct vhost_virtqueue *vq) { struct vhost_net_virtqueue *nvq = container_of(vq, struct vhost_net_virtqueue, vq); struct vhost_poll *poll = n->poll + (nvq - n->vqs); struct socket *sock; sock = vhost_vq_get_backend(vq); if (!sock) return 0; return vhost_poll_start(poll, sock->file); } static void vhost_net_signal_used(struct vhost_net_virtqueue *nvq, unsigned int count) { struct vhost_virtqueue *vq = &nvq->vq; struct vhost_dev *dev = vq->dev; if (!nvq->done_idx) return; vhost_add_used_and_signal_n(dev, vq, vq->heads, vq->nheads, count); nvq->done_idx = 0; } static void vhost_tx_batch(struct vhost_net *net, struct vhost_net_virtqueue *nvq, struct socket *sock, struct msghdr *msghdr) { struct vhost_virtqueue *vq = &nvq->vq; bool in_order = vhost_has_feature(vq, VIRTIO_F_IN_ORDER); struct tun_msg_ctl ctl = { .type = TUN_MSG_PTR, .num = nvq->batched_xdp, .ptr = nvq->xdp, }; int i, err; if (in_order) { vq->heads[0].len = 0; vq->nheads[0] = nvq->done_idx; } if (nvq->batched_xdp == 0) goto signal_used; msghdr->msg_control = &ctl; msghdr->msg_controllen = sizeof(ctl); err = sock->ops->sendmsg(sock, msghdr, 0); if (unlikely(err < 0)) { vq_err(&nvq->vq, "Fail to batch sending packets\n"); /* free pages owned by XDP; since this is an unlikely error path, * keep it simple and avoid more complex bulk update for the * used pages */ for (i = 0; i < nvq->batched_xdp; ++i) put_page(virt_to_head_page(nvq->xdp[i].data)); nvq->batched_xdp = 0; nvq->done_idx = 0; return; } signal_used: vhost_net_signal_used(nvq, in_order ? 1 : nvq->done_idx); nvq->batched_xdp = 0; } static int sock_has_rx_data(struct socket *sock) { if (unlikely(!sock)) return 0; if (sock->ops->peek_len) return sock->ops->peek_len(sock); return skb_queue_empty(&sock->sk->sk_receive_queue); } static void vhost_net_busy_poll_try_queue(struct vhost_net *net, struct vhost_virtqueue *vq) { if (!vhost_vq_avail_empty(&net->dev, vq)) { vhost_poll_queue(&vq->poll); } else if (unlikely(vhost_enable_notify(&net->dev, vq))) { vhost_disable_notify(&net->dev, vq); vhost_poll_queue(&vq->poll); } } static void vhost_net_busy_poll(struct vhost_net *net, struct vhost_virtqueue *rvq, struct vhost_virtqueue *tvq, bool *busyloop_intr, bool poll_rx) { unsigned long busyloop_timeout; unsigned long endtime; struct socket *sock; struct vhost_virtqueue *vq = poll_rx ? tvq : rvq; /* Try to hold the vq mutex of the paired virtqueue. We can't * use mutex_lock() here since we could not guarantee a * consistenet lock ordering. */ if (!mutex_trylock(&vq->mutex)) return; vhost_disable_notify(&net->dev, vq); sock = vhost_vq_get_backend(rvq); busyloop_timeout = poll_rx ? rvq->busyloop_timeout: tvq->busyloop_timeout; preempt_disable(); endtime = busy_clock() + busyloop_timeout; while (vhost_can_busy_poll(endtime)) { if (vhost_vq_has_work(vq)) { *busyloop_intr = true; break; } if ((sock_has_rx_data(sock) && !vhost_vq_avail_empty(&net->dev, rvq)) || !vhost_vq_avail_empty(&net->dev, tvq)) break; cpu_relax(); } preempt_enable(); if (poll_rx || sock_has_rx_data(sock)) vhost_net_busy_poll_try_queue(net, vq); else if (!poll_rx) /* On tx here, sock has no rx data. */ vhost_enable_notify(&net->dev, rvq); mutex_unlock(&vq->mutex); } static int vhost_net_tx_get_vq_desc(struct vhost_net *net, struct vhost_net_virtqueue *tnvq, unsigned int *out_num, unsigned int *in_num, struct msghdr *msghdr, bool *busyloop_intr) { struct vhost_net_virtqueue *rnvq = &net->vqs[VHOST_NET_VQ_RX]; struct vhost_virtqueue *rvq = &rnvq->vq; struct vhost_virtqueue *tvq = &tnvq->vq; int r = vhost_get_vq_desc(tvq, tvq->iov, ARRAY_SIZE(tvq->iov), out_num, in_num, NULL, NULL); if (r == tvq->num && tvq->busyloop_timeout) { /* Flush batched packets first */ if (!vhost_sock_zcopy(vhost_vq_get_backend(tvq))) vhost_tx_batch(net, tnvq, vhost_vq_get_backend(tvq), msghdr); vhost_net_busy_poll(net, rvq, tvq, busyloop_intr, false); r = vhost_get_vq_desc(tvq, tvq->iov, ARRAY_SIZE(tvq->iov), out_num, in_num, NULL, NULL); } return r; } static bool vhost_exceeds_maxpend(struct vhost_net *net) { struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX]; struct vhost_virtqueue *vq = &nvq->vq; return (nvq->upend_idx + UIO_MAXIOV - nvq->done_idx) % UIO_MAXIOV > min_t(unsigned int, VHOST_MAX_PEND, vq->num >> 2); } static size_t init_iov_iter(struct vhost_virtqueue *vq, struct iov_iter *iter, size_t hdr_size, int out) { /* Skip header. TODO: support TSO. */ size_t len = iov_length(vq->iov, out); iov_iter_init(iter, ITER_SOURCE, vq->iov, out, len); iov_iter_advance(iter, hdr_size); return iov_iter_count(iter); } static int get_tx_bufs(struct vhost_net *net, struct vhost_net_virtqueue *nvq, struct msghdr *msg, unsigned int *out, unsigned int *in, size_t *len, bool *busyloop_intr) { struct vhost_virtqueue *vq = &nvq->vq; int ret; ret = vhost_net_tx_get_vq_desc(net, nvq, out, in, msg, busyloop_intr); if (ret < 0 || ret == vq->num) return ret; if (*in) { vq_err(vq, "Unexpected descriptor format for TX: out %d, int %d\n", *out, *in); return -EFAULT; } /* Sanity check */ *len = init_iov_iter(vq, &msg->msg_iter, nvq->vhost_hlen, *out); if (*len == 0) { vq_err(vq, "Unexpected header len for TX: %zd expected %zd\n", *len, nvq->vhost_hlen); return -EFAULT; } return ret; } static bool tx_can_batch(struct vhost_virtqueue *vq, size_t total_len) { return total_len < VHOST_NET_WEIGHT && !vhost_vq_avail_empty(vq->dev, vq); } #define VHOST_NET_RX_PAD (NET_IP_ALIGN + NET_SKB_PAD) static int vhost_net_build_xdp(struct vhost_net_virtqueue *nvq, struct iov_iter *from) { struct vhost_virtqueue *vq = &nvq->vq; struct vhost_net *net = container_of(vq->dev, struct vhost_net, dev); struct socket *sock = vhost_vq_get_backend(vq); struct virtio_net_hdr *gso; struct xdp_buff *xdp = &nvq->xdp[nvq->batched_xdp]; size_t len = iov_iter_count(from); int headroom = vhost_sock_xdp(sock) ? XDP_PACKET_HEADROOM : 0; int buflen = SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); int pad = SKB_DATA_ALIGN(VHOST_NET_RX_PAD + headroom + nvq->sock_hlen); int sock_hlen = nvq->sock_hlen; void *buf; int copied; int ret; if (unlikely(len < nvq->sock_hlen)) return -EFAULT; if (SKB_DATA_ALIGN(len + pad) + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) > PAGE_SIZE) return -ENOSPC; buflen += SKB_DATA_ALIGN(len + pad); buf = page_frag_alloc_align(&net->pf_cache, buflen, GFP_KERNEL, SMP_CACHE_BYTES); if (unlikely(!buf)) return -ENOMEM; copied = copy_from_iter(buf + pad - sock_hlen, len, from); if (copied != len) { ret = -EFAULT; goto err; } gso = buf + pad - sock_hlen; if (!sock_hlen) memset(buf, 0, pad); if ((gso->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) && vhost16_to_cpu(vq, gso->csum_start) + vhost16_to_cpu(vq, gso->csum_offset) + 2 > vhost16_to_cpu(vq, gso->hdr_len)) { gso->hdr_len = cpu_to_vhost16(vq, vhost16_to_cpu(vq, gso->csum_start) + vhost16_to_cpu(vq, gso->csum_offset) + 2); if (vhost16_to_cpu(vq, gso->hdr_len) > len) { ret = -EINVAL; goto err; } } /* pad contains sock_hlen */ memcpy(buf, buf + pad - sock_hlen, sock_hlen); xdp_init_buff(xdp, buflen, NULL); xdp_prepare_buff(xdp, buf, pad, len - sock_hlen, true); ++nvq->batched_xdp; return 0; err: page_frag_free(buf); return ret; } static void handle_tx_copy(struct vhost_net *net, struct socket *sock) { struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX]; struct vhost_virtqueue *vq = &nvq->vq; unsigned out, in; int head; struct msghdr msg = { .msg_name = NULL, .msg_namelen = 0, .msg_control = NULL, .msg_controllen = 0, .msg_flags = MSG_DONTWAIT, }; size_t len, total_len = 0; int err; int sent_pkts = 0; bool sock_can_batch = (sock->sk->sk_sndbuf == INT_MAX); bool busyloop_intr; bool in_order = vhost_has_feature(vq, VIRTIO_F_IN_ORDER); do { busyloop_intr = false; if (nvq->done_idx == VHOST_NET_BATCH) vhost_tx_batch(net, nvq, sock, &msg); head = get_tx_bufs(net, nvq, &msg, &out, &in, &len, &busyloop_intr); /* On error, stop handling until the next kick. */ if (unlikely(head < 0)) break; /* Nothing new? Wait for eventfd to tell us they refilled. */ if (head == vq->num) { /* Kicks are disabled at this point, break loop and * process any remaining batched packets. Queue will * be re-enabled afterwards. */ break; } total_len += len; /* For simplicity, TX batching is only enabled if * sndbuf is unlimited. */ if (sock_can_batch) { err = vhost_net_build_xdp(nvq, &msg.msg_iter); if (!err) { goto done; } else if (unlikely(err != -ENOSPC)) { vhost_tx_batch(net, nvq, sock, &msg); vhost_discard_vq_desc(vq, 1); vhost_net_enable_vq(net, vq); break; } if (nvq->batched_xdp) { /* We can't build XDP buff, go for single * packet path but let's flush batched * packets. */ vhost_tx_batch(net, nvq, sock, &msg); } msg.msg_control = NULL; } else { if (tx_can_batch(vq, total_len)) msg.msg_flags |= MSG_MORE; else msg.msg_flags &= ~MSG_MORE; } err = sock->ops->sendmsg(sock, &msg, len); if (unlikely(err < 0)) { if (err == -EAGAIN || err == -ENOMEM || err == -ENOBUFS) { vhost_discard_vq_desc(vq, 1); vhost_net_enable_vq(net, vq); break; } pr_debug("Fail to send packet: err %d", err); } else if (unlikely(err != len)) pr_debug("Truncated TX packet: len %d != %zd\n", err, len); done: if (in_order) { vq->heads[0].id = cpu_to_vhost32(vq, head); } else { vq->heads[nvq->done_idx].id = cpu_to_vhost32(vq, head); vq->heads[nvq->done_idx].len = 0; } ++nvq->done_idx; } while (likely(!vhost_exceeds_weight(vq, ++sent_pkts, total_len))); /* Kicks are still disabled, dispatch any remaining batched msgs. */ vhost_tx_batch(net, nvq, sock, &msg); if (unlikely(busyloop_intr)) /* If interrupted while doing busy polling, requeue the * handler to be fair handle_rx as well as other tasks * waiting on cpu. */ vhost_poll_queue(&vq->poll); else /* All of our work has been completed; however, before * leaving the TX handler, do one last check for work, * and requeue handler if necessary. If there is no work, * queue will be reenabled. */ vhost_net_busy_poll_try_queue(net, vq); } static void handle_tx_zerocopy(struct vhost_net *net, struct socket *sock) { struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX]; struct vhost_virtqueue *vq = &nvq->vq; unsigned out, in; int head; struct msghdr msg = { .msg_name = NULL, .msg_namelen = 0, .msg_control = NULL, .msg_controllen = 0, .msg_flags = MSG_DONTWAIT, }; struct tun_msg_ctl ctl; size_t len, total_len = 0; int err; struct vhost_net_ubuf_ref *ubufs; struct ubuf_info_msgzc *ubuf; bool zcopy_used; int sent_pkts = 0; do { bool busyloop_intr; /* Release DMAs done buffers first */ vhost_zerocopy_signal_used(net, vq); busyloop_intr = false; head = get_tx_bufs(net, nvq, &msg, &out, &in, &len, &busyloop_intr); /* On error, stop handling until the next kick. */ if (unlikely(head < 0)) break; /* Nothing new? Wait for eventfd to tell us they refilled. */ if (head == vq->num) { if (unlikely(busyloop_intr)) { vhost_poll_queue(&vq->poll); } else if (unlikely(vhost_enable_notify(&net->dev, vq))) { vhost_disable_notify(&net->dev, vq); continue; } break; } zcopy_used = len >= VHOST_GOODCOPY_LEN && !vhost_exceeds_maxpend(net) && vhost_net_tx_select_zcopy(net); /* use msg_control to pass vhost zerocopy ubuf info to skb */ if (zcopy_used) { ubuf = nvq->ubuf_info + nvq->upend_idx; vq->heads[nvq->upend_idx].id = cpu_to_vhost32(vq, head); vq->heads[nvq->upend_idx].len = VHOST_DMA_IN_PROGRESS; ubuf->ctx = nvq->ubufs; ubuf->desc = nvq->upend_idx; ubuf->ubuf.ops = &vhost_ubuf_ops; ubuf->ubuf.flags = SKBFL_ZEROCOPY_FRAG; refcount_set(&ubuf->ubuf.refcnt, 1); msg.msg_control = &ctl; ctl.type = TUN_MSG_UBUF; ctl.ptr = &ubuf->ubuf; msg.msg_controllen = sizeof(ctl); ubufs = nvq->ubufs; atomic_inc(&ubufs->refcount); nvq->upend_idx = (nvq->upend_idx + 1) % UIO_MAXIOV; } else { msg.msg_control = NULL; ubufs = NULL; } total_len += len; if (tx_can_batch(vq, total_len) && likely(!vhost_exceeds_maxpend(net))) { msg.msg_flags |= MSG_MORE; } else { msg.msg_flags &= ~MSG_MORE; } err = sock->ops->sendmsg(sock, &msg, len); if (unlikely(err < 0)) { bool retry = err == -EAGAIN || err == -ENOMEM || err == -ENOBUFS; if (zcopy_used) { if (vq->heads[ubuf->desc].len == VHOST_DMA_IN_PROGRESS) vhost_net_ubuf_put(ubufs); if (retry) nvq->upend_idx = ((unsigned)nvq->upend_idx - 1) % UIO_MAXIOV; else vq->heads[ubuf->desc].len = VHOST_DMA_DONE_LEN; } if (retry) { vhost_discard_vq_desc(vq, 1); vhost_net_enable_vq(net, vq); break; } pr_debug("Fail to send packet: err %d", err); } else if (unlikely(err != len)) pr_debug("Truncated TX packet: " " len %d != %zd\n", err, len); if (!zcopy_used) vhost_add_used_and_signal(&net->dev, vq, head, 0); else vhost_zerocopy_signal_used(net, vq); vhost_net_tx_packet(net); } while (likely(!vhost_exceeds_weight(vq, ++sent_pkts, total_len))); } /* Expects to be always run from workqueue - which acts as * read-size critical section for our kind of RCU. */ static void handle_tx(struct vhost_net *net) { struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX]; struct vhost_virtqueue *vq = &nvq->vq; struct socket *sock; mutex_lock_nested(&vq->mutex, VHOST_NET_VQ_TX); sock = vhost_vq_get_backend(vq); if (!sock) goto out; if (!vq_meta_prefetch(vq)) goto out; vhost_disable_notify(&net->dev, vq); vhost_net_disable_vq(net, vq); if (vhost_sock_zcopy(sock)) handle_tx_zerocopy(net, sock); else handle_tx_copy(net, sock); out: mutex_unlock(&vq->mutex); } static int peek_head_len(struct vhost_net_virtqueue *rvq, struct sock *sk) { struct sk_buff *head; int len = 0; unsigned long flags; if (rvq->rx_ring) return vhost_net_buf_peek(rvq); spin_lock_irqsave(&sk->sk_receive_queue.lock, flags); head = skb_peek(&sk->sk_receive_queue); if (likely(head)) { len = head->len; if (skb_vlan_tag_present(head)) len += VLAN_HLEN; } spin_unlock_irqrestore(&sk->sk_receive_queue.lock, flags); return len; } static int vhost_net_rx_peek_head_len(struct vhost_net *net, struct sock *sk, bool *busyloop_intr, unsigned int count) { struct vhost_net_virtqueue *rnvq = &net->vqs[VHOST_NET_VQ_RX]; struct vhost_net_virtqueue *tnvq = &net->vqs[VHOST_NET_VQ_TX]; struct vhost_virtqueue *rvq = &rnvq->vq; struct vhost_virtqueue *tvq = &tnvq->vq; int len = peek_head_len(rnvq, sk); if (!len && rvq->busyloop_timeout) { /* Flush batched heads first */ vhost_net_signal_used(rnvq, count); /* Both tx vq and rx socket were polled here */ vhost_net_busy_poll(net, rvq, tvq, busyloop_intr, true); len = peek_head_len(rnvq, sk); } return len; } /* This is a multi-buffer version of vhost_get_desc, that works if * vq has read descriptors only. * @nvq - the relevant vhost_net virtqueue * @datalen - data length we'll be reading * @iovcount - returned count of io vectors we fill * @log - vhost log * @log_num - log offset * @quota - headcount quota, 1 for big buffer * returns number of buffer heads allocated, negative on error */ static int get_rx_bufs(struct vhost_net_virtqueue *nvq, struct vring_used_elem *heads, u16 *nheads, int datalen, unsigned *iovcount, struct vhost_log *log, unsigned *log_num, unsigned int quota) { struct vhost_virtqueue *vq = &nvq->vq; bool in_order = vhost_has_feature(vq, VIRTIO_F_IN_ORDER); unsigned int out, in; int seg = 0; int headcount = 0; unsigned d; int r, nlogs = 0; /* len is always initialized before use since we are always called with * datalen > 0. */ u32 len; while (datalen > 0 && headcount < quota) { if (unlikely(seg >= UIO_MAXIOV)) { r = -ENOBUFS; goto err; } r = vhost_get_vq_desc(vq, vq->iov + seg, ARRAY_SIZE(vq->iov) - seg, &out, &in, log, log_num); if (unlikely(r < 0)) goto err; d = r; if (d == vq->num) { r = 0; goto err; } if (unlikely(out || in <= 0)) { vq_err(vq, "unexpected descriptor format for RX: " "out %d, in %d\n", out, in); r = -EINVAL; goto err; } if (unlikely(log)) { nlogs += *log_num; log += *log_num; } len = iov_length(vq->iov + seg, in); if (!in_order) { heads[headcount].id = cpu_to_vhost32(vq, d); heads[headcount].len = cpu_to_vhost32(vq, len); } ++headcount; datalen -= len; seg += in; } *iovcount = seg; if (unlikely(log)) *log_num = nlogs; /* Detect overrun */ if (unlikely(datalen > 0)) { r = UIO_MAXIOV + 1; goto err; } if (!in_order) heads[headcount - 1].len = cpu_to_vhost32(vq, len + datalen); else { heads[0].len = cpu_to_vhost32(vq, len + datalen); heads[0].id = cpu_to_vhost32(vq, d); nheads[0] = headcount; } return headcount; err: vhost_discard_vq_desc(vq, headcount); return r; } /* Expects to be always run from workqueue - which acts as * read-size critical section for our kind of RCU. */ static void handle_rx(struct vhost_net *net) { struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_RX]; struct vhost_virtqueue *vq = &nvq->vq; bool in_order = vhost_has_feature(vq, VIRTIO_F_IN_ORDER); unsigned int count = 0; unsigned in, log; struct vhost_log *vq_log; struct msghdr msg = { .msg_name = NULL, .msg_namelen = 0, .msg_control = NULL, /* FIXME: get and handle RX aux data. */ .msg_controllen = 0, .msg_flags = MSG_DONTWAIT, }; struct virtio_net_hdr hdr = { .flags = 0, .gso_type = VIRTIO_NET_HDR_GSO_NONE }; size_t total_len = 0; int err, mergeable; s16 headcount; size_t vhost_hlen, sock_hlen; size_t vhost_len, sock_len; bool busyloop_intr = false; bool set_num_buffers; struct socket *sock; struct iov_iter fixup; __virtio16 num_buffers; int recv_pkts = 0; mutex_lock_nested(&vq->mutex, VHOST_NET_VQ_RX); sock = vhost_vq_get_backend(vq); if (!sock) goto out; if (!vq_meta_prefetch(vq)) goto out; vhost_disable_notify(&net->dev, vq); vhost_net_disable_vq(net, vq); vhost_hlen = nvq->vhost_hlen; sock_hlen = nvq->sock_hlen; vq_log = unlikely(vhost_has_feature(vq, VHOST_F_LOG_ALL)) ? vq->log : NULL; mergeable = vhost_has_feature(vq, VIRTIO_NET_F_MRG_RXBUF); set_num_buffers = mergeable || vhost_has_feature(vq, VIRTIO_F_VERSION_1); do { sock_len = vhost_net_rx_peek_head_len(net, sock->sk, &busyloop_intr, count); if (!sock_len) break; sock_len += sock_hlen; vhost_len = sock_len + vhost_hlen; headcount = get_rx_bufs(nvq, vq->heads + count, vq->nheads + count, vhost_len, &in, vq_log, &log, likely(mergeable) ? UIO_MAXIOV : 1); /* On error, stop handling until the next kick. */ if (unlikely(headcount < 0)) goto out; /* OK, now we need to know about added descriptors. */ if (!headcount) { if (unlikely(busyloop_intr)) { vhost_poll_queue(&vq->poll); } else if (unlikely(vhost_enable_notify(&net->dev, vq))) { /* They have slipped one in as we were * doing that: check again. */ vhost_disable_notify(&net->dev, vq); continue; } /* Nothing new? Wait for eventfd to tell us * they refilled. */ goto out; } busyloop_intr = false; if (nvq->rx_ring) msg.msg_control = vhost_net_buf_consume(&nvq->rxq); /* On overrun, truncate and discard */ if (unlikely(headcount > UIO_MAXIOV)) { iov_iter_init(&msg.msg_iter, ITER_DEST, vq->iov, 1, 1); err = sock->ops->recvmsg(sock, &msg, 1, MSG_DONTWAIT | MSG_TRUNC); pr_debug("Discarded rx packet: len %zd\n", sock_len); continue; } /* We don't need to be notified again. */ iov_iter_init(&msg.msg_iter, ITER_DEST, vq->iov, in, vhost_len); fixup = msg.msg_iter; if (unlikely((vhost_hlen))) { /* We will supply the header ourselves * TODO: support TSO. */ iov_iter_advance(&msg.msg_iter, vhost_hlen); } err = sock->ops->recvmsg(sock, &msg, sock_len, MSG_DONTWAIT | MSG_TRUNC); /* Userspace might have consumed the packet meanwhile: * it's not supposed to do this usually, but might be hard * to prevent. Discard data we got (if any) and keep going. */ if (unlikely(err != sock_len)) { pr_debug("Discarded rx packet: " " len %d, expected %zd\n", err, sock_len); vhost_discard_vq_desc(vq, headcount); continue; } /* Supply virtio_net_hdr if VHOST_NET_F_VIRTIO_NET_HDR */ if (unlikely(vhost_hlen)) { if (copy_to_iter(&hdr, sizeof(hdr), &fixup) != sizeof(hdr)) { vq_err(vq, "Unable to write vnet_hdr " "at addr %p\n", vq->iov->iov_base); goto out; } } else { /* Header came from socket; we'll need to patch * ->num_buffers over if VIRTIO_NET_F_MRG_RXBUF */ iov_iter_advance(&fixup, sizeof(hdr)); } /* TODO: Should check and handle checksum. */ num_buffers = cpu_to_vhost16(vq, headcount); if (likely(set_num_buffers) && copy_to_iter(&num_buffers, sizeof num_buffers, &fixup) != sizeof num_buffers) { vq_err(vq, "Failed num_buffers write"); vhost_discard_vq_desc(vq, headcount); goto out; } nvq->done_idx += headcount; count += in_order ? 1 : headcount; if (nvq->done_idx > VHOST_NET_BATCH) { vhost_net_signal_used(nvq, count); count = 0; } if (unlikely(vq_log)) vhost_log_write(vq, vq_log, log, vhost_len, vq->iov, in); total_len += vhost_len; } while (likely(!vhost_exceeds_weight(vq, ++recv_pkts, total_len))); if (unlikely(busyloop_intr)) vhost_poll_queue(&vq->poll); else if (!sock_len) vhost_net_enable_vq(net, vq); out: vhost_net_signal_used(nvq, count); mutex_unlock(&vq->mutex); } static void handle_tx_kick(struct vhost_work *work) { struct vhost_virtqueue *vq = container_of(work, struct vhost_virtqueue, poll.work); struct vhost_net *net = container_of(vq->dev, struct vhost_net, dev); handle_tx(net); } static void handle_rx_kick(struct vhost_work *work) { struct vhost_virtqueue *vq = container_of(work, struct vhost_virtqueue, poll.work); struct vhost_net *net = container_of(vq->dev, struct vhost_net, dev); handle_rx(net); } static void handle_tx_net(struct vhost_work *work) { struct vhost_net *net = container_of(work, struct vhost_net, poll[VHOST_NET_VQ_TX].work); handle_tx(net); } static void handle_rx_net(struct vhost_work *work) { struct vhost_net *net = container_of(work, struct vhost_net, poll[VHOST_NET_VQ_RX].work); handle_rx(net); } static int vhost_net_open(struct inode *inode, struct file *f) { struct vhost_net *n; struct vhost_dev *dev; struct vhost_virtqueue **vqs; void **queue; struct xdp_buff *xdp; int i; n = kvmalloc(sizeof *n, GFP_KERNEL | __GFP_RETRY_MAYFAIL); if (!n) return -ENOMEM; vqs = kmalloc_array(VHOST_NET_VQ_MAX, sizeof(*vqs), GFP_KERNEL); if (!vqs) { kvfree(n); return -ENOMEM; } queue = kmalloc_array(VHOST_NET_BATCH, sizeof(void *), GFP_KERNEL); if (!queue) { kfree(vqs); kvfree(n); return -ENOMEM; } n->vqs[VHOST_NET_VQ_RX].rxq.queue = queue; xdp = kmalloc_array(VHOST_NET_BATCH, sizeof(*xdp), GFP_KERNEL); if (!xdp) { kfree(vqs); kvfree(n); kfree(queue); return -ENOMEM; } n->vqs[VHOST_NET_VQ_TX].xdp = xdp; dev = &n->dev; vqs[VHOST_NET_VQ_TX] = &n->vqs[VHOST_NET_VQ_TX].vq; vqs[VHOST_NET_VQ_RX] = &n->vqs[VHOST_NET_VQ_RX].vq; n->vqs[VHOST_NET_VQ_TX].vq.handle_kick = handle_tx_kick; n->vqs[VHOST_NET_VQ_RX].vq.handle_kick = handle_rx_kick; for (i = 0; i < VHOST_NET_VQ_MAX; i++) { n->vqs[i].ubufs = NULL; n->vqs[i].ubuf_info = NULL; n->vqs[i].upend_idx = 0; n->vqs[i].done_idx = 0; n->vqs[i].batched_xdp = 0; n->vqs[i].vhost_hlen = 0; n->vqs[i].sock_hlen = 0; n->vqs[i].rx_ring = NULL; vhost_net_buf_init(&n->vqs[i].rxq); } vhost_dev_init(dev, vqs, VHOST_NET_VQ_MAX, UIO_MAXIOV + VHOST_NET_BATCH, VHOST_NET_PKT_WEIGHT, VHOST_NET_WEIGHT, true, NULL); vhost_poll_init(n->poll + VHOST_NET_VQ_TX, handle_tx_net, EPOLLOUT, dev, vqs[VHOST_NET_VQ_TX]); vhost_poll_init(n->poll + VHOST_NET_VQ_RX, handle_rx_net, EPOLLIN, dev, vqs[VHOST_NET_VQ_RX]); f->private_data = n; page_frag_cache_init(&n->pf_cache); return 0; } static struct socket *vhost_net_stop_vq(struct vhost_net *n, struct vhost_virtqueue *vq) { struct socket *sock; struct vhost_net_virtqueue *nvq = container_of(vq, struct vhost_net_virtqueue, vq); mutex_lock(&vq->mutex); sock = vhost_vq_get_backend(vq); vhost_net_disable_vq(n, vq); vhost_vq_set_backend(vq, NULL); vhost_net_buf_unproduce(nvq); nvq->rx_ring = NULL; mutex_unlock(&vq->mutex); return sock; } static void vhost_net_stop(struct vhost_net *n, struct socket **tx_sock, struct socket **rx_sock) { *tx_sock = vhost_net_stop_vq(n, &n->vqs[VHOST_NET_VQ_TX].vq); *rx_sock = vhost_net_stop_vq(n, &n->vqs[VHOST_NET_VQ_RX].vq); } static void vhost_net_flush(struct vhost_net *n) { vhost_dev_flush(&n->dev); if (n->vqs[VHOST_NET_VQ_TX].ubufs) { mutex_lock(&n->vqs[VHOST_NET_VQ_TX].vq.mutex); n->tx_flush = true; mutex_unlock(&n->vqs[VHOST_NET_VQ_TX].vq.mutex); /* Wait for all lower device DMAs done. */ vhost_net_ubuf_put_and_wait(n->vqs[VHOST_NET_VQ_TX].ubufs); mutex_lock(&n->vqs[VHOST_NET_VQ_TX].vq.mutex); n->tx_flush = false; atomic_set(&n->vqs[VHOST_NET_VQ_TX].ubufs->refcount, 1); mutex_unlock(&n->vqs[VHOST_NET_VQ_TX].vq.mutex); } } static int vhost_net_release(struct inode *inode, struct file *f) { struct vhost_net *n = f->private_data; struct socket *tx_sock; struct socket *rx_sock; vhost_net_stop(n, &tx_sock, &rx_sock); vhost_net_flush(n); vhost_dev_stop(&n->dev); vhost_dev_cleanup(&n->dev); vhost_net_vq_reset(n); if (tx_sock) sockfd_put(tx_sock); if (rx_sock) sockfd_put(rx_sock); /* Make sure no callbacks are outstanding */ synchronize_rcu(); /* We do an extra flush before freeing memory, * since jobs can re-queue themselves. */ vhost_net_flush(n); kfree(n->vqs[VHOST_NET_VQ_RX].rxq.queue); kfree(n->vqs[VHOST_NET_VQ_TX].xdp); kfree(n->dev.vqs); page_frag_cache_drain(&n->pf_cache); kvfree(n); return 0; } static struct socket *get_raw_socket(int fd) { int r; struct socket *sock = sockfd_lookup(fd, &r); if (!sock) return ERR_PTR(-ENOTSOCK); /* Parameter checking */ if (sock->sk->sk_type != SOCK_RAW) { r = -ESOCKTNOSUPPORT; goto err; } if (sock->sk->sk_family != AF_PACKET) { r = -EPFNOSUPPORT; goto err; } return sock; err: sockfd_put(sock); return ERR_PTR(r); } static struct ptr_ring *get_tap_ptr_ring(struct file *file) { struct ptr_ring *ring; ring = tun_get_tx_ring(file); if (!IS_ERR(ring)) goto out; ring = tap_get_ptr_ring(file); if (!IS_ERR(ring)) goto out; ring = NULL; out: return ring; } static struct socket *get_tap_socket(int fd) { struct file *file = fget(fd); struct socket *sock; if (!file) return ERR_PTR(-EBADF); sock = tun_get_socket(file); if (!IS_ERR(sock)) return sock; sock = tap_get_socket(file); if (IS_ERR(sock)) fput(file); return sock; } static struct socket *get_socket(int fd) { struct socket *sock; /* special case to disable backend */ if (fd == -1) return NULL; sock = get_raw_socket(fd); if (!IS_ERR(sock)) return sock; sock = get_tap_socket(fd); if (!IS_ERR(sock)) return sock; return ERR_PTR(-ENOTSOCK); } static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd) { struct socket *sock, *oldsock; struct vhost_virtqueue *vq; struct vhost_net_virtqueue *nvq; struct vhost_net_ubuf_ref *ubufs, *oldubufs = NULL; int r; mutex_lock(&n->dev.mutex); r = vhost_dev_check_owner(&n->dev); if (r) goto err; if (index >= VHOST_NET_VQ_MAX) { r = -ENOBUFS; goto err; } vq = &n->vqs[index].vq; nvq = &n->vqs[index]; mutex_lock(&vq->mutex); if (fd == -1) vhost_clear_msg(&n->dev); /* Verify that ring has been setup correctly. */ if (!vhost_vq_access_ok(vq)) { r = -EFAULT; goto err_vq; } sock = get_socket(fd); if (IS_ERR(sock)) { r = PTR_ERR(sock); goto err_vq; } /* start polling new socket */ oldsock = vhost_vq_get_backend(vq); if (sock != oldsock) { ubufs = vhost_net_ubuf_alloc(vq, sock && vhost_sock_zcopy(sock)); if (IS_ERR(ubufs)) { r = PTR_ERR(ubufs); goto err_ubufs; } vhost_net_disable_vq(n, vq); vhost_vq_set_backend(vq, sock); vhost_net_buf_unproduce(nvq); r = vhost_vq_init_access(vq); if (r) goto err_used; r = vhost_net_enable_vq(n, vq); if (r) goto err_used; if (index == VHOST_NET_VQ_RX) { if (sock) nvq->rx_ring = get_tap_ptr_ring(sock->file); else nvq->rx_ring = NULL; } oldubufs = nvq->ubufs; nvq->ubufs = ubufs; n->tx_packets = 0; n->tx_zcopy_err = 0; n->tx_flush = false; } mutex_unlock(&vq->mutex); if (oldubufs) { vhost_net_ubuf_put_wait_and_free(oldubufs); mutex_lock(&vq->mutex); vhost_zerocopy_signal_used(n, vq); mutex_unlock(&vq->mutex); } if (oldsock) { vhost_dev_flush(&n->dev); sockfd_put(oldsock); } mutex_unlock(&n->dev.mutex); return 0; err_used: vhost_vq_set_backend(vq, oldsock); vhost_net_enable_vq(n, vq); if (ubufs) vhost_net_ubuf_put_wait_and_free(ubufs); err_ubufs: if (sock) sockfd_put(sock); err_vq: mutex_unlock(&vq->mutex); err: mutex_unlock(&n->dev.mutex); return r; } static long vhost_net_reset_owner(struct vhost_net *n) { struct socket *tx_sock = NULL; struct socket *rx_sock = NULL; long err; struct vhost_iotlb *umem; mutex_lock(&n->dev.mutex); err = vhost_dev_check_owner(&n->dev); if (err) goto done; umem = vhost_dev_reset_owner_prepare(); if (!umem) { err = -ENOMEM; goto done; } vhost_net_stop(n, &tx_sock, &rx_sock); vhost_net_flush(n); vhost_dev_stop(&n->dev); vhost_dev_reset_owner(&n->dev, umem); vhost_net_vq_reset(n); done: mutex_unlock(&n->dev.mutex); if (tx_sock) sockfd_put(tx_sock); if (rx_sock) sockfd_put(rx_sock); return err; } static int vhost_net_set_features(struct vhost_net *n, const u64 *features) { size_t vhost_hlen, sock_hlen, hdr_len; int i; hdr_len = virtio_features_test_bit(features, VIRTIO_NET_F_MRG_RXBUF) || virtio_features_test_bit(features, VIRTIO_F_VERSION_1) ? sizeof(struct virtio_net_hdr_mrg_rxbuf) : sizeof(struct virtio_net_hdr); if (virtio_features_test_bit(features, VIRTIO_NET_F_HOST_UDP_TUNNEL_GSO) || virtio_features_test_bit(features, VIRTIO_NET_F_GUEST_UDP_TUNNEL_GSO)) hdr_len = sizeof(struct virtio_net_hdr_v1_hash_tunnel); if (virtio_features_test_bit(features, VHOST_NET_F_VIRTIO_NET_HDR)) { /* vhost provides vnet_hdr */ vhost_hlen = hdr_len; sock_hlen = 0; } else { /* socket provides vnet_hdr */ vhost_hlen = 0; sock_hlen = hdr_len; } mutex_lock(&n->dev.mutex); if (virtio_features_test_bit(features, VHOST_F_LOG_ALL) && !vhost_log_access_ok(&n->dev)) goto out_unlock; if (virtio_features_test_bit(features, VIRTIO_F_ACCESS_PLATFORM)) { if (vhost_init_device_iotlb(&n->dev)) goto out_unlock; } for (i = 0; i < VHOST_NET_VQ_MAX; ++i) { mutex_lock(&n->vqs[i].vq.mutex); virtio_features_copy(n->vqs[i].vq.acked_features_array, features); n->vqs[i].vhost_hlen = vhost_hlen; n->vqs[i].sock_hlen = sock_hlen; mutex_unlock(&n->vqs[i].vq.mutex); } mutex_unlock(&n->dev.mutex); return 0; out_unlock: mutex_unlock(&n->dev.mutex); return -EFAULT; } static long vhost_net_set_owner(struct vhost_net *n) { int r; mutex_lock(&n->dev.mutex); if (vhost_dev_has_owner(&n->dev)) { r = -EBUSY; goto out; } r = vhost_net_set_ubuf_info(n); if (r) goto out; r = vhost_dev_set_owner(&n->dev); if (r) vhost_net_clear_ubuf_info(n); vhost_net_flush(n); out: mutex_unlock(&n->dev.mutex); return r; } static long vhost_net_ioctl(struct file *f, unsigned int ioctl, unsigned long arg) { u64 all_features[VIRTIO_FEATURES_DWORDS]; struct vhost_net *n = f->private_data; void __user *argp = (void __user *)arg; u64 __user *featurep = argp; struct vhost_vring_file backend; u64 features, count, copied; int r, i; switch (ioctl) { case VHOST_NET_SET_BACKEND: if (copy_from_user(&backend, argp, sizeof backend)) return -EFAULT; return vhost_net_set_backend(n, backend.index, backend.fd); case VHOST_GET_FEATURES: features = vhost_net_features[0]; if (copy_to_user(featurep, &features, sizeof features)) return -EFAULT; return 0; case VHOST_SET_FEATURES: if (copy_from_user(&features, featurep, sizeof features)) return -EFAULT; if (features & ~vhost_net_features[0]) return -EOPNOTSUPP; virtio_features_from_u64(all_features, features); return vhost_net_set_features(n, all_features); case VHOST_GET_FEATURES_ARRAY: if (copy_from_user(&count, featurep, sizeof(count))) return -EFAULT; /* Copy the net features, up to the user-provided buffer size */ argp += sizeof(u64); copied = min(count, VIRTIO_FEATURES_DWORDS); if (copy_to_user(argp, vhost_net_features, copied * sizeof(u64))) return -EFAULT; /* Zero the trailing space provided by user-space, if any */ if (clear_user(argp, size_mul(count - copied, sizeof(u64)))) return -EFAULT; return 0; case VHOST_SET_FEATURES_ARRAY: if (copy_from_user(&count, featurep, sizeof(count))) return -EFAULT; virtio_features_zero(all_features); argp += sizeof(u64); copied = min(count, VIRTIO_FEATURES_DWORDS); if (copy_from_user(all_features, argp, copied * sizeof(u64))) return -EFAULT; /* * Any feature specified by user-space above * VIRTIO_FEATURES_MAX is not supported by definition. */ for (i = copied; i < count; ++i) { if (copy_from_user(&features, featurep + 1 + i, sizeof(features))) return -EFAULT; if (features) return -EOPNOTSUPP; } for (i = 0; i < VIRTIO_FEATURES_DWORDS; i++) if (all_features[i] & ~vhost_net_features[i]) return -EOPNOTSUPP; return vhost_net_set_features(n, all_features); case VHOST_GET_BACKEND_FEATURES: features = VHOST_NET_BACKEND_FEATURES; if (copy_to_user(featurep, &features, sizeof(features))) return -EFAULT; return 0; case VHOST_SET_BACKEND_FEATURES: if (copy_from_user(&features, featurep, sizeof(features))) return -EFAULT; if (features & ~VHOST_NET_BACKEND_FEATURES) return -EOPNOTSUPP; vhost_set_backend_features(&n->dev, features); return 0; case VHOST_RESET_OWNER: return vhost_net_reset_owner(n); case VHOST_SET_OWNER: return vhost_net_set_owner(n); default: mutex_lock(&n->dev.mutex); r = vhost_dev_ioctl(&n->dev, ioctl, argp); if (r == -ENOIOCTLCMD) r = vhost_vring_ioctl(&n->dev, ioctl, argp); else vhost_net_flush(n); mutex_unlock(&n->dev.mutex); return r; } } static ssize_t vhost_net_chr_read_iter(struct kiocb *iocb, struct iov_iter *to) { struct file *file = iocb->ki_filp; struct vhost_net *n = file->private_data; struct vhost_dev *dev = &n->dev; int noblock = file->f_flags & O_NONBLOCK; return vhost_chr_read_iter(dev, to, noblock); } static ssize_t vhost_net_chr_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; struct vhost_net *n = file->private_data; struct vhost_dev *dev = &n->dev; return vhost_chr_write_iter(dev, from); } static __poll_t vhost_net_chr_poll(struct file *file, poll_table *wait) { struct vhost_net *n = file->private_data; struct vhost_dev *dev = &n->dev; return vhost_chr_poll(file, dev, wait); } static const struct file_operations vhost_net_fops = { .owner = THIS_MODULE, .release = vhost_net_release, .read_iter = vhost_net_chr_read_iter, .write_iter = vhost_net_chr_write_iter, .poll = vhost_net_chr_poll, .unlocked_ioctl = vhost_net_ioctl, .compat_ioctl = compat_ptr_ioctl, .open = vhost_net_open, .llseek = noop_llseek, }; static struct miscdevice vhost_net_misc = { .minor = VHOST_NET_MINOR, .name = "vhost-net", .fops = &vhost_net_fops, }; static int __init vhost_net_init(void) { if (experimental_zcopytx) vhost_net_enable_zcopy(VHOST_NET_VQ_TX); return misc_register(&vhost_net_misc); } module_init(vhost_net_init); static void __exit vhost_net_exit(void) { misc_deregister(&vhost_net_misc); } module_exit(vhost_net_exit); MODULE_VERSION("0.0.1"); MODULE_LICENSE("GPL v2"); MODULE_AUTHOR("Michael S. Tsirkin"); MODULE_DESCRIPTION("Host kernel accelerator for virtio net"); MODULE_ALIAS_MISCDEV(VHOST_NET_MINOR); MODULE_ALIAS("devname:vhost-net"); |
| 4 9 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 | /* SPDX-License-Identifier: GPL-2.0 */ #if !defined(_DRM_TRACE_H_) || defined(TRACE_HEADER_MULTI_READ) #define _DRM_TRACE_H_ #include <linux/stringify.h> #include <linux/types.h> #include <linux/tracepoint.h> struct drm_file; #undef TRACE_SYSTEM #define TRACE_SYSTEM drm #define TRACE_INCLUDE_FILE drm_trace TRACE_EVENT(drm_vblank_event, TP_PROTO(int crtc, unsigned int seq, ktime_t time, bool high_prec), TP_ARGS(crtc, seq, time, high_prec), TP_STRUCT__entry( __field(int, crtc) __field(unsigned int, seq) __field(ktime_t, time) __field(bool, high_prec) ), TP_fast_assign( __entry->crtc = crtc; __entry->seq = seq; __entry->time = time; __entry->high_prec = high_prec; ), TP_printk("crtc=%d, seq=%u, time=%lld, high-prec=%s", __entry->crtc, __entry->seq, __entry->time, __entry->high_prec ? "true" : "false") ); TRACE_EVENT(drm_vblank_event_queued, TP_PROTO(struct drm_file *file, int crtc, unsigned int seq), TP_ARGS(file, crtc, seq), TP_STRUCT__entry( __field(struct drm_file *, file) __field(int, crtc) __field(unsigned int, seq) ), TP_fast_assign( __entry->file = file; __entry->crtc = crtc; __entry->seq = seq; ), TP_printk("file=%p, crtc=%d, seq=%u", __entry->file, __entry->crtc, \ __entry->seq) ); TRACE_EVENT(drm_vblank_event_delivered, TP_PROTO(struct drm_file *file, int crtc, unsigned int seq), TP_ARGS(file, crtc, seq), TP_STRUCT__entry( __field(struct drm_file *, file) __field(int, crtc) __field(unsigned int, seq) ), TP_fast_assign( __entry->file = file; __entry->crtc = crtc; __entry->seq = seq; ), TP_printk("file=%p, crtc=%d, seq=%u", __entry->file, __entry->crtc, \ __entry->seq) ); #endif /* _DRM_TRACE_H_ */ /* This part must be outside protection */ #undef TRACE_INCLUDE_PATH #define TRACE_INCLUDE_PATH ../../drivers/gpu/drm #include <trace/define_trace.h> |
| 25 24 25 25 25 25 25 25 33 33 33 33 33 33 33 33 32 25 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 | // SPDX-License-Identifier: GPL-2.0-only /* * Event char devices, giving access to raw input device events. * * Copyright (c) 1999-2002 Vojtech Pavlik */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #define EVDEV_MINOR_BASE 64 #define EVDEV_MINORS 32 #define EVDEV_MIN_BUFFER_SIZE 64U #define EVDEV_BUF_PACKETS 8 #include <linux/poll.h> #include <linux/sched.h> #include <linux/slab.h> #include <linux/vmalloc.h> #include <linux/mm.h> #include <linux/module.h> #include <linux/init.h> #include <linux/input/mt.h> #include <linux/major.h> #include <linux/device.h> #include <linux/cdev.h> #include "input-compat.h" struct evdev { int open; struct input_handle handle; struct evdev_client __rcu *grab; struct list_head client_list; spinlock_t client_lock; /* protects client_list */ struct mutex mutex; struct device dev; struct cdev cdev; bool exist; }; struct evdev_client { unsigned int head; unsigned int tail; unsigned int packet_head; /* [future] position of the first element of next packet */ spinlock_t buffer_lock; /* protects access to buffer, head and tail */ wait_queue_head_t wait; struct fasync_struct *fasync; struct evdev *evdev; struct list_head node; enum input_clock_type clk_type; bool revoked; unsigned long *evmasks[EV_CNT]; unsigned int bufsize; struct input_event buffer[] __counted_by(bufsize); }; static size_t evdev_get_mask_cnt(unsigned int type) { static const size_t counts[EV_CNT] = { /* EV_SYN==0 is EV_CNT, _not_ SYN_CNT, see EVIOCGBIT */ [EV_SYN] = EV_CNT, [EV_KEY] = KEY_CNT, [EV_REL] = REL_CNT, [EV_ABS] = ABS_CNT, [EV_MSC] = MSC_CNT, [EV_SW] = SW_CNT, [EV_LED] = LED_CNT, [EV_SND] = SND_CNT, [EV_FF] = FF_CNT, }; return (type < EV_CNT) ? counts[type] : 0; } /* requires the buffer lock to be held */ static bool __evdev_is_filtered(struct evdev_client *client, unsigned int type, unsigned int code) { unsigned long *mask; size_t cnt; /* EV_SYN and unknown codes are never filtered */ if (type == EV_SYN || type >= EV_CNT) return false; /* first test whether the type is filtered */ mask = client->evmasks[0]; if (mask && !test_bit(type, mask)) return true; /* unknown values are never filtered */ cnt = evdev_get_mask_cnt(type); if (!cnt || code >= cnt) return false; mask = client->evmasks[type]; return mask && !test_bit(code, mask); } /* flush queued events of type @type, caller must hold client->buffer_lock */ static void __evdev_flush_queue(struct evdev_client *client, unsigned int type) { unsigned int i, head, num; unsigned int mask = client->bufsize - 1; bool is_report; struct input_event *ev; BUG_ON(type == EV_SYN); head = client->tail; client->packet_head = client->tail; /* init to 1 so a leading SYN_REPORT will not be dropped */ num = 1; for (i = client->tail; i != client->head; i = (i + 1) & mask) { ev = &client->buffer[i]; is_report = ev->type == EV_SYN && ev->code == SYN_REPORT; if (ev->type == type) { /* drop matched entry */ continue; } else if (is_report && !num) { /* drop empty SYN_REPORT groups */ continue; } else if (head != i) { /* move entry to fill the gap */ client->buffer[head] = *ev; } num++; head = (head + 1) & mask; if (is_report) { num = 0; client->packet_head = head; } } client->head = head; } static void __evdev_queue_syn_dropped(struct evdev_client *client) { ktime_t *ev_time = input_get_timestamp(client->evdev->handle.dev); struct timespec64 ts = ktime_to_timespec64(ev_time[client->clk_type]); struct input_event ev; ev.input_event_sec = ts.tv_sec; ev.input_event_usec = ts.tv_nsec / NSEC_PER_USEC; ev.type = EV_SYN; ev.code = SYN_DROPPED; ev.value = 0; client->buffer[client->head++] = ev; client->head &= client->bufsize - 1; if (unlikely(client->head == client->tail)) { /* drop queue but keep our SYN_DROPPED event */ client->tail = (client->head - 1) & (client->bufsize - 1); client->packet_head = client->tail; } } static void evdev_queue_syn_dropped(struct evdev_client *client) { unsigned long flags; spin_lock_irqsave(&client->buffer_lock, flags); __evdev_queue_syn_dropped(client); spin_unlock_irqrestore(&client->buffer_lock, flags); } static int evdev_set_clk_type(struct evdev_client *client, unsigned int clkid) { unsigned long flags; enum input_clock_type clk_type; switch (clkid) { case CLOCK_REALTIME: clk_type = INPUT_CLK_REAL; break; case CLOCK_MONOTONIC: clk_type = INPUT_CLK_MONO; break; case CLOCK_BOOTTIME: clk_type = INPUT_CLK_BOOT; break; default: return -EINVAL; } if (client->clk_type != clk_type) { client->clk_type = clk_type; /* * Flush pending events and queue SYN_DROPPED event, * but only if the queue is not empty. */ spin_lock_irqsave(&client->buffer_lock, flags); if (client->head != client->tail) { client->packet_head = client->head = client->tail; __evdev_queue_syn_dropped(client); } spin_unlock_irqrestore(&client->buffer_lock, flags); } return 0; } static void __pass_event(struct evdev_client *client, const struct input_event *event) { client->buffer[client->head++] = *event; client->head &= client->bufsize - 1; if (unlikely(client->head == client->tail)) { /* * This effectively "drops" all unconsumed events, leaving * EV_SYN/SYN_DROPPED plus the newest event in the queue. */ client->tail = (client->head - 2) & (client->bufsize - 1); client->buffer[client->tail] = (struct input_event) { .input_event_sec = event->input_event_sec, .input_event_usec = event->input_event_usec, .type = EV_SYN, .code = SYN_DROPPED, .value = 0, }; client->packet_head = client->tail; } if (event->type == EV_SYN && event->code == SYN_REPORT) { client->packet_head = client->head; kill_fasync(&client->fasync, SIGIO, POLL_IN); } } static void evdev_pass_values(struct evdev_client *client, const struct input_value *vals, unsigned int count, ktime_t *ev_time) { const struct input_value *v; struct input_event event; struct timespec64 ts; bool wakeup = false; if (client->revoked) return; ts = ktime_to_timespec64(ev_time[client->clk_type]); event.input_event_sec = ts.tv_sec; event.input_event_usec = ts.tv_nsec / NSEC_PER_USEC; /* Interrupts are disabled, just acquire the lock. */ spin_lock(&client->buffer_lock); for (v = vals; v != vals + count; v++) { if (__evdev_is_filtered(client, v->type, v->code)) continue; if (v->type == EV_SYN && v->code == SYN_REPORT) { /* drop empty SYN_REPORT */ if (client->packet_head == client->head) continue; wakeup = true; } event.type = v->type; event.code = v->code; event.value = v->value; __pass_event(client, &event); } spin_unlock(&client->buffer_lock); if (wakeup) wake_up_interruptible_poll(&client->wait, EPOLLIN | EPOLLOUT | EPOLLRDNORM | EPOLLWRNORM); } /* * Pass incoming events to all connected clients. */ static unsigned int evdev_events(struct input_handle *handle, struct input_value *vals, unsigned int count) { struct evdev *evdev = handle->private; struct evdev_client *client; ktime_t *ev_time = input_get_timestamp(handle->dev); rcu_read_lock(); client = rcu_dereference(evdev->grab); if (client) evdev_pass_values(client, vals, count, ev_time); else list_for_each_entry_rcu(client, &evdev->client_list, node) evdev_pass_values(client, vals, count, ev_time); rcu_read_unlock(); return count; } static int evdev_fasync(int fd, struct file *file, int on) { struct evdev_client *client = file->private_data; return fasync_helper(fd, file, on, &client->fasync); } static void evdev_free(struct device *dev) { struct evdev *evdev = container_of(dev, struct evdev, dev); input_put_device(evdev->handle.dev); kfree(evdev); } /* * Grabs an event device (along with underlying input device). * This function is called with evdev->mutex taken. */ static int evdev_grab(struct evdev *evdev, struct evdev_client *client) { int error; if (evdev->grab) return -EBUSY; error = input_grab_device(&evdev->handle); if (error) return error; rcu_assign_pointer(evdev->grab, client); return 0; } static int evdev_ungrab(struct evdev *evdev, struct evdev_client *client) { struct evdev_client *grab = rcu_dereference_protected(evdev->grab, lockdep_is_held(&evdev->mutex)); if (grab != client) return -EINVAL; rcu_assign_pointer(evdev->grab, NULL); synchronize_rcu(); input_release_device(&evdev->handle); return 0; } static void evdev_attach_client(struct evdev *evdev, struct evdev_client *client) { spin_lock(&evdev->client_lock); list_add_tail_rcu(&client->node, &evdev->client_list); spin_unlock(&evdev->client_lock); } static void evdev_detach_client(struct evdev *evdev, struct evdev_client *client) { spin_lock(&evdev->client_lock); list_del_rcu(&client->node); spin_unlock(&evdev->client_lock); synchronize_rcu(); } static int evdev_open_device(struct evdev *evdev) { int retval; retval = mutex_lock_interruptible(&evdev->mutex); if (retval) return retval; if (!evdev->exist) retval = -ENODEV; else if (!evdev->open++) { retval = input_open_device(&evdev->handle); if (retval) evdev->open--; } mutex_unlock(&evdev->mutex); return retval; } static void evdev_close_device(struct evdev *evdev) { mutex_lock(&evdev->mutex); if (evdev->exist && !--evdev->open) input_close_device(&evdev->handle); mutex_unlock(&evdev->mutex); } /* * Wake up users waiting for IO so they can disconnect from * dead device. */ static void evdev_hangup(struct evdev *evdev) { struct evdev_client *client; spin_lock(&evdev->client_lock); list_for_each_entry(client, &evdev->client_list, node) { kill_fasync(&client->fasync, SIGIO, POLL_HUP); wake_up_interruptible_poll(&client->wait, EPOLLHUP | EPOLLERR); } spin_unlock(&evdev->client_lock); } static int evdev_release(struct inode *inode, struct file *file) { struct evdev_client *client = file->private_data; struct evdev *evdev = client->evdev; unsigned int i; mutex_lock(&evdev->mutex); if (evdev->exist && !client->revoked) input_flush_device(&evdev->handle, file); evdev_ungrab(evdev, client); mutex_unlock(&evdev->mutex); evdev_detach_client(evdev, client); for (i = 0; i < EV_CNT; ++i) bitmap_free(client->evmasks[i]); kvfree(client); evdev_close_device(evdev); return 0; } static unsigned int evdev_compute_buffer_size(struct input_dev *dev) { unsigned int n_events = max(dev->hint_events_per_packet * EVDEV_BUF_PACKETS, EVDEV_MIN_BUFFER_SIZE); return roundup_pow_of_two(n_events); } static int evdev_open(struct inode *inode, struct file *file) { struct evdev *evdev = container_of(inode->i_cdev, struct evdev, cdev); unsigned int bufsize = evdev_compute_buffer_size(evdev->handle.dev); struct evdev_client *client; int error; client = kvzalloc(struct_size(client, buffer, bufsize), GFP_KERNEL); if (!client) return -ENOMEM; init_waitqueue_head(&client->wait); client->bufsize = bufsize; spin_lock_init(&client->buffer_lock); client->evdev = evdev; evdev_attach_client(evdev, client); error = evdev_open_device(evdev); if (error) goto err_free_client; file->private_data = client; stream_open(inode, file); return 0; err_free_client: evdev_detach_client(evdev, client); kvfree(client); return error; } static ssize_t evdev_write(struct file *file, const char __user *buffer, size_t count, loff_t *ppos) { struct evdev_client *client = file->private_data; struct evdev *evdev = client->evdev; struct input_event event; int retval = 0; /* * Limit amount of data we inject into the input subsystem so that * we do not hold evdev->mutex for too long. 4096 bytes corresponds * to 170 input events. */ count = min(count, 4096); if (count != 0 && count < input_event_size()) return -EINVAL; retval = mutex_lock_interruptible(&evdev->mutex); if (retval) return retval; if (!evdev->exist || client->revoked) { retval = -ENODEV; goto out; } while (retval + input_event_size() <= count) { if (input_event_from_user(buffer + retval, &event)) { retval = -EFAULT; goto out; } retval += input_event_size(); input_inject_event(&evdev->handle, event.type, event.code, event.value); cond_resched(); } out: mutex_unlock(&evdev->mutex); return retval; } static int evdev_fetch_next_event(struct evdev_client *client, struct input_event *event) { int have_event; spin_lock_irq(&client->buffer_lock); have_event = client->packet_head != client->tail; if (have_event) { *event = client->buffer[client->tail++]; client->tail &= client->bufsize - 1; } spin_unlock_irq(&client->buffer_lock); return have_event; } static ssize_t evdev_read(struct file *file, char __user *buffer, size_t count, loff_t *ppos) { struct evdev_client *client = file->private_data; struct evdev *evdev = client->evdev; struct input_event event; size_t read = 0; int error; if (count != 0 && count < input_event_size()) return -EINVAL; for (;;) { if (!evdev->exist || client->revoked) return -ENODEV; if (client->packet_head == client->tail && (file->f_flags & O_NONBLOCK)) return -EAGAIN; /* * count == 0 is special - no IO is done but we check * for error conditions (see above). */ if (count == 0) break; while (read + input_event_size() <= count && evdev_fetch_next_event(client, &event)) { if (input_event_to_user(buffer + read, &event)) return -EFAULT; read += input_event_size(); } if (read) break; if (!(file->f_flags & O_NONBLOCK)) { error = wait_event_interruptible(client->wait, client->packet_head != client->tail || !evdev->exist || client->revoked); if (error) return error; } } return read; } /* No kernel lock - fine */ static __poll_t evdev_poll(struct file *file, poll_table *wait) { struct evdev_client *client = file->private_data; struct evdev *evdev = client->evdev; __poll_t mask; poll_wait(file, &client->wait, wait); if (evdev->exist && !client->revoked) mask = EPOLLOUT | EPOLLWRNORM; else mask = EPOLLHUP | EPOLLERR; if (client->packet_head != client->tail) mask |= EPOLLIN | EPOLLRDNORM; return mask; } #ifdef CONFIG_COMPAT #define BITS_PER_LONG_COMPAT (sizeof(compat_long_t) * 8) #define BITS_TO_LONGS_COMPAT(x) ((((x) - 1) / BITS_PER_LONG_COMPAT) + 1) #ifdef __BIG_ENDIAN static int bits_to_user(unsigned long *bits, unsigned int maxbit, unsigned int maxlen, void __user *p, int compat) { int len, i; if (compat) { len = BITS_TO_LONGS_COMPAT(maxbit) * sizeof(compat_long_t); if (len > maxlen) len = maxlen; for (i = 0; i < len / sizeof(compat_long_t); i++) if (copy_to_user((compat_long_t __user *) p + i, (compat_long_t *) bits + i + 1 - ((i % 2) << 1), sizeof(compat_long_t))) return -EFAULT; } else { len = BITS_TO_LONGS(maxbit) * sizeof(long); if (len > maxlen) len = maxlen; if (copy_to_user(p, bits, len)) return -EFAULT; } return len; } static int bits_from_user(unsigned long *bits, unsigned int maxbit, unsigned int maxlen, const void __user *p, int compat) { int len, i; if (compat) { if (maxlen % sizeof(compat_long_t)) return -EINVAL; len = BITS_TO_LONGS_COMPAT(maxbit) * sizeof(compat_long_t); if (len > maxlen) len = maxlen; for (i = 0; i < len / sizeof(compat_long_t); i++) if (copy_from_user((compat_long_t *) bits + i + 1 - ((i % 2) << 1), (compat_long_t __user *) p + i, sizeof(compat_long_t))) return -EFAULT; if (i % 2) *((compat_long_t *) bits + i - 1) = 0; } else { if (maxlen % sizeof(long)) return -EINVAL; len = BITS_TO_LONGS(maxbit) * sizeof(long); if (len > maxlen) len = maxlen; if (copy_from_user(bits, p, len)) return -EFAULT; } return len; } #else static int bits_to_user(unsigned long *bits, unsigned int maxbit, unsigned int maxlen, void __user *p, int compat) { int len = compat ? BITS_TO_LONGS_COMPAT(maxbit) * sizeof(compat_long_t) : BITS_TO_LONGS(maxbit) * sizeof(long); if (len > maxlen) len = maxlen; return copy_to_user(p, bits, len) ? -EFAULT : len; } static int bits_from_user(unsigned long *bits, unsigned int maxbit, unsigned int maxlen, const void __user *p, int compat) { size_t chunk_size = compat ? sizeof(compat_long_t) : sizeof(long); int len; if (maxlen % chunk_size) return -EINVAL; len = compat ? BITS_TO_LONGS_COMPAT(maxbit) : BITS_TO_LONGS(maxbit); len *= chunk_size; if (len > maxlen) len = maxlen; return copy_from_user(bits, p, len) ? -EFAULT : len; } #endif /* __BIG_ENDIAN */ #else static int bits_to_user(unsigned long *bits, unsigned int maxbit, unsigned int maxlen, void __user *p, int compat) { int len = BITS_TO_LONGS(maxbit) * sizeof(long); if (len > maxlen) len = maxlen; return copy_to_user(p, bits, len) ? -EFAULT : len; } static int bits_from_user(unsigned long *bits, unsigned int maxbit, unsigned int maxlen, const void __user *p, int compat) { int len; if (maxlen % sizeof(long)) return -EINVAL; len = BITS_TO_LONGS(maxbit) * sizeof(long); if (len > maxlen) len = maxlen; return copy_from_user(bits, p, len) ? -EFAULT : len; } #endif /* CONFIG_COMPAT */ static int str_to_user(const char *str, unsigned int maxlen, void __user *p) { int len; if (!str) return -ENOENT; len = strlen(str) + 1; if (len > maxlen) len = maxlen; return copy_to_user(p, str, len) ? -EFAULT : len; } static int handle_eviocgbit(struct input_dev *dev, unsigned int type, unsigned int size, void __user *p, int compat_mode) { unsigned long *bits; int len; switch (type) { case 0: bits = dev->evbit; len = EV_MAX; break; case EV_KEY: bits = dev->keybit; len = KEY_MAX; break; case EV_REL: bits = dev->relbit; len = REL_MAX; break; case EV_ABS: bits = dev->absbit; len = ABS_MAX; break; case EV_MSC: bits = dev->mscbit; len = MSC_MAX; break; case EV_LED: bits = dev->ledbit; len = LED_MAX; break; case EV_SND: bits = dev->sndbit; len = SND_MAX; break; case EV_FF: bits = dev->ffbit; len = FF_MAX; break; case EV_SW: bits = dev->swbit; len = SW_MAX; break; default: return -EINVAL; } return bits_to_user(bits, len, size, p, compat_mode); } static int evdev_handle_get_keycode(struct input_dev *dev, void __user *p) { struct input_keymap_entry ke = { .len = sizeof(unsigned int), .flags = 0, }; int __user *ip = (int __user *)p; int error; /* legacy case */ if (copy_from_user(ke.scancode, p, sizeof(unsigned int))) return -EFAULT; error = input_get_keycode(dev, &ke); if (error) return error; if (put_user(ke.keycode, ip + 1)) return -EFAULT; return 0; } static int evdev_handle_get_keycode_v2(struct input_dev *dev, void __user *p) { struct input_keymap_entry ke; int error; if (copy_from_user(&ke, p, sizeof(ke))) return -EFAULT; error = input_get_keycode(dev, &ke); if (error) return error; if (copy_to_user(p, &ke, sizeof(ke))) return -EFAULT; return 0; } static int evdev_handle_set_keycode(struct input_dev *dev, void __user *p) { struct input_keymap_entry ke = { .len = sizeof(unsigned int), .flags = 0, }; int __user *ip = (int __user *)p; if (copy_from_user(ke.scancode, p, sizeof(unsigned int))) return -EFAULT; if (get_user(ke.keycode, ip + 1)) return -EFAULT; return input_set_keycode(dev, &ke); } static int evdev_handle_set_keycode_v2(struct input_dev *dev, void __user *p) { struct input_keymap_entry ke; if (copy_from_user(&ke, p, sizeof(ke))) return -EFAULT; if (ke.len > sizeof(ke.scancode)) return -EINVAL; return input_set_keycode(dev, &ke); } /* * If we transfer state to the user, we should flush all pending events * of the same type from the client's queue. Otherwise, they might end up * with duplicate events, which can screw up client's state tracking. * If bits_to_user fails after flushing the queue, we queue a SYN_DROPPED * event so user-space will notice missing events. * * LOCKING: * We need to take event_lock before buffer_lock to avoid dead-locks. But we * need the even_lock only to guarantee consistent state. We can safely release * it while flushing the queue. This allows input-core to handle filters while * we flush the queue. */ static int evdev_handle_get_val(struct evdev_client *client, struct input_dev *dev, unsigned int type, unsigned long *bits, unsigned int maxbit, unsigned int maxlen, void __user *p, int compat) { int ret; unsigned long *mem; mem = bitmap_alloc(maxbit, GFP_KERNEL); if (!mem) return -ENOMEM; spin_lock_irq(&dev->event_lock); spin_lock(&client->buffer_lock); bitmap_copy(mem, bits, maxbit); spin_unlock(&dev->event_lock); __evdev_flush_queue(client, type); spin_unlock_irq(&client->buffer_lock); ret = bits_to_user(mem, maxbit, maxlen, p, compat); if (ret < 0) evdev_queue_syn_dropped(client); bitmap_free(mem); return ret; } static int evdev_handle_mt_request(struct input_dev *dev, unsigned int size, int __user *ip) { const struct input_mt *mt = dev->mt; unsigned int code; int max_slots; int i; if (get_user(code, &ip[0])) return -EFAULT; if (!mt || !input_is_mt_value(code)) return -EINVAL; max_slots = (size - sizeof(__u32)) / sizeof(__s32); for (i = 0; i < mt->num_slots && i < max_slots; i++) { int value = input_mt_get_value(&mt->slots[i], code); if (put_user(value, &ip[1 + i])) return -EFAULT; } return 0; } static int evdev_revoke(struct evdev *evdev, struct evdev_client *client, struct file *file) { client->revoked = true; evdev_ungrab(evdev, client); input_flush_device(&evdev->handle, file); wake_up_interruptible_poll(&client->wait, EPOLLHUP | EPOLLERR); return 0; } /* must be called with evdev-mutex held */ static int evdev_set_mask(struct evdev_client *client, unsigned int type, const void __user *codes, u32 codes_size, int compat) { unsigned long flags, *mask, *oldmask; size_t cnt; int error; /* we allow unknown types and 'codes_size > size' for forward-compat */ cnt = evdev_get_mask_cnt(type); if (!cnt) return 0; mask = bitmap_zalloc(cnt, GFP_KERNEL); if (!mask) return -ENOMEM; error = bits_from_user(mask, cnt - 1, codes_size, codes, compat); if (error < 0) { bitmap_free(mask); return error; } spin_lock_irqsave(&client->buffer_lock, flags); oldmask = client->evmasks[type]; client->evmasks[type] = mask; spin_unlock_irqrestore(&client->buffer_lock, flags); bitmap_free(oldmask); return 0; } /* must be called with evdev-mutex held */ static int evdev_get_mask(struct evdev_client *client, unsigned int type, void __user *codes, u32 codes_size, int compat) { unsigned long *mask; size_t cnt, size, xfer_size; int i; int error; /* we allow unknown types and 'codes_size > size' for forward-compat */ cnt = evdev_get_mask_cnt(type); size = sizeof(unsigned long) * BITS_TO_LONGS(cnt); xfer_size = min_t(size_t, codes_size, size); if (cnt > 0) { mask = client->evmasks[type]; if (mask) { error = bits_to_user(mask, cnt - 1, xfer_size, codes, compat); if (error < 0) return error; } else { /* fake mask with all bits set */ for (i = 0; i < xfer_size; i++) if (put_user(0xffU, (u8 __user *)codes + i)) return -EFAULT; } } if (xfer_size < codes_size) if (clear_user(codes + xfer_size, codes_size - xfer_size)) return -EFAULT; return 0; } static long evdev_do_ioctl(struct file *file, unsigned int cmd, void __user *p, int compat_mode) { struct evdev_client *client = file->private_data; struct evdev *evdev = client->evdev; struct input_dev *dev = evdev->handle.dev; struct input_absinfo abs; struct input_mask mask; struct ff_effect effect; int __user *ip = (int __user *)p; unsigned int i, t, u, v; unsigned int size; int error; /* First we check for fixed-length commands */ switch (cmd) { case EVIOCGVERSION: return put_user(EV_VERSION, ip); case EVIOCGID: if (copy_to_user(p, &dev->id, sizeof(struct input_id))) return -EFAULT; return 0; case EVIOCGREP: if (!test_bit(EV_REP, dev->evbit)) return -ENOSYS; if (put_user(dev->rep[REP_DELAY], ip)) return -EFAULT; if (put_user(dev->rep[REP_PERIOD], ip + 1)) return -EFAULT; return 0; case EVIOCSREP: if (!test_bit(EV_REP, dev->evbit)) return -ENOSYS; if (get_user(u, ip)) return -EFAULT; if (get_user(v, ip + 1)) return -EFAULT; input_inject_event(&evdev->handle, EV_REP, REP_DELAY, u); input_inject_event(&evdev->handle, EV_REP, REP_PERIOD, v); return 0; case EVIOCRMFF: return input_ff_erase(dev, (int)(unsigned long) p, file); case EVIOCGEFFECTS: i = test_bit(EV_FF, dev->evbit) ? dev->ff->max_effects : 0; if (put_user(i, ip)) return -EFAULT; return 0; case EVIOCGRAB: if (p) return evdev_grab(evdev, client); else return evdev_ungrab(evdev, client); case EVIOCREVOKE: if (p) return -EINVAL; else return evdev_revoke(evdev, client, file); case EVIOCGMASK: { void __user *codes_ptr; if (copy_from_user(&mask, p, sizeof(mask))) return -EFAULT; codes_ptr = (void __user *)(unsigned long)mask.codes_ptr; return evdev_get_mask(client, mask.type, codes_ptr, mask.codes_size, compat_mode); } case EVIOCSMASK: { const void __user *codes_ptr; if (copy_from_user(&mask, p, sizeof(mask))) return -EFAULT; codes_ptr = (const void __user *)(unsigned long)mask.codes_ptr; return evdev_set_mask(client, mask.type, codes_ptr, mask.codes_size, compat_mode); } case EVIOCSCLOCKID: if (copy_from_user(&i, p, sizeof(unsigned int))) return -EFAULT; return evdev_set_clk_type(client, i); case EVIOCGKEYCODE: return evdev_handle_get_keycode(dev, p); case EVIOCSKEYCODE: return evdev_handle_set_keycode(dev, p); case EVIOCGKEYCODE_V2: return evdev_handle_get_keycode_v2(dev, p); case EVIOCSKEYCODE_V2: return evdev_handle_set_keycode_v2(dev, p); } size = _IOC_SIZE(cmd); /* Now check variable-length commands */ #define EVIOC_MASK_SIZE(nr) ((nr) & ~(_IOC_SIZEMASK << _IOC_SIZESHIFT)) switch (EVIOC_MASK_SIZE(cmd)) { case EVIOCGPROP(0): return bits_to_user(dev->propbit, INPUT_PROP_MAX, size, p, compat_mode); case EVIOCGMTSLOTS(0): return evdev_handle_mt_request(dev, size, ip); case EVIOCGKEY(0): return evdev_handle_get_val(client, dev, EV_KEY, dev->key, KEY_MAX, size, p, compat_mode); case EVIOCGLED(0): return evdev_handle_get_val(client, dev, EV_LED, dev->led, LED_MAX, size, p, compat_mode); case EVIOCGSND(0): return evdev_handle_get_val(client, dev, EV_SND, dev->snd, SND_MAX, size, p, compat_mode); case EVIOCGSW(0): return evdev_handle_get_val(client, dev, EV_SW, dev->sw, SW_MAX, size, p, compat_mode); case EVIOCGNAME(0): return str_to_user(dev->name, size, p); case EVIOCGPHYS(0): return str_to_user(dev->phys, size, p); case EVIOCGUNIQ(0): return str_to_user(dev->uniq, size, p); case EVIOC_MASK_SIZE(EVIOCSFF): if (input_ff_effect_from_user(p, size, &effect)) return -EFAULT; error = input_ff_upload(dev, &effect, file); if (error) return error; if (put_user(effect.id, &(((struct ff_effect __user *)p)->id))) return -EFAULT; return 0; } /* Multi-number variable-length handlers */ if (_IOC_TYPE(cmd) != 'E') return -EINVAL; if (_IOC_DIR(cmd) == _IOC_READ) { if ((_IOC_NR(cmd) & ~EV_MAX) == _IOC_NR(EVIOCGBIT(0, 0))) return handle_eviocgbit(dev, _IOC_NR(cmd) & EV_MAX, size, p, compat_mode); if ((_IOC_NR(cmd) & ~ABS_MAX) == _IOC_NR(EVIOCGABS(0))) { if (!dev->absinfo) return -EINVAL; t = _IOC_NR(cmd) & ABS_MAX; abs = dev->absinfo[t]; if (copy_to_user(p, &abs, min_t(size_t, size, sizeof(struct input_absinfo)))) return -EFAULT; return 0; } } if (_IOC_DIR(cmd) == _IOC_WRITE) { if ((_IOC_NR(cmd) & ~ABS_MAX) == _IOC_NR(EVIOCSABS(0))) { if (!dev->absinfo) return -EINVAL; t = _IOC_NR(cmd) & ABS_MAX; if (copy_from_user(&abs, p, min_t(size_t, size, sizeof(struct input_absinfo)))) return -EFAULT; if (size < sizeof(struct input_absinfo)) abs.resolution = 0; /* We can't change number of reserved MT slots */ if (t == ABS_MT_SLOT) return -EINVAL; /* * Take event lock to ensure that we are not * changing device parameters in the middle * of event. */ spin_lock_irq(&dev->event_lock); dev->absinfo[t] = abs; spin_unlock_irq(&dev->event_lock); return 0; } } return -EINVAL; } static long evdev_ioctl_handler(struct file *file, unsigned int cmd, void __user *p, int compat_mode) { struct evdev_client *client = file->private_data; struct evdev *evdev = client->evdev; int retval; retval = mutex_lock_interruptible(&evdev->mutex); if (retval) return retval; if (!evdev->exist || client->revoked) { retval = -ENODEV; goto out; } retval = evdev_do_ioctl(file, cmd, p, compat_mode); out: mutex_unlock(&evdev->mutex); return retval; } static long evdev_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { return evdev_ioctl_handler(file, cmd, (void __user *)arg, 0); } #ifdef CONFIG_COMPAT static long evdev_ioctl_compat(struct file *file, unsigned int cmd, unsigned long arg) { return evdev_ioctl_handler(file, cmd, compat_ptr(arg), 1); } #endif static const struct file_operations evdev_fops = { .owner = THIS_MODULE, .read = evdev_read, .write = evdev_write, .poll = evdev_poll, .open = evdev_open, .release = evdev_release, .unlocked_ioctl = evdev_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = evdev_ioctl_compat, #endif .fasync = evdev_fasync, }; /* * Mark device non-existent. This disables writes, ioctls and * prevents new users from opening the device. Already posted * blocking reads will stay, however new ones will fail. */ static void evdev_mark_dead(struct evdev *evdev) { mutex_lock(&evdev->mutex); evdev->exist = false; mutex_unlock(&evdev->mutex); } static void evdev_cleanup(struct evdev *evdev) { struct input_handle *handle = &evdev->handle; evdev_mark_dead(evdev); evdev_hangup(evdev); /* evdev is marked dead so no one else accesses evdev->open */ if (evdev->open) { input_flush_device(handle, NULL); input_close_device(handle); } } /* * Create new evdev device. Note that input core serializes calls * to connect and disconnect. */ static int evdev_connect(struct input_handler *handler, struct input_dev *dev, const struct input_device_id *id) { struct evdev *evdev; int minor; int dev_no; int error; minor = input_get_new_minor(EVDEV_MINOR_BASE, EVDEV_MINORS, true); if (minor < 0) { error = minor; pr_err("failed to reserve new minor: %d\n", error); return error; } evdev = kzalloc(sizeof(struct evdev), GFP_KERNEL); if (!evdev) { error = -ENOMEM; goto err_free_minor; } INIT_LIST_HEAD(&evdev->client_list); spin_lock_init(&evdev->client_lock); mutex_init(&evdev->mutex); evdev->exist = true; dev_no = minor; /* Normalize device number if it falls into legacy range */ if (dev_no < EVDEV_MINOR_BASE + EVDEV_MINORS) dev_no -= EVDEV_MINOR_BASE; dev_set_name(&evdev->dev, "event%d", dev_no); evdev->handle.dev = input_get_device(dev); evdev->handle.name = dev_name(&evdev->dev); evdev->handle.handler = handler; evdev->handle.private = evdev; evdev->dev.devt = MKDEV(INPUT_MAJOR, minor); evdev->dev.class = &input_class; evdev->dev.parent = &dev->dev; evdev->dev.release = evdev_free; device_initialize(&evdev->dev); error = input_register_handle(&evdev->handle); if (error) goto err_free_evdev; cdev_init(&evdev->cdev, &evdev_fops); error = cdev_device_add(&evdev->cdev, &evdev->dev); if (error) goto err_cleanup_evdev; return 0; err_cleanup_evdev: evdev_cleanup(evdev); input_unregister_handle(&evdev->handle); err_free_evdev: put_device(&evdev->dev); err_free_minor: input_free_minor(minor); return error; } static void evdev_disconnect(struct input_handle *handle) { struct evdev *evdev = handle->private; cdev_device_del(&evdev->cdev, &evdev->dev); evdev_cleanup(evdev); input_free_minor(MINOR(evdev->dev.devt)); input_unregister_handle(handle); put_device(&evdev->dev); } static const struct input_device_id evdev_ids[] = { { /* Matches all devices */ .flags = INPUT_DEVICE_ID_MATCH_EVBIT, .evbit = { BIT_MASK(EV_SYN) }, }, { } /* Terminating zero entry */ }; MODULE_DEVICE_TABLE(input, evdev_ids); static struct input_handler evdev_handler = { .events = evdev_events, .connect = evdev_connect, .disconnect = evdev_disconnect, .legacy_minors = true, .minor = EVDEV_MINOR_BASE, .name = "evdev", .id_table = evdev_ids, }; static int __init evdev_init(void) { return input_register_handler(&evdev_handler); } static void __exit evdev_exit(void) { input_unregister_handler(&evdev_handler); } module_init(evdev_init); module_exit(evdev_exit); MODULE_AUTHOR("Vojtech Pavlik <vojtech@ucw.cz>"); MODULE_DESCRIPTION("Input driver event char devices"); MODULE_LICENSE("GPL"); |
| 5 18 18 18 18 18 18 18 18 6 6 6 6 5 5 5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 | // SPDX-License-Identifier: GPL-2.0 #include <linux/swap_cgroup.h> #include <linux/vmalloc.h> #include <linux/mm.h> #include <linux/swapops.h> /* depends on mm.h include */ static DEFINE_MUTEX(swap_cgroup_mutex); /* Pack two cgroup id (short) of two entries in one swap_cgroup (atomic_t) */ #define ID_PER_SC (sizeof(struct swap_cgroup) / sizeof(unsigned short)) #define ID_SHIFT (BITS_PER_TYPE(unsigned short)) #define ID_MASK (BIT(ID_SHIFT) - 1) struct swap_cgroup { atomic_t ids; }; struct swap_cgroup_ctrl { struct swap_cgroup *map; }; static struct swap_cgroup_ctrl swap_cgroup_ctrl[MAX_SWAPFILES]; static unsigned short __swap_cgroup_id_lookup(struct swap_cgroup *map, pgoff_t offset) { unsigned int shift = (offset % ID_PER_SC) * ID_SHIFT; unsigned int old_ids = atomic_read(&map[offset / ID_PER_SC].ids); BUILD_BUG_ON(!is_power_of_2(ID_PER_SC)); BUILD_BUG_ON(sizeof(struct swap_cgroup) != sizeof(atomic_t)); return (old_ids >> shift) & ID_MASK; } static unsigned short __swap_cgroup_id_xchg(struct swap_cgroup *map, pgoff_t offset, unsigned short new_id) { unsigned short old_id; struct swap_cgroup *sc = &map[offset / ID_PER_SC]; unsigned int shift = (offset % ID_PER_SC) * ID_SHIFT; unsigned int new_ids, old_ids = atomic_read(&sc->ids); do { old_id = (old_ids >> shift) & ID_MASK; new_ids = (old_ids & ~(ID_MASK << shift)); new_ids |= ((unsigned int)new_id) << shift; } while (!atomic_try_cmpxchg(&sc->ids, &old_ids, new_ids)); return old_id; } /** * swap_cgroup_record - record mem_cgroup for a set of swap entries. * These entries must belong to one single folio, and that folio * must be being charged for swap space (swap out), and these * entries must not have been charged * * @folio: the folio that the swap entry belongs to * @id: mem_cgroup ID to be recorded * @ent: the first swap entry to be recorded */ void swap_cgroup_record(struct folio *folio, unsigned short id, swp_entry_t ent) { unsigned int nr_ents = folio_nr_pages(folio); struct swap_cgroup *map; pgoff_t offset, end; unsigned short old; offset = swp_offset(ent); end = offset + nr_ents; map = swap_cgroup_ctrl[swp_type(ent)].map; do { old = __swap_cgroup_id_xchg(map, offset, id); VM_BUG_ON(old); } while (++offset != end); } /** * swap_cgroup_clear - clear mem_cgroup for a set of swap entries. * These entries must be being uncharged from swap. They either * belongs to one single folio in the swap cache (swap in for * cgroup v1), or no longer have any users (slot freeing). * * @ent: the first swap entry to be recorded into * @nr_ents: number of swap entries to be recorded * * Returns the existing old value. */ unsigned short swap_cgroup_clear(swp_entry_t ent, unsigned int nr_ents) { pgoff_t offset, end; struct swap_cgroup *map; unsigned short old, iter = 0; offset = swp_offset(ent); end = offset + nr_ents; map = swap_cgroup_ctrl[swp_type(ent)].map; do { old = __swap_cgroup_id_xchg(map, offset, 0); if (!iter) iter = old; VM_BUG_ON(iter != old); } while (++offset != end); return old; } /** * lookup_swap_cgroup_id - lookup mem_cgroup id tied to swap entry * @ent: swap entry to be looked up. * * Returns ID of mem_cgroup at success. 0 at failure. (0 is invalid ID) */ unsigned short lookup_swap_cgroup_id(swp_entry_t ent) { struct swap_cgroup_ctrl *ctrl; if (mem_cgroup_disabled()) return 0; ctrl = &swap_cgroup_ctrl[swp_type(ent)]; return __swap_cgroup_id_lookup(ctrl->map, swp_offset(ent)); } int swap_cgroup_swapon(int type, unsigned long max_pages) { struct swap_cgroup *map; struct swap_cgroup_ctrl *ctrl; if (mem_cgroup_disabled()) return 0; BUILD_BUG_ON(sizeof(unsigned short) * ID_PER_SC != sizeof(struct swap_cgroup)); map = vzalloc(DIV_ROUND_UP(max_pages, ID_PER_SC) * sizeof(struct swap_cgroup)); if (!map) goto nomem; ctrl = &swap_cgroup_ctrl[type]; mutex_lock(&swap_cgroup_mutex); ctrl->map = map; mutex_unlock(&swap_cgroup_mutex); return 0; nomem: pr_info("couldn't allocate enough memory for swap_cgroup\n"); pr_info("swap_cgroup can be disabled by swapaccount=0 boot option\n"); return -ENOMEM; } void swap_cgroup_swapoff(int type) { struct swap_cgroup *map; struct swap_cgroup_ctrl *ctrl; if (mem_cgroup_disabled()) return; mutex_lock(&swap_cgroup_mutex); ctrl = &swap_cgroup_ctrl[type]; map = ctrl->map; ctrl->map = NULL; mutex_unlock(&swap_cgroup_mutex); vfree(map); } |
| 7 7 7 2 364 2 364 8 2 6 5 2 38 15 10 8 23 31 1 30 30 30 37 8 3 3 3 1 2 2 2 2 2 3 3 1 1 1 1 1 9 4 3 2 1 1 1 1 1 2 2 1 1 22 23 1 23 2 2 2 2 1 1 1 1 1 1 1 1 1 1 1 1 21 21 21 21 18 12 21 20 21 20 1 1 1 1 1 1 1 1 1 1 21 21 21 20 2 2 21 21 17 18 18 3 21 21 21 16 19 21 3 4 3 4 4 4 4 21 21 20 20 21 20 21 21 8 8 1 8 7 7 7 7 1 7 1 1 6 6 6 2 2 2 1 2 6 6 6 6 7 7 7 7 8 10 9 9 9 9 9 9 9 9 9 10 12 10 10 3 12 2 1 12 11 10 10 9 16 16 3 3 8 8 7 8 8 7 7 7 7 7 7 7 7 10 4 2 6 6 6 6 1 1 1 1 6 6 6 6 13 6 6 2 1 4 4 4 4 4 1 4 1 3 3 1 3 4 4 112 113 112 8 4 4 21 12 20 13 4 1 2 1 7 2 2 2 2 1 1 1 1 1 1 3 7 10 37 38 1 113 12 12 11 1 11 1 1 113 12 11 104 105 37 114 105 18 102 82 9 75 73 36 39 21 126 40 77 126 127 120 40 71 122 1 12 115 12 3 113 114 116 115 115 116 2 115 113 115 115 16 102 108 115 113 112 112 112 103 110 12 115 3 3 3 3 2 2 2 2 1 3 3 3 6 6 5 4 1 3 3 3 3 3 4 6 5 5 13 1 13 12 12 1 11 11 13 16 9 1 8 5 1 16 7 14 7 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 | // SPDX-License-Identifier: GPL-2.0 /* * linux/mm/madvise.c * * Copyright (C) 1999 Linus Torvalds * Copyright (C) 2002 Christoph Hellwig */ #include <linux/mman.h> #include <linux/pagemap.h> #include <linux/syscalls.h> #include <linux/mempolicy.h> #include <linux/page-isolation.h> #include <linux/page_idle.h> #include <linux/userfaultfd_k.h> #include <linux/hugetlb.h> #include <linux/falloc.h> #include <linux/fadvise.h> #include <linux/sched.h> #include <linux/sched/mm.h> #include <linux/mm_inline.h> #include <linux/mmu_context.h> #include <linux/string.h> #include <linux/uio.h> #include <linux/ksm.h> #include <linux/fs.h> #include <linux/file.h> #include <linux/blkdev.h> #include <linux/backing-dev.h> #include <linux/pagewalk.h> #include <linux/swap.h> #include <linux/swapops.h> #include <linux/shmem_fs.h> #include <linux/mmu_notifier.h> #include <asm/tlb.h> #include "internal.h" #include "swap.h" #define __MADV_SET_ANON_VMA_NAME (-1) /* * Maximum number of attempts we make to install guard pages before we give up * and return -ERESTARTNOINTR to have userspace try again. */ #define MAX_MADVISE_GUARD_RETRIES 3 struct madvise_walk_private { struct mmu_gather *tlb; bool pageout; }; enum madvise_lock_mode { MADVISE_NO_LOCK, MADVISE_MMAP_READ_LOCK, MADVISE_MMAP_WRITE_LOCK, MADVISE_VMA_READ_LOCK, }; struct madvise_behavior_range { unsigned long start; unsigned long end; }; struct madvise_behavior { struct mm_struct *mm; int behavior; struct mmu_gather *tlb; enum madvise_lock_mode lock_mode; struct anon_vma_name *anon_name; /* * The range over which the behaviour is currently being applied. If * traversing multiple VMAs, this is updated for each. */ struct madvise_behavior_range range; /* The VMA and VMA preceding it (if applicable) currently targeted. */ struct vm_area_struct *prev; struct vm_area_struct *vma; bool lock_dropped; }; #ifdef CONFIG_ANON_VMA_NAME static int madvise_walk_vmas(struct madvise_behavior *madv_behavior); struct anon_vma_name *anon_vma_name_alloc(const char *name) { struct anon_vma_name *anon_name; size_t count; /* Add 1 for NUL terminator at the end of the anon_name->name */ count = strlen(name) + 1; anon_name = kmalloc(struct_size(anon_name, name, count), GFP_KERNEL); if (anon_name) { kref_init(&anon_name->kref); memcpy(anon_name->name, name, count); } return anon_name; } void anon_vma_name_free(struct kref *kref) { struct anon_vma_name *anon_name = container_of(kref, struct anon_vma_name, kref); kfree(anon_name); } struct anon_vma_name *anon_vma_name(struct vm_area_struct *vma) { if (!rwsem_is_locked(&vma->vm_mm->mmap_lock)) vma_assert_locked(vma); return vma->anon_name; } /* mmap_lock should be write-locked */ static int replace_anon_vma_name(struct vm_area_struct *vma, struct anon_vma_name *anon_name) { struct anon_vma_name *orig_name = anon_vma_name(vma); if (!anon_name) { vma->anon_name = NULL; anon_vma_name_put(orig_name); return 0; } if (anon_vma_name_eq(orig_name, anon_name)) return 0; vma->anon_name = anon_vma_name_reuse(anon_name); anon_vma_name_put(orig_name); return 0; } #else /* CONFIG_ANON_VMA_NAME */ static int replace_anon_vma_name(struct vm_area_struct *vma, struct anon_vma_name *anon_name) { if (anon_name) return -EINVAL; return 0; } #endif /* CONFIG_ANON_VMA_NAME */ /* * Update the vm_flags or anon_name on region of a vma, splitting it or merging * it as necessary. Must be called with mmap_lock held for writing. */ static int madvise_update_vma(vm_flags_t new_flags, struct madvise_behavior *madv_behavior) { struct vm_area_struct *vma = madv_behavior->vma; struct madvise_behavior_range *range = &madv_behavior->range; struct anon_vma_name *anon_name = madv_behavior->anon_name; bool set_new_anon_name = madv_behavior->behavior == __MADV_SET_ANON_VMA_NAME; VMA_ITERATOR(vmi, madv_behavior->mm, range->start); if (new_flags == vma->vm_flags && (!set_new_anon_name || anon_vma_name_eq(anon_vma_name(vma), anon_name))) return 0; if (set_new_anon_name) vma = vma_modify_name(&vmi, madv_behavior->prev, vma, range->start, range->end, anon_name); else vma = vma_modify_flags(&vmi, madv_behavior->prev, vma, range->start, range->end, new_flags); if (IS_ERR(vma)) return PTR_ERR(vma); madv_behavior->vma = vma; /* vm_flags is protected by the mmap_lock held in write mode. */ vma_start_write(vma); vm_flags_reset(vma, new_flags); if (set_new_anon_name) return replace_anon_vma_name(vma, anon_name); return 0; } #ifdef CONFIG_SWAP static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start, unsigned long end, struct mm_walk *walk) { struct vm_area_struct *vma = walk->private; struct swap_iocb *splug = NULL; pte_t *ptep = NULL; spinlock_t *ptl; unsigned long addr; for (addr = start; addr < end; addr += PAGE_SIZE) { pte_t pte; swp_entry_t entry; struct folio *folio; if (!ptep++) { ptep = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); if (!ptep) break; } pte = ptep_get(ptep); if (!is_swap_pte(pte)) continue; entry = pte_to_swp_entry(pte); if (unlikely(non_swap_entry(entry))) continue; pte_unmap_unlock(ptep, ptl); ptep = NULL; folio = read_swap_cache_async(entry, GFP_HIGHUSER_MOVABLE, vma, addr, &splug); if (folio) folio_put(folio); } if (ptep) pte_unmap_unlock(ptep, ptl); swap_read_unplug(splug); cond_resched(); return 0; } static const struct mm_walk_ops swapin_walk_ops = { .pmd_entry = swapin_walk_pmd_entry, .walk_lock = PGWALK_RDLOCK, }; static void shmem_swapin_range(struct vm_area_struct *vma, unsigned long start, unsigned long end, struct address_space *mapping) { XA_STATE(xas, &mapping->i_pages, linear_page_index(vma, start)); pgoff_t end_index = linear_page_index(vma, end) - 1; struct folio *folio; struct swap_iocb *splug = NULL; rcu_read_lock(); xas_for_each(&xas, folio, end_index) { unsigned long addr; swp_entry_t entry; if (!xa_is_value(folio)) continue; entry = radix_to_swp_entry(folio); /* There might be swapin error entries in shmem mapping. */ if (non_swap_entry(entry)) continue; addr = vma->vm_start + ((xas.xa_index - vma->vm_pgoff) << PAGE_SHIFT); xas_pause(&xas); rcu_read_unlock(); folio = read_swap_cache_async(entry, mapping_gfp_mask(mapping), vma, addr, &splug); if (folio) folio_put(folio); rcu_read_lock(); } rcu_read_unlock(); swap_read_unplug(splug); } #endif /* CONFIG_SWAP */ static void mark_mmap_lock_dropped(struct madvise_behavior *madv_behavior) { VM_WARN_ON_ONCE(madv_behavior->lock_mode == MADVISE_VMA_READ_LOCK); madv_behavior->lock_dropped = true; } /* * Schedule all required I/O operations. Do not wait for completion. */ static long madvise_willneed(struct madvise_behavior *madv_behavior) { struct vm_area_struct *vma = madv_behavior->vma; struct mm_struct *mm = madv_behavior->mm; struct file *file = vma->vm_file; unsigned long start = madv_behavior->range.start; unsigned long end = madv_behavior->range.end; loff_t offset; #ifdef CONFIG_SWAP if (!file) { walk_page_range_vma(vma, start, end, &swapin_walk_ops, vma); lru_add_drain(); /* Push any new pages onto the LRU now */ return 0; } if (shmem_mapping(file->f_mapping)) { shmem_swapin_range(vma, start, end, file->f_mapping); lru_add_drain(); /* Push any new pages onto the LRU now */ return 0; } #else if (!file) return -EBADF; #endif if (IS_DAX(file_inode(file))) { /* no bad return value, but ignore advice */ return 0; } /* * Filesystem's fadvise may need to take various locks. We need to * explicitly grab a reference because the vma (and hence the * vma's reference to the file) can go away as soon as we drop * mmap_lock. */ mark_mmap_lock_dropped(madv_behavior); get_file(file); offset = (loff_t)(start - vma->vm_start) + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); mmap_read_unlock(mm); vfs_fadvise(file, offset, end - start, POSIX_FADV_WILLNEED); fput(file); mmap_read_lock(mm); return 0; } static inline bool can_do_file_pageout(struct vm_area_struct *vma) { if (!vma->vm_file) return false; /* * paging out pagecache only for non-anonymous mappings that correspond * to the files the calling process could (if tried) open for writing; * otherwise we'd be including shared non-exclusive mappings, which * opens a side channel. */ return inode_owner_or_capable(&nop_mnt_idmap, file_inode(vma->vm_file)) || file_permission(vma->vm_file, MAY_WRITE) == 0; } static inline int madvise_folio_pte_batch(unsigned long addr, unsigned long end, struct folio *folio, pte_t *ptep, pte_t *ptentp) { int max_nr = (end - addr) / PAGE_SIZE; return folio_pte_batch_flags(folio, NULL, ptep, ptentp, max_nr, FPB_MERGE_YOUNG_DIRTY); } static int madvise_cold_or_pageout_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, struct mm_walk *walk) { struct madvise_walk_private *private = walk->private; struct mmu_gather *tlb = private->tlb; bool pageout = private->pageout; struct mm_struct *mm = tlb->mm; struct vm_area_struct *vma = walk->vma; pte_t *start_pte, *pte, ptent; spinlock_t *ptl; struct folio *folio = NULL; LIST_HEAD(folio_list); bool pageout_anon_only_filter; unsigned int batch_count = 0; int nr; if (fatal_signal_pending(current)) return -EINTR; pageout_anon_only_filter = pageout && !vma_is_anonymous(vma) && !can_do_file_pageout(vma); #ifdef CONFIG_TRANSPARENT_HUGEPAGE if (pmd_trans_huge(*pmd)) { pmd_t orig_pmd; unsigned long next = pmd_addr_end(addr, end); tlb_change_page_size(tlb, HPAGE_PMD_SIZE); ptl = pmd_trans_huge_lock(pmd, vma); if (!ptl) return 0; orig_pmd = *pmd; if (is_huge_zero_pmd(orig_pmd)) goto huge_unlock; if (unlikely(!pmd_present(orig_pmd))) { VM_BUG_ON(thp_migration_supported() && !is_pmd_migration_entry(orig_pmd)); goto huge_unlock; } folio = pmd_folio(orig_pmd); /* Do not interfere with other mappings of this folio */ if (folio_maybe_mapped_shared(folio)) goto huge_unlock; if (pageout_anon_only_filter && !folio_test_anon(folio)) goto huge_unlock; if (next - addr != HPAGE_PMD_SIZE) { int err; folio_get(folio); spin_unlock(ptl); folio_lock(folio); err = split_folio(folio); folio_unlock(folio); folio_put(folio); if (!err) goto regular_folio; return 0; } if (!pageout && pmd_young(orig_pmd)) { pmdp_invalidate(vma, addr, pmd); orig_pmd = pmd_mkold(orig_pmd); set_pmd_at(mm, addr, pmd, orig_pmd); tlb_remove_pmd_tlb_entry(tlb, pmd, addr); } folio_clear_referenced(folio); folio_test_clear_young(folio); if (folio_test_active(folio)) folio_set_workingset(folio); if (pageout) { if (folio_isolate_lru(folio)) { if (folio_test_unevictable(folio)) folio_putback_lru(folio); else list_add(&folio->lru, &folio_list); } } else folio_deactivate(folio); huge_unlock: spin_unlock(ptl); if (pageout) reclaim_pages(&folio_list); return 0; } regular_folio: #endif tlb_change_page_size(tlb, PAGE_SIZE); restart: start_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); if (!start_pte) return 0; flush_tlb_batched_pending(mm); arch_enter_lazy_mmu_mode(); for (; addr < end; pte += nr, addr += nr * PAGE_SIZE) { nr = 1; ptent = ptep_get(pte); if (++batch_count == SWAP_CLUSTER_MAX) { batch_count = 0; if (need_resched()) { arch_leave_lazy_mmu_mode(); pte_unmap_unlock(start_pte, ptl); cond_resched(); goto restart; } } if (pte_none(ptent)) continue; if (!pte_present(ptent)) continue; folio = vm_normal_folio(vma, addr, ptent); if (!folio || folio_is_zone_device(folio)) continue; /* * If we encounter a large folio, only split it if it is not * fully mapped within the range we are operating on. Otherwise * leave it as is so that it can be swapped out whole. If we * fail to split a folio, leave it in place and advance to the * next pte in the range. */ if (folio_test_large(folio)) { nr = madvise_folio_pte_batch(addr, end, folio, pte, &ptent); if (nr < folio_nr_pages(folio)) { int err; if (folio_maybe_mapped_shared(folio)) continue; if (pageout_anon_only_filter && !folio_test_anon(folio)) continue; if (!folio_trylock(folio)) continue; folio_get(folio); arch_leave_lazy_mmu_mode(); pte_unmap_unlock(start_pte, ptl); start_pte = NULL; err = split_folio(folio); folio_unlock(folio); folio_put(folio); start_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl); if (!start_pte) break; flush_tlb_batched_pending(mm); arch_enter_lazy_mmu_mode(); if (!err) nr = 0; continue; } } /* * Do not interfere with other mappings of this folio and * non-LRU folio. If we have a large folio at this point, we * know it is fully mapped so if its mapcount is the same as its * number of pages, it must be exclusive. */ if (!folio_test_lru(folio) || folio_mapcount(folio) != folio_nr_pages(folio)) continue; if (pageout_anon_only_filter && !folio_test_anon(folio)) continue; if (!pageout && pte_young(ptent)) { clear_young_dirty_ptes(vma, addr, pte, nr, CYDP_CLEAR_YOUNG); tlb_remove_tlb_entries(tlb, pte, nr, addr); } /* * We are deactivating a folio for accelerating reclaiming. * VM couldn't reclaim the folio unless we clear PG_young. * As a side effect, it makes confuse idle-page tracking * because they will miss recent referenced history. */ folio_clear_referenced(folio); folio_test_clear_young(folio); if (folio_test_active(folio)) folio_set_workingset(folio); if (pageout) { if (folio_isolate_lru(folio)) { if (folio_test_unevictable(folio)) folio_putback_lru(folio); else list_add(&folio->lru, &folio_list); } } else folio_deactivate(folio); } if (start_pte) { arch_leave_lazy_mmu_mode(); pte_unmap_unlock(start_pte, ptl); } if (pageout) reclaim_pages(&folio_list); cond_resched(); return 0; } static const struct mm_walk_ops cold_walk_ops = { .pmd_entry = madvise_cold_or_pageout_pte_range, .walk_lock = PGWALK_RDLOCK, }; static void madvise_cold_page_range(struct mmu_gather *tlb, struct madvise_behavior *madv_behavior) { struct vm_area_struct *vma = madv_behavior->vma; struct madvise_behavior_range *range = &madv_behavior->range; struct madvise_walk_private walk_private = { .pageout = false, .tlb = tlb, }; tlb_start_vma(tlb, vma); walk_page_range_vma(vma, range->start, range->end, &cold_walk_ops, &walk_private); tlb_end_vma(tlb, vma); } static inline bool can_madv_lru_vma(struct vm_area_struct *vma) { return !(vma->vm_flags & (VM_LOCKED|VM_PFNMAP|VM_HUGETLB)); } static long madvise_cold(struct madvise_behavior *madv_behavior) { struct vm_area_struct *vma = madv_behavior->vma; struct mmu_gather tlb; if (!can_madv_lru_vma(vma)) return -EINVAL; lru_add_drain(); tlb_gather_mmu(&tlb, madv_behavior->mm); madvise_cold_page_range(&tlb, madv_behavior); tlb_finish_mmu(&tlb); return 0; } static void madvise_pageout_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma, struct madvise_behavior_range *range) { struct madvise_walk_private walk_private = { .pageout = true, .tlb = tlb, }; tlb_start_vma(tlb, vma); walk_page_range_vma(vma, range->start, range->end, &cold_walk_ops, &walk_private); tlb_end_vma(tlb, vma); } static long madvise_pageout(struct madvise_behavior *madv_behavior) { struct mmu_gather tlb; struct vm_area_struct *vma = madv_behavior->vma; if (!can_madv_lru_vma(vma)) return -EINVAL; /* * If the VMA belongs to a private file mapping, there can be private * dirty pages which can be paged out if even this process is neither * owner nor write capable of the file. We allow private file mappings * further to pageout dirty anon pages. */ if (!vma_is_anonymous(vma) && (!can_do_file_pageout(vma) && (vma->vm_flags & VM_MAYSHARE))) return 0; lru_add_drain(); tlb_gather_mmu(&tlb, madv_behavior->mm); madvise_pageout_page_range(&tlb, vma, &madv_behavior->range); tlb_finish_mmu(&tlb); return 0; } static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, struct mm_walk *walk) { const cydp_t cydp_flags = CYDP_CLEAR_YOUNG | CYDP_CLEAR_DIRTY; struct mmu_gather *tlb = walk->private; struct mm_struct *mm = tlb->mm; struct vm_area_struct *vma = walk->vma; spinlock_t *ptl; pte_t *start_pte, *pte, ptent; struct folio *folio; int nr_swap = 0; unsigned long next; int nr, max_nr; next = pmd_addr_end(addr, end); if (pmd_trans_huge(*pmd)) if (madvise_free_huge_pmd(tlb, vma, pmd, addr, next)) return 0; tlb_change_page_size(tlb, PAGE_SIZE); start_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl); if (!start_pte) return 0; flush_tlb_batched_pending(mm); arch_enter_lazy_mmu_mode(); for (; addr != end; pte += nr, addr += PAGE_SIZE * nr) { nr = 1; ptent = ptep_get(pte); if (pte_none(ptent)) continue; /* * If the pte has swp_entry, just clear page table to * prevent swap-in which is more expensive rather than * (page allocation + zeroing). */ if (!pte_present(ptent)) { swp_entry_t entry; entry = pte_to_swp_entry(ptent); if (!non_swap_entry(entry)) { max_nr = (end - addr) / PAGE_SIZE; nr = swap_pte_batch(pte, max_nr, ptent); nr_swap -= nr; free_swap_and_cache_nr(entry, nr); clear_not_present_full_ptes(mm, addr, pte, nr, tlb->fullmm); } else if (is_hwpoison_entry(entry) || is_poisoned_swp_entry(entry)) { pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); } continue; } folio = vm_normal_folio(vma, addr, ptent); if (!folio || folio_is_zone_device(folio)) continue; /* * If we encounter a large folio, only split it if it is not * fully mapped within the range we are operating on. Otherwise * leave it as is so that it can be marked as lazyfree. If we * fail to split a folio, leave it in place and advance to the * next pte in the range. */ if (folio_test_large(folio)) { nr = madvise_folio_pte_batch(addr, end, folio, pte, &ptent); if (nr < folio_nr_pages(folio)) { int err; if (folio_maybe_mapped_shared(folio)) continue; if (!folio_trylock(folio)) continue; folio_get(folio); arch_leave_lazy_mmu_mode(); pte_unmap_unlock(start_pte, ptl); start_pte = NULL; err = split_folio(folio); folio_unlock(folio); folio_put(folio); pte = pte_offset_map_lock(mm, pmd, addr, &ptl); start_pte = pte; if (!start_pte) break; flush_tlb_batched_pending(mm); arch_enter_lazy_mmu_mode(); if (!err) nr = 0; continue; } } if (folio_test_swapcache(folio) || folio_test_dirty(folio)) { if (!folio_trylock(folio)) continue; /* * If we have a large folio at this point, we know it is * fully mapped so if its mapcount is the same as its * number of pages, it must be exclusive. */ if (folio_mapcount(folio) != folio_nr_pages(folio)) { folio_unlock(folio); continue; } if (folio_test_swapcache(folio) && !folio_free_swap(folio)) { folio_unlock(folio); continue; } folio_clear_dirty(folio); folio_unlock(folio); } if (pte_young(ptent) || pte_dirty(ptent)) { clear_young_dirty_ptes(vma, addr, pte, nr, cydp_flags); tlb_remove_tlb_entries(tlb, pte, nr, addr); } folio_mark_lazyfree(folio); } if (nr_swap) add_mm_counter(mm, MM_SWAPENTS, nr_swap); if (start_pte) { arch_leave_lazy_mmu_mode(); pte_unmap_unlock(start_pte, ptl); } cond_resched(); return 0; } static inline enum page_walk_lock get_walk_lock(enum madvise_lock_mode mode) { switch (mode) { case MADVISE_VMA_READ_LOCK: return PGWALK_VMA_RDLOCK_VERIFY; case MADVISE_MMAP_READ_LOCK: return PGWALK_RDLOCK; default: /* Other modes don't require fixing up the walk_lock */ WARN_ON_ONCE(1); return PGWALK_RDLOCK; } } static int madvise_free_single_vma(struct madvise_behavior *madv_behavior) { struct mm_struct *mm = madv_behavior->mm; struct vm_area_struct *vma = madv_behavior->vma; unsigned long start_addr = madv_behavior->range.start; unsigned long end_addr = madv_behavior->range.end; struct mmu_notifier_range range; struct mmu_gather *tlb = madv_behavior->tlb; struct mm_walk_ops walk_ops = { .pmd_entry = madvise_free_pte_range, }; /* MADV_FREE works for only anon vma at the moment */ if (!vma_is_anonymous(vma)) return -EINVAL; range.start = max(vma->vm_start, start_addr); if (range.start >= vma->vm_end) return -EINVAL; range.end = min(vma->vm_end, end_addr); if (range.end <= vma->vm_start) return -EINVAL; mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, range.start, range.end); lru_add_drain(); update_hiwater_rss(mm); mmu_notifier_invalidate_range_start(&range); tlb_start_vma(tlb, vma); walk_ops.walk_lock = get_walk_lock(madv_behavior->lock_mode); walk_page_range_vma(vma, range.start, range.end, &walk_ops, tlb); tlb_end_vma(tlb, vma); mmu_notifier_invalidate_range_end(&range); return 0; } /* * Application no longer needs these pages. If the pages are dirty, * it's OK to just throw them away. The app will be more careful about * data it wants to keep. Be sure to free swap resources too. The * zap_page_range_single call sets things up for shrink_active_list to actually * free these pages later if no one else has touched them in the meantime, * although we could add these pages to a global reuse list for * shrink_active_list to pick up before reclaiming other pages. * * NB: This interface discards data rather than pushes it out to swap, * as some implementations do. This has performance implications for * applications like large transactional databases which want to discard * pages in anonymous maps after committing to backing store the data * that was kept in them. There is no reason to write this data out to * the swap area if the application is discarding it. * * An interface that causes the system to free clean pages and flush * dirty pages is already available as msync(MS_INVALIDATE). */ static long madvise_dontneed_single_vma(struct madvise_behavior *madv_behavior) { struct madvise_behavior_range *range = &madv_behavior->range; struct zap_details details = { .reclaim_pt = true, .even_cows = true, }; zap_page_range_single_batched( madv_behavior->tlb, madv_behavior->vma, range->start, range->end - range->start, &details); return 0; } static bool madvise_dontneed_free_valid_vma(struct madvise_behavior *madv_behavior) { struct vm_area_struct *vma = madv_behavior->vma; int behavior = madv_behavior->behavior; struct madvise_behavior_range *range = &madv_behavior->range; if (!is_vm_hugetlb_page(vma)) { unsigned int forbidden = VM_PFNMAP; if (behavior != MADV_DONTNEED_LOCKED) forbidden |= VM_LOCKED; return !(vma->vm_flags & forbidden); } if (behavior != MADV_DONTNEED && behavior != MADV_DONTNEED_LOCKED) return false; if (range->start & ~huge_page_mask(hstate_vma(vma))) return false; /* * Madvise callers expect the length to be rounded up to PAGE_SIZE * boundaries, and may be unaware that this VMA uses huge pages. * Avoid unexpected data loss by rounding down the number of * huge pages freed. */ range->end = ALIGN_DOWN(range->end, huge_page_size(hstate_vma(vma))); return true; } static long madvise_dontneed_free(struct madvise_behavior *madv_behavior) { struct mm_struct *mm = madv_behavior->mm; struct madvise_behavior_range *range = &madv_behavior->range; int behavior = madv_behavior->behavior; if (!madvise_dontneed_free_valid_vma(madv_behavior)) return -EINVAL; if (range->start == range->end) return 0; if (!userfaultfd_remove(madv_behavior->vma, range->start, range->end)) { struct vm_area_struct *vma; mark_mmap_lock_dropped(madv_behavior); mmap_read_lock(mm); madv_behavior->vma = vma = vma_lookup(mm, range->start); if (!vma) return -ENOMEM; /* * Potential end adjustment for hugetlb vma is OK as * the check below keeps end within vma. */ if (!madvise_dontneed_free_valid_vma(madv_behavior)) return -EINVAL; if (range->end > vma->vm_end) { /* * Don't fail if end > vma->vm_end. If the old * vma was split while the mmap_lock was * released the effect of the concurrent * operation may not cause madvise() to * have an undefined result. There may be an * adjacent next vma that we'll walk * next. userfaultfd_remove() will generate an * UFFD_EVENT_REMOVE repetition on the * end-vma->vm_end range, but the manager can * handle a repetition fine. */ range->end = vma->vm_end; } /* * If the memory region between start and end was * originally backed by 4kB pages and then remapped to * be backed by hugepages while mmap_lock was dropped, * the adjustment for hugetlb vma above may have rounded * end down to the start address. */ if (range->start == range->end) return 0; VM_WARN_ON(range->start > range->end); } if (behavior == MADV_DONTNEED || behavior == MADV_DONTNEED_LOCKED) return madvise_dontneed_single_vma(madv_behavior); else if (behavior == MADV_FREE) return madvise_free_single_vma(madv_behavior); else return -EINVAL; } static long madvise_populate(struct madvise_behavior *madv_behavior) { struct mm_struct *mm = madv_behavior->mm; const bool write = madv_behavior->behavior == MADV_POPULATE_WRITE; int locked = 1; unsigned long start = madv_behavior->range.start; unsigned long end = madv_behavior->range.end; long pages; while (start < end) { /* Populate (prefault) page tables readable/writable. */ pages = faultin_page_range(mm, start, end, write, &locked); if (!locked) { mmap_read_lock(mm); locked = 1; } if (pages < 0) { switch (pages) { case -EINTR: return -EINTR; case -EINVAL: /* Incompatible mappings / permissions. */ return -EINVAL; case -EHWPOISON: return -EHWPOISON; case -EFAULT: /* VM_FAULT_SIGBUS or VM_FAULT_SIGSEGV */ return -EFAULT; default: pr_warn_once("%s: unhandled return value: %ld\n", __func__, pages); fallthrough; case -ENOMEM: /* No VMA or out of memory. */ return -ENOMEM; } } start += pages * PAGE_SIZE; } return 0; } /* * Application wants to free up the pages and associated backing store. * This is effectively punching a hole into the middle of a file. */ static long madvise_remove(struct madvise_behavior *madv_behavior) { loff_t offset; int error; struct file *f; struct mm_struct *mm = madv_behavior->mm; struct vm_area_struct *vma = madv_behavior->vma; unsigned long start = madv_behavior->range.start; unsigned long end = madv_behavior->range.end; mark_mmap_lock_dropped(madv_behavior); if (vma->vm_flags & VM_LOCKED) return -EINVAL; f = vma->vm_file; if (!f || !f->f_mapping || !f->f_mapping->host) { return -EINVAL; } if (!vma_is_shared_maywrite(vma)) return -EACCES; offset = (loff_t)(start - vma->vm_start) + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); /* * Filesystem's fallocate may need to take i_rwsem. We need to * explicitly grab a reference because the vma (and hence the * vma's reference to the file) can go away as soon as we drop * mmap_lock. */ get_file(f); if (userfaultfd_remove(vma, start, end)) { /* mmap_lock was not released by userfaultfd_remove() */ mmap_read_unlock(mm); } error = vfs_fallocate(f, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, offset, end - start); fput(f); mmap_read_lock(mm); return error; } static bool is_valid_guard_vma(struct vm_area_struct *vma, bool allow_locked) { vm_flags_t disallowed = VM_SPECIAL | VM_HUGETLB; /* * A user could lock after setting a guard range but that's fine, as * they'd not be able to fault in. The issue arises when we try to zap * existing locked VMAs. We don't want to do that. */ if (!allow_locked) disallowed |= VM_LOCKED; return !(vma->vm_flags & disallowed); } static bool is_guard_pte_marker(pte_t ptent) { return is_pte_marker(ptent) && is_guard_swp_entry(pte_to_swp_entry(ptent)); } static int guard_install_pud_entry(pud_t *pud, unsigned long addr, unsigned long next, struct mm_walk *walk) { pud_t pudval = pudp_get(pud); /* If huge return >0 so we abort the operation + zap. */ return pud_trans_huge(pudval); } static int guard_install_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long next, struct mm_walk *walk) { pmd_t pmdval = pmdp_get(pmd); /* If huge return >0 so we abort the operation + zap. */ return pmd_trans_huge(pmdval); } static int guard_install_pte_entry(pte_t *pte, unsigned long addr, unsigned long next, struct mm_walk *walk) { pte_t pteval = ptep_get(pte); unsigned long *nr_pages = (unsigned long *)walk->private; /* If there is already a guard page marker, we have nothing to do. */ if (is_guard_pte_marker(pteval)) { (*nr_pages)++; return 0; } /* If populated return >0 so we abort the operation + zap. */ return 1; } static int guard_install_set_pte(unsigned long addr, unsigned long next, pte_t *ptep, struct mm_walk *walk) { unsigned long *nr_pages = (unsigned long *)walk->private; /* Simply install a PTE marker, this causes segfault on access. */ *ptep = make_pte_marker(PTE_MARKER_GUARD); (*nr_pages)++; return 0; } static const struct mm_walk_ops guard_install_walk_ops = { .pud_entry = guard_install_pud_entry, .pmd_entry = guard_install_pmd_entry, .pte_entry = guard_install_pte_entry, .install_pte = guard_install_set_pte, .walk_lock = PGWALK_RDLOCK, }; static long madvise_guard_install(struct madvise_behavior *madv_behavior) { struct vm_area_struct *vma = madv_behavior->vma; struct madvise_behavior_range *range = &madv_behavior->range; long err; int i; if (!is_valid_guard_vma(vma, /* allow_locked = */false)) return -EINVAL; /* * If we install guard markers, then the range is no longer * empty from a page table perspective and therefore it's * appropriate to have an anon_vma. * * This ensures that on fork, we copy page tables correctly. */ err = anon_vma_prepare(vma); if (err) return err; /* * Optimistically try to install the guard marker pages first. If any * non-guard pages are encountered, give up and zap the range before * trying again. * * We try a few times before giving up and releasing back to userland to * loop around, releasing locks in the process to avoid contention. This * would only happen if there was a great many racing page faults. * * In most cases we should simply install the guard markers immediately * with no zap or looping. */ for (i = 0; i < MAX_MADVISE_GUARD_RETRIES; i++) { unsigned long nr_pages = 0; /* Returns < 0 on error, == 0 if success, > 0 if zap needed. */ err = walk_page_range_mm(vma->vm_mm, range->start, range->end, &guard_install_walk_ops, &nr_pages); if (err < 0) return err; if (err == 0) { unsigned long nr_expected_pages = PHYS_PFN(range->end - range->start); VM_WARN_ON(nr_pages != nr_expected_pages); return 0; } /* * OK some of the range have non-guard pages mapped, zap * them. This leaves existing guard pages in place. */ zap_page_range_single(vma, range->start, range->end - range->start, NULL); } /* * We were unable to install the guard pages due to being raced by page * faults. This should not happen ordinarily. We return to userspace and * immediately retry, relieving lock contention. */ return restart_syscall(); } static int guard_remove_pud_entry(pud_t *pud, unsigned long addr, unsigned long next, struct mm_walk *walk) { pud_t pudval = pudp_get(pud); /* If huge, cannot have guard pages present, so no-op - skip. */ if (pud_trans_huge(pudval)) walk->action = ACTION_CONTINUE; return 0; } static int guard_remove_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long next, struct mm_walk *walk) { pmd_t pmdval = pmdp_get(pmd); /* If huge, cannot have guard pages present, so no-op - skip. */ if (pmd_trans_huge(pmdval)) walk->action = ACTION_CONTINUE; return 0; } static int guard_remove_pte_entry(pte_t *pte, unsigned long addr, unsigned long next, struct mm_walk *walk) { pte_t ptent = ptep_get(pte); if (is_guard_pte_marker(ptent)) { /* Simply clear the PTE marker. */ pte_clear_not_present_full(walk->mm, addr, pte, false); update_mmu_cache(walk->vma, addr, pte); } return 0; } static const struct mm_walk_ops guard_remove_walk_ops = { .pud_entry = guard_remove_pud_entry, .pmd_entry = guard_remove_pmd_entry, .pte_entry = guard_remove_pte_entry, .walk_lock = PGWALK_RDLOCK, }; static long madvise_guard_remove(struct madvise_behavior *madv_behavior) { struct vm_area_struct *vma = madv_behavior->vma; struct madvise_behavior_range *range = &madv_behavior->range; /* * We're ok with removing guards in mlock()'d ranges, as this is a * non-destructive action. */ if (!is_valid_guard_vma(vma, /* allow_locked = */true)) return -EINVAL; return walk_page_range_vma(vma, range->start, range->end, &guard_remove_walk_ops, NULL); } #ifdef CONFIG_64BIT /* Does the madvise operation result in discarding of mapped data? */ static bool is_discard(int behavior) { switch (behavior) { case MADV_FREE: case MADV_DONTNEED: case MADV_DONTNEED_LOCKED: case MADV_REMOVE: case MADV_DONTFORK: case MADV_WIPEONFORK: case MADV_GUARD_INSTALL: return true; } return false; } /* * We are restricted from madvise()'ing mseal()'d VMAs only in very particular * circumstances - discarding of data from read-only anonymous SEALED mappings. * * This is because users cannot trivally discard data from these VMAs, and may * only do so via an appropriate madvise() call. */ static bool can_madvise_modify(struct madvise_behavior *madv_behavior) { struct vm_area_struct *vma = madv_behavior->vma; /* If the VMA isn't sealed we're good. */ if (!vma_is_sealed(vma)) return true; /* For a sealed VMA, we only care about discard operations. */ if (!is_discard(madv_behavior->behavior)) return true; /* * We explicitly permit all file-backed mappings, whether MAP_SHARED or * MAP_PRIVATE. * * The latter causes some complications. Because now, one can mmap() * read/write a MAP_PRIVATE mapping, write to it, then mprotect() * read-only, mseal() and a discard will be permitted. * * However, in order to avoid issues with potential use of madvise(..., * MADV_DONTNEED) of mseal()'d .text mappings we, for the time being, * permit this. */ if (!vma_is_anonymous(vma)) return true; /* If the user could write to the mapping anyway, then this is fine. */ if ((vma->vm_flags & VM_WRITE) && arch_vma_access_permitted(vma, /* write= */ true, /* execute= */ false, /* foreign= */ false)) return true; /* Otherwise, we are not permitted to perform this operation. */ return false; } #else static bool can_madvise_modify(struct madvise_behavior *madv_behavior) { return true; } #endif /* * Apply an madvise behavior to a region of a vma. madvise_update_vma * will handle splitting a vm area into separate areas, each area with its own * behavior. */ static int madvise_vma_behavior(struct madvise_behavior *madv_behavior) { int behavior = madv_behavior->behavior; struct vm_area_struct *vma = madv_behavior->vma; vm_flags_t new_flags = vma->vm_flags; struct madvise_behavior_range *range = &madv_behavior->range; int error; if (unlikely(!can_madvise_modify(madv_behavior))) return -EPERM; switch (behavior) { case MADV_REMOVE: return madvise_remove(madv_behavior); case MADV_WILLNEED: return madvise_willneed(madv_behavior); case MADV_COLD: return madvise_cold(madv_behavior); case MADV_PAGEOUT: return madvise_pageout(madv_behavior); case MADV_FREE: case MADV_DONTNEED: case MADV_DONTNEED_LOCKED: return madvise_dontneed_free(madv_behavior); case MADV_COLLAPSE: return madvise_collapse(vma, range->start, range->end, &madv_behavior->lock_dropped); case MADV_GUARD_INSTALL: return madvise_guard_install(madv_behavior); case MADV_GUARD_REMOVE: return madvise_guard_remove(madv_behavior); /* The below behaviours update VMAs via madvise_update_vma(). */ case MADV_NORMAL: new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ; break; case MADV_SEQUENTIAL: new_flags = (new_flags & ~VM_RAND_READ) | VM_SEQ_READ; break; case MADV_RANDOM: new_flags = (new_flags & ~VM_SEQ_READ) | VM_RAND_READ; break; case MADV_DONTFORK: new_flags |= VM_DONTCOPY; break; case MADV_DOFORK: if (new_flags & VM_IO) return -EINVAL; new_flags &= ~VM_DONTCOPY; break; case MADV_WIPEONFORK: /* MADV_WIPEONFORK is only supported on anonymous memory. */ if (vma->vm_file || new_flags & VM_SHARED) return -EINVAL; new_flags |= VM_WIPEONFORK; break; case MADV_KEEPONFORK: if (new_flags & VM_DROPPABLE) return -EINVAL; new_flags &= ~VM_WIPEONFORK; break; case MADV_DONTDUMP: new_flags |= VM_DONTDUMP; break; case MADV_DODUMP: if ((!is_vm_hugetlb_page(vma) && (new_flags & VM_SPECIAL)) || (new_flags & VM_DROPPABLE)) return -EINVAL; new_flags &= ~VM_DONTDUMP; break; case MADV_MERGEABLE: case MADV_UNMERGEABLE: error = ksm_madvise(vma, range->start, range->end, behavior, &new_flags); if (error) goto out; break; case MADV_HUGEPAGE: case MADV_NOHUGEPAGE: error = hugepage_madvise(vma, &new_flags, behavior); if (error) goto out; break; case __MADV_SET_ANON_VMA_NAME: /* Only anonymous mappings can be named */ if (vma->vm_file && !vma_is_anon_shmem(vma)) return -EBADF; break; } /* This is a write operation.*/ VM_WARN_ON_ONCE(madv_behavior->lock_mode != MADVISE_MMAP_WRITE_LOCK); error = madvise_update_vma(new_flags, madv_behavior); out: /* * madvise() returns EAGAIN if kernel resources, such as * slab, are temporarily unavailable. */ if (error == -ENOMEM) error = -EAGAIN; return error; } #ifdef CONFIG_MEMORY_FAILURE /* * Error injection support for memory error handling. */ static int madvise_inject_error(struct madvise_behavior *madv_behavior) { unsigned long size; unsigned long start = madv_behavior->range.start; unsigned long end = madv_behavior->range.end; if (!capable(CAP_SYS_ADMIN)) return -EPERM; for (; start < end; start += size) { unsigned long pfn; struct page *page; int ret; ret = get_user_pages_fast(start, 1, 0, &page); if (ret != 1) return ret; pfn = page_to_pfn(page); /* * When soft offlining hugepages, after migrating the page * we dissolve it, therefore in the second loop "page" will * no longer be a compound page. */ size = page_size(compound_head(page)); if (madv_behavior->behavior == MADV_SOFT_OFFLINE) { pr_info("Soft offlining pfn %#lx at process virtual address %#lx\n", pfn, start); ret = soft_offline_page(pfn, MF_COUNT_INCREASED); } else { pr_info("Injecting memory failure for pfn %#lx at process virtual address %#lx\n", pfn, start); ret = memory_failure(pfn, MF_ACTION_REQUIRED | MF_COUNT_INCREASED | MF_SW_SIMULATED); if (ret == -EOPNOTSUPP) ret = 0; } if (ret) return ret; } return 0; } static bool is_memory_failure(struct madvise_behavior *madv_behavior) { switch (madv_behavior->behavior) { case MADV_HWPOISON: case MADV_SOFT_OFFLINE: return true; default: return false; } } #else static int madvise_inject_error(struct madvise_behavior *madv_behavior) { return 0; } static bool is_memory_failure(struct madvise_behavior *madv_behavior) { return false; } #endif /* CONFIG_MEMORY_FAILURE */ static bool madvise_behavior_valid(int behavior) { switch (behavior) { case MADV_DOFORK: case MADV_DONTFORK: case MADV_NORMAL: case MADV_SEQUENTIAL: case MADV_RANDOM: case MADV_REMOVE: case MADV_WILLNEED: case MADV_DONTNEED: case MADV_DONTNEED_LOCKED: case MADV_FREE: case MADV_COLD: case MADV_PAGEOUT: case MADV_POPULATE_READ: case MADV_POPULATE_WRITE: #ifdef CONFIG_KSM case MADV_MERGEABLE: case MADV_UNMERGEABLE: #endif #ifdef CONFIG_TRANSPARENT_HUGEPAGE case MADV_HUGEPAGE: case MADV_NOHUGEPAGE: case MADV_COLLAPSE: #endif case MADV_DONTDUMP: case MADV_DODUMP: case MADV_WIPEONFORK: case MADV_KEEPONFORK: case MADV_GUARD_INSTALL: case MADV_GUARD_REMOVE: #ifdef CONFIG_MEMORY_FAILURE case MADV_SOFT_OFFLINE: case MADV_HWPOISON: #endif return true; default: return false; } } /* Can we invoke process_madvise() on a remote mm for the specified behavior? */ static bool process_madvise_remote_valid(int behavior) { switch (behavior) { case MADV_COLD: case MADV_PAGEOUT: case MADV_WILLNEED: case MADV_COLLAPSE: return true; default: return false; } } /* * Try to acquire a VMA read lock if possible. * * We only support this lock over a single VMA, which the input range must * span either partially or fully. * * This function always returns with an appropriate lock held. If a VMA read * lock could be acquired, we return true and set madv_behavior state * accordingly. * * If a VMA read lock could not be acquired, we return false and expect caller to * fallback to mmap lock behaviour. */ static bool try_vma_read_lock(struct madvise_behavior *madv_behavior) { struct mm_struct *mm = madv_behavior->mm; struct vm_area_struct *vma; vma = lock_vma_under_rcu(mm, madv_behavior->range.start); if (!vma) goto take_mmap_read_lock; /* * Must span only a single VMA; uffd and remote processes are * unsupported. */ if (madv_behavior->range.end > vma->vm_end || current->mm != mm || userfaultfd_armed(vma)) { vma_end_read(vma); goto take_mmap_read_lock; } madv_behavior->vma = vma; return true; take_mmap_read_lock: mmap_read_lock(mm); madv_behavior->lock_mode = MADVISE_MMAP_READ_LOCK; return false; } /* * Walk the vmas in range [start,end), and call the madvise_vma_behavior * function on each one. The function will get start and end parameters that * cover the overlap between the current vma and the original range. Any * unmapped regions in the original range will result in this function returning * -ENOMEM while still calling the madvise_vma_behavior function on all of the * existing vmas in the range. Must be called with the mmap_lock held for * reading or writing. */ static int madvise_walk_vmas(struct madvise_behavior *madv_behavior) { struct mm_struct *mm = madv_behavior->mm; struct madvise_behavior_range *range = &madv_behavior->range; /* range is updated to span each VMA, so store end of entire range. */ unsigned long last_end = range->end; int unmapped_error = 0; int error; struct vm_area_struct *prev, *vma; /* * If VMA read lock is supported, apply madvise to a single VMA * tentatively, avoiding walking VMAs. */ if (madv_behavior->lock_mode == MADVISE_VMA_READ_LOCK && try_vma_read_lock(madv_behavior)) { error = madvise_vma_behavior(madv_behavior); vma_end_read(madv_behavior->vma); return error; } vma = find_vma_prev(mm, range->start, &prev); if (vma && range->start > vma->vm_start) prev = vma; for (;;) { /* Still start < end. */ if (!vma) return -ENOMEM; /* Here start < (last_end|vma->vm_end). */ if (range->start < vma->vm_start) { /* * This indicates a gap between VMAs in the input * range. This does not cause the operation to abort, * rather we simply return -ENOMEM to indicate that this * has happened, but carry on. */ unmapped_error = -ENOMEM; range->start = vma->vm_start; if (range->start >= last_end) break; } /* Here vma->vm_start <= range->start < (last_end|vma->vm_end) */ range->end = min(vma->vm_end, last_end); /* Here vma->vm_start <= range->start < range->end <= (last_end|vma->vm_end). */ madv_behavior->prev = prev; madv_behavior->vma = vma; error = madvise_vma_behavior(madv_behavior); if (error) return error; if (madv_behavior->lock_dropped) { /* We dropped the mmap lock, we can't ref the VMA. */ prev = NULL; vma = NULL; madv_behavior->lock_dropped = false; } else { vma = madv_behavior->vma; prev = vma; } if (vma && range->end < vma->vm_end) range->end = vma->vm_end; if (range->end >= last_end) break; vma = find_vma(mm, vma ? vma->vm_end : range->end); range->start = range->end; } return unmapped_error; } /* * Any behaviour which results in changes to the vma->vm_flags needs to * take mmap_lock for writing. Others, which simply traverse vmas, need * to only take it for reading. */ static enum madvise_lock_mode get_lock_mode(struct madvise_behavior *madv_behavior) { if (is_memory_failure(madv_behavior)) return MADVISE_NO_LOCK; switch (madv_behavior->behavior) { case MADV_REMOVE: case MADV_WILLNEED: case MADV_COLD: case MADV_PAGEOUT: case MADV_POPULATE_READ: case MADV_POPULATE_WRITE: case MADV_COLLAPSE: case MADV_GUARD_INSTALL: case MADV_GUARD_REMOVE: return MADVISE_MMAP_READ_LOCK; case MADV_DONTNEED: case MADV_DONTNEED_LOCKED: case MADV_FREE: return MADVISE_VMA_READ_LOCK; default: return MADVISE_MMAP_WRITE_LOCK; } } static int madvise_lock(struct madvise_behavior *madv_behavior) { struct mm_struct *mm = madv_behavior->mm; enum madvise_lock_mode lock_mode = get_lock_mode(madv_behavior); switch (lock_mode) { case MADVISE_NO_LOCK: break; case MADVISE_MMAP_WRITE_LOCK: if (mmap_write_lock_killable(mm)) return -EINTR; break; case MADVISE_MMAP_READ_LOCK: mmap_read_lock(mm); break; case MADVISE_VMA_READ_LOCK: /* We will acquire the lock per-VMA in madvise_walk_vmas(). */ break; } madv_behavior->lock_mode = lock_mode; return 0; } static void madvise_unlock(struct madvise_behavior *madv_behavior) { struct mm_struct *mm = madv_behavior->mm; switch (madv_behavior->lock_mode) { case MADVISE_NO_LOCK: return; case MADVISE_MMAP_WRITE_LOCK: mmap_write_unlock(mm); break; case MADVISE_MMAP_READ_LOCK: mmap_read_unlock(mm); break; case MADVISE_VMA_READ_LOCK: /* We will drop the lock per-VMA in madvise_walk_vmas(). */ break; } madv_behavior->lock_mode = MADVISE_NO_LOCK; } static bool madvise_batch_tlb_flush(int behavior) { switch (behavior) { case MADV_DONTNEED: case MADV_DONTNEED_LOCKED: case MADV_FREE: return true; default: return false; } } static void madvise_init_tlb(struct madvise_behavior *madv_behavior) { if (madvise_batch_tlb_flush(madv_behavior->behavior)) tlb_gather_mmu(madv_behavior->tlb, madv_behavior->mm); } static void madvise_finish_tlb(struct madvise_behavior *madv_behavior) { if (madvise_batch_tlb_flush(madv_behavior->behavior)) tlb_finish_mmu(madv_behavior->tlb); } static bool is_valid_madvise(unsigned long start, size_t len_in, int behavior) { size_t len; if (!madvise_behavior_valid(behavior)) return false; if (!PAGE_ALIGNED(start)) return false; len = PAGE_ALIGN(len_in); /* Check to see whether len was rounded up from small -ve to zero */ if (len_in && !len) return false; if (start + len < start) return false; return true; } /* * madvise_should_skip() - Return if the request is invalid or nothing. * @start: Start address of madvise-requested address range. * @len_in: Length of madvise-requested address range. * @behavior: Requested madvise behavor. * @err: Pointer to store an error code from the check. * * If the specified behaviour is invalid or nothing would occur, we skip the * operation. This function returns true in the cases, otherwise false. In * the former case we store an error on @err. */ static bool madvise_should_skip(unsigned long start, size_t len_in, int behavior, int *err) { if (!is_valid_madvise(start, len_in, behavior)) { *err = -EINVAL; return true; } if (start + PAGE_ALIGN(len_in) == start) { *err = 0; return true; } return false; } static bool is_madvise_populate(struct madvise_behavior *madv_behavior) { switch (madv_behavior->behavior) { case MADV_POPULATE_READ: case MADV_POPULATE_WRITE: return true; default: return false; } } /* * untagged_addr_remote() assumes mmap_lock is already held. On * architectures like x86 and RISC-V, tagging is tricky because each * mm may have a different tagging mask. However, we might only hold * the per-VMA lock (currently only local processes are supported), * so untagged_addr is used to avoid the mmap_lock assertion for * local processes. */ static inline unsigned long get_untagged_addr(struct mm_struct *mm, unsigned long start) { return current->mm == mm ? untagged_addr(start) : untagged_addr_remote(mm, start); } static int madvise_do_behavior(unsigned long start, size_t len_in, struct madvise_behavior *madv_behavior) { struct blk_plug plug; int error; struct madvise_behavior_range *range = &madv_behavior->range; if (is_memory_failure(madv_behavior)) { range->start = start; range->end = start + len_in; return madvise_inject_error(madv_behavior); } range->start = get_untagged_addr(madv_behavior->mm, start); range->end = range->start + PAGE_ALIGN(len_in); blk_start_plug(&plug); if (is_madvise_populate(madv_behavior)) error = madvise_populate(madv_behavior); else error = madvise_walk_vmas(madv_behavior); blk_finish_plug(&plug); return error; } /* * The madvise(2) system call. * * Applications can use madvise() to advise the kernel how it should * handle paging I/O in this VM area. The idea is to help the kernel * use appropriate read-ahead and caching techniques. The information * provided is advisory only, and can be safely disregarded by the * kernel without affecting the correct operation of the application. * * behavior values: * MADV_NORMAL - the default behavior is to read clusters. This * results in some read-ahead and read-behind. * MADV_RANDOM - the system should read the minimum amount of data * on any access, since it is unlikely that the appli- * cation will need more than what it asks for. * MADV_SEQUENTIAL - pages in the given range will probably be accessed * once, so they can be aggressively read ahead, and * can be freed soon after they are accessed. * MADV_WILLNEED - the application is notifying the system to read * some pages ahead. * MADV_DONTNEED - the application is finished with the given range, * so the kernel can free resources associated with it. * MADV_FREE - the application marks pages in the given range as lazy free, * where actual purges are postponed until memory pressure happens. * MADV_REMOVE - the application wants to free up the given range of * pages and associated backing store. * MADV_DONTFORK - omit this area from child's address space when forking: * typically, to avoid COWing pages pinned by get_user_pages(). * MADV_DOFORK - cancel MADV_DONTFORK: no longer omit this area when forking. * MADV_WIPEONFORK - present the child process with zero-filled memory in this * range after a fork. * MADV_KEEPONFORK - undo the effect of MADV_WIPEONFORK * MADV_HWPOISON - trigger memory error handler as if the given memory range * were corrupted by unrecoverable hardware memory failure. * MADV_SOFT_OFFLINE - try to soft-offline the given range of memory. * MADV_MERGEABLE - the application recommends that KSM try to merge pages in * this area with pages of identical content from other such areas. * MADV_UNMERGEABLE- cancel MADV_MERGEABLE: no longer merge pages with others. * MADV_HUGEPAGE - the application wants to back the given range by transparent * huge pages in the future. Existing pages might be coalesced and * new pages might be allocated as THP. * MADV_NOHUGEPAGE - mark the given range as not worth being backed by * transparent huge pages so the existing pages will not be * coalesced into THP and new pages will not be allocated as THP. * MADV_COLLAPSE - synchronously coalesce pages into new THP. * MADV_DONTDUMP - the application wants to prevent pages in the given range * from being included in its core dump. * MADV_DODUMP - cancel MADV_DONTDUMP: no longer exclude from core dump. * MADV_COLD - the application is not expected to use this memory soon, * deactivate pages in this range so that they can be reclaimed * easily if memory pressure happens. * MADV_PAGEOUT - the application is not expected to use this memory soon, * page out the pages in this range immediately. * MADV_POPULATE_READ - populate (prefault) page tables readable by * triggering read faults if required * MADV_POPULATE_WRITE - populate (prefault) page tables writable by * triggering write faults if required * * return values: * zero - success * -EINVAL - start + len < 0, start is not page-aligned, * "behavior" is not a valid value, or application * is attempting to release locked or shared pages, * or the specified address range includes file, Huge TLB, * MAP_SHARED or VMPFNMAP range. * -ENOMEM - addresses in the specified range are not currently * mapped, or are outside the AS of the process. * -EIO - an I/O error occurred while paging in data. * -EBADF - map exists, but area maps something that isn't a file. * -EAGAIN - a kernel resource was temporarily unavailable. * -EPERM - memory is sealed. */ int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior) { int error; struct mmu_gather tlb; struct madvise_behavior madv_behavior = { .mm = mm, .behavior = behavior, .tlb = &tlb, }; if (madvise_should_skip(start, len_in, behavior, &error)) return error; error = madvise_lock(&madv_behavior); if (error) return error; madvise_init_tlb(&madv_behavior); error = madvise_do_behavior(start, len_in, &madv_behavior); madvise_finish_tlb(&madv_behavior); madvise_unlock(&madv_behavior); return error; } SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) { return do_madvise(current->mm, start, len_in, behavior); } /* Perform an madvise operation over a vector of addresses and lengths. */ static ssize_t vector_madvise(struct mm_struct *mm, struct iov_iter *iter, int behavior) { ssize_t ret = 0; size_t total_len; struct mmu_gather tlb; struct madvise_behavior madv_behavior = { .mm = mm, .behavior = behavior, .tlb = &tlb, }; total_len = iov_iter_count(iter); ret = madvise_lock(&madv_behavior); if (ret) return ret; madvise_init_tlb(&madv_behavior); while (iov_iter_count(iter)) { unsigned long start = (unsigned long)iter_iov_addr(iter); size_t len_in = iter_iov_len(iter); int error; if (madvise_should_skip(start, len_in, behavior, &error)) ret = error; else ret = madvise_do_behavior(start, len_in, &madv_behavior); /* * An madvise operation is attempting to restart the syscall, * but we cannot proceed as it would not be correct to repeat * the operation in aggregate, and would be surprising to the * user. * * We drop and reacquire locks so it is safe to just loop and * try again. We check for fatal signals in case we need exit * early anyway. */ if (ret == -ERESTARTNOINTR) { if (fatal_signal_pending(current)) { ret = -EINTR; break; } /* Drop and reacquire lock to unwind race. */ madvise_finish_tlb(&madv_behavior); madvise_unlock(&madv_behavior); ret = madvise_lock(&madv_behavior); if (ret) goto out; madvise_init_tlb(&madv_behavior); continue; } if (ret < 0) break; iov_iter_advance(iter, iter_iov_len(iter)); } madvise_finish_tlb(&madv_behavior); madvise_unlock(&madv_behavior); out: ret = (total_len - iov_iter_count(iter)) ? : ret; return ret; } SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec, size_t, vlen, int, behavior, unsigned int, flags) { ssize_t ret; struct iovec iovstack[UIO_FASTIOV]; struct iovec *iov = iovstack; struct iov_iter iter; struct task_struct *task; struct mm_struct *mm; unsigned int f_flags; if (flags != 0) { ret = -EINVAL; goto out; } ret = import_iovec(ITER_DEST, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter); if (ret < 0) goto out; task = pidfd_get_task(pidfd, &f_flags); if (IS_ERR(task)) { ret = PTR_ERR(task); goto free_iov; } /* Require PTRACE_MODE_READ to avoid leaking ASLR metadata. */ mm = mm_access(task, PTRACE_MODE_READ_FSCREDS); if (IS_ERR(mm)) { ret = PTR_ERR(mm); goto release_task; } /* * We need only perform this check if we are attempting to manipulate a * remote process's address space. */ if (mm != current->mm && !process_madvise_remote_valid(behavior)) { ret = -EINVAL; goto release_mm; } /* * Require CAP_SYS_NICE for influencing process performance. Note that * only non-destructive hints are currently supported for remote * processes. */ if (mm != current->mm && !capable(CAP_SYS_NICE)) { ret = -EPERM; goto release_mm; } ret = vector_madvise(mm, &iter, behavior); release_mm: mmput(mm); release_task: put_task_struct(task); free_iov: kfree(iov); out: return ret; } #ifdef CONFIG_ANON_VMA_NAME #define ANON_VMA_NAME_MAX_LEN 80 #define ANON_VMA_NAME_INVALID_CHARS "\\`$[]" static inline bool is_valid_name_char(char ch) { /* printable ascii characters, excluding ANON_VMA_NAME_INVALID_CHARS */ return ch > 0x1f && ch < 0x7f && !strchr(ANON_VMA_NAME_INVALID_CHARS, ch); } static int madvise_set_anon_name(struct mm_struct *mm, unsigned long start, unsigned long len_in, struct anon_vma_name *anon_name) { unsigned long end; unsigned long len; int error; struct madvise_behavior madv_behavior = { .mm = mm, .behavior = __MADV_SET_ANON_VMA_NAME, .anon_name = anon_name, }; if (start & ~PAGE_MASK) return -EINVAL; len = (len_in + ~PAGE_MASK) & PAGE_MASK; /* Check to see whether len was rounded up from small -ve to zero */ if (len_in && !len) return -EINVAL; end = start + len; if (end < start) return -EINVAL; if (end == start) return 0; madv_behavior.range.start = start; madv_behavior.range.end = end; error = madvise_lock(&madv_behavior); if (error) return error; error = madvise_walk_vmas(&madv_behavior); madvise_unlock(&madv_behavior); return error; } int set_anon_vma_name(unsigned long addr, unsigned long size, const char __user *uname) { struct anon_vma_name *anon_name = NULL; struct mm_struct *mm = current->mm; int error; if (uname) { char *name, *pch; name = strndup_user(uname, ANON_VMA_NAME_MAX_LEN); if (IS_ERR(name)) return PTR_ERR(name); for (pch = name; *pch != '\0'; pch++) { if (!is_valid_name_char(*pch)) { kfree(name); return -EINVAL; } } /* anon_vma has its own copy */ anon_name = anon_vma_name_alloc(name); kfree(name); if (!anon_name) return -ENOMEM; } error = madvise_set_anon_name(mm, addr, size, anon_name); anon_vma_name_put(anon_name); return error; } #endif |
| 3 3 3 7 7 7 7 7 7 7 3 3 3 3 7 7 7 7 7 7 1 1 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 | // SPDX-License-Identifier: GPL-2.0-only /* * LED Triggers Core * * Copyright 2005-2007 Openedhand Ltd. * * Author: Richard Purdie <rpurdie@openedhand.com> */ #include <linux/export.h> #include <linux/kernel.h> #include <linux/list.h> #include <linux/spinlock.h> #include <linux/device.h> #include <linux/timer.h> #include <linux/rwsem.h> #include <linux/leds.h> #include <linux/slab.h> #include <linux/mm.h> #include "leds.h" /* * Nests outside led_cdev->trigger_lock */ static DECLARE_RWSEM(triggers_list_lock); static LIST_HEAD(trigger_list); /* Used by LED Class */ static inline bool trigger_relevant(struct led_classdev *led_cdev, struct led_trigger *trig) { return !trig->trigger_type || trig->trigger_type == led_cdev->trigger_type; } ssize_t led_trigger_write(struct file *filp, struct kobject *kobj, const struct bin_attribute *bin_attr, char *buf, loff_t pos, size_t count) { struct device *dev = kobj_to_dev(kobj); struct led_classdev *led_cdev = dev_get_drvdata(dev); struct led_trigger *trig; int ret = count; mutex_lock(&led_cdev->led_access); if (led_sysfs_is_disabled(led_cdev)) { ret = -EBUSY; goto unlock; } if (sysfs_streq(buf, "none")) { led_trigger_remove(led_cdev); goto unlock; } if (sysfs_streq(buf, "default")) { led_trigger_set_default(led_cdev); goto unlock; } down_read(&triggers_list_lock); list_for_each_entry(trig, &trigger_list, next_trig) { if (sysfs_streq(buf, trig->name) && trigger_relevant(led_cdev, trig)) { down_write(&led_cdev->trigger_lock); led_trigger_set(led_cdev, trig); up_write(&led_cdev->trigger_lock); up_read(&triggers_list_lock); goto unlock; } } /* we come here only if buf matches no trigger */ ret = -EINVAL; up_read(&triggers_list_lock); unlock: mutex_unlock(&led_cdev->led_access); return ret; } EXPORT_SYMBOL_GPL(led_trigger_write); __printf(3, 4) static int led_trigger_snprintf(char *buf, ssize_t size, const char *fmt, ...) { va_list args; int i; va_start(args, fmt); if (size <= 0) i = vsnprintf(NULL, 0, fmt, args); else i = vscnprintf(buf, size, fmt, args); va_end(args); return i; } static int led_trigger_format(char *buf, size_t size, struct led_classdev *led_cdev) { struct led_trigger *trig; int len = led_trigger_snprintf(buf, size, "%s", led_cdev->trigger ? "none" : "[none]"); if (led_cdev->default_trigger) len += led_trigger_snprintf(buf + len, size - len, " default"); list_for_each_entry(trig, &trigger_list, next_trig) { bool hit; if (!trigger_relevant(led_cdev, trig)) continue; hit = led_cdev->trigger && !strcmp(led_cdev->trigger->name, trig->name); len += led_trigger_snprintf(buf + len, size - len, " %s%s%s", hit ? "[" : "", trig->name, hit ? "]" : ""); } len += led_trigger_snprintf(buf + len, size - len, "\n"); return len; } /* * It was stupid to create 10000 cpu triggers, but we are stuck with it now. * Don't make that mistake again. We work around it here by creating binary * attribute, which is not limited by length. This is _not_ good design, do not * copy it. */ ssize_t led_trigger_read(struct file *filp, struct kobject *kobj, const struct bin_attribute *attr, char *buf, loff_t pos, size_t count) { struct device *dev = kobj_to_dev(kobj); struct led_classdev *led_cdev = dev_get_drvdata(dev); void *data; int len; down_read(&triggers_list_lock); down_read(&led_cdev->trigger_lock); len = led_trigger_format(NULL, 0, led_cdev); data = kvmalloc(len + 1, GFP_KERNEL); if (!data) { up_read(&led_cdev->trigger_lock); up_read(&triggers_list_lock); return -ENOMEM; } len = led_trigger_format(data, len + 1, led_cdev); up_read(&led_cdev->trigger_lock); up_read(&triggers_list_lock); len = memory_read_from_buffer(buf, count, &pos, data, len); kvfree(data); return len; } EXPORT_SYMBOL_GPL(led_trigger_read); /* Caller must ensure led_cdev->trigger_lock held */ int led_trigger_set(struct led_classdev *led_cdev, struct led_trigger *trig) { char *event = NULL; char *envp[2]; const char *name; int ret; if (!led_cdev->trigger && !trig) return 0; name = trig ? trig->name : "none"; event = kasprintf(GFP_KERNEL, "TRIGGER=%s", name); /* Remove any existing trigger */ if (led_cdev->trigger) { spin_lock(&led_cdev->trigger->leddev_list_lock); list_del_rcu(&led_cdev->trig_list); spin_unlock(&led_cdev->trigger->leddev_list_lock); /* ensure it's no longer visible on the led_cdevs list */ synchronize_rcu(); cancel_work_sync(&led_cdev->set_brightness_work); led_stop_software_blink(led_cdev); device_remove_groups(led_cdev->dev, led_cdev->trigger->groups); if (led_cdev->trigger->deactivate) led_cdev->trigger->deactivate(led_cdev); led_cdev->trigger = NULL; led_cdev->trigger_data = NULL; led_cdev->activated = false; led_cdev->flags &= ~LED_INIT_DEFAULT_TRIGGER; led_set_brightness(led_cdev, LED_OFF); } if (trig) { spin_lock(&trig->leddev_list_lock); list_add_tail_rcu(&led_cdev->trig_list, &trig->led_cdevs); spin_unlock(&trig->leddev_list_lock); led_cdev->trigger = trig; /* * Some activate() calls use led_trigger_event() to initialize * the brightness of the LED for which the trigger is being set. * Ensure the led_cdev is visible on trig->led_cdevs for this. */ synchronize_rcu(); /* * If "set brightness to 0" is pending in workqueue, * we don't want that to be reordered after ->activate() */ flush_work(&led_cdev->set_brightness_work); ret = 0; if (trig->activate) ret = trig->activate(led_cdev); else led_set_brightness(led_cdev, trig->brightness); if (ret) goto err_activate; ret = device_add_groups(led_cdev->dev, trig->groups); if (ret) { dev_err(led_cdev->dev, "Failed to add trigger attributes\n"); goto err_add_groups; } } if (event) { envp[0] = event; envp[1] = NULL; if (kobject_uevent_env(&led_cdev->dev->kobj, KOBJ_CHANGE, envp)) dev_err(led_cdev->dev, "%s: Error sending uevent\n", __func__); kfree(event); } return 0; err_add_groups: if (trig->deactivate) trig->deactivate(led_cdev); err_activate: spin_lock(&led_cdev->trigger->leddev_list_lock); list_del_rcu(&led_cdev->trig_list); spin_unlock(&led_cdev->trigger->leddev_list_lock); synchronize_rcu(); led_cdev->trigger = NULL; led_cdev->trigger_data = NULL; led_set_brightness(led_cdev, LED_OFF); kfree(event); return ret; } EXPORT_SYMBOL_GPL(led_trigger_set); void led_trigger_remove(struct led_classdev *led_cdev) { down_write(&led_cdev->trigger_lock); led_trigger_set(led_cdev, NULL); up_write(&led_cdev->trigger_lock); } EXPORT_SYMBOL_GPL(led_trigger_remove); static bool led_match_default_trigger(struct led_classdev *led_cdev, struct led_trigger *trig) { if (!strcmp(led_cdev->default_trigger, trig->name) && trigger_relevant(led_cdev, trig)) { led_cdev->flags |= LED_INIT_DEFAULT_TRIGGER; led_trigger_set(led_cdev, trig); return true; } return false; } void led_trigger_set_default(struct led_classdev *led_cdev) { struct led_trigger *trig; bool found = false; if (!led_cdev->default_trigger) return; if (!strcmp(led_cdev->default_trigger, "none")) { led_trigger_remove(led_cdev); return; } down_read(&triggers_list_lock); down_write(&led_cdev->trigger_lock); list_for_each_entry(trig, &trigger_list, next_trig) { found = led_match_default_trigger(led_cdev, trig); if (found) break; } up_write(&led_cdev->trigger_lock); up_read(&triggers_list_lock); /* * If default trigger wasn't found, maybe trigger module isn't loaded yet. * Once loaded it will re-probe with all led_cdev's. */ if (!found) request_module_nowait("ledtrig:%s", led_cdev->default_trigger); } EXPORT_SYMBOL_GPL(led_trigger_set_default); /* LED Trigger Interface */ int led_trigger_register(struct led_trigger *trig) { struct led_classdev *led_cdev; struct led_trigger *_trig; spin_lock_init(&trig->leddev_list_lock); INIT_LIST_HEAD(&trig->led_cdevs); down_write(&triggers_list_lock); /* Make sure the trigger's name isn't already in use */ list_for_each_entry(_trig, &trigger_list, next_trig) { if (!strcmp(_trig->name, trig->name) && (trig->trigger_type == _trig->trigger_type || !trig->trigger_type || !_trig->trigger_type)) { up_write(&triggers_list_lock); return -EEXIST; } } /* Add to the list of led triggers */ list_add_tail(&trig->next_trig, &trigger_list); up_write(&triggers_list_lock); /* Register with any LEDs that have this as a default trigger */ down_read(&leds_list_lock); list_for_each_entry(led_cdev, &leds_list, node) { down_write(&led_cdev->trigger_lock); if (!led_cdev->trigger && led_cdev->default_trigger) led_match_default_trigger(led_cdev, trig); up_write(&led_cdev->trigger_lock); } up_read(&leds_list_lock); return 0; } EXPORT_SYMBOL_GPL(led_trigger_register); void led_trigger_unregister(struct led_trigger *trig) { struct led_classdev *led_cdev; if (list_empty_careful(&trig->next_trig)) return; /* Remove from the list of led triggers */ down_write(&triggers_list_lock); list_del_init(&trig->next_trig); up_write(&triggers_list_lock); /* Remove anyone actively using this trigger */ down_read(&leds_list_lock); list_for_each_entry(led_cdev, &leds_list, node) { down_write(&led_cdev->trigger_lock); if (led_cdev->trigger == trig) led_trigger_set(led_cdev, NULL); up_write(&led_cdev->trigger_lock); } up_read(&leds_list_lock); } EXPORT_SYMBOL_GPL(led_trigger_unregister); static void devm_led_trigger_release(struct device *dev, void *res) { led_trigger_unregister(*(struct led_trigger **)res); } int devm_led_trigger_register(struct device *dev, struct led_trigger *trig) { struct led_trigger **dr; int rc; dr = devres_alloc(devm_led_trigger_release, sizeof(*dr), GFP_KERNEL); if (!dr) return -ENOMEM; *dr = trig; rc = led_trigger_register(trig); if (rc) devres_free(dr); else devres_add(dev, dr); return rc; } EXPORT_SYMBOL_GPL(devm_led_trigger_register); /* Simple LED Trigger Interface */ void led_trigger_event(struct led_trigger *trig, enum led_brightness brightness) { struct led_classdev *led_cdev; if (!trig) return; trig->brightness = brightness; rcu_read_lock(); list_for_each_entry_rcu(led_cdev, &trig->led_cdevs, trig_list) led_set_brightness(led_cdev, brightness); rcu_read_unlock(); } EXPORT_SYMBOL_GPL(led_trigger_event); void led_mc_trigger_event(struct led_trigger *trig, unsigned int *intensity_value, unsigned int num_colors, enum led_brightness brightness) { struct led_classdev *led_cdev; if (!trig) return; rcu_read_lock(); list_for_each_entry_rcu(led_cdev, &trig->led_cdevs, trig_list) { if (!(led_cdev->flags & LED_MULTI_COLOR)) continue; led_mc_set_brightness(led_cdev, intensity_value, num_colors, brightness); } rcu_read_unlock(); } EXPORT_SYMBOL_GPL(led_mc_trigger_event); static void led_trigger_blink_setup(struct led_trigger *trig, unsigned long delay_on, unsigned long delay_off, int oneshot, int invert) { struct led_classdev *led_cdev; if (!trig) return; rcu_read_lock(); list_for_each_entry_rcu(led_cdev, &trig->led_cdevs, trig_list) { if (oneshot) led_blink_set_oneshot(led_cdev, &delay_on, &delay_off, invert); else led_blink_set_nosleep(led_cdev, delay_on, delay_off); } rcu_read_unlock(); } void led_trigger_blink(struct led_trigger *trig, unsigned long delay_on, unsigned long delay_off) { led_trigger_blink_setup(trig, delay_on, delay_off, 0, 0); } EXPORT_SYMBOL_GPL(led_trigger_blink); void led_trigger_blink_oneshot(struct led_trigger *trig, unsigned long delay_on, unsigned long delay_off, int invert) { led_trigger_blink_setup(trig, delay_on, delay_off, 1, invert); } EXPORT_SYMBOL_GPL(led_trigger_blink_oneshot); void led_trigger_register_simple(const char *name, struct led_trigger **tp) { struct led_trigger *trig; int err; trig = kzalloc(sizeof(struct led_trigger), GFP_KERNEL); if (trig) { trig->name = name; err = led_trigger_register(trig); if (err < 0) { kfree(trig); trig = NULL; pr_warn("LED trigger %s failed to register (%d)\n", name, err); } } else { pr_warn("LED trigger %s failed to register (no memory)\n", name); } *tp = trig; } EXPORT_SYMBOL_GPL(led_trigger_register_simple); void led_trigger_unregister_simple(struct led_trigger *trig) { if (trig) led_trigger_unregister(trig); kfree(trig); } EXPORT_SYMBOL_GPL(led_trigger_unregister_simple); |
| 4 1 4 3 2 2 4 1 1 1 1 1 1 3 1 3 2 3 1 9 9 8 9 7 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 | // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2016 Laura Garcia <nevola@gmail.com> */ #include <linux/kernel.h> #include <linux/init.h> #include <linux/module.h> #include <linux/netlink.h> #include <linux/netfilter.h> #include <linux/netfilter/nf_tables.h> #include <linux/random.h> #include <linux/static_key.h> #include <net/netfilter/nf_tables.h> #include <net/netfilter/nf_tables_core.h> struct nft_ng_inc { u8 dreg; u32 modulus; atomic_t *counter; u32 offset; }; static u32 nft_ng_inc_gen(struct nft_ng_inc *priv) { u32 nval, oval; do { oval = atomic_read(priv->counter); nval = (oval + 1 < priv->modulus) ? oval + 1 : 0; } while (atomic_cmpxchg(priv->counter, oval, nval) != oval); return nval + priv->offset; } static void nft_ng_inc_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { struct nft_ng_inc *priv = nft_expr_priv(expr); regs->data[priv->dreg] = nft_ng_inc_gen(priv); } static const struct nla_policy nft_ng_policy[NFTA_NG_MAX + 1] = { [NFTA_NG_DREG] = { .type = NLA_U32 }, [NFTA_NG_MODULUS] = { .type = NLA_U32 }, [NFTA_NG_TYPE] = { .type = NLA_U32 }, [NFTA_NG_OFFSET] = { .type = NLA_U32 }, }; static int nft_ng_inc_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) { struct nft_ng_inc *priv = nft_expr_priv(expr); int err; if (tb[NFTA_NG_OFFSET]) priv->offset = ntohl(nla_get_be32(tb[NFTA_NG_OFFSET])); priv->modulus = ntohl(nla_get_be32(tb[NFTA_NG_MODULUS])); if (priv->modulus == 0) return -ERANGE; if (priv->offset + priv->modulus - 1 < priv->offset) return -EOVERFLOW; priv->counter = kmalloc(sizeof(*priv->counter), GFP_KERNEL_ACCOUNT); if (!priv->counter) return -ENOMEM; atomic_set(priv->counter, priv->modulus - 1); err = nft_parse_register_store(ctx, tb[NFTA_NG_DREG], &priv->dreg, NULL, NFT_DATA_VALUE, sizeof(u32)); if (err < 0) goto err; return 0; err: kfree(priv->counter); return err; } static bool nft_ng_inc_reduce(struct nft_regs_track *track, const struct nft_expr *expr) { const struct nft_ng_inc *priv = nft_expr_priv(expr); nft_reg_track_cancel(track, priv->dreg, NFT_REG32_SIZE); return false; } static int nft_ng_dump(struct sk_buff *skb, enum nft_registers dreg, u32 modulus, enum nft_ng_types type, u32 offset) { if (nft_dump_register(skb, NFTA_NG_DREG, dreg)) goto nla_put_failure; if (nla_put_be32(skb, NFTA_NG_MODULUS, htonl(modulus))) goto nla_put_failure; if (nla_put_be32(skb, NFTA_NG_TYPE, htonl(type))) goto nla_put_failure; if (nla_put_be32(skb, NFTA_NG_OFFSET, htonl(offset))) goto nla_put_failure; return 0; nla_put_failure: return -1; } static int nft_ng_inc_dump(struct sk_buff *skb, const struct nft_expr *expr, bool reset) { const struct nft_ng_inc *priv = nft_expr_priv(expr); return nft_ng_dump(skb, priv->dreg, priv->modulus, NFT_NG_INCREMENTAL, priv->offset); } static void nft_ng_inc_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr) { const struct nft_ng_inc *priv = nft_expr_priv(expr); kfree(priv->counter); } struct nft_ng_random { u8 dreg; u32 modulus; u32 offset; }; static u32 nft_ng_random_gen(const struct nft_ng_random *priv) { return reciprocal_scale(get_random_u32(), priv->modulus) + priv->offset; } static void nft_ng_random_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { struct nft_ng_random *priv = nft_expr_priv(expr); regs->data[priv->dreg] = nft_ng_random_gen(priv); } static int nft_ng_random_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) { struct nft_ng_random *priv = nft_expr_priv(expr); if (tb[NFTA_NG_OFFSET]) priv->offset = ntohl(nla_get_be32(tb[NFTA_NG_OFFSET])); priv->modulus = ntohl(nla_get_be32(tb[NFTA_NG_MODULUS])); if (priv->modulus == 0) return -ERANGE; if (priv->offset + priv->modulus - 1 < priv->offset) return -EOVERFLOW; return nft_parse_register_store(ctx, tb[NFTA_NG_DREG], &priv->dreg, NULL, NFT_DATA_VALUE, sizeof(u32)); } static int nft_ng_random_dump(struct sk_buff *skb, const struct nft_expr *expr, bool reset) { const struct nft_ng_random *priv = nft_expr_priv(expr); return nft_ng_dump(skb, priv->dreg, priv->modulus, NFT_NG_RANDOM, priv->offset); } static bool nft_ng_random_reduce(struct nft_regs_track *track, const struct nft_expr *expr) { const struct nft_ng_random *priv = nft_expr_priv(expr); nft_reg_track_cancel(track, priv->dreg, NFT_REG32_SIZE); return false; } static struct nft_expr_type nft_ng_type; static const struct nft_expr_ops nft_ng_inc_ops = { .type = &nft_ng_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_ng_inc)), .eval = nft_ng_inc_eval, .init = nft_ng_inc_init, .destroy = nft_ng_inc_destroy, .dump = nft_ng_inc_dump, .reduce = nft_ng_inc_reduce, }; static const struct nft_expr_ops nft_ng_random_ops = { .type = &nft_ng_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_ng_random)), .eval = nft_ng_random_eval, .init = nft_ng_random_init, .dump = nft_ng_random_dump, .reduce = nft_ng_random_reduce, }; static const struct nft_expr_ops * nft_ng_select_ops(const struct nft_ctx *ctx, const struct nlattr * const tb[]) { u32 type; if (!tb[NFTA_NG_DREG] || !tb[NFTA_NG_MODULUS] || !tb[NFTA_NG_TYPE]) return ERR_PTR(-EINVAL); type = ntohl(nla_get_be32(tb[NFTA_NG_TYPE])); switch (type) { case NFT_NG_INCREMENTAL: return &nft_ng_inc_ops; case NFT_NG_RANDOM: return &nft_ng_random_ops; } return ERR_PTR(-EINVAL); } static struct nft_expr_type nft_ng_type __read_mostly = { .name = "numgen", .select_ops = nft_ng_select_ops, .policy = nft_ng_policy, .maxattr = NFTA_NG_MAX, .owner = THIS_MODULE, }; static int __init nft_ng_module_init(void) { return nft_register_expr(&nft_ng_type); } static void __exit nft_ng_module_exit(void) { nft_unregister_expr(&nft_ng_type); } module_init(nft_ng_module_init); module_exit(nft_ng_module_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Laura Garcia <nevola@gmail.com>"); MODULE_ALIAS_NFT_EXPR("numgen"); MODULE_DESCRIPTION("nftables number generator module"); |
| 489 372 371 372 368 138 4 4 136 3 138 39 137 138 4 497 21 495 501 497 2 7 6 501 501 499 497 497 501 491 498 499 499 1066 1066 1070 1072 1071 1061 315 1065 1072 1 1 1 1 1 1 214 188 186 177 113 87 69 97 187 5 4 4 4 5 189 187 189 1060 1056 1065 1060 967 1058 981 984 18 965 983 2 3 3 3 3 3 3 2 2 2 2 2 2 2 2 2 2 211 210 216 1 1 216 2 214 2 216 214 215 4 4 4 6 6 6 216 216 215 47 44 48 48 47 47 47 64 65 63 65 64 48 63 65 64 65 23 41 556 556 556 97 557 558 389 8 8 369 386 389 385 161 384 556 371 555 408 558 385 383 387 19 19 19 19 19 19 19 27 27 26 12 12 12 12 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 | // SPDX-License-Identifier: GPL-2.0-only /* * linux/mm/swap.c * * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds */ /* * This file contains the default values for the operation of the * Linux VM subsystem. Fine-tuning documentation can be found in * Documentation/admin-guide/sysctl/vm.rst. * Started 18.12.91 * Swap aging added 23.2.95, Stephen Tweedie. * Buffermem limits added 12.3.98, Rik van Riel. */ #include <linux/mm.h> #include <linux/sched.h> #include <linux/kernel_stat.h> #include <linux/swap.h> #include <linux/mman.h> #include <linux/pagemap.h> #include <linux/pagevec.h> #include <linux/init.h> #include <linux/export.h> #include <linux/mm_inline.h> #include <linux/percpu_counter.h> #include <linux/memremap.h> #include <linux/percpu.h> #include <linux/cpu.h> #include <linux/notifier.h> #include <linux/backing-dev.h> #include <linux/memcontrol.h> #include <linux/gfp.h> #include <linux/uio.h> #include <linux/hugetlb.h> #include <linux/page_idle.h> #include <linux/local_lock.h> #include <linux/buffer_head.h> #include "internal.h" #define CREATE_TRACE_POINTS #include <trace/events/pagemap.h> /* How many pages do we try to swap or page in/out together? As a power of 2 */ int page_cluster; static const int page_cluster_max = 31; struct cpu_fbatches { /* * The following folio batches are grouped together because they are protected * by disabling preemption (and interrupts remain enabled). */ local_lock_t lock; struct folio_batch lru_add; struct folio_batch lru_deactivate_file; struct folio_batch lru_deactivate; struct folio_batch lru_lazyfree; #ifdef CONFIG_SMP struct folio_batch lru_activate; #endif /* Protecting the following batches which require disabling interrupts */ local_lock_t lock_irq; struct folio_batch lru_move_tail; }; static DEFINE_PER_CPU(struct cpu_fbatches, cpu_fbatches) = { .lock = INIT_LOCAL_LOCK(lock), .lock_irq = INIT_LOCAL_LOCK(lock_irq), }; static void __page_cache_release(struct folio *folio, struct lruvec **lruvecp, unsigned long *flagsp) { if (folio_test_lru(folio)) { folio_lruvec_relock_irqsave(folio, lruvecp, flagsp); lruvec_del_folio(*lruvecp, folio); __folio_clear_lru_flags(folio); } } /* * This path almost never happens for VM activity - pages are normally freed * in batches. But it gets used by networking - and for compound pages. */ static void page_cache_release(struct folio *folio) { struct lruvec *lruvec = NULL; unsigned long flags; __page_cache_release(folio, &lruvec, &flags); if (lruvec) unlock_page_lruvec_irqrestore(lruvec, flags); } void __folio_put(struct folio *folio) { if (unlikely(folio_is_zone_device(folio))) { free_zone_device_folio(folio); return; } if (folio_test_hugetlb(folio)) { free_huge_folio(folio); return; } page_cache_release(folio); folio_unqueue_deferred_split(folio); mem_cgroup_uncharge(folio); free_frozen_pages(&folio->page, folio_order(folio)); } EXPORT_SYMBOL(__folio_put); typedef void (*move_fn_t)(struct lruvec *lruvec, struct folio *folio); static void lru_add(struct lruvec *lruvec, struct folio *folio) { int was_unevictable = folio_test_clear_unevictable(folio); long nr_pages = folio_nr_pages(folio); VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); /* * Is an smp_mb__after_atomic() still required here, before * folio_evictable() tests the mlocked flag, to rule out the possibility * of stranding an evictable folio on an unevictable LRU? I think * not, because __munlock_folio() only clears the mlocked flag * while the LRU lock is held. * * (That is not true of __page_cache_release(), and not necessarily * true of folios_put(): but those only clear the mlocked flag after * folio_put_testzero() has excluded any other users of the folio.) */ if (folio_evictable(folio)) { if (was_unevictable) __count_vm_events(UNEVICTABLE_PGRESCUED, nr_pages); } else { folio_clear_active(folio); folio_set_unevictable(folio); /* * folio->mlock_count = !!folio_test_mlocked(folio)? * But that leaves __mlock_folio() in doubt whether another * actor has already counted the mlock or not. Err on the * safe side, underestimate, let page reclaim fix it, rather * than leaving a page on the unevictable LRU indefinitely. */ folio->mlock_count = 0; if (!was_unevictable) __count_vm_events(UNEVICTABLE_PGCULLED, nr_pages); } lruvec_add_folio(lruvec, folio); trace_mm_lru_insertion(folio); } static void folio_batch_move_lru(struct folio_batch *fbatch, move_fn_t move_fn) { int i; struct lruvec *lruvec = NULL; unsigned long flags = 0; for (i = 0; i < folio_batch_count(fbatch); i++) { struct folio *folio = fbatch->folios[i]; folio_lruvec_relock_irqsave(folio, &lruvec, &flags); move_fn(lruvec, folio); folio_set_lru(folio); } if (lruvec) unlock_page_lruvec_irqrestore(lruvec, flags); folios_put(fbatch); } static void __folio_batch_add_and_move(struct folio_batch __percpu *fbatch, struct folio *folio, move_fn_t move_fn, bool on_lru, bool disable_irq) { unsigned long flags; if (on_lru && !folio_test_clear_lru(folio)) return; folio_get(folio); if (disable_irq) local_lock_irqsave(&cpu_fbatches.lock_irq, flags); else local_lock(&cpu_fbatches.lock); if (!folio_batch_add(this_cpu_ptr(fbatch), folio) || folio_test_large(folio) || lru_cache_disabled()) folio_batch_move_lru(this_cpu_ptr(fbatch), move_fn); if (disable_irq) local_unlock_irqrestore(&cpu_fbatches.lock_irq, flags); else local_unlock(&cpu_fbatches.lock); } #define folio_batch_add_and_move(folio, op, on_lru) \ __folio_batch_add_and_move( \ &cpu_fbatches.op, \ folio, \ op, \ on_lru, \ offsetof(struct cpu_fbatches, op) >= offsetof(struct cpu_fbatches, lock_irq) \ ) static void lru_move_tail(struct lruvec *lruvec, struct folio *folio) { if (folio_test_unevictable(folio)) return; lruvec_del_folio(lruvec, folio); folio_clear_active(folio); lruvec_add_folio_tail(lruvec, folio); __count_vm_events(PGROTATED, folio_nr_pages(folio)); } /* * Writeback is about to end against a folio which has been marked for * immediate reclaim. If it still appears to be reclaimable, move it * to the tail of the inactive list. * * folio_rotate_reclaimable() must disable IRQs, to prevent nasty races. */ void folio_rotate_reclaimable(struct folio *folio) { if (folio_test_locked(folio) || folio_test_dirty(folio) || folio_test_unevictable(folio)) return; folio_batch_add_and_move(folio, lru_move_tail, true); } void lru_note_cost_unlock_irq(struct lruvec *lruvec, bool file, unsigned int nr_io, unsigned int nr_rotated) __releases(lruvec->lru_lock) { unsigned long cost; /* * Reflect the relative cost of incurring IO and spending CPU * time on rotations. This doesn't attempt to make a precise * comparison, it just says: if reloads are about comparable * between the LRU lists, or rotations are overwhelmingly * different between them, adjust scan balance for CPU work. */ cost = nr_io * SWAP_CLUSTER_MAX + nr_rotated; if (!cost) { spin_unlock_irq(&lruvec->lru_lock); return; } for (;;) { unsigned long lrusize; /* Record cost event */ if (file) lruvec->file_cost += cost; else lruvec->anon_cost += cost; /* * Decay previous events * * Because workloads change over time (and to avoid * overflow) we keep these statistics as a floating * average, which ends up weighing recent refaults * more than old ones. */ lrusize = lruvec_page_state(lruvec, NR_INACTIVE_ANON) + lruvec_page_state(lruvec, NR_ACTIVE_ANON) + lruvec_page_state(lruvec, NR_INACTIVE_FILE) + lruvec_page_state(lruvec, NR_ACTIVE_FILE); if (lruvec->file_cost + lruvec->anon_cost > lrusize / 4) { lruvec->file_cost /= 2; lruvec->anon_cost /= 2; } spin_unlock_irq(&lruvec->lru_lock); lruvec = parent_lruvec(lruvec); if (!lruvec) break; spin_lock_irq(&lruvec->lru_lock); } } void lru_note_cost_refault(struct folio *folio) { struct lruvec *lruvec; lruvec = folio_lruvec_lock_irq(folio); lru_note_cost_unlock_irq(lruvec, folio_is_file_lru(folio), folio_nr_pages(folio), 0); } static void lru_activate(struct lruvec *lruvec, struct folio *folio) { long nr_pages = folio_nr_pages(folio); if (folio_test_active(folio) || folio_test_unevictable(folio)) return; lruvec_del_folio(lruvec, folio); folio_set_active(folio); lruvec_add_folio(lruvec, folio); trace_mm_lru_activate(folio); __count_vm_events(PGACTIVATE, nr_pages); count_memcg_events(lruvec_memcg(lruvec), PGACTIVATE, nr_pages); } #ifdef CONFIG_SMP static void folio_activate_drain(int cpu) { struct folio_batch *fbatch = &per_cpu(cpu_fbatches.lru_activate, cpu); if (folio_batch_count(fbatch)) folio_batch_move_lru(fbatch, lru_activate); } void folio_activate(struct folio *folio) { if (folio_test_active(folio) || folio_test_unevictable(folio)) return; folio_batch_add_and_move(folio, lru_activate, true); } #else static inline void folio_activate_drain(int cpu) { } void folio_activate(struct folio *folio) { struct lruvec *lruvec; if (!folio_test_clear_lru(folio)) return; lruvec = folio_lruvec_lock_irq(folio); lru_activate(lruvec, folio); unlock_page_lruvec_irq(lruvec); folio_set_lru(folio); } #endif static void __lru_cache_activate_folio(struct folio *folio) { struct folio_batch *fbatch; int i; local_lock(&cpu_fbatches.lock); fbatch = this_cpu_ptr(&cpu_fbatches.lru_add); /* * Search backwards on the optimistic assumption that the folio being * activated has just been added to this batch. Note that only * the local batch is examined as a !LRU folio could be in the * process of being released, reclaimed, migrated or on a remote * batch that is currently being drained. Furthermore, marking * a remote batch's folio active potentially hits a race where * a folio is marked active just after it is added to the inactive * list causing accounting errors and BUG_ON checks to trigger. */ for (i = folio_batch_count(fbatch) - 1; i >= 0; i--) { struct folio *batch_folio = fbatch->folios[i]; if (batch_folio == folio) { folio_set_active(folio); break; } } local_unlock(&cpu_fbatches.lock); } #ifdef CONFIG_LRU_GEN static void lru_gen_inc_refs(struct folio *folio) { unsigned long new_flags, old_flags = READ_ONCE(folio->flags); if (folio_test_unevictable(folio)) return; /* see the comment on LRU_REFS_FLAGS */ if (!folio_test_referenced(folio)) { set_mask_bits(&folio->flags, LRU_REFS_MASK, BIT(PG_referenced)); return; } do { if ((old_flags & LRU_REFS_MASK) == LRU_REFS_MASK) { if (!folio_test_workingset(folio)) folio_set_workingset(folio); return; } new_flags = old_flags + BIT(LRU_REFS_PGOFF); } while (!try_cmpxchg(&folio->flags, &old_flags, new_flags)); } static bool lru_gen_clear_refs(struct folio *folio) { struct lru_gen_folio *lrugen; int gen = folio_lru_gen(folio); int type = folio_is_file_lru(folio); if (gen < 0) return true; set_mask_bits(&folio->flags, LRU_REFS_FLAGS | BIT(PG_workingset), 0); lrugen = &folio_lruvec(folio)->lrugen; /* whether can do without shuffling under the LRU lock */ return gen == lru_gen_from_seq(READ_ONCE(lrugen->min_seq[type])); } #else /* !CONFIG_LRU_GEN */ static void lru_gen_inc_refs(struct folio *folio) { } static bool lru_gen_clear_refs(struct folio *folio) { return false; } #endif /* CONFIG_LRU_GEN */ /** * folio_mark_accessed - Mark a folio as having seen activity. * @folio: The folio to mark. * * This function will perform one of the following transitions: * * * inactive,unreferenced -> inactive,referenced * * inactive,referenced -> active,unreferenced * * active,unreferenced -> active,referenced * * When a newly allocated folio is not yet visible, so safe for non-atomic ops, * __folio_set_referenced() may be substituted for folio_mark_accessed(). */ void folio_mark_accessed(struct folio *folio) { if (folio_test_dropbehind(folio)) return; if (lru_gen_enabled()) { lru_gen_inc_refs(folio); return; } if (!folio_test_referenced(folio)) { folio_set_referenced(folio); } else if (folio_test_unevictable(folio)) { /* * Unevictable pages are on the "LRU_UNEVICTABLE" list. But, * this list is never rotated or maintained, so marking an * unevictable page accessed has no effect. */ } else if (!folio_test_active(folio)) { /* * If the folio is on the LRU, queue it for activation via * cpu_fbatches.lru_activate. Otherwise, assume the folio is in a * folio_batch, mark it active and it'll be moved to the active * LRU on the next drain. */ if (folio_test_lru(folio)) folio_activate(folio); else __lru_cache_activate_folio(folio); folio_clear_referenced(folio); workingset_activation(folio); } if (folio_test_idle(folio)) folio_clear_idle(folio); } EXPORT_SYMBOL(folio_mark_accessed); /** * folio_add_lru - Add a folio to an LRU list. * @folio: The folio to be added to the LRU. * * Queue the folio for addition to the LRU. The decision on whether * to add the page to the [in]active [file|anon] list is deferred until the * folio_batch is drained. This gives a chance for the caller of folio_add_lru() * have the folio added to the active list using folio_mark_accessed(). */ void folio_add_lru(struct folio *folio) { VM_BUG_ON_FOLIO(folio_test_active(folio) && folio_test_unevictable(folio), folio); VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); /* see the comment in lru_gen_folio_seq() */ if (lru_gen_enabled() && !folio_test_unevictable(folio) && lru_gen_in_fault() && !(current->flags & PF_MEMALLOC)) folio_set_active(folio); folio_batch_add_and_move(folio, lru_add, false); } EXPORT_SYMBOL(folio_add_lru); /** * folio_add_lru_vma() - Add a folio to the appropate LRU list for this VMA. * @folio: The folio to be added to the LRU. * @vma: VMA in which the folio is mapped. * * If the VMA is mlocked, @folio is added to the unevictable list. * Otherwise, it is treated the same way as folio_add_lru(). */ void folio_add_lru_vma(struct folio *folio, struct vm_area_struct *vma) { VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); if (unlikely((vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) == VM_LOCKED)) mlock_new_folio(folio); else folio_add_lru(folio); } /* * If the folio cannot be invalidated, it is moved to the * inactive list to speed up its reclaim. It is moved to the * head of the list, rather than the tail, to give the flusher * threads some time to write it out, as this is much more * effective than the single-page writeout from reclaim. * * If the folio isn't mapped and dirty/writeback, the folio * could be reclaimed asap using the reclaim flag. * * 1. active, mapped folio -> none * 2. active, dirty/writeback folio -> inactive, head, reclaim * 3. inactive, mapped folio -> none * 4. inactive, dirty/writeback folio -> inactive, head, reclaim * 5. inactive, clean -> inactive, tail * 6. Others -> none * * In 4, it moves to the head of the inactive list so the folio is * written out by flusher threads as this is much more efficient * than the single-page writeout from reclaim. */ static void lru_deactivate_file(struct lruvec *lruvec, struct folio *folio) { bool active = folio_test_active(folio) || lru_gen_enabled(); long nr_pages = folio_nr_pages(folio); if (folio_test_unevictable(folio)) return; /* Some processes are using the folio */ if (folio_mapped(folio)) return; lruvec_del_folio(lruvec, folio); folio_clear_active(folio); folio_clear_referenced(folio); if (folio_test_writeback(folio) || folio_test_dirty(folio)) { /* * Setting the reclaim flag could race with * folio_end_writeback() and confuse readahead. But the * race window is _really_ small and it's not a critical * problem. */ lruvec_add_folio(lruvec, folio); folio_set_reclaim(folio); } else { /* * The folio's writeback ended while it was in the batch. * We move that folio to the tail of the inactive list. */ lruvec_add_folio_tail(lruvec, folio); __count_vm_events(PGROTATED, nr_pages); } if (active) { __count_vm_events(PGDEACTIVATE, nr_pages); count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, nr_pages); } } static void lru_deactivate(struct lruvec *lruvec, struct folio *folio) { long nr_pages = folio_nr_pages(folio); if (folio_test_unevictable(folio) || !(folio_test_active(folio) || lru_gen_enabled())) return; lruvec_del_folio(lruvec, folio); folio_clear_active(folio); folio_clear_referenced(folio); lruvec_add_folio(lruvec, folio); __count_vm_events(PGDEACTIVATE, nr_pages); count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, nr_pages); } static void lru_lazyfree(struct lruvec *lruvec, struct folio *folio) { long nr_pages = folio_nr_pages(folio); if (!folio_test_anon(folio) || !folio_test_swapbacked(folio) || folio_test_swapcache(folio) || folio_test_unevictable(folio)) return; lruvec_del_folio(lruvec, folio); folio_clear_active(folio); if (lru_gen_enabled()) lru_gen_clear_refs(folio); else folio_clear_referenced(folio); /* * Lazyfree folios are clean anonymous folios. They have * the swapbacked flag cleared, to distinguish them from normal * anonymous folios */ folio_clear_swapbacked(folio); lruvec_add_folio(lruvec, folio); __count_vm_events(PGLAZYFREE, nr_pages); count_memcg_events(lruvec_memcg(lruvec), PGLAZYFREE, nr_pages); } /* * Drain pages out of the cpu's folio_batch. * Either "cpu" is the current CPU, and preemption has already been * disabled; or "cpu" is being hot-unplugged, and is already dead. */ void lru_add_drain_cpu(int cpu) { struct cpu_fbatches *fbatches = &per_cpu(cpu_fbatches, cpu); struct folio_batch *fbatch = &fbatches->lru_add; if (folio_batch_count(fbatch)) folio_batch_move_lru(fbatch, lru_add); fbatch = &fbatches->lru_move_tail; /* Disabling interrupts below acts as a compiler barrier. */ if (data_race(folio_batch_count(fbatch))) { unsigned long flags; /* No harm done if a racing interrupt already did this */ local_lock_irqsave(&cpu_fbatches.lock_irq, flags); folio_batch_move_lru(fbatch, lru_move_tail); local_unlock_irqrestore(&cpu_fbatches.lock_irq, flags); } fbatch = &fbatches->lru_deactivate_file; if (folio_batch_count(fbatch)) folio_batch_move_lru(fbatch, lru_deactivate_file); fbatch = &fbatches->lru_deactivate; if (folio_batch_count(fbatch)) folio_batch_move_lru(fbatch, lru_deactivate); fbatch = &fbatches->lru_lazyfree; if (folio_batch_count(fbatch)) folio_batch_move_lru(fbatch, lru_lazyfree); folio_activate_drain(cpu); } /** * deactivate_file_folio() - Deactivate a file folio. * @folio: Folio to deactivate. * * This function hints to the VM that @folio is a good reclaim candidate, * for example if its invalidation fails due to the folio being dirty * or under writeback. * * Context: Caller holds a reference on the folio. */ void deactivate_file_folio(struct folio *folio) { /* Deactivating an unevictable folio will not accelerate reclaim */ if (folio_test_unevictable(folio)) return; if (lru_gen_enabled() && lru_gen_clear_refs(folio)) return; folio_batch_add_and_move(folio, lru_deactivate_file, true); } /* * folio_deactivate - deactivate a folio * @folio: folio to deactivate * * folio_deactivate() moves @folio to the inactive list if @folio was on the * active list and was not unevictable. This is done to accelerate the * reclaim of @folio. */ void folio_deactivate(struct folio *folio) { if (folio_test_unevictable(folio)) return; if (lru_gen_enabled() ? lru_gen_clear_refs(folio) : !folio_test_active(folio)) return; folio_batch_add_and_move(folio, lru_deactivate, true); } /** * folio_mark_lazyfree - make an anon folio lazyfree * @folio: folio to deactivate * * folio_mark_lazyfree() moves @folio to the inactive file list. * This is done to accelerate the reclaim of @folio. */ void folio_mark_lazyfree(struct folio *folio) { if (!folio_test_anon(folio) || !folio_test_swapbacked(folio) || folio_test_swapcache(folio) || folio_test_unevictable(folio)) return; folio_batch_add_and_move(folio, lru_lazyfree, true); } void lru_add_drain(void) { local_lock(&cpu_fbatches.lock); lru_add_drain_cpu(smp_processor_id()); local_unlock(&cpu_fbatches.lock); mlock_drain_local(); } /* * It's called from per-cpu workqueue context in SMP case so * lru_add_drain_cpu and invalidate_bh_lrus_cpu should run on * the same cpu. It shouldn't be a problem in !SMP case since * the core is only one and the locks will disable preemption. */ static void lru_add_and_bh_lrus_drain(void) { local_lock(&cpu_fbatches.lock); lru_add_drain_cpu(smp_processor_id()); local_unlock(&cpu_fbatches.lock); invalidate_bh_lrus_cpu(); mlock_drain_local(); } void lru_add_drain_cpu_zone(struct zone *zone) { local_lock(&cpu_fbatches.lock); lru_add_drain_cpu(smp_processor_id()); drain_local_pages(zone); local_unlock(&cpu_fbatches.lock); mlock_drain_local(); } #ifdef CONFIG_SMP static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work); static void lru_add_drain_per_cpu(struct work_struct *dummy) { lru_add_and_bh_lrus_drain(); } static bool cpu_needs_drain(unsigned int cpu) { struct cpu_fbatches *fbatches = &per_cpu(cpu_fbatches, cpu); /* Check these in order of likelihood that they're not zero */ return folio_batch_count(&fbatches->lru_add) || folio_batch_count(&fbatches->lru_move_tail) || folio_batch_count(&fbatches->lru_deactivate_file) || folio_batch_count(&fbatches->lru_deactivate) || folio_batch_count(&fbatches->lru_lazyfree) || folio_batch_count(&fbatches->lru_activate) || need_mlock_drain(cpu) || has_bh_in_lru(cpu, NULL); } /* * Doesn't need any cpu hotplug locking because we do rely on per-cpu * kworkers being shut down before our page_alloc_cpu_dead callback is * executed on the offlined cpu. * Calling this function with cpu hotplug locks held can actually lead * to obscure indirect dependencies via WQ context. */ static inline void __lru_add_drain_all(bool force_all_cpus) { /* * lru_drain_gen - Global pages generation number * * (A) Definition: global lru_drain_gen = x implies that all generations * 0 < n <= x are already *scheduled* for draining. * * This is an optimization for the highly-contended use case where a * user space workload keeps constantly generating a flow of pages for * each CPU. */ static unsigned int lru_drain_gen; static struct cpumask has_work; static DEFINE_MUTEX(lock); unsigned cpu, this_gen; /* * Make sure nobody triggers this path before mm_percpu_wq is fully * initialized. */ if (WARN_ON(!mm_percpu_wq)) return; /* * Guarantee folio_batch counter stores visible by this CPU * are visible to other CPUs before loading the current drain * generation. */ smp_mb(); /* * (B) Locally cache global LRU draining generation number * * The read barrier ensures that the counter is loaded before the mutex * is taken. It pairs with smp_mb() inside the mutex critical section * at (D). */ this_gen = smp_load_acquire(&lru_drain_gen); mutex_lock(&lock); /* * (C) Exit the draining operation if a newer generation, from another * lru_add_drain_all(), was already scheduled for draining. Check (A). */ if (unlikely(this_gen != lru_drain_gen && !force_all_cpus)) goto done; /* * (D) Increment global generation number * * Pairs with smp_load_acquire() at (B), outside of the critical * section. Use a full memory barrier to guarantee that the * new global drain generation number is stored before loading * folio_batch counters. * * This pairing must be done here, before the for_each_online_cpu loop * below which drains the page vectors. * * Let x, y, and z represent some system CPU numbers, where x < y < z. * Assume CPU #z is in the middle of the for_each_online_cpu loop * below and has already reached CPU #y's per-cpu data. CPU #x comes * along, adds some pages to its per-cpu vectors, then calls * lru_add_drain_all(). * * If the paired barrier is done at any later step, e.g. after the * loop, CPU #x will just exit at (C) and miss flushing out all of its * added pages. */ WRITE_ONCE(lru_drain_gen, lru_drain_gen + 1); smp_mb(); cpumask_clear(&has_work); for_each_online_cpu(cpu) { struct work_struct *work = &per_cpu(lru_add_drain_work, cpu); if (cpu_needs_drain(cpu)) { INIT_WORK(work, lru_add_drain_per_cpu); queue_work_on(cpu, mm_percpu_wq, work); __cpumask_set_cpu(cpu, &has_work); } } for_each_cpu(cpu, &has_work) flush_work(&per_cpu(lru_add_drain_work, cpu)); done: mutex_unlock(&lock); } void lru_add_drain_all(void) { __lru_add_drain_all(false); } #else void lru_add_drain_all(void) { lru_add_drain(); } #endif /* CONFIG_SMP */ atomic_t lru_disable_count = ATOMIC_INIT(0); /* * lru_cache_disable() needs to be called before we start compiling * a list of folios to be migrated using folio_isolate_lru(). * It drains folios on LRU cache and then disable on all cpus until * lru_cache_enable is called. * * Must be paired with a call to lru_cache_enable(). */ void lru_cache_disable(void) { atomic_inc(&lru_disable_count); /* * Readers of lru_disable_count are protected by either disabling * preemption or rcu_read_lock: * * preempt_disable, local_irq_disable [bh_lru_lock()] * rcu_read_lock [rt_spin_lock CONFIG_PREEMPT_RT] * preempt_disable [local_lock !CONFIG_PREEMPT_RT] * * Since v5.1 kernel, synchronize_rcu() is guaranteed to wait on * preempt_disable() regions of code. So any CPU which sees * lru_disable_count = 0 will have exited the critical * section when synchronize_rcu() returns. */ synchronize_rcu_expedited(); #ifdef CONFIG_SMP __lru_add_drain_all(true); #else lru_add_and_bh_lrus_drain(); #endif } /** * folios_put_refs - Reduce the reference count on a batch of folios. * @folios: The folios. * @refs: The number of refs to subtract from each folio. * * Like folio_put(), but for a batch of folios. This is more efficient * than writing the loop yourself as it will optimise the locks which need * to be taken if the folios are freed. The folios batch is returned * empty and ready to be reused for another batch; there is no need * to reinitialise it. If @refs is NULL, we subtract one from each * folio refcount. * * Context: May be called in process or interrupt context, but not in NMI * context. May be called while holding a spinlock. */ void folios_put_refs(struct folio_batch *folios, unsigned int *refs) { int i, j; struct lruvec *lruvec = NULL; unsigned long flags = 0; for (i = 0, j = 0; i < folios->nr; i++) { struct folio *folio = folios->folios[i]; unsigned int nr_refs = refs ? refs[i] : 1; if (is_huge_zero_folio(folio)) continue; if (folio_is_zone_device(folio)) { if (lruvec) { unlock_page_lruvec_irqrestore(lruvec, flags); lruvec = NULL; } if (folio_ref_sub_and_test(folio, nr_refs)) free_zone_device_folio(folio); continue; } if (!folio_ref_sub_and_test(folio, nr_refs)) continue; /* hugetlb has its own memcg */ if (folio_test_hugetlb(folio)) { if (lruvec) { unlock_page_lruvec_irqrestore(lruvec, flags); lruvec = NULL; } free_huge_folio(folio); continue; } folio_unqueue_deferred_split(folio); __page_cache_release(folio, &lruvec, &flags); if (j != i) folios->folios[j] = folio; j++; } if (lruvec) unlock_page_lruvec_irqrestore(lruvec, flags); if (!j) { folio_batch_reinit(folios); return; } folios->nr = j; mem_cgroup_uncharge_folios(folios); free_unref_folios(folios); } EXPORT_SYMBOL(folios_put_refs); /** * release_pages - batched put_page() * @arg: array of pages to release * @nr: number of pages * * Decrement the reference count on all the pages in @arg. If it * fell to zero, remove the page from the LRU and free it. * * Note that the argument can be an array of pages, encoded pages, * or folio pointers. We ignore any encoded bits, and turn any of * them into just a folio that gets free'd. */ void release_pages(release_pages_arg arg, int nr) { struct folio_batch fbatch; int refs[PAGEVEC_SIZE]; struct encoded_page **encoded = arg.encoded_pages; int i; folio_batch_init(&fbatch); for (i = 0; i < nr; i++) { /* Turn any of the argument types into a folio */ struct folio *folio = page_folio(encoded_page_ptr(encoded[i])); /* Is our next entry actually "nr_pages" -> "nr_refs" ? */ refs[fbatch.nr] = 1; if (unlikely(encoded_page_flags(encoded[i]) & ENCODED_PAGE_BIT_NR_PAGES_NEXT)) refs[fbatch.nr] = encoded_nr_pages(encoded[++i]); if (folio_batch_add(&fbatch, folio) > 0) continue; folios_put_refs(&fbatch, refs); } if (fbatch.nr) folios_put_refs(&fbatch, refs); } EXPORT_SYMBOL(release_pages); /* * The folios which we're about to release may be in the deferred lru-addition * queues. That would prevent them from really being freed right now. That's * OK from a correctness point of view but is inefficient - those folios may be * cache-warm and we want to give them back to the page allocator ASAP. * * So __folio_batch_release() will drain those queues here. * folio_batch_move_lru() calls folios_put() directly to avoid * mutual recursion. */ void __folio_batch_release(struct folio_batch *fbatch) { if (!fbatch->percpu_pvec_drained) { lru_add_drain(); fbatch->percpu_pvec_drained = true; } folios_put(fbatch); } EXPORT_SYMBOL(__folio_batch_release); /** * folio_batch_remove_exceptionals() - Prune non-folios from a batch. * @fbatch: The batch to prune * * find_get_entries() fills a batch with both folios and shadow/swap/DAX * entries. This function prunes all the non-folio entries from @fbatch * without leaving holes, so that it can be passed on to folio-only batch * operations. */ void folio_batch_remove_exceptionals(struct folio_batch *fbatch) { unsigned int i, j; for (i = 0, j = 0; i < folio_batch_count(fbatch); i++) { struct folio *folio = fbatch->folios[i]; if (!xa_is_value(folio)) fbatch->folios[j++] = folio; } fbatch->nr = j; } static const struct ctl_table swap_sysctl_table[] = { { .procname = "page-cluster", .data = &page_cluster, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = (void *)&page_cluster_max, } }; /* * Perform any setup for the swap system */ void __init swap_setup(void) { unsigned long megs = totalram_pages() >> (20 - PAGE_SHIFT); /* Use a smaller cluster for small-memory machines */ if (megs < 16) page_cluster = 2; else page_cluster = 3; /* * Right now other parts of the system means that we * _really_ don't want to cluster much more */ register_sysctl_init("vm", swap_sysctl_table); } |
| 436 834 139 1599 134 19 15708 16 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 | /* SPDX-License-Identifier: GPL-2.0-only */ #ifndef __ASM_GENERIC_BITOPS_GENERIC_NON_ATOMIC_H #define __ASM_GENERIC_BITOPS_GENERIC_NON_ATOMIC_H #include <linux/bits.h> #include <asm/barrier.h> #ifndef _LINUX_BITOPS_H #error only <linux/bitops.h> can be included directly #endif /* * Generic definitions for bit operations, should not be used in regular code * directly. */ /** * generic___set_bit - Set a bit in memory * @nr: the bit to set * @addr: the address to start counting from * * Unlike set_bit(), this function is non-atomic and may be reordered. * If it's called on the same region of memory simultaneously, the effect * may be that only one operation succeeds. */ static __always_inline void generic___set_bit(unsigned long nr, volatile unsigned long *addr) { unsigned long mask = BIT_MASK(nr); unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr); *p |= mask; } static __always_inline void generic___clear_bit(unsigned long nr, volatile unsigned long *addr) { unsigned long mask = BIT_MASK(nr); unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr); *p &= ~mask; } /** * generic___change_bit - Toggle a bit in memory * @nr: the bit to change * @addr: the address to start counting from * * Unlike change_bit(), this function is non-atomic and may be reordered. * If it's called on the same region of memory simultaneously, the effect * may be that only one operation succeeds. */ static __always_inline void generic___change_bit(unsigned long nr, volatile unsigned long *addr) { unsigned long mask = BIT_MASK(nr); unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr); *p ^= mask; } /** * generic___test_and_set_bit - Set a bit and return its old value * @nr: Bit to set * @addr: Address to count from * * This operation is non-atomic and can be reordered. * If two examples of this operation race, one can appear to succeed * but actually fail. You must protect multiple accesses with a lock. */ static __always_inline bool generic___test_and_set_bit(unsigned long nr, volatile unsigned long *addr) { unsigned long mask = BIT_MASK(nr); unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr); unsigned long old = *p; *p = old | mask; return (old & mask) != 0; } /** * generic___test_and_clear_bit - Clear a bit and return its old value * @nr: Bit to clear * @addr: Address to count from * * This operation is non-atomic and can be reordered. * If two examples of this operation race, one can appear to succeed * but actually fail. You must protect multiple accesses with a lock. */ static __always_inline bool generic___test_and_clear_bit(unsigned long nr, volatile unsigned long *addr) { unsigned long mask = BIT_MASK(nr); unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr); unsigned long old = *p; *p = old & ~mask; return (old & mask) != 0; } /* WARNING: non atomic and it can be reordered! */ static __always_inline bool generic___test_and_change_bit(unsigned long nr, volatile unsigned long *addr) { unsigned long mask = BIT_MASK(nr); unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr); unsigned long old = *p; *p = old ^ mask; return (old & mask) != 0; } /** * generic_test_bit - Determine whether a bit is set * @nr: bit number to test * @addr: Address to start counting from */ static __always_inline bool generic_test_bit(unsigned long nr, const volatile unsigned long *addr) { /* * Unlike the bitops with the '__' prefix above, this one *is* atomic, * so `volatile` must always stay here with no cast-aways. See * `Documentation/atomic_bitops.txt` for the details. */ return 1UL & (addr[BIT_WORD(nr)] >> (nr & (BITS_PER_LONG-1))); } /** * generic_test_bit_acquire - Determine, with acquire semantics, whether a bit is set * @nr: bit number to test * @addr: Address to start counting from */ static __always_inline bool generic_test_bit_acquire(unsigned long nr, const volatile unsigned long *addr) { unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr); return 1UL & (smp_load_acquire(p) >> (nr & (BITS_PER_LONG-1))); } /* * const_*() definitions provide good compile-time optimizations when * the passed arguments can be resolved at compile time. */ #define const___set_bit generic___set_bit #define const___clear_bit generic___clear_bit #define const___change_bit generic___change_bit #define const___test_and_set_bit generic___test_and_set_bit #define const___test_and_clear_bit generic___test_and_clear_bit #define const___test_and_change_bit generic___test_and_change_bit #define const_test_bit_acquire generic_test_bit_acquire /** * const_test_bit - Determine whether a bit is set * @nr: bit number to test * @addr: Address to start counting from * * A version of generic_test_bit() which discards the `volatile` qualifier to * allow a compiler to optimize code harder. Non-atomic and to be called only * for testing compile-time constants, e.g. by the corresponding macros, not * directly from "regular" code. */ static __always_inline bool const_test_bit(unsigned long nr, const volatile unsigned long *addr) { const unsigned long *p = (const unsigned long *)addr + BIT_WORD(nr); unsigned long mask = BIT_MASK(nr); unsigned long val = *p; return !!(val & mask); } #endif /* __ASM_GENERIC_BITOPS_GENERIC_NON_ATOMIC_H */ |
| 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 | /* SPDX-License-Identifier: GPL-2.0+ */ /* * MACsec netdev header, used for h/w accelerated implementations. * * Copyright (c) 2015 Sabrina Dubroca <sd@queasysnail.net> */ #ifndef _NET_MACSEC_H_ #define _NET_MACSEC_H_ #include <linux/u64_stats_sync.h> #include <linux/if_vlan.h> #include <uapi/linux/if_link.h> #include <uapi/linux/if_macsec.h> #define MACSEC_DEFAULT_PN_LEN 4 #define MACSEC_XPN_PN_LEN 8 #define MACSEC_NUM_AN 4 /* 2 bits for the association number */ #define MACSEC_SCI_LEN 8 #define MACSEC_PORT_ES (htons(0x0001)) #define MACSEC_TCI_VERSION 0x80 #define MACSEC_TCI_ES 0x40 /* end station */ #define MACSEC_TCI_SC 0x20 /* SCI present */ #define MACSEC_TCI_SCB 0x10 /* epon */ #define MACSEC_TCI_E 0x08 /* encryption */ #define MACSEC_TCI_C 0x04 /* changed text */ #define MACSEC_AN_MASK 0x03 /* association number */ #define MACSEC_TCI_CONFID (MACSEC_TCI_E | MACSEC_TCI_C) #define MACSEC_DEFAULT_ICV_LEN 16 typedef u64 __bitwise sci_t; typedef u32 __bitwise ssci_t; struct metadata_dst; typedef union salt { struct { ssci_t ssci; __be64 pn; } __packed; u8 bytes[MACSEC_SALT_LEN]; } __packed salt_t; typedef union pn { struct { #if defined(__LITTLE_ENDIAN_BITFIELD) u32 lower; u32 upper; #elif defined(__BIG_ENDIAN_BITFIELD) u32 upper; u32 lower; #else #error "Please fix <asm/byteorder.h>" #endif }; u64 full64; } pn_t; /** * struct macsec_key - SA key * @id: user-provided key identifier * @tfm: crypto struct, key storage * @salt: salt used to generate IV in XPN cipher suites */ struct macsec_key { u8 id[MACSEC_KEYID_LEN]; struct crypto_aead *tfm; salt_t salt; }; struct macsec_rx_sc_stats { __u64 InOctetsValidated; __u64 InOctetsDecrypted; __u64 InPktsUnchecked; __u64 InPktsDelayed; __u64 InPktsOK; __u64 InPktsInvalid; __u64 InPktsLate; __u64 InPktsNotValid; __u64 InPktsNotUsingSA; __u64 InPktsUnusedSA; }; struct macsec_rx_sa_stats { __u32 InPktsOK; __u32 InPktsInvalid; __u32 InPktsNotValid; __u32 InPktsNotUsingSA; __u32 InPktsUnusedSA; }; struct macsec_tx_sa_stats { __u32 OutPktsProtected; __u32 OutPktsEncrypted; }; struct macsec_tx_sc_stats { __u64 OutPktsProtected; __u64 OutPktsEncrypted; __u64 OutOctetsProtected; __u64 OutOctetsEncrypted; }; struct macsec_dev_stats { __u64 OutPktsUntagged; __u64 InPktsUntagged; __u64 OutPktsTooLong; __u64 InPktsNoTag; __u64 InPktsBadTag; __u64 InPktsUnknownSCI; __u64 InPktsNoSCI; __u64 InPktsOverrun; }; /** * struct macsec_rx_sa - receive secure association * @active: * @next_pn: packet number expected for the next packet * @lock: protects next_pn manipulations * @key: key structure * @ssci: short secure channel identifier * @stats: per-SA stats */ struct macsec_rx_sa { struct macsec_key key; ssci_t ssci; spinlock_t lock; union { pn_t next_pn_halves; u64 next_pn; }; refcount_t refcnt; bool active; struct macsec_rx_sa_stats __percpu *stats; struct macsec_rx_sc *sc; struct rcu_head rcu; }; struct pcpu_rx_sc_stats { struct macsec_rx_sc_stats stats; struct u64_stats_sync syncp; }; struct pcpu_tx_sc_stats { struct macsec_tx_sc_stats stats; struct u64_stats_sync syncp; }; /** * struct macsec_rx_sc - receive secure channel * @sci: secure channel identifier for this SC * @active: channel is active * @sa: array of secure associations * @stats: per-SC stats */ struct macsec_rx_sc { struct macsec_rx_sc __rcu *next; sci_t sci; bool active; struct macsec_rx_sa __rcu *sa[MACSEC_NUM_AN]; struct pcpu_rx_sc_stats __percpu *stats; refcount_t refcnt; struct rcu_head rcu_head; }; /** * struct macsec_tx_sa - transmit secure association * @active: * @next_pn: packet number to use for the next packet * @lock: protects next_pn manipulations * @key: key structure * @ssci: short secure channel identifier * @stats: per-SA stats */ struct macsec_tx_sa { struct macsec_key key; ssci_t ssci; spinlock_t lock; union { pn_t next_pn_halves; u64 next_pn; }; refcount_t refcnt; bool active; struct macsec_tx_sa_stats __percpu *stats; struct rcu_head rcu; }; /** * struct macsec_tx_sc - transmit secure channel * @active: * @encoding_sa: association number of the SA currently in use * @encrypt: encrypt packets on transmit, or authenticate only * @send_sci: always include the SCI in the SecTAG * @end_station: * @scb: single copy broadcast flag * @sa: array of secure associations * @stats: stats for this TXSC * @md_dst: MACsec offload metadata dst */ struct macsec_tx_sc { bool active; u8 encoding_sa; bool encrypt; bool send_sci; bool end_station; bool scb; struct macsec_tx_sa __rcu *sa[MACSEC_NUM_AN]; struct pcpu_tx_sc_stats __percpu *stats; struct metadata_dst *md_dst; }; /** * struct macsec_secy - MACsec Security Entity * @netdev: netdevice for this SecY * @n_rx_sc: number of receive secure channels configured on this SecY * @sci: secure channel identifier used for tx * @key_len: length of keys used by the cipher suite * @icv_len: length of ICV used by the cipher suite * @validate_frames: validation mode * @xpn: enable XPN for this SecY * @operational: MAC_Operational flag * @protect_frames: enable protection for this SecY * @replay_protect: enable packet number checks on receive * @replay_window: size of the replay window * @tx_sc: transmit secure channel * @rx_sc: linked list of receive secure channels */ struct macsec_secy { struct net_device *netdev; unsigned int n_rx_sc; sci_t sci; u16 key_len; u16 icv_len; enum macsec_validation_type validate_frames; bool xpn; bool operational; bool protect_frames; bool replay_protect; u32 replay_window; struct macsec_tx_sc tx_sc; struct macsec_rx_sc __rcu *rx_sc; }; /** * struct macsec_context - MACsec context for hardware offloading * @netdev: a valid pointer to a struct net_device if @offload == * MACSEC_OFFLOAD_MAC * @phydev: a valid pointer to a struct phy_device if @offload == * MACSEC_OFFLOAD_PHY * @offload: MACsec offload status * @secy: pointer to a MACsec SecY * @rx_sc: pointer to a RX SC * @update_pn: when updating the SA, update the next PN * @assoc_num: association number of the target SA * @key: key of the target SA * @rx_sa: pointer to an RX SA if a RX SA is added/updated/removed * @tx_sa: pointer to an TX SA if a TX SA is added/updated/removed * @tx_sc_stats: pointer to TX SC stats structure * @tx_sa_stats: pointer to TX SA stats structure * @rx_sc_stats: pointer to RX SC stats structure * @rx_sa_stats: pointer to RX SA stats structure * @dev_stats: pointer to dev stats structure */ struct macsec_context { union { struct net_device *netdev; struct phy_device *phydev; }; enum macsec_offload offload; struct macsec_secy *secy; struct macsec_rx_sc *rx_sc; struct { bool update_pn; unsigned char assoc_num; u8 key[MACSEC_MAX_KEY_LEN]; union { struct macsec_rx_sa *rx_sa; struct macsec_tx_sa *tx_sa; }; } sa; union { struct macsec_tx_sc_stats *tx_sc_stats; struct macsec_tx_sa_stats *tx_sa_stats; struct macsec_rx_sc_stats *rx_sc_stats; struct macsec_rx_sa_stats *rx_sa_stats; struct macsec_dev_stats *dev_stats; } stats; }; /** * struct macsec_ops - MACsec offloading operations * @mdo_dev_open: called when the MACsec interface transitions to the up state * @mdo_dev_stop: called when the MACsec interface transitions to the down * state * @mdo_add_secy: called when a new SecY is added * @mdo_upd_secy: called when the SecY flags are changed or the MAC address of * the MACsec interface is changed * @mdo_del_secy: called when the hw offload is disabled or the MACsec * interface is removed * @mdo_add_rxsc: called when a new RX SC is added * @mdo_upd_rxsc: called when a certain RX SC is updated * @mdo_del_rxsc: called when a certain RX SC is removed * @mdo_add_rxsa: called when a new RX SA is added * @mdo_upd_rxsa: called when a certain RX SA is updated * @mdo_del_rxsa: called when a certain RX SA is removed * @mdo_add_txsa: called when a new TX SA is added * @mdo_upd_txsa: called when a certain TX SA is updated * @mdo_del_txsa: called when a certain TX SA is removed * @mdo_get_dev_stats: called when dev stats are read * @mdo_get_tx_sc_stats: called when TX SC stats are read * @mdo_get_tx_sa_stats: called when TX SA stats are read * @mdo_get_rx_sc_stats: called when RX SC stats are read * @mdo_get_rx_sa_stats: called when RX SA stats are read * @mdo_insert_tx_tag: called to insert the TX tag * @needed_headroom: number of bytes reserved at the beginning of the sk_buff * for the TX tag * @needed_tailroom: number of bytes reserved at the end of the sk_buff for the * TX tag * @rx_uses_md_dst: whether MACsec device offload supports sk_buff md_dst */ struct macsec_ops { /* Device wide */ int (*mdo_dev_open)(struct macsec_context *ctx); int (*mdo_dev_stop)(struct macsec_context *ctx); /* SecY */ int (*mdo_add_secy)(struct macsec_context *ctx); int (*mdo_upd_secy)(struct macsec_context *ctx); int (*mdo_del_secy)(struct macsec_context *ctx); /* Security channels */ int (*mdo_add_rxsc)(struct macsec_context *ctx); int (*mdo_upd_rxsc)(struct macsec_context *ctx); int (*mdo_del_rxsc)(struct macsec_context *ctx); /* Security associations */ int (*mdo_add_rxsa)(struct macsec_context *ctx); int (*mdo_upd_rxsa)(struct macsec_context *ctx); int (*mdo_del_rxsa)(struct macsec_context *ctx); int (*mdo_add_txsa)(struct macsec_context *ctx); int (*mdo_upd_txsa)(struct macsec_context *ctx); int (*mdo_del_txsa)(struct macsec_context *ctx); /* Statistics */ int (*mdo_get_dev_stats)(struct macsec_context *ctx); int (*mdo_get_tx_sc_stats)(struct macsec_context *ctx); int (*mdo_get_tx_sa_stats)(struct macsec_context *ctx); int (*mdo_get_rx_sc_stats)(struct macsec_context *ctx); int (*mdo_get_rx_sa_stats)(struct macsec_context *ctx); /* Offload tag */ int (*mdo_insert_tx_tag)(struct phy_device *phydev, struct sk_buff *skb); unsigned int needed_headroom; unsigned int needed_tailroom; bool rx_uses_md_dst; }; void macsec_pn_wrapped(struct macsec_secy *secy, struct macsec_tx_sa *tx_sa); static inline bool macsec_send_sci(const struct macsec_secy *secy) { const struct macsec_tx_sc *tx_sc = &secy->tx_sc; return tx_sc->send_sci || (secy->n_rx_sc > 1 && !tx_sc->end_station && !tx_sc->scb); } struct net_device *macsec_get_real_dev(const struct net_device *dev); bool macsec_netdev_is_offloaded(struct net_device *dev); static inline void *macsec_netdev_priv(const struct net_device *dev) { #if IS_ENABLED(CONFIG_VLAN_8021Q) if (is_vlan_dev(dev)) return netdev_priv(vlan_dev_priv(dev)->real_dev); #endif return netdev_priv(dev); } static inline u64 sci_to_cpu(sci_t sci) { return be64_to_cpu((__force __be64)sci); } #endif /* _NET_MACSEC_H_ */ |
| 25 42 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_STATIC_CALL_H #define _LINUX_STATIC_CALL_H /* * Static call support * * Static calls use code patching to hard-code function pointers into direct * branch instructions. They give the flexibility of function pointers, but * with improved performance. This is especially important for cases where * retpolines would otherwise be used, as retpolines can significantly impact * performance. * * * API overview: * * DECLARE_STATIC_CALL(name, func); * DEFINE_STATIC_CALL(name, func); * DEFINE_STATIC_CALL_NULL(name, typename); * DEFINE_STATIC_CALL_RET0(name, typename); * * __static_call_return0; * * static_call(name)(args...); * static_call_cond(name)(args...); * static_call_update(name, func); * static_call_query(name); * * EXPORT_STATIC_CALL{,_TRAMP}{,_GPL}() * * Usage example: * * # Start with the following functions (with identical prototypes): * int func_a(int arg1, int arg2); * int func_b(int arg1, int arg2); * * # Define a 'my_name' reference, associated with func_a() by default * DEFINE_STATIC_CALL(my_name, func_a); * * # Call func_a() * static_call(my_name)(arg1, arg2); * * # Update 'my_name' to point to func_b() * static_call_update(my_name, &func_b); * * # Call func_b() * static_call(my_name)(arg1, arg2); * * * Implementation details: * * This requires some arch-specific code (CONFIG_HAVE_STATIC_CALL). * Otherwise basic indirect calls are used (with function pointers). * * Each static_call() site calls into a trampoline associated with the name. * The trampoline has a direct branch to the default function. Updates to a * name will modify the trampoline's branch destination. * * If the arch has CONFIG_HAVE_STATIC_CALL_INLINE, then the call sites * themselves will be patched at runtime to call the functions directly, * rather than calling through the trampoline. This requires objtool or a * compiler plugin to detect all the static_call() sites and annotate them * in the .static_call_sites section. * * * Notes on NULL function pointers: * * Static_call()s support NULL functions, with many of the caveats that * regular function pointers have. * * Clearly calling a NULL function pointer is 'BAD', so too for * static_call()s (although when HAVE_STATIC_CALL it might not be immediately * fatal). A NULL static_call can be the result of: * * DECLARE_STATIC_CALL_NULL(my_static_call, void (*)(int)); * * which is equivalent to declaring a NULL function pointer with just a * typename: * * void (*my_func_ptr)(int arg1) = NULL; * * or using static_call_update() with a NULL function. In both cases the * HAVE_STATIC_CALL implementation will patch the trampoline with a RET * instruction, instead of an immediate tail-call JMP. HAVE_STATIC_CALL_INLINE * architectures can patch the trampoline call to a NOP. * * In all cases, any argument evaluation is unconditional. Unlike a regular * conditional function pointer call: * * if (my_func_ptr) * my_func_ptr(arg1) * * where the argument evaludation also depends on the pointer value. * * When calling a static_call that can be NULL, use: * * static_call_cond(name)(arg1); * * which will include the required value tests to avoid NULL-pointer * dereferences. * * To query which function is currently set to be called, use: * * func = static_call_query(name); * * * DEFINE_STATIC_CALL_RET0 / __static_call_return0: * * Just like how DEFINE_STATIC_CALL_NULL() / static_call_cond() optimize the * conditional void function call, DEFINE_STATIC_CALL_RET0 / * __static_call_return0 optimize the do nothing return 0 function. * * This feature is strictly UB per the C standard (since it casts a function * pointer to a different signature) and relies on the architecture ABI to * make things work. In particular it relies on Caller Stack-cleanup and the * whole return register being clobbered for short return values. All normal * CDECL style ABIs conform. * * In particular the x86_64 implementation replaces the 5 byte CALL * instruction at the callsite with a 5 byte clear of the RAX register, * completely eliding any function call overhead. * * Notably argument setup is unconditional. * * * EXPORT_STATIC_CALL() vs EXPORT_STATIC_CALL_TRAMP(): * * The difference is that the _TRAMP variant tries to only export the * trampoline with the result that a module can use static_call{,_cond}() but * not static_call_update(). * */ #include <linux/types.h> #include <linux/cpu.h> #include <linux/static_call_types.h> #ifdef CONFIG_HAVE_STATIC_CALL #include <asm/static_call.h> /* * Either @site or @tramp can be NULL. */ extern void arch_static_call_transform(void *site, void *tramp, void *func, bool tail); #define STATIC_CALL_TRAMP_ADDR(name) &STATIC_CALL_TRAMP(name) #else #define STATIC_CALL_TRAMP_ADDR(name) NULL #endif #define static_call_update(name, func) \ ({ \ typeof(&STATIC_CALL_TRAMP(name)) __F = (func); \ __static_call_update(&STATIC_CALL_KEY(name), \ STATIC_CALL_TRAMP_ADDR(name), __F); \ }) #define static_call_query(name) (READ_ONCE(STATIC_CALL_KEY(name).func)) #ifdef CONFIG_HAVE_STATIC_CALL_INLINE extern int static_call_initialized; extern int __init static_call_init(void); extern void static_call_force_reinit(void); struct static_call_mod { struct static_call_mod *next; struct module *mod; /* for vmlinux, mod == NULL */ struct static_call_site *sites; }; /* For finding the key associated with a trampoline */ struct static_call_tramp_key { s32 tramp; s32 key; }; extern void __static_call_update(struct static_call_key *key, void *tramp, void *func); extern int static_call_mod_init(struct module *mod); extern int static_call_text_reserved(void *start, void *end); extern long __static_call_return0(void); #define DEFINE_STATIC_CALL(name, _func) \ DECLARE_STATIC_CALL(name, _func); \ struct static_call_key STATIC_CALL_KEY(name) = { \ .func = _func, \ .type = 1, \ }; \ ARCH_DEFINE_STATIC_CALL_TRAMP(name, _func) #define DEFINE_STATIC_CALL_NULL(name, _func) \ DECLARE_STATIC_CALL(name, _func); \ struct static_call_key STATIC_CALL_KEY(name) = { \ .func = NULL, \ .type = 1, \ }; \ ARCH_DEFINE_STATIC_CALL_NULL_TRAMP(name) #define DEFINE_STATIC_CALL_RET0(name, _func) \ DECLARE_STATIC_CALL(name, _func); \ struct static_call_key STATIC_CALL_KEY(name) = { \ .func = __static_call_return0, \ .type = 1, \ }; \ ARCH_DEFINE_STATIC_CALL_RET0_TRAMP(name) #define static_call_cond(name) (void)__static_call(name) #define EXPORT_STATIC_CALL(name) \ EXPORT_SYMBOL(STATIC_CALL_KEY(name)); \ EXPORT_SYMBOL(STATIC_CALL_TRAMP(name)) #define EXPORT_STATIC_CALL_GPL(name) \ EXPORT_SYMBOL_GPL(STATIC_CALL_KEY(name)); \ EXPORT_SYMBOL_GPL(STATIC_CALL_TRAMP(name)) /* Leave the key unexported, so modules can't change static call targets: */ #define EXPORT_STATIC_CALL_TRAMP(name) \ EXPORT_SYMBOL(STATIC_CALL_TRAMP(name)); \ ARCH_ADD_TRAMP_KEY(name) #define EXPORT_STATIC_CALL_TRAMP_GPL(name) \ EXPORT_SYMBOL_GPL(STATIC_CALL_TRAMP(name)); \ ARCH_ADD_TRAMP_KEY(name) #elif defined(CONFIG_HAVE_STATIC_CALL) #define static_call_initialized 0 static inline int static_call_init(void) { return 0; } #define DEFINE_STATIC_CALL(name, _func) \ DECLARE_STATIC_CALL(name, _func); \ struct static_call_key STATIC_CALL_KEY(name) = { \ .func = _func, \ }; \ ARCH_DEFINE_STATIC_CALL_TRAMP(name, _func) #define DEFINE_STATIC_CALL_NULL(name, _func) \ DECLARE_STATIC_CALL(name, _func); \ struct static_call_key STATIC_CALL_KEY(name) = { \ .func = NULL, \ }; \ ARCH_DEFINE_STATIC_CALL_NULL_TRAMP(name) #define DEFINE_STATIC_CALL_RET0(name, _func) \ DECLARE_STATIC_CALL(name, _func); \ struct static_call_key STATIC_CALL_KEY(name) = { \ .func = __static_call_return0, \ }; \ ARCH_DEFINE_STATIC_CALL_RET0_TRAMP(name) #define static_call_cond(name) (void)__static_call(name) static inline void __static_call_update(struct static_call_key *key, void *tramp, void *func) { cpus_read_lock(); WRITE_ONCE(key->func, func); arch_static_call_transform(NULL, tramp, func, false); cpus_read_unlock(); } static inline int static_call_text_reserved(void *start, void *end) { return 0; } extern long __static_call_return0(void); #define EXPORT_STATIC_CALL(name) \ EXPORT_SYMBOL(STATIC_CALL_KEY(name)); \ EXPORT_SYMBOL(STATIC_CALL_TRAMP(name)) #define EXPORT_STATIC_CALL_GPL(name) \ EXPORT_SYMBOL_GPL(STATIC_CALL_KEY(name)); \ EXPORT_SYMBOL_GPL(STATIC_CALL_TRAMP(name)) /* Leave the key unexported, so modules can't change static call targets: */ #define EXPORT_STATIC_CALL_TRAMP(name) \ EXPORT_SYMBOL(STATIC_CALL_TRAMP(name)) #define EXPORT_STATIC_CALL_TRAMP_GPL(name) \ EXPORT_SYMBOL_GPL(STATIC_CALL_TRAMP(name)) #else /* Generic implementation */ #define static_call_initialized 0 static inline int static_call_init(void) { return 0; } static inline long __static_call_return0(void) { return 0; } #define __DEFINE_STATIC_CALL(name, _func, _func_init) \ DECLARE_STATIC_CALL(name, _func); \ struct static_call_key STATIC_CALL_KEY(name) = { \ .func = _func_init, \ } #define DEFINE_STATIC_CALL(name, _func) \ __DEFINE_STATIC_CALL(name, _func, _func) #define DEFINE_STATIC_CALL_NULL(name, _func) \ __DEFINE_STATIC_CALL(name, _func, NULL) #define DEFINE_STATIC_CALL_RET0(name, _func) \ __DEFINE_STATIC_CALL(name, _func, __static_call_return0) static inline void __static_call_nop(void) { } /* * This horrific hack takes care of two things: * * - it ensures the compiler will only load the function pointer ONCE, * which avoids a reload race. * * - it ensures the argument evaluation is unconditional, similar * to the HAVE_STATIC_CALL variant. * * Sadly current GCC/Clang (10 for both) do not optimize this properly * and will emit an indirect call for the NULL case :-( */ #define __static_call_cond(name) \ ({ \ void *func = READ_ONCE(STATIC_CALL_KEY(name).func); \ if (!func) \ func = &__static_call_nop; \ (typeof(STATIC_CALL_TRAMP(name))*)func; \ }) #define static_call_cond(name) (void)__static_call_cond(name) static inline void __static_call_update(struct static_call_key *key, void *tramp, void *func) { WRITE_ONCE(key->func, func); } static inline int static_call_text_reserved(void *start, void *end) { return 0; } #define EXPORT_STATIC_CALL(name) EXPORT_SYMBOL(STATIC_CALL_KEY(name)) #define EXPORT_STATIC_CALL_GPL(name) EXPORT_SYMBOL_GPL(STATIC_CALL_KEY(name)) #endif /* CONFIG_HAVE_STATIC_CALL */ #endif /* _LINUX_STATIC_CALL_H */ |
| 1487 1170 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 | /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM vmalloc #if !defined(_TRACE_VMALLOC_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_VMALLOC_H #include <linux/tracepoint.h> /** * alloc_vmap_area - called when a new vmap allocation occurs * @addr: an allocated address * @size: a requested size * @align: a requested alignment * @vstart: a requested start range * @vend: a requested end range * @failed: an allocation failed or not * * This event is used for a debug purpose, it can give an extra * information for a developer about how often it occurs and which * parameters are passed for further validation. */ TRACE_EVENT(alloc_vmap_area, TP_PROTO(unsigned long addr, unsigned long size, unsigned long align, unsigned long vstart, unsigned long vend, int failed), TP_ARGS(addr, size, align, vstart, vend, failed), TP_STRUCT__entry( __field(unsigned long, addr) __field(unsigned long, size) __field(unsigned long, align) __field(unsigned long, vstart) __field(unsigned long, vend) __field(int, failed) ), TP_fast_assign( __entry->addr = addr; __entry->size = size; __entry->align = align; __entry->vstart = vstart; __entry->vend = vend; __entry->failed = failed; ), TP_printk("va_start: %lu size=%lu align=%lu vstart=0x%lx vend=0x%lx failed=%d", __entry->addr, __entry->size, __entry->align, __entry->vstart, __entry->vend, __entry->failed) ); /** * purge_vmap_area_lazy - called when vmap areas were lazily freed * @start: purging start address * @end: purging end address * @npurged: numbed of purged vmap areas * * This event is used for a debug purpose. It gives some * indication about start:end range and how many objects * are released. */ TRACE_EVENT(purge_vmap_area_lazy, TP_PROTO(unsigned long start, unsigned long end, unsigned int npurged), TP_ARGS(start, end, npurged), TP_STRUCT__entry( __field(unsigned long, start) __field(unsigned long, end) __field(unsigned int, npurged) ), TP_fast_assign( __entry->start = start; __entry->end = end; __entry->npurged = npurged; ), TP_printk("start=0x%lx end=0x%lx num_purged=%u", __entry->start, __entry->end, __entry->npurged) ); /** * free_vmap_area_noflush - called when a vmap area is freed * @va_start: a start address of VA * @nr_lazy: number of current lazy pages * @nr_lazy_max: number of maximum lazy pages * * This event is used for a debug purpose. It gives some * indication about a VA that is released, number of current * outstanding areas and a maximum allowed threshold before * dropping all of them. */ TRACE_EVENT(free_vmap_area_noflush, TP_PROTO(unsigned long va_start, unsigned long nr_lazy, unsigned long nr_lazy_max), TP_ARGS(va_start, nr_lazy, nr_lazy_max), TP_STRUCT__entry( __field(unsigned long, va_start) __field(unsigned long, nr_lazy) __field(unsigned long, nr_lazy_max) ), TP_fast_assign( __entry->va_start = va_start; __entry->nr_lazy = nr_lazy; __entry->nr_lazy_max = nr_lazy_max; ), TP_printk("va_start=0x%lx nr_lazy=%lu nr_lazy_max=%lu", __entry->va_start, __entry->nr_lazy, __entry->nr_lazy_max) ); #endif /* _TRACE_VMALLOC_H */ /* This part must be outside protection */ #include <trace/define_trace.h> |
| 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 | // SPDX-License-Identifier: GPL-2.0-only /* * dma-fence-util: misc functions for dma_fence objects * * Copyright (C) 2022 Advanced Micro Devices, Inc. * Authors: * Christian König <christian.koenig@amd.com> */ #include <linux/dma-fence.h> #include <linux/dma-fence-array.h> #include <linux/dma-fence-chain.h> #include <linux/dma-fence-unwrap.h> #include <linux/slab.h> #include <linux/sort.h> /* Internal helper to start new array iteration, don't use directly */ static struct dma_fence * __dma_fence_unwrap_array(struct dma_fence_unwrap *cursor) { cursor->array = dma_fence_chain_contained(cursor->chain); cursor->index = 0; return dma_fence_array_first(cursor->array); } /** * dma_fence_unwrap_first - return the first fence from fence containers * @head: the entrypoint into the containers * @cursor: current position inside the containers * * Unwraps potential dma_fence_chain/dma_fence_array containers and return the * first fence. */ struct dma_fence *dma_fence_unwrap_first(struct dma_fence *head, struct dma_fence_unwrap *cursor) { cursor->chain = dma_fence_get(head); return __dma_fence_unwrap_array(cursor); } EXPORT_SYMBOL_GPL(dma_fence_unwrap_first); /** * dma_fence_unwrap_next - return the next fence from a fence containers * @cursor: current position inside the containers * * Continue unwrapping the dma_fence_chain/dma_fence_array containers and return * the next fence from them. */ struct dma_fence *dma_fence_unwrap_next(struct dma_fence_unwrap *cursor) { struct dma_fence *tmp; ++cursor->index; tmp = dma_fence_array_next(cursor->array, cursor->index); if (tmp) return tmp; cursor->chain = dma_fence_chain_walk(cursor->chain); return __dma_fence_unwrap_array(cursor); } EXPORT_SYMBOL_GPL(dma_fence_unwrap_next); static int fence_cmp(const void *_a, const void *_b) { struct dma_fence *a = *(struct dma_fence **)_a; struct dma_fence *b = *(struct dma_fence **)_b; if (a->context < b->context) return -1; else if (a->context > b->context) return 1; if (dma_fence_is_later(b, a)) return 1; else if (dma_fence_is_later(a, b)) return -1; return 0; } /** * dma_fence_dedup_array - Sort and deduplicate an array of dma_fence pointers * @fences: Array of dma_fence pointers to be deduplicated * @num_fences: Number of entries in the @fences array * * Sorts the input array by context, then removes duplicate * fences with the same context, keeping only the most recent one. * * The array is modified in-place and unreferenced duplicate fences are released * via dma_fence_put(). The function returns the new number of fences after * deduplication. * * Return: Number of unique fences remaining in the array. */ int dma_fence_dedup_array(struct dma_fence **fences, int num_fences) { int i, j; sort(fences, num_fences, sizeof(*fences), fence_cmp, NULL); /* * Only keep the most recent fence for each context. */ j = 0; for (i = 1; i < num_fences; i++) { if (fences[i]->context == fences[j]->context) dma_fence_put(fences[i]); else fences[++j] = fences[i]; } return ++j; } EXPORT_SYMBOL_GPL(dma_fence_dedup_array); /* Implementation for the dma_fence_merge() marco, don't use directly */ struct dma_fence *__dma_fence_unwrap_merge(unsigned int num_fences, struct dma_fence **fences, struct dma_fence_unwrap *iter) { struct dma_fence *tmp, *unsignaled = NULL, **array; struct dma_fence_array *result; ktime_t timestamp; int i, count; count = 0; timestamp = ns_to_ktime(0); for (i = 0; i < num_fences; ++i) { dma_fence_unwrap_for_each(tmp, &iter[i], fences[i]) { if (!dma_fence_is_signaled(tmp)) { dma_fence_put(unsignaled); unsignaled = dma_fence_get(tmp); ++count; } else { ktime_t t = dma_fence_timestamp(tmp); if (ktime_after(t, timestamp)) timestamp = t; } } } /* * If we couldn't find a pending fence just return a private signaled * fence with the timestamp of the last signaled one. * * Or if there was a single unsignaled fence left we can return it * directly and early since that is a major path on many workloads. */ if (count == 0) return dma_fence_allocate_private_stub(timestamp); else if (count == 1) return unsignaled; dma_fence_put(unsignaled); array = kmalloc_array(count, sizeof(*array), GFP_KERNEL); if (!array) return NULL; count = 0; for (i = 0; i < num_fences; ++i) { dma_fence_unwrap_for_each(tmp, &iter[i], fences[i]) { if (!dma_fence_is_signaled(tmp)) { array[count++] = dma_fence_get(tmp); } else { ktime_t t = dma_fence_timestamp(tmp); if (ktime_after(t, timestamp)) timestamp = t; } } } if (count == 0 || count == 1) goto return_fastpath; count = dma_fence_dedup_array(array, count); if (count > 1) { result = dma_fence_array_create(count, array, dma_fence_context_alloc(1), 1, false); if (!result) { for (i = 0; i < count; i++) dma_fence_put(array[i]); tmp = NULL; goto return_tmp; } return &result->base; } return_fastpath: if (count == 0) tmp = dma_fence_allocate_private_stub(timestamp); else tmp = array[0]; return_tmp: kfree(array); return tmp; } EXPORT_SYMBOL_GPL(__dma_fence_unwrap_merge); |
| 17 17 17 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __LINUX_PAGE_OWNER_H #define __LINUX_PAGE_OWNER_H #include <linux/jump_label.h> #ifdef CONFIG_PAGE_OWNER extern struct static_key_false page_owner_inited; extern struct page_ext_operations page_owner_ops; extern void __reset_page_owner(struct page *page, unsigned short order); extern void __set_page_owner(struct page *page, unsigned short order, gfp_t gfp_mask); extern void __split_page_owner(struct page *page, int old_order, int new_order); extern void __folio_copy_owner(struct folio *newfolio, struct folio *old); extern void __folio_set_owner_migrate_reason(struct folio *folio, int reason); extern void __dump_page_owner(const struct page *page); extern void pagetypeinfo_showmixedcount_print(struct seq_file *m, pg_data_t *pgdat, struct zone *zone); static inline void reset_page_owner(struct page *page, unsigned short order) { if (static_branch_unlikely(&page_owner_inited)) __reset_page_owner(page, order); } static inline void set_page_owner(struct page *page, unsigned short order, gfp_t gfp_mask) { if (static_branch_unlikely(&page_owner_inited)) __set_page_owner(page, order, gfp_mask); } static inline void split_page_owner(struct page *page, int old_order, int new_order) { if (static_branch_unlikely(&page_owner_inited)) __split_page_owner(page, old_order, new_order); } static inline void folio_copy_owner(struct folio *newfolio, struct folio *old) { if (static_branch_unlikely(&page_owner_inited)) __folio_copy_owner(newfolio, old); } static inline void folio_set_owner_migrate_reason(struct folio *folio, int reason) { if (static_branch_unlikely(&page_owner_inited)) __folio_set_owner_migrate_reason(folio, reason); } static inline void dump_page_owner(const struct page *page) { if (static_branch_unlikely(&page_owner_inited)) __dump_page_owner(page); } #else static inline void reset_page_owner(struct page *page, unsigned short order) { } static inline void set_page_owner(struct page *page, unsigned short order, gfp_t gfp_mask) { } static inline void split_page_owner(struct page *page, int old_order, int new_order) { } static inline void folio_copy_owner(struct folio *newfolio, struct folio *folio) { } static inline void folio_set_owner_migrate_reason(struct folio *folio, int reason) { } static inline void dump_page_owner(const struct page *page) { } #endif /* CONFIG_PAGE_OWNER */ #endif /* __LINUX_PAGE_OWNER_H */ |
| 1 1 1 1 1 1 1 1 1 2 2 2 2 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 1 2 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082 6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121 6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132 6133 6134 6135 6136 6137 6138 6139 6140 6141 6142 6143 6144 6145 6146 6147 6148 6149 6150 6151 6152 6153 6154 6155 6156 6157 6158 6159 6160 6161 6162 6163 6164 6165 6166 6167 6168 6169 6170 6171 6172 6173 6174 6175 6176 6177 6178 6179 6180 6181 6182 6183 6184 6185 6186 6187 6188 6189 6190 6191 6192 6193 6194 6195 6196 6197 6198 6199 6200 6201 6202 6203 6204 6205 6206 6207 6208 6209 6210 6211 6212 6213 6214 6215 6216 6217 6218 6219 6220 6221 6222 6223 6224 6225 6226 6227 6228 6229 6230 6231 6232 6233 6234 6235 6236 6237 6238 6239 6240 6241 6242 6243 6244 6245 6246 6247 6248 6249 6250 6251 6252 6253 6254 6255 6256 6257 6258 6259 6260 6261 6262 6263 6264 6265 6266 6267 6268 6269 6270 6271 6272 6273 6274 6275 6276 6277 6278 6279 6280 6281 6282 6283 6284 6285 6286 6287 6288 6289 6290 6291 6292 6293 6294 6295 6296 6297 6298 6299 6300 6301 6302 6303 6304 6305 6306 6307 6308 6309 6310 6311 6312 6313 6314 6315 6316 6317 6318 6319 6320 6321 6322 6323 6324 6325 6326 6327 6328 6329 6330 6331 6332 6333 6334 6335 6336 6337 6338 6339 6340 6341 6342 6343 6344 6345 6346 6347 6348 6349 6350 6351 6352 6353 6354 6355 6356 6357 6358 6359 6360 6361 6362 6363 6364 6365 6366 6367 6368 6369 6370 6371 6372 6373 6374 6375 6376 6377 6378 6379 6380 6381 6382 6383 6384 6385 6386 6387 6388 6389 6390 6391 6392 6393 6394 6395 6396 6397 6398 6399 6400 6401 6402 6403 6404 6405 6406 6407 6408 6409 6410 6411 6412 6413 6414 6415 6416 6417 6418 6419 6420 6421 6422 6423 6424 6425 6426 6427 6428 6429 6430 6431 6432 6433 6434 6435 6436 6437 6438 6439 6440 6441 6442 6443 6444 6445 6446 6447 6448 6449 6450 6451 6452 6453 6454 6455 6456 6457 6458 6459 6460 6461 6462 6463 6464 6465 6466 6467 6468 6469 6470 6471 6472 6473 6474 6475 6476 6477 6478 6479 6480 6481 6482 6483 6484 6485 6486 6487 6488 6489 6490 6491 6492 6493 6494 6495 6496 6497 6498 6499 6500 6501 6502 6503 6504 6505 6506 6507 6508 6509 6510 6511 6512 6513 6514 6515 6516 6517 6518 6519 6520 6521 6522 6523 6524 6525 6526 6527 6528 6529 6530 6531 6532 6533 6534 6535 6536 6537 6538 6539 6540 6541 6542 6543 6544 6545 6546 6547 6548 6549 6550 6551 6552 6553 6554 6555 6556 6557 6558 6559 6560 6561 6562 6563 6564 6565 6566 6567 6568 6569 6570 6571 6572 6573 6574 6575 6576 6577 6578 6579 6580 6581 6582 6583 6584 6585 6586 6587 6588 6589 6590 6591 6592 6593 6594 6595 6596 6597 6598 6599 6600 6601 6602 6603 6604 6605 6606 6607 6608 6609 6610 6611 6612 6613 6614 6615 6616 6617 6618 6619 6620 6621 6622 6623 6624 6625 6626 6627 6628 6629 6630 6631 6632 6633 6634 6635 6636 6637 6638 6639 6640 6641 6642 6643 6644 6645 6646 6647 6648 6649 6650 6651 6652 6653 6654 6655 6656 6657 6658 6659 6660 6661 6662 6663 6664 6665 6666 6667 6668 6669 6670 6671 6672 6673 6674 6675 6676 6677 6678 6679 6680 6681 6682 6683 6684 6685 6686 6687 6688 6689 6690 6691 6692 6693 6694 6695 6696 6697 6698 6699 6700 6701 6702 6703 6704 6705 6706 6707 6708 6709 6710 6711 6712 6713 6714 6715 6716 6717 6718 6719 6720 6721 6722 6723 6724 6725 6726 6727 6728 6729 6730 6731 6732 6733 6734 6735 6736 6737 6738 6739 6740 6741 6742 6743 6744 6745 6746 6747 6748 6749 6750 6751 6752 6753 6754 6755 6756 6757 6758 6759 6760 6761 6762 6763 6764 6765 6766 6767 6768 6769 6770 6771 6772 6773 6774 6775 6776 6777 | // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/ext4/inode.c * * Copyright (C) 1992, 1993, 1994, 1995 * Remy Card (card@masi.ibp.fr) * Laboratoire MASI - Institut Blaise Pascal * Universite Pierre et Marie Curie (Paris VI) * * from * * linux/fs/minix/inode.c * * Copyright (C) 1991, 1992 Linus Torvalds * * 64-bit file support on 64-bit platforms by Jakub Jelinek * (jj@sunsite.ms.mff.cuni.cz) * * Assorted race fixes, rewrite of ext4_get_block() by Al Viro, 2000 */ #include <linux/fs.h> #include <linux/mount.h> #include <linux/time.h> #include <linux/highuid.h> #include <linux/pagemap.h> #include <linux/dax.h> #include <linux/quotaops.h> #include <linux/string.h> #include <linux/buffer_head.h> #include <linux/writeback.h> #include <linux/pagevec.h> #include <linux/mpage.h> #include <linux/rmap.h> #include <linux/namei.h> #include <linux/uio.h> #include <linux/bio.h> #include <linux/workqueue.h> #include <linux/kernel.h> #include <linux/printk.h> #include <linux/slab.h> #include <linux/bitops.h> #include <linux/iomap.h> #include <linux/iversion.h> #include "ext4_jbd2.h" #include "xattr.h" #include "acl.h" #include "truncate.h" #include <trace/events/ext4.h> static void ext4_journalled_zero_new_buffers(handle_t *handle, struct inode *inode, struct folio *folio, unsigned from, unsigned to); static __u32 ext4_inode_csum(struct inode *inode, struct ext4_inode *raw, struct ext4_inode_info *ei) { __u32 csum; __u16 dummy_csum = 0; int offset = offsetof(struct ext4_inode, i_checksum_lo); unsigned int csum_size = sizeof(dummy_csum); csum = ext4_chksum(ei->i_csum_seed, (__u8 *)raw, offset); csum = ext4_chksum(csum, (__u8 *)&dummy_csum, csum_size); offset += csum_size; csum = ext4_chksum(csum, (__u8 *)raw + offset, EXT4_GOOD_OLD_INODE_SIZE - offset); if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) { offset = offsetof(struct ext4_inode, i_checksum_hi); csum = ext4_chksum(csum, (__u8 *)raw + EXT4_GOOD_OLD_INODE_SIZE, offset - EXT4_GOOD_OLD_INODE_SIZE); if (EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi)) { csum = ext4_chksum(csum, (__u8 *)&dummy_csum, csum_size); offset += csum_size; } csum = ext4_chksum(csum, (__u8 *)raw + offset, EXT4_INODE_SIZE(inode->i_sb) - offset); } return csum; } static int ext4_inode_csum_verify(struct inode *inode, struct ext4_inode *raw, struct ext4_inode_info *ei) { __u32 provided, calculated; if (EXT4_SB(inode->i_sb)->s_es->s_creator_os != cpu_to_le32(EXT4_OS_LINUX) || !ext4_has_feature_metadata_csum(inode->i_sb)) return 1; provided = le16_to_cpu(raw->i_checksum_lo); calculated = ext4_inode_csum(inode, raw, ei); if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE && EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi)) provided |= ((__u32)le16_to_cpu(raw->i_checksum_hi)) << 16; else calculated &= 0xFFFF; return provided == calculated; } void ext4_inode_csum_set(struct inode *inode, struct ext4_inode *raw, struct ext4_inode_info *ei) { __u32 csum; if (EXT4_SB(inode->i_sb)->s_es->s_creator_os != cpu_to_le32(EXT4_OS_LINUX) || !ext4_has_feature_metadata_csum(inode->i_sb)) return; csum = ext4_inode_csum(inode, raw, ei); raw->i_checksum_lo = cpu_to_le16(csum & 0xFFFF); if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE && EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi)) raw->i_checksum_hi = cpu_to_le16(csum >> 16); } static inline int ext4_begin_ordered_truncate(struct inode *inode, loff_t new_size) { trace_ext4_begin_ordered_truncate(inode, new_size); /* * If jinode is zero, then we never opened the file for * writing, so there's no need to call * jbd2_journal_begin_ordered_truncate() since there's no * outstanding writes we need to flush. */ if (!EXT4_I(inode)->jinode) return 0; return jbd2_journal_begin_ordered_truncate(EXT4_JOURNAL(inode), EXT4_I(inode)->jinode, new_size); } /* * Test whether an inode is a fast symlink. * A fast symlink has its symlink data stored in ext4_inode_info->i_data. */ int ext4_inode_is_fast_symlink(struct inode *inode) { if (!(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) { int ea_blocks = EXT4_I(inode)->i_file_acl ? EXT4_CLUSTER_SIZE(inode->i_sb) >> 9 : 0; if (ext4_has_inline_data(inode)) return 0; return (S_ISLNK(inode->i_mode) && inode->i_blocks - ea_blocks == 0); } return S_ISLNK(inode->i_mode) && inode->i_size && (inode->i_size < EXT4_N_BLOCKS * 4); } /* * Called at the last iput() if i_nlink is zero. */ void ext4_evict_inode(struct inode *inode) { handle_t *handle; int err; /* * Credits for final inode cleanup and freeing: * sb + inode (ext4_orphan_del()), block bitmap, group descriptor * (xattr block freeing), bitmap, group descriptor (inode freeing) */ int extra_credits = 6; struct ext4_xattr_inode_array *ea_inode_array = NULL; bool freeze_protected = false; trace_ext4_evict_inode(inode); dax_break_layout_final(inode); if (EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL) ext4_evict_ea_inode(inode); if (inode->i_nlink) { truncate_inode_pages_final(&inode->i_data); goto no_delete; } if (is_bad_inode(inode)) goto no_delete; dquot_initialize(inode); if (ext4_should_order_data(inode)) ext4_begin_ordered_truncate(inode, 0); truncate_inode_pages_final(&inode->i_data); /* * For inodes with journalled data, transaction commit could have * dirtied the inode. And for inodes with dioread_nolock, unwritten * extents converting worker could merge extents and also have dirtied * the inode. Flush worker is ignoring it because of I_FREEING flag but * we still need to remove the inode from the writeback lists. */ if (!list_empty_careful(&inode->i_io_list)) inode_io_list_del(inode); /* * Protect us against freezing - iput() caller didn't have to have any * protection against it. When we are in a running transaction though, * we are already protected against freezing and we cannot grab further * protection due to lock ordering constraints. */ if (!ext4_journal_current_handle()) { sb_start_intwrite(inode->i_sb); freeze_protected = true; } if (!IS_NOQUOTA(inode)) extra_credits += EXT4_MAXQUOTAS_DEL_BLOCKS(inode->i_sb); /* * Block bitmap, group descriptor, and inode are accounted in both * ext4_blocks_for_truncate() and extra_credits. So subtract 3. */ handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, ext4_blocks_for_truncate(inode) + extra_credits - 3); if (IS_ERR(handle)) { ext4_std_error(inode->i_sb, PTR_ERR(handle)); /* * If we're going to skip the normal cleanup, we still need to * make sure that the in-core orphan linked list is properly * cleaned up. */ ext4_orphan_del(NULL, inode); if (freeze_protected) sb_end_intwrite(inode->i_sb); goto no_delete; } if (IS_SYNC(inode)) ext4_handle_sync(handle); /* * Set inode->i_size to 0 before calling ext4_truncate(). We need * special handling of symlinks here because i_size is used to * determine whether ext4_inode_info->i_data contains symlink data or * block mappings. Setting i_size to 0 will remove its fast symlink * status. Erase i_data so that it becomes a valid empty block map. */ if (ext4_inode_is_fast_symlink(inode)) memset(EXT4_I(inode)->i_data, 0, sizeof(EXT4_I(inode)->i_data)); inode->i_size = 0; err = ext4_mark_inode_dirty(handle, inode); if (err) { ext4_warning(inode->i_sb, "couldn't mark inode dirty (err %d)", err); goto stop_handle; } if (inode->i_blocks) { err = ext4_truncate(inode); if (err) { ext4_error_err(inode->i_sb, -err, "couldn't truncate inode %lu (err %d)", inode->i_ino, err); goto stop_handle; } } /* Remove xattr references. */ err = ext4_xattr_delete_inode(handle, inode, &ea_inode_array, extra_credits); if (err) { ext4_warning(inode->i_sb, "xattr delete (err %d)", err); stop_handle: ext4_journal_stop(handle); ext4_orphan_del(NULL, inode); if (freeze_protected) sb_end_intwrite(inode->i_sb); ext4_xattr_inode_array_free(ea_inode_array); goto no_delete; } /* * Kill off the orphan record which ext4_truncate created. * AKPM: I think this can be inside the above `if'. * Note that ext4_orphan_del() has to be able to cope with the * deletion of a non-existent orphan - this is because we don't * know if ext4_truncate() actually created an orphan record. * (Well, we could do this if we need to, but heck - it works) */ ext4_orphan_del(handle, inode); EXT4_I(inode)->i_dtime = (__u32)ktime_get_real_seconds(); /* * One subtle ordering requirement: if anything has gone wrong * (transaction abort, IO errors, whatever), then we can still * do these next steps (the fs will already have been marked as * having errors), but we can't free the inode if the mark_dirty * fails. */ if (ext4_mark_inode_dirty(handle, inode)) /* If that failed, just do the required in-core inode clear. */ ext4_clear_inode(inode); else ext4_free_inode(handle, inode); ext4_journal_stop(handle); if (freeze_protected) sb_end_intwrite(inode->i_sb); ext4_xattr_inode_array_free(ea_inode_array); return; no_delete: /* * Check out some where else accidentally dirty the evicting inode, * which may probably cause inode use-after-free issues later. */ WARN_ON_ONCE(!list_empty_careful(&inode->i_io_list)); if (!list_empty(&EXT4_I(inode)->i_fc_list)) ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM, NULL); ext4_clear_inode(inode); /* We must guarantee clearing of inode... */ } #ifdef CONFIG_QUOTA qsize_t *ext4_get_reserved_space(struct inode *inode) { return &EXT4_I(inode)->i_reserved_quota; } #endif /* * Called with i_data_sem down, which is important since we can call * ext4_discard_preallocations() from here. */ void ext4_da_update_reserve_space(struct inode *inode, int used, int quota_claim) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct ext4_inode_info *ei = EXT4_I(inode); spin_lock(&ei->i_block_reservation_lock); trace_ext4_da_update_reserve_space(inode, used, quota_claim); if (unlikely(used > ei->i_reserved_data_blocks)) { ext4_warning(inode->i_sb, "%s: ino %lu, used %d " "with only %d reserved data blocks", __func__, inode->i_ino, used, ei->i_reserved_data_blocks); WARN_ON(1); used = ei->i_reserved_data_blocks; } /* Update per-inode reservations */ ei->i_reserved_data_blocks -= used; percpu_counter_sub(&sbi->s_dirtyclusters_counter, used); spin_unlock(&ei->i_block_reservation_lock); /* Update quota subsystem for data blocks */ if (quota_claim) dquot_claim_block(inode, EXT4_C2B(sbi, used)); else { /* * We did fallocate with an offset that is already delayed * allocated. So on delayed allocated writeback we should * not re-claim the quota for fallocated blocks. */ dquot_release_reservation_block(inode, EXT4_C2B(sbi, used)); } /* * If we have done all the pending block allocations and if * there aren't any writers on the inode, we can discard the * inode's preallocations. */ if ((ei->i_reserved_data_blocks == 0) && !inode_is_open_for_write(inode)) ext4_discard_preallocations(inode); } static int __check_block_validity(struct inode *inode, const char *func, unsigned int line, struct ext4_map_blocks *map) { journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; if (journal && inode == journal->j_inode) return 0; if (!ext4_inode_block_valid(inode, map->m_pblk, map->m_len)) { ext4_error_inode(inode, func, line, map->m_pblk, "lblock %lu mapped to illegal pblock %llu " "(length %d)", (unsigned long) map->m_lblk, map->m_pblk, map->m_len); return -EFSCORRUPTED; } return 0; } int ext4_issue_zeroout(struct inode *inode, ext4_lblk_t lblk, ext4_fsblk_t pblk, ext4_lblk_t len) { int ret; if (IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode)) return fscrypt_zeroout_range(inode, lblk, pblk, len); ret = sb_issue_zeroout(inode->i_sb, pblk, len, GFP_NOFS); if (ret > 0) ret = 0; return ret; } /* * For generic regular files, when updating the extent tree, Ext4 should * hold the i_rwsem and invalidate_lock exclusively. This ensures * exclusion against concurrent page faults, as well as reads and writes. */ #ifdef CONFIG_EXT4_DEBUG void ext4_check_map_extents_env(struct inode *inode) { if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) return; if (!S_ISREG(inode->i_mode) || IS_NOQUOTA(inode) || IS_VERITY(inode) || is_special_ino(inode->i_sb, inode->i_ino) || (inode->i_state & (I_FREEING | I_WILL_FREE | I_NEW)) || ext4_test_inode_flag(inode, EXT4_INODE_EA_INODE) || ext4_verity_in_progress(inode)) return; WARN_ON_ONCE(!inode_is_locked(inode) && !rwsem_is_locked(&inode->i_mapping->invalidate_lock)); } #else void ext4_check_map_extents_env(struct inode *inode) {} #endif #define check_block_validity(inode, map) \ __check_block_validity((inode), __func__, __LINE__, (map)) #ifdef ES_AGGRESSIVE_TEST static void ext4_map_blocks_es_recheck(handle_t *handle, struct inode *inode, struct ext4_map_blocks *es_map, struct ext4_map_blocks *map, int flags) { int retval; map->m_flags = 0; /* * There is a race window that the result is not the same. * e.g. xfstests #223 when dioread_nolock enables. The reason * is that we lookup a block mapping in extent status tree with * out taking i_data_sem. So at the time the unwritten extent * could be converted. */ down_read(&EXT4_I(inode)->i_data_sem); if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { retval = ext4_ext_map_blocks(handle, inode, map, 0); } else { retval = ext4_ind_map_blocks(handle, inode, map, 0); } up_read((&EXT4_I(inode)->i_data_sem)); /* * We don't check m_len because extent will be collpased in status * tree. So the m_len might not equal. */ if (es_map->m_lblk != map->m_lblk || es_map->m_flags != map->m_flags || es_map->m_pblk != map->m_pblk) { printk("ES cache assertion failed for inode: %lu " "es_cached ex [%d/%d/%llu/%x] != " "found ex [%d/%d/%llu/%x] retval %d flags %x\n", inode->i_ino, es_map->m_lblk, es_map->m_len, es_map->m_pblk, es_map->m_flags, map->m_lblk, map->m_len, map->m_pblk, map->m_flags, retval, flags); } } #endif /* ES_AGGRESSIVE_TEST */ static int ext4_map_query_blocks_next_in_leaf(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, unsigned int orig_mlen) { struct ext4_map_blocks map2; unsigned int status, status2; int retval; status = map->m_flags & EXT4_MAP_UNWRITTEN ? EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; WARN_ON_ONCE(!(map->m_flags & EXT4_MAP_QUERY_LAST_IN_LEAF)); WARN_ON_ONCE(orig_mlen <= map->m_len); /* Prepare map2 for lookup in next leaf block */ map2.m_lblk = map->m_lblk + map->m_len; map2.m_len = orig_mlen - map->m_len; map2.m_flags = 0; retval = ext4_ext_map_blocks(handle, inode, &map2, 0); if (retval <= 0) { ext4_es_insert_extent(inode, map->m_lblk, map->m_len, map->m_pblk, status, false); return map->m_len; } if (unlikely(retval != map2.m_len)) { ext4_warning(inode->i_sb, "ES len assertion failed for inode " "%lu: retval %d != map->m_len %d", inode->i_ino, retval, map2.m_len); WARN_ON(1); } status2 = map2.m_flags & EXT4_MAP_UNWRITTEN ? EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; /* * If map2 is contiguous with map, then let's insert it as a single * extent in es cache and return the combined length of both the maps. */ if (map->m_pblk + map->m_len == map2.m_pblk && status == status2) { ext4_es_insert_extent(inode, map->m_lblk, map->m_len + map2.m_len, map->m_pblk, status, false); map->m_len += map2.m_len; } else { ext4_es_insert_extent(inode, map->m_lblk, map->m_len, map->m_pblk, status, false); } return map->m_len; } static int ext4_map_query_blocks(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, int flags) { unsigned int status; int retval; unsigned int orig_mlen = map->m_len; flags &= EXT4_EX_QUERY_FILTER; if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) retval = ext4_ext_map_blocks(handle, inode, map, flags); else retval = ext4_ind_map_blocks(handle, inode, map, flags); if (retval <= 0) return retval; if (unlikely(retval != map->m_len)) { ext4_warning(inode->i_sb, "ES len assertion failed for inode " "%lu: retval %d != map->m_len %d", inode->i_ino, retval, map->m_len); WARN_ON(1); } /* * No need to query next in leaf: * - if returned extent is not last in leaf or * - if the last in leaf is the full requested range */ if (!(map->m_flags & EXT4_MAP_QUERY_LAST_IN_LEAF) || map->m_len == orig_mlen) { status = map->m_flags & EXT4_MAP_UNWRITTEN ? EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; ext4_es_insert_extent(inode, map->m_lblk, map->m_len, map->m_pblk, status, false); return retval; } return ext4_map_query_blocks_next_in_leaf(handle, inode, map, orig_mlen); } static int ext4_map_create_blocks(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, int flags) { struct extent_status es; unsigned int status; int err, retval = 0; /* * We pass in the magic EXT4_GET_BLOCKS_DELALLOC_RESERVE * indicates that the blocks and quotas has already been * checked when the data was copied into the page cache. */ if (map->m_flags & EXT4_MAP_DELAYED) flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE; /* * Here we clear m_flags because after allocating an new extent, * it will be set again. */ map->m_flags &= ~EXT4_MAP_FLAGS; /* * We need to check for EXT4 here because migrate could have * changed the inode type in between. */ if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { retval = ext4_ext_map_blocks(handle, inode, map, flags); } else { retval = ext4_ind_map_blocks(handle, inode, map, flags); /* * We allocated new blocks which will result in i_data's * format changing. Force the migrate to fail by clearing * migrate flags. */ if (retval > 0 && map->m_flags & EXT4_MAP_NEW) ext4_clear_inode_state(inode, EXT4_STATE_EXT_MIGRATE); } if (retval <= 0) return retval; if (unlikely(retval != map->m_len)) { ext4_warning(inode->i_sb, "ES len assertion failed for inode %lu: " "retval %d != map->m_len %d", inode->i_ino, retval, map->m_len); WARN_ON(1); } /* * We have to zeroout blocks before inserting them into extent * status tree. Otherwise someone could look them up there and * use them before they are really zeroed. We also have to * unmap metadata before zeroing as otherwise writeback can * overwrite zeros with stale data from block device. */ if (flags & EXT4_GET_BLOCKS_ZERO && map->m_flags & EXT4_MAP_MAPPED && map->m_flags & EXT4_MAP_NEW) { err = ext4_issue_zeroout(inode, map->m_lblk, map->m_pblk, map->m_len); if (err) return err; } /* * If the extent has been zeroed out, we don't need to update * extent status tree. */ if (flags & EXT4_GET_BLOCKS_PRE_IO && ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) { if (ext4_es_is_written(&es)) return retval; } status = map->m_flags & EXT4_MAP_UNWRITTEN ? EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; ext4_es_insert_extent(inode, map->m_lblk, map->m_len, map->m_pblk, status, flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE); return retval; } /* * The ext4_map_blocks() function tries to look up the requested blocks, * and returns if the blocks are already mapped. * * Otherwise it takes the write lock of the i_data_sem and allocate blocks * and store the allocated blocks in the result buffer head and mark it * mapped. * * If file type is extents based, it will call ext4_ext_map_blocks(), * Otherwise, call with ext4_ind_map_blocks() to handle indirect mapping * based files * * On success, it returns the number of blocks being mapped or allocated. * If flags doesn't contain EXT4_GET_BLOCKS_CREATE the blocks are * pre-allocated and unwritten, the resulting @map is marked as unwritten. * If the flags contain EXT4_GET_BLOCKS_CREATE, it will mark @map as mapped. * * It returns 0 if plain look up failed (blocks have not been allocated), in * that case, @map is returned as unmapped but we still do fill map->m_len to * indicate the length of a hole starting at map->m_lblk. * * It returns the error in case of allocation failure. */ int ext4_map_blocks(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, int flags) { struct extent_status es; int retval; int ret = 0; unsigned int orig_mlen = map->m_len; #ifdef ES_AGGRESSIVE_TEST struct ext4_map_blocks orig_map; memcpy(&orig_map, map, sizeof(*map)); #endif map->m_flags = 0; ext_debug(inode, "flag 0x%x, max_blocks %u, logical block %lu\n", flags, map->m_len, (unsigned long) map->m_lblk); /* * ext4_map_blocks returns an int, and m_len is an unsigned int */ if (unlikely(map->m_len > INT_MAX)) map->m_len = INT_MAX; /* We can handle the block number less than EXT_MAX_BLOCKS */ if (unlikely(map->m_lblk >= EXT_MAX_BLOCKS)) return -EFSCORRUPTED; /* * Callers from the context of data submission are the only exceptions * for regular files that do not hold the i_rwsem or invalidate_lock. * However, caching unrelated ranges is not permitted. */ if (flags & EXT4_GET_BLOCKS_IO_SUBMIT) WARN_ON_ONCE(!(flags & EXT4_EX_NOCACHE)); else ext4_check_map_extents_env(inode); /* Lookup extent status tree firstly */ if (ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) { if (ext4_es_is_written(&es) || ext4_es_is_unwritten(&es)) { map->m_pblk = ext4_es_pblock(&es) + map->m_lblk - es.es_lblk; map->m_flags |= ext4_es_is_written(&es) ? EXT4_MAP_MAPPED : EXT4_MAP_UNWRITTEN; retval = es.es_len - (map->m_lblk - es.es_lblk); if (retval > map->m_len) retval = map->m_len; map->m_len = retval; } else if (ext4_es_is_delayed(&es) || ext4_es_is_hole(&es)) { map->m_pblk = 0; map->m_flags |= ext4_es_is_delayed(&es) ? EXT4_MAP_DELAYED : 0; retval = es.es_len - (map->m_lblk - es.es_lblk); if (retval > map->m_len) retval = map->m_len; map->m_len = retval; retval = 0; } else { BUG(); } if (flags & EXT4_GET_BLOCKS_CACHED_NOWAIT) return retval; #ifdef ES_AGGRESSIVE_TEST ext4_map_blocks_es_recheck(handle, inode, map, &orig_map, flags); #endif if (!(flags & EXT4_GET_BLOCKS_QUERY_LAST_IN_LEAF) || orig_mlen == map->m_len) goto found; map->m_len = orig_mlen; } /* * In the query cache no-wait mode, nothing we can do more if we * cannot find extent in the cache. */ if (flags & EXT4_GET_BLOCKS_CACHED_NOWAIT) return 0; /* * Try to see if we can get the block without requesting a new * file system block. */ down_read(&EXT4_I(inode)->i_data_sem); retval = ext4_map_query_blocks(handle, inode, map, flags); up_read((&EXT4_I(inode)->i_data_sem)); found: if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) { ret = check_block_validity(inode, map); if (ret != 0) return ret; } /* If it is only a block(s) look up */ if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) return retval; /* * Returns if the blocks have already allocated * * Note that if blocks have been preallocated * ext4_ext_map_blocks() returns with buffer head unmapped */ if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) /* * If we need to convert extent to unwritten * we continue and do the actual work in * ext4_ext_map_blocks() */ if (!(flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN)) return retval; ext4_fc_track_inode(handle, inode); /* * New blocks allocate and/or writing to unwritten extent * will possibly result in updating i_data, so we take * the write lock of i_data_sem, and call get_block() * with create == 1 flag. */ down_write(&EXT4_I(inode)->i_data_sem); retval = ext4_map_create_blocks(handle, inode, map, flags); up_write((&EXT4_I(inode)->i_data_sem)); if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) { ret = check_block_validity(inode, map); if (ret != 0) return ret; /* * Inodes with freshly allocated blocks where contents will be * visible after transaction commit must be on transaction's * ordered data list. */ if (map->m_flags & EXT4_MAP_NEW && !(map->m_flags & EXT4_MAP_UNWRITTEN) && !(flags & EXT4_GET_BLOCKS_ZERO) && !ext4_is_quota_file(inode) && ext4_should_order_data(inode)) { loff_t start_byte = (loff_t)map->m_lblk << inode->i_blkbits; loff_t length = (loff_t)map->m_len << inode->i_blkbits; if (flags & EXT4_GET_BLOCKS_IO_SUBMIT) ret = ext4_jbd2_inode_add_wait(handle, inode, start_byte, length); else ret = ext4_jbd2_inode_add_write(handle, inode, start_byte, length); if (ret) return ret; } } if (retval > 0 && (map->m_flags & EXT4_MAP_UNWRITTEN || map->m_flags & EXT4_MAP_MAPPED)) ext4_fc_track_range(handle, inode, map->m_lblk, map->m_lblk + map->m_len - 1); if (retval < 0) ext_debug(inode, "failed with err %d\n", retval); return retval; } /* * Update EXT4_MAP_FLAGS in bh->b_state. For buffer heads attached to pages * we have to be careful as someone else may be manipulating b_state as well. */ static void ext4_update_bh_state(struct buffer_head *bh, unsigned long flags) { unsigned long old_state; unsigned long new_state; flags &= EXT4_MAP_FLAGS; /* Dummy buffer_head? Set non-atomically. */ if (!bh->b_folio) { bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | flags; return; } /* * Someone else may be modifying b_state. Be careful! This is ugly but * once we get rid of using bh as a container for mapping information * to pass to / from get_block functions, this can go away. */ old_state = READ_ONCE(bh->b_state); do { new_state = (old_state & ~EXT4_MAP_FLAGS) | flags; } while (unlikely(!try_cmpxchg(&bh->b_state, &old_state, new_state))); } /* * Make sure that the current journal transaction has enough credits to map * one extent. Return -EAGAIN if it cannot extend the current running * transaction. */ static inline int ext4_journal_ensure_extent_credits(handle_t *handle, struct inode *inode) { int credits; int ret; /* Called from ext4_da_write_begin() which has no handle started? */ if (!handle) return 0; credits = ext4_chunk_trans_blocks(inode, 1); ret = __ext4_journal_ensure_credits(handle, credits, credits, 0); return ret <= 0 ? ret : -EAGAIN; } static int _ext4_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh, int flags) { struct ext4_map_blocks map; int ret = 0; if (ext4_has_inline_data(inode)) return -ERANGE; map.m_lblk = iblock; map.m_len = bh->b_size >> inode->i_blkbits; ret = ext4_map_blocks(ext4_journal_current_handle(), inode, &map, flags); if (ret > 0) { map_bh(bh, inode->i_sb, map.m_pblk); ext4_update_bh_state(bh, map.m_flags); bh->b_size = inode->i_sb->s_blocksize * map.m_len; ret = 0; } else if (ret == 0) { /* hole case, need to fill in bh->b_size */ bh->b_size = inode->i_sb->s_blocksize * map.m_len; } return ret; } int ext4_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh, int create) { return _ext4_get_block(inode, iblock, bh, create ? EXT4_GET_BLOCKS_CREATE : 0); } /* * Get block function used when preparing for buffered write if we require * creating an unwritten extent if blocks haven't been allocated. The extent * will be converted to written after the IO is complete. */ int ext4_get_block_unwritten(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) { int ret = 0; ext4_debug("ext4_get_block_unwritten: inode %lu, create flag %d\n", inode->i_ino, create); ret = _ext4_get_block(inode, iblock, bh_result, EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT); /* * If the buffer is marked unwritten, mark it as new to make sure it is * zeroed out correctly in case of partial writes. Otherwise, there is * a chance of stale data getting exposed. */ if (ret == 0 && buffer_unwritten(bh_result)) set_buffer_new(bh_result); return ret; } /* Maximum number of blocks we map for direct IO at once. */ #define DIO_MAX_BLOCKS 4096 /* * `handle' can be NULL if create is zero */ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode, ext4_lblk_t block, int map_flags) { struct ext4_map_blocks map; struct buffer_head *bh; int create = map_flags & EXT4_GET_BLOCKS_CREATE; bool nowait = map_flags & EXT4_GET_BLOCKS_CACHED_NOWAIT; int err; ASSERT((EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) || handle != NULL || create == 0); ASSERT(create == 0 || !nowait); map.m_lblk = block; map.m_len = 1; err = ext4_map_blocks(handle, inode, &map, map_flags); if (err == 0) return create ? ERR_PTR(-ENOSPC) : NULL; if (err < 0) return ERR_PTR(err); if (nowait) return sb_find_get_block(inode->i_sb, map.m_pblk); /* * Since bh could introduce extra ref count such as referred by * journal_head etc. Try to avoid using __GFP_MOVABLE here * as it may fail the migration when journal_head remains. */ bh = getblk_unmovable(inode->i_sb->s_bdev, map.m_pblk, inode->i_sb->s_blocksize); if (unlikely(!bh)) return ERR_PTR(-ENOMEM); if (map.m_flags & EXT4_MAP_NEW) { ASSERT(create != 0); ASSERT((EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) || (handle != NULL)); /* * Now that we do not always journal data, we should * keep in mind whether this should always journal the * new buffer as metadata. For now, regular file * writes use ext4_get_block instead, so it's not a * problem. */ lock_buffer(bh); BUFFER_TRACE(bh, "call get_create_access"); err = ext4_journal_get_create_access(handle, inode->i_sb, bh, EXT4_JTR_NONE); if (unlikely(err)) { unlock_buffer(bh); goto errout; } if (!buffer_uptodate(bh)) { memset(bh->b_data, 0, inode->i_sb->s_blocksize); set_buffer_uptodate(bh); } unlock_buffer(bh); BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); err = ext4_handle_dirty_metadata(handle, inode, bh); if (unlikely(err)) goto errout; } else BUFFER_TRACE(bh, "not a new buffer"); return bh; errout: brelse(bh); return ERR_PTR(err); } struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode, ext4_lblk_t block, int map_flags) { struct buffer_head *bh; int ret; bh = ext4_getblk(handle, inode, block, map_flags); if (IS_ERR(bh)) return bh; if (!bh || ext4_buffer_uptodate(bh)) return bh; ret = ext4_read_bh_lock(bh, REQ_META | REQ_PRIO, true); if (ret) { put_bh(bh); return ERR_PTR(ret); } return bh; } /* Read a contiguous batch of blocks. */ int ext4_bread_batch(struct inode *inode, ext4_lblk_t block, int bh_count, bool wait, struct buffer_head **bhs) { int i, err; for (i = 0; i < bh_count; i++) { bhs[i] = ext4_getblk(NULL, inode, block + i, 0 /* map_flags */); if (IS_ERR(bhs[i])) { err = PTR_ERR(bhs[i]); bh_count = i; goto out_brelse; } } for (i = 0; i < bh_count; i++) /* Note that NULL bhs[i] is valid because of holes. */ if (bhs[i] && !ext4_buffer_uptodate(bhs[i])) ext4_read_bh_lock(bhs[i], REQ_META | REQ_PRIO, false); if (!wait) return 0; for (i = 0; i < bh_count; i++) if (bhs[i]) wait_on_buffer(bhs[i]); for (i = 0; i < bh_count; i++) { if (bhs[i] && !buffer_uptodate(bhs[i])) { err = -EIO; goto out_brelse; } } return 0; out_brelse: for (i = 0; i < bh_count; i++) { brelse(bhs[i]); bhs[i] = NULL; } return err; } int ext4_walk_page_buffers(handle_t *handle, struct inode *inode, struct buffer_head *head, unsigned from, unsigned to, int *partial, int (*fn)(handle_t *handle, struct inode *inode, struct buffer_head *bh)) { struct buffer_head *bh; unsigned block_start, block_end; unsigned blocksize = head->b_size; int err, ret = 0; struct buffer_head *next; for (bh = head, block_start = 0; ret == 0 && (bh != head || !block_start); block_start = block_end, bh = next) { next = bh->b_this_page; block_end = block_start + blocksize; if (block_end <= from || block_start >= to) { if (partial && !buffer_uptodate(bh)) *partial = 1; continue; } err = (*fn)(handle, inode, bh); if (!ret) ret = err; } return ret; } /* * Helper for handling dirtying of journalled data. We also mark the folio as * dirty so that writeback code knows about this page (and inode) contains * dirty data. ext4_writepages() then commits appropriate transaction to * make data stable. */ static int ext4_dirty_journalled_data(handle_t *handle, struct buffer_head *bh) { struct folio *folio = bh->b_folio; struct inode *inode = folio->mapping->host; /* only regular files have a_ops */ if (S_ISREG(inode->i_mode)) folio_mark_dirty(folio); return ext4_handle_dirty_metadata(handle, NULL, bh); } int do_journal_get_write_access(handle_t *handle, struct inode *inode, struct buffer_head *bh) { if (!buffer_mapped(bh) || buffer_freed(bh)) return 0; BUFFER_TRACE(bh, "get write access"); return ext4_journal_get_write_access(handle, inode->i_sb, bh, EXT4_JTR_NONE); } int ext4_block_write_begin(handle_t *handle, struct folio *folio, loff_t pos, unsigned len, get_block_t *get_block) { unsigned int from = offset_in_folio(folio, pos); unsigned to = from + len; struct inode *inode = folio->mapping->host; unsigned block_start, block_end; sector_t block; int err = 0; unsigned blocksize = inode->i_sb->s_blocksize; unsigned bbits; struct buffer_head *bh, *head, *wait[2]; int nr_wait = 0; int i; bool should_journal_data = ext4_should_journal_data(inode); BUG_ON(!folio_test_locked(folio)); BUG_ON(to > folio_size(folio)); BUG_ON(from > to); head = folio_buffers(folio); if (!head) head = create_empty_buffers(folio, blocksize, 0); bbits = ilog2(blocksize); block = (sector_t)folio->index << (PAGE_SHIFT - bbits); for (bh = head, block_start = 0; bh != head || !block_start; block++, block_start = block_end, bh = bh->b_this_page) { block_end = block_start + blocksize; if (block_end <= from || block_start >= to) { if (folio_test_uptodate(folio)) { set_buffer_uptodate(bh); } continue; } if (WARN_ON_ONCE(buffer_new(bh))) clear_buffer_new(bh); if (!buffer_mapped(bh)) { WARN_ON(bh->b_size != blocksize); err = ext4_journal_ensure_extent_credits(handle, inode); if (!err) err = get_block(inode, block, bh, 1); if (err) break; if (buffer_new(bh)) { /* * We may be zeroing partial buffers or all new * buffers in case of failure. Prepare JBD2 for * that. */ if (should_journal_data) do_journal_get_write_access(handle, inode, bh); if (folio_test_uptodate(folio)) { /* * Unlike __block_write_begin() we leave * dirtying of new uptodate buffers to * ->write_end() time or * folio_zero_new_buffers(). */ set_buffer_uptodate(bh); continue; } if (block_end > to || block_start < from) folio_zero_segments(folio, to, block_end, block_start, from); continue; } } if (folio_test_uptodate(folio)) { set_buffer_uptodate(bh); continue; } if (!buffer_uptodate(bh) && !buffer_delay(bh) && !buffer_unwritten(bh) && (block_start < from || block_end > to)) { ext4_read_bh_lock(bh, 0, false); wait[nr_wait++] = bh; } } /* * If we issued read requests, let them complete. */ for (i = 0; i < nr_wait; i++) { wait_on_buffer(wait[i]); if (!buffer_uptodate(wait[i])) err = -EIO; } if (unlikely(err)) { if (should_journal_data) ext4_journalled_zero_new_buffers(handle, inode, folio, from, to); else folio_zero_new_buffers(folio, from, to); } else if (fscrypt_inode_uses_fs_layer_crypto(inode)) { for (i = 0; i < nr_wait; i++) { int err2; err2 = fscrypt_decrypt_pagecache_blocks(folio, blocksize, bh_offset(wait[i])); if (err2) { clear_buffer_uptodate(wait[i]); err = err2; } } } return err; } /* * To preserve ordering, it is essential that the hole instantiation and * the data write be encapsulated in a single transaction. We cannot * close off a transaction and start a new one between the ext4_get_block() * and the ext4_write_end(). So doing the jbd2_journal_start at the start of * ext4_write_begin() is the right place. */ static int ext4_write_begin(const struct kiocb *iocb, struct address_space *mapping, loff_t pos, unsigned len, struct folio **foliop, void **fsdata) { struct inode *inode = mapping->host; int ret, needed_blocks; handle_t *handle; int retries = 0; struct folio *folio; pgoff_t index; unsigned from, to; ret = ext4_emergency_state(inode->i_sb); if (unlikely(ret)) return ret; trace_ext4_write_begin(inode, pos, len); /* * Reserve one block more for addition to orphan list in case * we allocate blocks but write fails for some reason */ needed_blocks = ext4_chunk_trans_extent(inode, ext4_journal_blocks_per_folio(inode)) + 1; index = pos >> PAGE_SHIFT; if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) { ret = ext4_try_to_write_inline_data(mapping, inode, pos, len, foliop); if (ret < 0) return ret; if (ret == 1) return 0; } /* * write_begin_get_folio() can take a long time if the * system is thrashing due to memory pressure, or if the folio * is being written back. So grab it first before we start * the transaction handle. This also allows us to allocate * the folio (if needed) without using GFP_NOFS. */ retry_grab: folio = write_begin_get_folio(iocb, mapping, index, len); if (IS_ERR(folio)) return PTR_ERR(folio); if (pos + len > folio_pos(folio) + folio_size(folio)) len = folio_pos(folio) + folio_size(folio) - pos; from = offset_in_folio(folio, pos); to = from + len; /* * The same as page allocation, we prealloc buffer heads before * starting the handle. */ if (!folio_buffers(folio)) create_empty_buffers(folio, inode->i_sb->s_blocksize, 0); folio_unlock(folio); retry_journal: handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, needed_blocks); if (IS_ERR(handle)) { folio_put(folio); return PTR_ERR(handle); } folio_lock(folio); if (folio->mapping != mapping) { /* The folio got truncated from under us */ folio_unlock(folio); folio_put(folio); ext4_journal_stop(handle); goto retry_grab; } /* In case writeback began while the folio was unlocked */ folio_wait_stable(folio); if (ext4_should_dioread_nolock(inode)) ret = ext4_block_write_begin(handle, folio, pos, len, ext4_get_block_unwritten); else ret = ext4_block_write_begin(handle, folio, pos, len, ext4_get_block); if (!ret && ext4_should_journal_data(inode)) { ret = ext4_walk_page_buffers(handle, inode, folio_buffers(folio), from, to, NULL, do_journal_get_write_access); } if (ret) { bool extended = (pos + len > inode->i_size) && !ext4_verity_in_progress(inode); folio_unlock(folio); /* * ext4_block_write_begin may have instantiated a few blocks * outside i_size. Trim these off again. Don't need * i_size_read because we hold i_rwsem. * * Add inode to orphan list in case we crash before * truncate finishes */ if (extended && ext4_can_truncate(inode)) ext4_orphan_add(handle, inode); ext4_journal_stop(handle); if (extended) { ext4_truncate_failed_write(inode); /* * If truncate failed early the inode might * still be on the orphan list; we need to * make sure the inode is removed from the * orphan list in that case. */ if (inode->i_nlink) ext4_orphan_del(NULL, inode); } if (ret == -EAGAIN || (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))) goto retry_journal; folio_put(folio); return ret; } *foliop = folio; return ret; } /* For write_end() in data=journal mode */ static int write_end_fn(handle_t *handle, struct inode *inode, struct buffer_head *bh) { int ret; if (!buffer_mapped(bh) || buffer_freed(bh)) return 0; set_buffer_uptodate(bh); ret = ext4_dirty_journalled_data(handle, bh); clear_buffer_meta(bh); clear_buffer_prio(bh); clear_buffer_new(bh); return ret; } /* * We need to pick up the new inode size which generic_commit_write gave us * `iocb` can be NULL - eg, when called from page_symlink(). * * ext4 never places buffers on inode->i_mapping->i_private_list. metadata * buffers are managed internally. */ static int ext4_write_end(const struct kiocb *iocb, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct folio *folio, void *fsdata) { handle_t *handle = ext4_journal_current_handle(); struct inode *inode = mapping->host; loff_t old_size = inode->i_size; int ret = 0, ret2; int i_size_changed = 0; bool verity = ext4_verity_in_progress(inode); trace_ext4_write_end(inode, pos, len, copied); if (ext4_has_inline_data(inode) && ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) return ext4_write_inline_data_end(inode, pos, len, copied, folio); copied = block_write_end(pos, len, copied, folio); /* * it's important to update i_size while still holding folio lock: * page writeout could otherwise come in and zero beyond i_size. * * If FS_IOC_ENABLE_VERITY is running on this inode, then Merkle tree * blocks are being written past EOF, so skip the i_size update. */ if (!verity) i_size_changed = ext4_update_inode_size(inode, pos + copied); folio_unlock(folio); folio_put(folio); if (old_size < pos && !verity) { pagecache_isize_extended(inode, old_size, pos); ext4_zero_partial_blocks(handle, inode, old_size, pos - old_size); } /* * Don't mark the inode dirty under folio lock. First, it unnecessarily * makes the holding time of folio lock longer. Second, it forces lock * ordering of folio lock and transaction start for journaling * filesystems. */ if (i_size_changed) ret = ext4_mark_inode_dirty(handle, inode); if (pos + len > inode->i_size && !verity && ext4_can_truncate(inode)) /* if we have allocated more blocks and copied * less. We will have blocks allocated outside * inode->i_size. So truncate them */ ext4_orphan_add(handle, inode); ret2 = ext4_journal_stop(handle); if (!ret) ret = ret2; if (pos + len > inode->i_size && !verity) { ext4_truncate_failed_write(inode); /* * If truncate failed early the inode might still be * on the orphan list; we need to make sure the inode * is removed from the orphan list in that case. */ if (inode->i_nlink) ext4_orphan_del(NULL, inode); } return ret ? ret : copied; } /* * This is a private version of folio_zero_new_buffers() which doesn't * set the buffer to be dirty, since in data=journalled mode we need * to call ext4_dirty_journalled_data() instead. */ static void ext4_journalled_zero_new_buffers(handle_t *handle, struct inode *inode, struct folio *folio, unsigned from, unsigned to) { unsigned int block_start = 0, block_end; struct buffer_head *head, *bh; bh = head = folio_buffers(folio); do { block_end = block_start + bh->b_size; if (buffer_new(bh)) { if (block_end > from && block_start < to) { if (!folio_test_uptodate(folio)) { unsigned start, size; start = max(from, block_start); size = min(to, block_end) - start; folio_zero_range(folio, start, size); } clear_buffer_new(bh); write_end_fn(handle, inode, bh); } } block_start = block_end; bh = bh->b_this_page; } while (bh != head); } static int ext4_journalled_write_end(const struct kiocb *iocb, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct folio *folio, void *fsdata) { handle_t *handle = ext4_journal_current_handle(); struct inode *inode = mapping->host; loff_t old_size = inode->i_size; int ret = 0, ret2; int partial = 0; unsigned from, to; int size_changed = 0; bool verity = ext4_verity_in_progress(inode); trace_ext4_journalled_write_end(inode, pos, len, copied); from = pos & (PAGE_SIZE - 1); to = from + len; BUG_ON(!ext4_handle_valid(handle)); if (ext4_has_inline_data(inode)) return ext4_write_inline_data_end(inode, pos, len, copied, folio); if (unlikely(copied < len) && !folio_test_uptodate(folio)) { copied = 0; ext4_journalled_zero_new_buffers(handle, inode, folio, from, to); } else { if (unlikely(copied < len)) ext4_journalled_zero_new_buffers(handle, inode, folio, from + copied, to); ret = ext4_walk_page_buffers(handle, inode, folio_buffers(folio), from, from + copied, &partial, write_end_fn); if (!partial) folio_mark_uptodate(folio); } if (!verity) size_changed = ext4_update_inode_size(inode, pos + copied); EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid; folio_unlock(folio); folio_put(folio); if (old_size < pos && !verity) { pagecache_isize_extended(inode, old_size, pos); ext4_zero_partial_blocks(handle, inode, old_size, pos - old_size); } if (size_changed) { ret2 = ext4_mark_inode_dirty(handle, inode); if (!ret) ret = ret2; } if (pos + len > inode->i_size && !verity && ext4_can_truncate(inode)) /* if we have allocated more blocks and copied * less. We will have blocks allocated outside * inode->i_size. So truncate them */ ext4_orphan_add(handle, inode); ret2 = ext4_journal_stop(handle); if (!ret) ret = ret2; if (pos + len > inode->i_size && !verity) { ext4_truncate_failed_write(inode); /* * If truncate failed early the inode might still be * on the orphan list; we need to make sure the inode * is removed from the orphan list in that case. */ if (inode->i_nlink) ext4_orphan_del(NULL, inode); } return ret ? ret : copied; } /* * Reserve space for 'nr_resv' clusters */ static int ext4_da_reserve_space(struct inode *inode, int nr_resv) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct ext4_inode_info *ei = EXT4_I(inode); int ret; /* * We will charge metadata quota at writeout time; this saves * us from metadata over-estimation, though we may go over by * a small amount in the end. Here we just reserve for data. */ ret = dquot_reserve_block(inode, EXT4_C2B(sbi, nr_resv)); if (ret) return ret; spin_lock(&ei->i_block_reservation_lock); if (ext4_claim_free_clusters(sbi, nr_resv, 0)) { spin_unlock(&ei->i_block_reservation_lock); dquot_release_reservation_block(inode, EXT4_C2B(sbi, nr_resv)); return -ENOSPC; } ei->i_reserved_data_blocks += nr_resv; trace_ext4_da_reserve_space(inode, nr_resv); spin_unlock(&ei->i_block_reservation_lock); return 0; /* success */ } void ext4_da_release_space(struct inode *inode, int to_free) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct ext4_inode_info *ei = EXT4_I(inode); if (!to_free) return; /* Nothing to release, exit */ spin_lock(&EXT4_I(inode)->i_block_reservation_lock); trace_ext4_da_release_space(inode, to_free); if (unlikely(to_free > ei->i_reserved_data_blocks)) { /* * if there aren't enough reserved blocks, then the * counter is messed up somewhere. Since this * function is called from invalidate page, it's * harmless to return without any action. */ ext4_warning(inode->i_sb, "ext4_da_release_space: " "ino %lu, to_free %d with only %d reserved " "data blocks", inode->i_ino, to_free, ei->i_reserved_data_blocks); WARN_ON(1); to_free = ei->i_reserved_data_blocks; } ei->i_reserved_data_blocks -= to_free; /* update fs dirty data blocks counter */ percpu_counter_sub(&sbi->s_dirtyclusters_counter, to_free); spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); dquot_release_reservation_block(inode, EXT4_C2B(sbi, to_free)); } /* * Delayed allocation stuff */ struct mpage_da_data { /* These are input fields for ext4_do_writepages() */ struct inode *inode; struct writeback_control *wbc; unsigned int can_map:1; /* Can writepages call map blocks? */ /* These are internal state of ext4_do_writepages() */ loff_t start_pos; /* The start pos to write */ loff_t next_pos; /* Current pos to examine */ loff_t end_pos; /* Last pos to examine */ /* * Extent to map - this can be after start_pos because that can be * fully mapped. We somewhat abuse m_flags to store whether the extent * is delalloc or unwritten. */ struct ext4_map_blocks map; struct ext4_io_submit io_submit; /* IO submission data */ unsigned int do_map:1; unsigned int scanned_until_end:1; unsigned int journalled_more_data:1; }; static void mpage_release_unused_pages(struct mpage_da_data *mpd, bool invalidate) { unsigned nr, i; pgoff_t index, end; struct folio_batch fbatch; struct inode *inode = mpd->inode; struct address_space *mapping = inode->i_mapping; /* This is necessary when next_pos == 0. */ if (mpd->start_pos >= mpd->next_pos) return; mpd->scanned_until_end = 0; if (invalidate) { ext4_lblk_t start, last; start = EXT4_B_TO_LBLK(inode, mpd->start_pos); last = mpd->next_pos >> inode->i_blkbits; /* * avoid racing with extent status tree scans made by * ext4_insert_delayed_block() */ down_write(&EXT4_I(inode)->i_data_sem); ext4_es_remove_extent(inode, start, last - start); up_write(&EXT4_I(inode)->i_data_sem); } folio_batch_init(&fbatch); index = mpd->start_pos >> PAGE_SHIFT; end = mpd->next_pos >> PAGE_SHIFT; while (index < end) { nr = filemap_get_folios(mapping, &index, end - 1, &fbatch); if (nr == 0) break; for (i = 0; i < nr; i++) { struct folio *folio = fbatch.folios[i]; if (folio_pos(folio) < mpd->start_pos) continue; if (folio_next_index(folio) > end) continue; BUG_ON(!folio_test_locked(folio)); BUG_ON(folio_test_writeback(folio)); if (invalidate) { if (folio_mapped(folio)) folio_clear_dirty_for_io(folio); block_invalidate_folio(folio, 0, folio_size(folio)); folio_clear_uptodate(folio); } folio_unlock(folio); } folio_batch_release(&fbatch); } } static void ext4_print_free_blocks(struct inode *inode) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct super_block *sb = inode->i_sb; struct ext4_inode_info *ei = EXT4_I(inode); ext4_msg(sb, KERN_CRIT, "Total free blocks count %lld", EXT4_C2B(EXT4_SB(inode->i_sb), ext4_count_free_clusters(sb))); ext4_msg(sb, KERN_CRIT, "Free/Dirty block details"); ext4_msg(sb, KERN_CRIT, "free_blocks=%lld", (long long) EXT4_C2B(EXT4_SB(sb), percpu_counter_sum(&sbi->s_freeclusters_counter))); ext4_msg(sb, KERN_CRIT, "dirty_blocks=%lld", (long long) EXT4_C2B(EXT4_SB(sb), percpu_counter_sum(&sbi->s_dirtyclusters_counter))); ext4_msg(sb, KERN_CRIT, "Block reservation details"); ext4_msg(sb, KERN_CRIT, "i_reserved_data_blocks=%u", ei->i_reserved_data_blocks); return; } /* * Check whether the cluster containing lblk has been allocated or has * delalloc reservation. * * Returns 0 if the cluster doesn't have either, 1 if it has delalloc * reservation, 2 if it's already been allocated, negative error code on * failure. */ static int ext4_clu_alloc_state(struct inode *inode, ext4_lblk_t lblk) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); int ret; /* Has delalloc reservation? */ if (ext4_es_scan_clu(inode, &ext4_es_is_delayed, lblk)) return 1; /* Already been allocated? */ if (ext4_es_scan_clu(inode, &ext4_es_is_mapped, lblk)) return 2; ret = ext4_clu_mapped(inode, EXT4_B2C(sbi, lblk)); if (ret < 0) return ret; if (ret > 0) return 2; return 0; } /* * ext4_insert_delayed_blocks - adds a multiple delayed blocks to the extents * status tree, incrementing the reserved * cluster/block count or making pending * reservations where needed * * @inode - file containing the newly added block * @lblk - start logical block to be added * @len - length of blocks to be added * * Returns 0 on success, negative error code on failure. */ static int ext4_insert_delayed_blocks(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); int ret; bool lclu_allocated = false; bool end_allocated = false; ext4_lblk_t resv_clu; ext4_lblk_t end = lblk + len - 1; /* * If the cluster containing lblk or end is shared with a delayed, * written, or unwritten extent in a bigalloc file system, it's * already been accounted for and does not need to be reserved. * A pending reservation must be made for the cluster if it's * shared with a written or unwritten extent and doesn't already * have one. Written and unwritten extents can be purged from the * extents status tree if the system is under memory pressure, so * it's necessary to examine the extent tree if a search of the * extents status tree doesn't get a match. */ if (sbi->s_cluster_ratio == 1) { ret = ext4_da_reserve_space(inode, len); if (ret != 0) /* ENOSPC */ return ret; } else { /* bigalloc */ resv_clu = EXT4_B2C(sbi, end) - EXT4_B2C(sbi, lblk) + 1; ret = ext4_clu_alloc_state(inode, lblk); if (ret < 0) return ret; if (ret > 0) { resv_clu--; lclu_allocated = (ret == 2); } if (EXT4_B2C(sbi, lblk) != EXT4_B2C(sbi, end)) { ret = ext4_clu_alloc_state(inode, end); if (ret < 0) return ret; if (ret > 0) { resv_clu--; end_allocated = (ret == 2); } } if (resv_clu) { ret = ext4_da_reserve_space(inode, resv_clu); if (ret != 0) /* ENOSPC */ return ret; } } ext4_es_insert_delayed_extent(inode, lblk, len, lclu_allocated, end_allocated); return 0; } /* * Looks up the requested blocks and sets the delalloc extent map. * First try to look up for the extent entry that contains the requested * blocks in the extent status tree without i_data_sem, then try to look * up for the ondisk extent mapping with i_data_sem in read mode, * finally hold i_data_sem in write mode, looks up again and add a * delalloc extent entry if it still couldn't find any extent. Pass out * the mapped extent through @map and return 0 on success. */ static int ext4_da_map_blocks(struct inode *inode, struct ext4_map_blocks *map) { struct extent_status es; int retval; #ifdef ES_AGGRESSIVE_TEST struct ext4_map_blocks orig_map; memcpy(&orig_map, map, sizeof(*map)); #endif map->m_flags = 0; ext_debug(inode, "max_blocks %u, logical block %lu\n", map->m_len, (unsigned long) map->m_lblk); ext4_check_map_extents_env(inode); /* Lookup extent status tree firstly */ if (ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) { map->m_len = min_t(unsigned int, map->m_len, es.es_len - (map->m_lblk - es.es_lblk)); if (ext4_es_is_hole(&es)) goto add_delayed; found: /* * Delayed extent could be allocated by fallocate. * So we need to check it. */ if (ext4_es_is_delayed(&es)) { map->m_flags |= EXT4_MAP_DELAYED; return 0; } map->m_pblk = ext4_es_pblock(&es) + map->m_lblk - es.es_lblk; if (ext4_es_is_written(&es)) map->m_flags |= EXT4_MAP_MAPPED; else if (ext4_es_is_unwritten(&es)) map->m_flags |= EXT4_MAP_UNWRITTEN; else BUG(); #ifdef ES_AGGRESSIVE_TEST ext4_map_blocks_es_recheck(NULL, inode, map, &orig_map, 0); #endif return 0; } /* * Try to see if we can get the block without requesting a new * file system block. */ down_read(&EXT4_I(inode)->i_data_sem); if (ext4_has_inline_data(inode)) retval = 0; else retval = ext4_map_query_blocks(NULL, inode, map, 0); up_read(&EXT4_I(inode)->i_data_sem); if (retval) return retval < 0 ? retval : 0; add_delayed: down_write(&EXT4_I(inode)->i_data_sem); /* * Page fault path (ext4_page_mkwrite does not take i_rwsem) * and fallocate path (no folio lock) can race. Make sure we * lookup the extent status tree here again while i_data_sem * is held in write mode, before inserting a new da entry in * the extent status tree. */ if (ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) { map->m_len = min_t(unsigned int, map->m_len, es.es_len - (map->m_lblk - es.es_lblk)); if (!ext4_es_is_hole(&es)) { up_write(&EXT4_I(inode)->i_data_sem); goto found; } } else if (!ext4_has_inline_data(inode)) { retval = ext4_map_query_blocks(NULL, inode, map, 0); if (retval) { up_write(&EXT4_I(inode)->i_data_sem); return retval < 0 ? retval : 0; } } map->m_flags |= EXT4_MAP_DELAYED; retval = ext4_insert_delayed_blocks(inode, map->m_lblk, map->m_len); up_write(&EXT4_I(inode)->i_data_sem); return retval; } /* * This is a special get_block_t callback which is used by * ext4_da_write_begin(). It will either return mapped block or * reserve space for a single block. * * For delayed buffer_head we have BH_Mapped, BH_New, BH_Delay set. * We also have b_blocknr = -1 and b_bdev initialized properly * * For unwritten buffer_head we have BH_Mapped, BH_New, BH_Unwritten set. * We also have b_blocknr = physicalblock mapping unwritten extent and b_bdev * initialized properly. */ int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, struct buffer_head *bh, int create) { struct ext4_map_blocks map; sector_t invalid_block = ~((sector_t) 0xffff); int ret = 0; BUG_ON(create == 0); BUG_ON(bh->b_size != inode->i_sb->s_blocksize); if (invalid_block < ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es)) invalid_block = ~0; map.m_lblk = iblock; map.m_len = 1; /* * first, we need to know whether the block is allocated already * preallocated blocks are unmapped but should treated * the same as allocated blocks. */ ret = ext4_da_map_blocks(inode, &map); if (ret < 0) return ret; if (map.m_flags & EXT4_MAP_DELAYED) { map_bh(bh, inode->i_sb, invalid_block); set_buffer_new(bh); set_buffer_delay(bh); return 0; } map_bh(bh, inode->i_sb, map.m_pblk); ext4_update_bh_state(bh, map.m_flags); if (buffer_unwritten(bh)) { /* A delayed write to unwritten bh should be marked * new and mapped. Mapped ensures that we don't do * get_block multiple times when we write to the same * offset and new ensures that we do proper zero out * for partial write. */ set_buffer_new(bh); set_buffer_mapped(bh); } return 0; } static void mpage_folio_done(struct mpage_da_data *mpd, struct folio *folio) { mpd->start_pos += folio_size(folio); mpd->wbc->nr_to_write -= folio_nr_pages(folio); folio_unlock(folio); } static int mpage_submit_folio(struct mpage_da_data *mpd, struct folio *folio) { size_t len; loff_t size; int err; WARN_ON_ONCE(folio_pos(folio) != mpd->start_pos); folio_clear_dirty_for_io(folio); /* * We have to be very careful here! Nothing protects writeback path * against i_size changes and the page can be writeably mapped into * page tables. So an application can be growing i_size and writing * data through mmap while writeback runs. folio_clear_dirty_for_io() * write-protects our page in page tables and the page cannot get * written to again until we release folio lock. So only after * folio_clear_dirty_for_io() we are safe to sample i_size for * ext4_bio_write_folio() to zero-out tail of the written page. We rely * on the barrier provided by folio_test_clear_dirty() in * folio_clear_dirty_for_io() to make sure i_size is really sampled only * after page tables are updated. */ size = i_size_read(mpd->inode); len = folio_size(folio); if (folio_pos(folio) + len > size && !ext4_verity_in_progress(mpd->inode)) len = size & (len - 1); err = ext4_bio_write_folio(&mpd->io_submit, folio, len); return err; } #define BH_FLAGS (BIT(BH_Unwritten) | BIT(BH_Delay)) /* * mballoc gives us at most this number of blocks... * XXX: That seems to be only a limitation of ext4_mb_normalize_request(). * The rest of mballoc seems to handle chunks up to full group size. */ #define MAX_WRITEPAGES_EXTENT_LEN 2048 /* * mpage_add_bh_to_extent - try to add bh to extent of blocks to map * * @mpd - extent of blocks * @lblk - logical number of the block in the file * @bh - buffer head we want to add to the extent * * The function is used to collect contig. blocks in the same state. If the * buffer doesn't require mapping for writeback and we haven't started the * extent of buffers to map yet, the function returns 'true' immediately - the * caller can write the buffer right away. Otherwise the function returns true * if the block has been added to the extent, false if the block couldn't be * added. */ static bool mpage_add_bh_to_extent(struct mpage_da_data *mpd, ext4_lblk_t lblk, struct buffer_head *bh) { struct ext4_map_blocks *map = &mpd->map; /* Buffer that doesn't need mapping for writeback? */ if (!buffer_dirty(bh) || !buffer_mapped(bh) || (!buffer_delay(bh) && !buffer_unwritten(bh))) { /* So far no extent to map => we write the buffer right away */ if (map->m_len == 0) return true; return false; } /* First block in the extent? */ if (map->m_len == 0) { /* We cannot map unless handle is started... */ if (!mpd->do_map) return false; map->m_lblk = lblk; map->m_len = 1; map->m_flags = bh->b_state & BH_FLAGS; return true; } /* Don't go larger than mballoc is willing to allocate */ if (map->m_len >= MAX_WRITEPAGES_EXTENT_LEN) return false; /* Can we merge the block to our big extent? */ if (lblk == map->m_lblk + map->m_len && (bh->b_state & BH_FLAGS) == map->m_flags) { map->m_len++; return true; } return false; } /* * mpage_process_page_bufs - submit page buffers for IO or add them to extent * * @mpd - extent of blocks for mapping * @head - the first buffer in the page * @bh - buffer we should start processing from * @lblk - logical number of the block in the file corresponding to @bh * * Walk through page buffers from @bh upto @head (exclusive) and either submit * the page for IO if all buffers in this page were mapped and there's no * accumulated extent of buffers to map or add buffers in the page to the * extent of buffers to map. The function returns 1 if the caller can continue * by processing the next page, 0 if it should stop adding buffers to the * extent to map because we cannot extend it anymore. It can also return value * < 0 in case of error during IO submission. */ static int mpage_process_page_bufs(struct mpage_da_data *mpd, struct buffer_head *head, struct buffer_head *bh, ext4_lblk_t lblk) { struct inode *inode = mpd->inode; int err; ext4_lblk_t blocks = (i_size_read(inode) + i_blocksize(inode) - 1) >> inode->i_blkbits; if (ext4_verity_in_progress(inode)) blocks = EXT_MAX_BLOCKS; do { BUG_ON(buffer_locked(bh)); if (lblk >= blocks || !mpage_add_bh_to_extent(mpd, lblk, bh)) { /* Found extent to map? */ if (mpd->map.m_len) return 0; /* Buffer needs mapping and handle is not started? */ if (!mpd->do_map) return 0; /* Everything mapped so far and we hit EOF */ break; } } while (lblk++, (bh = bh->b_this_page) != head); /* So far everything mapped? Submit the page for IO. */ if (mpd->map.m_len == 0) { err = mpage_submit_folio(mpd, head->b_folio); if (err < 0) return err; mpage_folio_done(mpd, head->b_folio); } if (lblk >= blocks) { mpd->scanned_until_end = 1; return 0; } return 1; } /* * mpage_process_folio - update folio buffers corresponding to changed extent * and may submit fully mapped page for IO * @mpd: description of extent to map, on return next extent to map * @folio: Contains these buffers. * @m_lblk: logical block mapping. * @m_pblk: corresponding physical mapping. * @map_bh: determines on return whether this page requires any further * mapping or not. * * Scan given folio buffers corresponding to changed extent and update buffer * state according to new extent state. * We map delalloc buffers to their physical location, clear unwritten bits. * If the given folio is not fully mapped, we update @mpd to the next extent in * the given folio that needs mapping & return @map_bh as true. */ static int mpage_process_folio(struct mpage_da_data *mpd, struct folio *folio, ext4_lblk_t *m_lblk, ext4_fsblk_t *m_pblk, bool *map_bh) { struct buffer_head *head, *bh; ext4_io_end_t *io_end = mpd->io_submit.io_end; ext4_lblk_t lblk = *m_lblk; ext4_fsblk_t pblock = *m_pblk; int err = 0; int blkbits = mpd->inode->i_blkbits; ssize_t io_end_size = 0; struct ext4_io_end_vec *io_end_vec = ext4_last_io_end_vec(io_end); bh = head = folio_buffers(folio); do { if (lblk < mpd->map.m_lblk) continue; if (lblk >= mpd->map.m_lblk + mpd->map.m_len) { /* * Buffer after end of mapped extent. * Find next buffer in the folio to map. */ mpd->map.m_len = 0; mpd->map.m_flags = 0; io_end_vec->size += io_end_size; err = mpage_process_page_bufs(mpd, head, bh, lblk); if (err > 0) err = 0; if (!err && mpd->map.m_len && mpd->map.m_lblk > lblk) { io_end_vec = ext4_alloc_io_end_vec(io_end); if (IS_ERR(io_end_vec)) { err = PTR_ERR(io_end_vec); goto out; } io_end_vec->offset = (loff_t)mpd->map.m_lblk << blkbits; } *map_bh = true; goto out; } if (buffer_delay(bh)) { clear_buffer_delay(bh); bh->b_blocknr = pblock++; } clear_buffer_unwritten(bh); io_end_size += (1 << blkbits); } while (lblk++, (bh = bh->b_this_page) != head); io_end_vec->size += io_end_size; *map_bh = false; out: *m_lblk = lblk; *m_pblk = pblock; return err; } /* * mpage_map_buffers - update buffers corresponding to changed extent and * submit fully mapped pages for IO * * @mpd - description of extent to map, on return next extent to map * * Scan buffers corresponding to changed extent (we expect corresponding pages * to be already locked) and update buffer state according to new extent state. * We map delalloc buffers to their physical location, clear unwritten bits, * and mark buffers as uninit when we perform writes to unwritten extents * and do extent conversion after IO is finished. If the last page is not fully * mapped, we update @map to the next extent in the last page that needs * mapping. Otherwise we submit the page for IO. */ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd) { struct folio_batch fbatch; unsigned nr, i; struct inode *inode = mpd->inode; int bpp_bits = PAGE_SHIFT - inode->i_blkbits; pgoff_t start, end; ext4_lblk_t lblk; ext4_fsblk_t pblock; int err; bool map_bh = false; start = mpd->map.m_lblk >> bpp_bits; end = (mpd->map.m_lblk + mpd->map.m_len - 1) >> bpp_bits; pblock = mpd->map.m_pblk; folio_batch_init(&fbatch); while (start <= end) { nr = filemap_get_folios(inode->i_mapping, &start, end, &fbatch); if (nr == 0) break; for (i = 0; i < nr; i++) { struct folio *folio = fbatch.folios[i]; lblk = folio->index << bpp_bits; err = mpage_process_folio(mpd, folio, &lblk, &pblock, &map_bh); /* * If map_bh is true, means page may require further bh * mapping, or maybe the page was submitted for IO. * So we return to call further extent mapping. */ if (err < 0 || map_bh) goto out; /* Page fully mapped - let IO run! */ err = mpage_submit_folio(mpd, folio); if (err < 0) goto out; mpage_folio_done(mpd, folio); } folio_batch_release(&fbatch); } /* Extent fully mapped and matches with page boundary. We are done. */ mpd->map.m_len = 0; mpd->map.m_flags = 0; return 0; out: folio_batch_release(&fbatch); return err; } static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd) { struct inode *inode = mpd->inode; struct ext4_map_blocks *map = &mpd->map; int get_blocks_flags; int err, dioread_nolock; /* Make sure transaction has enough credits for this extent */ err = ext4_journal_ensure_extent_credits(handle, inode); if (err < 0) return err; trace_ext4_da_write_pages_extent(inode, map); /* * Call ext4_map_blocks() to allocate any delayed allocation blocks, or * to convert an unwritten extent to be initialized (in the case * where we have written into one or more preallocated blocks). It is * possible that we're going to need more metadata blocks than * previously reserved. However we must not fail because we're in * writeback and there is nothing we can do about it so it might result * in data loss. So use reserved blocks to allocate metadata if * possible. In addition, do not cache any unrelated extents, as it * only holds the folio lock but does not hold the i_rwsem or * invalidate_lock, which could corrupt the extent status tree. */ get_blocks_flags = EXT4_GET_BLOCKS_CREATE | EXT4_GET_BLOCKS_METADATA_NOFAIL | EXT4_GET_BLOCKS_IO_SUBMIT | EXT4_EX_NOCACHE; dioread_nolock = ext4_should_dioread_nolock(inode); if (dioread_nolock) get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT; err = ext4_map_blocks(handle, inode, map, get_blocks_flags); if (err < 0) return err; if (dioread_nolock && (map->m_flags & EXT4_MAP_UNWRITTEN)) { if (!mpd->io_submit.io_end->handle && ext4_handle_valid(handle)) { mpd->io_submit.io_end->handle = handle->h_rsv_handle; handle->h_rsv_handle = NULL; } ext4_set_io_unwritten_flag(mpd->io_submit.io_end); } BUG_ON(map->m_len == 0); return 0; } /* * This is used to submit mapped buffers in a single folio that is not fully * mapped for various reasons, such as insufficient space or journal credits. */ static int mpage_submit_partial_folio(struct mpage_da_data *mpd) { struct inode *inode = mpd->inode; struct folio *folio; loff_t pos; int ret; folio = filemap_get_folio(inode->i_mapping, mpd->start_pos >> PAGE_SHIFT); if (IS_ERR(folio)) return PTR_ERR(folio); /* * The mapped position should be within the current processing folio * but must not be the folio start position. */ pos = ((loff_t)mpd->map.m_lblk) << inode->i_blkbits; if (WARN_ON_ONCE((folio_pos(folio) == pos) || !folio_contains(folio, pos >> PAGE_SHIFT))) return -EINVAL; ret = mpage_submit_folio(mpd, folio); if (ret) goto out; /* * Update start_pos to prevent this folio from being released in * mpage_release_unused_pages(), it will be reset to the aligned folio * pos when this folio is written again in the next round. Additionally, * do not update wbc->nr_to_write here, as it will be updated once the * entire folio has finished processing. */ mpd->start_pos = pos; out: folio_unlock(folio); folio_put(folio); return ret; } /* * mpage_map_and_submit_extent - map extent starting at mpd->lblk of length * mpd->len and submit pages underlying it for IO * * @handle - handle for journal operations * @mpd - extent to map * @give_up_on_write - we set this to true iff there is a fatal error and there * is no hope of writing the data. The caller should discard * dirty pages to avoid infinite loops. * * The function maps extent starting at mpd->lblk of length mpd->len. If it is * delayed, blocks are allocated, if it is unwritten, we may need to convert * them to initialized or split the described range from larger unwritten * extent. Note that we need not map all the described range since allocation * can return less blocks or the range is covered by more unwritten extents. We * cannot map more because we are limited by reserved transaction credits. On * the other hand we always make sure that the last touched page is fully * mapped so that it can be written out (and thus forward progress is * guaranteed). After mapping we submit all mapped pages for IO. */ static int mpage_map_and_submit_extent(handle_t *handle, struct mpage_da_data *mpd, bool *give_up_on_write) { struct inode *inode = mpd->inode; struct ext4_map_blocks *map = &mpd->map; int err; loff_t disksize; int progress = 0; ext4_io_end_t *io_end = mpd->io_submit.io_end; struct ext4_io_end_vec *io_end_vec; io_end_vec = ext4_alloc_io_end_vec(io_end); if (IS_ERR(io_end_vec)) return PTR_ERR(io_end_vec); io_end_vec->offset = ((loff_t)map->m_lblk) << inode->i_blkbits; do { err = mpage_map_one_extent(handle, mpd); if (err < 0) { struct super_block *sb = inode->i_sb; if (ext4_emergency_state(sb)) goto invalidate_dirty_pages; /* * Let the uper layers retry transient errors. * In the case of ENOSPC, if ext4_count_free_blocks() * is non-zero, a commit should free up blocks. */ if ((err == -ENOMEM) || (err == -EAGAIN) || (err == -ENOSPC && ext4_count_free_clusters(sb))) { /* * We may have already allocated extents for * some bhs inside the folio, issue the * corresponding data to prevent stale data. */ if (progress) { if (mpage_submit_partial_folio(mpd)) goto invalidate_dirty_pages; goto update_disksize; } return err; } ext4_msg(sb, KERN_CRIT, "Delayed block allocation failed for " "inode %lu at logical offset %llu with" " max blocks %u with error %d", inode->i_ino, (unsigned long long)map->m_lblk, (unsigned)map->m_len, -err); ext4_msg(sb, KERN_CRIT, "This should not happen!! Data will " "be lost\n"); if (err == -ENOSPC) ext4_print_free_blocks(inode); invalidate_dirty_pages: *give_up_on_write = true; return err; } progress = 1; /* * Update buffer state, submit mapped pages, and get us new * extent to map */ err = mpage_map_and_submit_buffers(mpd); if (err < 0) goto update_disksize; } while (map->m_len); update_disksize: /* * Update on-disk size after IO is submitted. Races with * truncate are avoided by checking i_size under i_data_sem. */ disksize = mpd->start_pos; if (disksize > READ_ONCE(EXT4_I(inode)->i_disksize)) { int err2; loff_t i_size; down_write(&EXT4_I(inode)->i_data_sem); i_size = i_size_read(inode); if (disksize > i_size) disksize = i_size; if (disksize > EXT4_I(inode)->i_disksize) EXT4_I(inode)->i_disksize = disksize; up_write(&EXT4_I(inode)->i_data_sem); err2 = ext4_mark_inode_dirty(handle, inode); if (err2) { ext4_error_err(inode->i_sb, -err2, "Failed to mark inode %lu dirty", inode->i_ino); } if (!err) err = err2; } return err; } static int ext4_journal_folio_buffers(handle_t *handle, struct folio *folio, size_t len) { struct buffer_head *page_bufs = folio_buffers(folio); struct inode *inode = folio->mapping->host; int ret, err; ret = ext4_walk_page_buffers(handle, inode, page_bufs, 0, len, NULL, do_journal_get_write_access); err = ext4_walk_page_buffers(handle, inode, page_bufs, 0, len, NULL, write_end_fn); if (ret == 0) ret = err; err = ext4_jbd2_inode_add_write(handle, inode, folio_pos(folio), len); if (ret == 0) ret = err; EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid; return ret; } static int mpage_journal_page_buffers(handle_t *handle, struct mpage_da_data *mpd, struct folio *folio) { struct inode *inode = mpd->inode; loff_t size = i_size_read(inode); size_t len = folio_size(folio); folio_clear_checked(folio); mpd->wbc->nr_to_write -= folio_nr_pages(folio); if (folio_pos(folio) + len > size && !ext4_verity_in_progress(inode)) len = size & (len - 1); return ext4_journal_folio_buffers(handle, folio, len); } /* * mpage_prepare_extent_to_map - find & lock contiguous range of dirty pages * needing mapping, submit mapped pages * * @mpd - where to look for pages * * Walk dirty pages in the mapping. If they are fully mapped, submit them for * IO immediately. If we cannot map blocks, we submit just already mapped * buffers in the page for IO and keep page dirty. When we can map blocks and * we find a page which isn't mapped we start accumulating extent of buffers * underlying these pages that needs mapping (formed by either delayed or * unwritten buffers). We also lock the pages containing these buffers. The * extent found is returned in @mpd structure (starting at mpd->lblk with * length mpd->len blocks). * * Note that this function can attach bios to one io_end structure which are * neither logically nor physically contiguous. Although it may seem as an * unnecessary complication, it is actually inevitable in blocksize < pagesize * case as we need to track IO to all buffers underlying a page in one io_end. */ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) { struct address_space *mapping = mpd->inode->i_mapping; struct folio_batch fbatch; unsigned int nr_folios; pgoff_t index = mpd->start_pos >> PAGE_SHIFT; pgoff_t end = mpd->end_pos >> PAGE_SHIFT; xa_mark_t tag; int i, err = 0; int blkbits = mpd->inode->i_blkbits; ext4_lblk_t lblk; struct buffer_head *head; handle_t *handle = NULL; int bpp = ext4_journal_blocks_per_folio(mpd->inode); if (mpd->wbc->sync_mode == WB_SYNC_ALL || mpd->wbc->tagged_writepages) tag = PAGECACHE_TAG_TOWRITE; else tag = PAGECACHE_TAG_DIRTY; mpd->map.m_len = 0; mpd->next_pos = mpd->start_pos; if (ext4_should_journal_data(mpd->inode)) { handle = ext4_journal_start(mpd->inode, EXT4_HT_WRITE_PAGE, bpp); if (IS_ERR(handle)) return PTR_ERR(handle); } folio_batch_init(&fbatch); while (index <= end) { nr_folios = filemap_get_folios_tag(mapping, &index, end, tag, &fbatch); if (nr_folios == 0) break; for (i = 0; i < nr_folios; i++) { struct folio *folio = fbatch.folios[i]; /* * Accumulated enough dirty pages? This doesn't apply * to WB_SYNC_ALL mode. For integrity sync we have to * keep going because someone may be concurrently * dirtying pages, and we might have synced a lot of * newly appeared dirty pages, but have not synced all * of the old dirty pages. */ if (mpd->wbc->sync_mode == WB_SYNC_NONE && mpd->wbc->nr_to_write <= mpd->map.m_len >> (PAGE_SHIFT - blkbits)) goto out; /* If we can't merge this page, we are done. */ if (mpd->map.m_len > 0 && mpd->next_pos != folio_pos(folio)) goto out; if (handle) { err = ext4_journal_ensure_credits(handle, bpp, 0); if (err < 0) goto out; } folio_lock(folio); /* * If the page is no longer dirty, or its mapping no * longer corresponds to inode we are writing (which * means it has been truncated or invalidated), or the * page is already under writeback and we are not doing * a data integrity writeback, skip the page */ if (!folio_test_dirty(folio) || (folio_test_writeback(folio) && (mpd->wbc->sync_mode == WB_SYNC_NONE)) || unlikely(folio->mapping != mapping)) { folio_unlock(folio); continue; } folio_wait_writeback(folio); BUG_ON(folio_test_writeback(folio)); /* * Should never happen but for buggy code in * other subsystems that call * set_page_dirty() without properly warning * the file system first. See [1] for more * information. * * [1] https://lore.kernel.org/linux-mm/20180103100430.GE4911@quack2.suse.cz */ if (!folio_buffers(folio)) { ext4_warning_inode(mpd->inode, "page %lu does not have buffers attached", folio->index); folio_clear_dirty(folio); folio_unlock(folio); continue; } if (mpd->map.m_len == 0) mpd->start_pos = folio_pos(folio); mpd->next_pos = folio_pos(folio) + folio_size(folio); /* * Writeout when we cannot modify metadata is simple. * Just submit the page. For data=journal mode we * first handle writeout of the page for checkpoint and * only after that handle delayed page dirtying. This * makes sure current data is checkpointed to the final * location before possibly journalling it again which * is desirable when the page is frequently dirtied * through a pin. */ if (!mpd->can_map) { err = mpage_submit_folio(mpd, folio); if (err < 0) goto out; /* Pending dirtying of journalled data? */ if (folio_test_checked(folio)) { err = mpage_journal_page_buffers(handle, mpd, folio); if (err < 0) goto out; mpd->journalled_more_data = 1; } mpage_folio_done(mpd, folio); } else { /* Add all dirty buffers to mpd */ lblk = ((ext4_lblk_t)folio->index) << (PAGE_SHIFT - blkbits); head = folio_buffers(folio); err = mpage_process_page_bufs(mpd, head, head, lblk); if (err <= 0) goto out; err = 0; } } folio_batch_release(&fbatch); cond_resched(); } mpd->scanned_until_end = 1; if (handle) ext4_journal_stop(handle); return 0; out: folio_batch_release(&fbatch); if (handle) ext4_journal_stop(handle); return err; } static int ext4_do_writepages(struct mpage_da_data *mpd) { struct writeback_control *wbc = mpd->wbc; pgoff_t writeback_index = 0; long nr_to_write = wbc->nr_to_write; int range_whole = 0; int cycled = 1; handle_t *handle = NULL; struct inode *inode = mpd->inode; struct address_space *mapping = inode->i_mapping; int needed_blocks, rsv_blocks = 0, ret = 0; struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb); struct blk_plug plug; bool give_up_on_write = false; trace_ext4_writepages(inode, wbc); /* * No pages to write? This is mainly a kludge to avoid starting * a transaction for special inodes like journal inode on last iput() * because that could violate lock ordering on umount */ if (!mapping->nrpages || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) goto out_writepages; /* * If the filesystem has aborted, it is read-only, so return * right away instead of dumping stack traces later on that * will obscure the real source of the problem. We test * fs shutdown state instead of sb->s_flag's SB_RDONLY because * the latter could be true if the filesystem is mounted * read-only, and in that case, ext4_writepages should * *never* be called, so if that ever happens, we would want * the stack trace. */ ret = ext4_emergency_state(mapping->host->i_sb); if (unlikely(ret)) goto out_writepages; /* * If we have inline data and arrive here, it means that * we will soon create the block for the 1st page, so * we'd better clear the inline data here. */ if (ext4_has_inline_data(inode)) { /* Just inode will be modified... */ handle = ext4_journal_start(inode, EXT4_HT_INODE, 1); if (IS_ERR(handle)) { ret = PTR_ERR(handle); goto out_writepages; } BUG_ON(ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)); ext4_destroy_inline_data(handle, inode); ext4_journal_stop(handle); } /* * data=journal mode does not do delalloc so we just need to writeout / * journal already mapped buffers. On the other hand we need to commit * transaction to make data stable. We expect all the data to be * already in the journal (the only exception are DMA pinned pages * dirtied behind our back) so we commit transaction here and run the * writeback loop to checkpoint them. The checkpointing is not actually * necessary to make data persistent *but* quite a few places (extent * shifting operations, fsverity, ...) depend on being able to drop * pagecache pages after calling filemap_write_and_wait() and for that * checkpointing needs to happen. */ if (ext4_should_journal_data(inode)) { mpd->can_map = 0; if (wbc->sync_mode == WB_SYNC_ALL) ext4_fc_commit(sbi->s_journal, EXT4_I(inode)->i_datasync_tid); } mpd->journalled_more_data = 0; if (ext4_should_dioread_nolock(inode)) { int bpf = ext4_journal_blocks_per_folio(inode); /* * We may need to convert up to one extent per block in * the folio and we may dirty the inode. */ rsv_blocks = 1 + ext4_ext_index_trans_blocks(inode, bpf); } if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) range_whole = 1; if (wbc->range_cyclic) { writeback_index = mapping->writeback_index; if (writeback_index) cycled = 0; mpd->start_pos = writeback_index << PAGE_SHIFT; mpd->end_pos = LLONG_MAX; } else { mpd->start_pos = wbc->range_start; mpd->end_pos = wbc->range_end; } ext4_io_submit_init(&mpd->io_submit, wbc); retry: if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) tag_pages_for_writeback(mapping, mpd->start_pos >> PAGE_SHIFT, mpd->end_pos >> PAGE_SHIFT); blk_start_plug(&plug); /* * First writeback pages that don't need mapping - we can avoid * starting a transaction unnecessarily and also avoid being blocked * in the block layer on device congestion while having transaction * started. */ mpd->do_map = 0; mpd->scanned_until_end = 0; mpd->io_submit.io_end = ext4_init_io_end(inode, GFP_KERNEL); if (!mpd->io_submit.io_end) { ret = -ENOMEM; goto unplug; } ret = mpage_prepare_extent_to_map(mpd); /* Unlock pages we didn't use */ mpage_release_unused_pages(mpd, false); /* Submit prepared bio */ ext4_io_submit(&mpd->io_submit); ext4_put_io_end_defer(mpd->io_submit.io_end); mpd->io_submit.io_end = NULL; if (ret < 0) goto unplug; while (!mpd->scanned_until_end && wbc->nr_to_write > 0) { /* For each extent of pages we use new io_end */ mpd->io_submit.io_end = ext4_init_io_end(inode, GFP_KERNEL); if (!mpd->io_submit.io_end) { ret = -ENOMEM; break; } WARN_ON_ONCE(!mpd->can_map); /* * We have two constraints: We find one extent to map and we * must always write out whole page (makes a difference when * blocksize < pagesize) so that we don't block on IO when we * try to write out the rest of the page. Journalled mode is * not supported by delalloc. */ BUG_ON(ext4_should_journal_data(inode)); /* * Calculate the number of credits needed to reserve for one * extent of up to MAX_WRITEPAGES_EXTENT_LEN blocks. It will * attempt to extend the transaction or start a new iteration * if the reserved credits are insufficient. */ needed_blocks = ext4_chunk_trans_blocks(inode, MAX_WRITEPAGES_EXTENT_LEN); /* start a new transaction */ handle = ext4_journal_start_with_reserve(inode, EXT4_HT_WRITE_PAGE, needed_blocks, rsv_blocks); if (IS_ERR(handle)) { ret = PTR_ERR(handle); ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: " "%ld pages, ino %lu; err %d", __func__, wbc->nr_to_write, inode->i_ino, ret); /* Release allocated io_end */ ext4_put_io_end(mpd->io_submit.io_end); mpd->io_submit.io_end = NULL; break; } mpd->do_map = 1; trace_ext4_da_write_folios_start(inode, mpd->start_pos, mpd->next_pos, wbc); ret = mpage_prepare_extent_to_map(mpd); if (!ret && mpd->map.m_len) ret = mpage_map_and_submit_extent(handle, mpd, &give_up_on_write); /* * Caution: If the handle is synchronous, * ext4_journal_stop() can wait for transaction commit * to finish which may depend on writeback of pages to * complete or on page lock to be released. In that * case, we have to wait until after we have * submitted all the IO, released page locks we hold, * and dropped io_end reference (for extent conversion * to be able to complete) before stopping the handle. */ if (!ext4_handle_valid(handle) || handle->h_sync == 0) { ext4_journal_stop(handle); handle = NULL; mpd->do_map = 0; } /* Unlock pages we didn't use */ mpage_release_unused_pages(mpd, give_up_on_write); /* Submit prepared bio */ ext4_io_submit(&mpd->io_submit); /* * Drop our io_end reference we got from init. We have * to be careful and use deferred io_end finishing if * we are still holding the transaction as we can * release the last reference to io_end which may end * up doing unwritten extent conversion. */ if (handle) { ext4_put_io_end_defer(mpd->io_submit.io_end); ext4_journal_stop(handle); } else ext4_put_io_end(mpd->io_submit.io_end); mpd->io_submit.io_end = NULL; trace_ext4_da_write_folios_end(inode, mpd->start_pos, mpd->next_pos, wbc, ret); if (ret == -ENOSPC && sbi->s_journal) { /* * Commit the transaction which would * free blocks released in the transaction * and try again */ jbd2_journal_force_commit_nested(sbi->s_journal); ret = 0; continue; } if (ret == -EAGAIN) ret = 0; /* Fatal error - ENOMEM, EIO... */ if (ret) break; } unplug: blk_finish_plug(&plug); if (!ret && !cycled && wbc->nr_to_write > 0) { cycled = 1; mpd->end_pos = (writeback_index << PAGE_SHIFT) - 1; mpd->start_pos = 0; goto retry; } /* Update index */ if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) /* * Set the writeback_index so that range_cyclic * mode will write it back later */ mapping->writeback_index = mpd->start_pos >> PAGE_SHIFT; out_writepages: trace_ext4_writepages_result(inode, wbc, ret, nr_to_write - wbc->nr_to_write); return ret; } static int ext4_writepages(struct address_space *mapping, struct writeback_control *wbc) { struct super_block *sb = mapping->host->i_sb; struct mpage_da_data mpd = { .inode = mapping->host, .wbc = wbc, .can_map = 1, }; int ret; int alloc_ctx; ret = ext4_emergency_state(sb); if (unlikely(ret)) return ret; alloc_ctx = ext4_writepages_down_read(sb); ret = ext4_do_writepages(&mpd); /* * For data=journal writeback we could have come across pages marked * for delayed dirtying (PageChecked) which were just added to the * running transaction. Try once more to get them to stable storage. */ if (!ret && mpd.journalled_more_data) ret = ext4_do_writepages(&mpd); ext4_writepages_up_read(sb, alloc_ctx); return ret; } int ext4_normal_submit_inode_data_buffers(struct jbd2_inode *jinode) { struct writeback_control wbc = { .sync_mode = WB_SYNC_ALL, .nr_to_write = LONG_MAX, .range_start = jinode->i_dirty_start, .range_end = jinode->i_dirty_end, }; struct mpage_da_data mpd = { .inode = jinode->i_vfs_inode, .wbc = &wbc, .can_map = 0, }; return ext4_do_writepages(&mpd); } static int ext4_dax_writepages(struct address_space *mapping, struct writeback_control *wbc) { int ret; long nr_to_write = wbc->nr_to_write; struct inode *inode = mapping->host; int alloc_ctx; ret = ext4_emergency_state(inode->i_sb); if (unlikely(ret)) return ret; alloc_ctx = ext4_writepages_down_read(inode->i_sb); trace_ext4_writepages(inode, wbc); ret = dax_writeback_mapping_range(mapping, EXT4_SB(inode->i_sb)->s_daxdev, wbc); trace_ext4_writepages_result(inode, wbc, ret, nr_to_write - wbc->nr_to_write); ext4_writepages_up_read(inode->i_sb, alloc_ctx); return ret; } static int ext4_nonda_switch(struct super_block *sb) { s64 free_clusters, dirty_clusters; struct ext4_sb_info *sbi = EXT4_SB(sb); /* * switch to non delalloc mode if we are running low * on free block. The free block accounting via percpu * counters can get slightly wrong with percpu_counter_batch getting * accumulated on each CPU without updating global counters * Delalloc need an accurate free block accounting. So switch * to non delalloc when we are near to error range. */ free_clusters = percpu_counter_read_positive(&sbi->s_freeclusters_counter); dirty_clusters = percpu_counter_read_positive(&sbi->s_dirtyclusters_counter); /* * Start pushing delalloc when 1/2 of free blocks are dirty. */ if (dirty_clusters && (free_clusters < 2 * dirty_clusters)) try_to_writeback_inodes_sb(sb, WB_REASON_FS_FREE_SPACE); if (2 * free_clusters < 3 * dirty_clusters || free_clusters < (dirty_clusters + EXT4_FREECLUSTERS_WATERMARK)) { /* * free block count is less than 150% of dirty blocks * or free blocks is less than watermark */ return 1; } return 0; } static int ext4_da_write_begin(const struct kiocb *iocb, struct address_space *mapping, loff_t pos, unsigned len, struct folio **foliop, void **fsdata) { int ret, retries = 0; struct folio *folio; pgoff_t index; struct inode *inode = mapping->host; ret = ext4_emergency_state(inode->i_sb); if (unlikely(ret)) return ret; index = pos >> PAGE_SHIFT; if (ext4_nonda_switch(inode->i_sb) || ext4_verity_in_progress(inode)) { *fsdata = (void *)FALL_BACK_TO_NONDELALLOC; return ext4_write_begin(iocb, mapping, pos, len, foliop, fsdata); } *fsdata = (void *)0; trace_ext4_da_write_begin(inode, pos, len); if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) { ret = ext4_generic_write_inline_data(mapping, inode, pos, len, foliop, fsdata, true); if (ret < 0) return ret; if (ret == 1) return 0; } retry: folio = write_begin_get_folio(iocb, mapping, index, len); if (IS_ERR(folio)) return PTR_ERR(folio); if (pos + len > folio_pos(folio) + folio_size(folio)) len = folio_pos(folio) + folio_size(folio) - pos; ret = ext4_block_write_begin(NULL, folio, pos, len, ext4_da_get_block_prep); if (ret < 0) { folio_unlock(folio); folio_put(folio); /* * block_write_begin may have instantiated a few blocks * outside i_size. Trim these off again. Don't need * i_size_read because we hold inode lock. */ if (pos + len > inode->i_size) ext4_truncate_failed_write(inode); if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) goto retry; return ret; } *foliop = folio; return ret; } /* * Check if we should update i_disksize * when write to the end of file but not require block allocation */ static int ext4_da_should_update_i_disksize(struct folio *folio, unsigned long offset) { struct buffer_head *bh; struct inode *inode = folio->mapping->host; unsigned int idx; int i; bh = folio_buffers(folio); idx = offset >> inode->i_blkbits; for (i = 0; i < idx; i++) bh = bh->b_this_page; if (!buffer_mapped(bh) || (buffer_delay(bh)) || buffer_unwritten(bh)) return 0; return 1; } static int ext4_da_do_write_end(struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct folio *folio) { struct inode *inode = mapping->host; loff_t old_size = inode->i_size; bool disksize_changed = false; loff_t new_i_size, zero_len = 0; handle_t *handle; if (unlikely(!folio_buffers(folio))) { folio_unlock(folio); folio_put(folio); return -EIO; } /* * block_write_end() will mark the inode as dirty with I_DIRTY_PAGES * flag, which all that's needed to trigger page writeback. */ copied = block_write_end(pos, len, copied, folio); new_i_size = pos + copied; /* * It's important to update i_size while still holding folio lock, * because folio writeout could otherwise come in and zero beyond * i_size. * * Since we are holding inode lock, we are sure i_disksize <= * i_size. We also know that if i_disksize < i_size, there are * delalloc writes pending in the range up to i_size. If the end of * the current write is <= i_size, there's no need to touch * i_disksize since writeback will push i_disksize up to i_size * eventually. If the end of the current write is > i_size and * inside an allocated block which ext4_da_should_update_i_disksize() * checked, we need to update i_disksize here as certain * ext4_writepages() paths not allocating blocks and update i_disksize. */ if (new_i_size > inode->i_size) { unsigned long end; i_size_write(inode, new_i_size); end = offset_in_folio(folio, new_i_size - 1); if (copied && ext4_da_should_update_i_disksize(folio, end)) { ext4_update_i_disksize(inode, new_i_size); disksize_changed = true; } } folio_unlock(folio); folio_put(folio); if (pos > old_size) { pagecache_isize_extended(inode, old_size, pos); zero_len = pos - old_size; } if (!disksize_changed && !zero_len) return copied; handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); if (IS_ERR(handle)) return PTR_ERR(handle); if (zero_len) ext4_zero_partial_blocks(handle, inode, old_size, zero_len); ext4_mark_inode_dirty(handle, inode); ext4_journal_stop(handle); return copied; } static int ext4_da_write_end(const struct kiocb *iocb, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct folio *folio, void *fsdata) { struct inode *inode = mapping->host; int write_mode = (int)(unsigned long)fsdata; if (write_mode == FALL_BACK_TO_NONDELALLOC) return ext4_write_end(iocb, mapping, pos, len, copied, folio, fsdata); trace_ext4_da_write_end(inode, pos, len, copied); if (write_mode != CONVERT_INLINE_DATA && ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA) && ext4_has_inline_data(inode)) return ext4_write_inline_data_end(inode, pos, len, copied, folio); if (unlikely(copied < len) && !folio_test_uptodate(folio)) copied = 0; return ext4_da_do_write_end(mapping, pos, len, copied, folio); } /* * Force all delayed allocation blocks to be allocated for a given inode. */ int ext4_alloc_da_blocks(struct inode *inode) { trace_ext4_alloc_da_blocks(inode); if (!EXT4_I(inode)->i_reserved_data_blocks) return 0; /* * We do something simple for now. The filemap_flush() will * also start triggering a write of the data blocks, which is * not strictly speaking necessary (and for users of * laptop_mode, not even desirable). However, to do otherwise * would require replicating code paths in: * * ext4_writepages() -> * write_cache_pages() ---> (via passed in callback function) * __mpage_da_writepage() --> * mpage_add_bh_to_extent() * mpage_da_map_blocks() * * The problem is that write_cache_pages(), located in * mm/page-writeback.c, marks pages clean in preparation for * doing I/O, which is not desirable if we're not planning on * doing I/O at all. * * We could call write_cache_pages(), and then redirty all of * the pages by calling redirty_page_for_writepage() but that * would be ugly in the extreme. So instead we would need to * replicate parts of the code in the above functions, * simplifying them because we wouldn't actually intend to * write out the pages, but rather only collect contiguous * logical block extents, call the multi-block allocator, and * then update the buffer heads with the block allocations. * * For now, though, we'll cheat by calling filemap_flush(), * which will map the blocks, and start the I/O, but not * actually wait for the I/O to complete. */ return filemap_flush(inode->i_mapping); } /* * bmap() is special. It gets used by applications such as lilo and by * the swapper to find the on-disk block of a specific piece of data. * * Naturally, this is dangerous if the block concerned is still in the * journal. If somebody makes a swapfile on an ext4 data-journaling * filesystem and enables swap, then they may get a nasty shock when the * data getting swapped to that swapfile suddenly gets overwritten by * the original zero's written out previously to the journal and * awaiting writeback in the kernel's buffer cache. * * So, if we see any bmap calls here on a modified, data-journaled file, * take extra steps to flush any blocks which might be in the cache. */ static sector_t ext4_bmap(struct address_space *mapping, sector_t block) { struct inode *inode = mapping->host; sector_t ret = 0; inode_lock_shared(inode); /* * We can get here for an inline file via the FIBMAP ioctl */ if (ext4_has_inline_data(inode)) goto out; if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) && (test_opt(inode->i_sb, DELALLOC) || ext4_should_journal_data(inode))) { /* * With delalloc or journalled data we want to sync the file so * that we can make sure we allocate blocks for file and data * is in place for the user to see it */ filemap_write_and_wait(mapping); } ret = iomap_bmap(mapping, block, &ext4_iomap_ops); out: inode_unlock_shared(inode); return ret; } static int ext4_read_folio(struct file *file, struct folio *folio) { int ret = -EAGAIN; struct inode *inode = folio->mapping->host; trace_ext4_read_folio(inode, folio); if (ext4_has_inline_data(inode)) ret = ext4_readpage_inline(inode, folio); if (ret == -EAGAIN) return ext4_mpage_readpages(inode, NULL, folio); return ret; } static void ext4_readahead(struct readahead_control *rac) { struct inode *inode = rac->mapping->host; /* If the file has inline data, no need to do readahead. */ if (ext4_has_inline_data(inode)) return; ext4_mpage_readpages(inode, rac, NULL); } static void ext4_invalidate_folio(struct folio *folio, size_t offset, size_t length) { trace_ext4_invalidate_folio(folio, offset, length); /* No journalling happens on data buffers when this function is used */ WARN_ON(folio_buffers(folio) && buffer_jbd(folio_buffers(folio))); block_invalidate_folio(folio, offset, length); } static int __ext4_journalled_invalidate_folio(struct folio *folio, size_t offset, size_t length) { journal_t *journal = EXT4_JOURNAL(folio->mapping->host); trace_ext4_journalled_invalidate_folio(folio, offset, length); /* * If it's a full truncate we just forget about the pending dirtying */ if (offset == 0 && length == folio_size(folio)) folio_clear_checked(folio); return jbd2_journal_invalidate_folio(journal, folio, offset, length); } /* Wrapper for aops... */ static void ext4_journalled_invalidate_folio(struct folio *folio, size_t offset, size_t length) { WARN_ON(__ext4_journalled_invalidate_folio(folio, offset, length) < 0); } static bool ext4_release_folio(struct folio *folio, gfp_t wait) { struct inode *inode = folio->mapping->host; journal_t *journal = EXT4_JOURNAL(inode); trace_ext4_release_folio(inode, folio); /* Page has dirty journalled data -> cannot release */ if (folio_test_checked(folio)) return false; if (journal) return jbd2_journal_try_to_free_buffers(journal, folio); else return try_to_free_buffers(folio); } static bool ext4_inode_datasync_dirty(struct inode *inode) { journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; if (journal) { if (jbd2_transaction_committed(journal, EXT4_I(inode)->i_datasync_tid)) return false; if (test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT)) return !list_empty(&EXT4_I(inode)->i_fc_list); return true; } /* Any metadata buffers to write? */ if (!list_empty(&inode->i_mapping->i_private_list)) return true; return inode->i_state & I_DIRTY_DATASYNC; } static void ext4_set_iomap(struct inode *inode, struct iomap *iomap, struct ext4_map_blocks *map, loff_t offset, loff_t length, unsigned int flags) { u8 blkbits = inode->i_blkbits; /* * Writes that span EOF might trigger an I/O size update on completion, * so consider them to be dirty for the purpose of O_DSYNC, even if * there is no other metadata changes being made or are pending. */ iomap->flags = 0; if (ext4_inode_datasync_dirty(inode) || offset + length > i_size_read(inode)) iomap->flags |= IOMAP_F_DIRTY; if (map->m_flags & EXT4_MAP_NEW) iomap->flags |= IOMAP_F_NEW; /* HW-offload atomics are always used */ if (flags & IOMAP_ATOMIC) iomap->flags |= IOMAP_F_ATOMIC_BIO; if (flags & IOMAP_DAX) iomap->dax_dev = EXT4_SB(inode->i_sb)->s_daxdev; else iomap->bdev = inode->i_sb->s_bdev; iomap->offset = (u64) map->m_lblk << blkbits; iomap->length = (u64) map->m_len << blkbits; if ((map->m_flags & EXT4_MAP_MAPPED) && !ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) iomap->flags |= IOMAP_F_MERGED; /* * Flags passed to ext4_map_blocks() for direct I/O writes can result * in m_flags having both EXT4_MAP_MAPPED and EXT4_MAP_UNWRITTEN bits * set. In order for any allocated unwritten extents to be converted * into written extents correctly within the ->end_io() handler, we * need to ensure that the iomap->type is set appropriately. Hence, the * reason why we need to check whether the EXT4_MAP_UNWRITTEN bit has * been set first. */ if (map->m_flags & EXT4_MAP_UNWRITTEN) { iomap->type = IOMAP_UNWRITTEN; iomap->addr = (u64) map->m_pblk << blkbits; if (flags & IOMAP_DAX) iomap->addr += EXT4_SB(inode->i_sb)->s_dax_part_off; } else if (map->m_flags & EXT4_MAP_MAPPED) { iomap->type = IOMAP_MAPPED; iomap->addr = (u64) map->m_pblk << blkbits; if (flags & IOMAP_DAX) iomap->addr += EXT4_SB(inode->i_sb)->s_dax_part_off; } else if (map->m_flags & EXT4_MAP_DELAYED) { iomap->type = IOMAP_DELALLOC; iomap->addr = IOMAP_NULL_ADDR; } else { iomap->type = IOMAP_HOLE; iomap->addr = IOMAP_NULL_ADDR; } } static int ext4_map_blocks_atomic_write_slow(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map) { ext4_lblk_t m_lblk = map->m_lblk; unsigned int m_len = map->m_len; unsigned int mapped_len = 0, m_flags = 0; ext4_fsblk_t next_pblk; bool check_next_pblk = false; int ret = 0; WARN_ON_ONCE(!ext4_has_feature_bigalloc(inode->i_sb)); /* * This is a slow path in case of mixed mapping. We use * EXT4_GET_BLOCKS_CREATE_ZERO flag here to make sure we get a single * contiguous mapped mapping. This will ensure any unwritten or hole * regions within the requested range is zeroed out and we return * a single contiguous mapped extent. */ m_flags = EXT4_GET_BLOCKS_CREATE_ZERO; do { ret = ext4_map_blocks(handle, inode, map, m_flags); if (ret < 0 && ret != -ENOSPC) goto out_err; /* * This should never happen, but let's return an error code to * avoid an infinite loop in here. */ if (ret == 0) { ret = -EFSCORRUPTED; ext4_warning_inode(inode, "ext4_map_blocks() couldn't allocate blocks m_flags: 0x%x, ret:%d", m_flags, ret); goto out_err; } /* * With bigalloc we should never get ENOSPC nor discontiguous * physical extents. */ if ((check_next_pblk && next_pblk != map->m_pblk) || ret == -ENOSPC) { ext4_warning_inode(inode, "Non-contiguous allocation detected: expected %llu, got %llu, " "or ext4_map_blocks() returned out of space ret: %d", next_pblk, map->m_pblk, ret); ret = -EFSCORRUPTED; goto out_err; } next_pblk = map->m_pblk + map->m_len; check_next_pblk = true; mapped_len += map->m_len; map->m_lblk += map->m_len; map->m_len = m_len - mapped_len; } while (mapped_len < m_len); /* * We might have done some work in above loop, so we need to query the * start of the physical extent, based on the origin m_lblk and m_len. * Let's also ensure we were able to allocate the required range for * mixed mapping case. */ map->m_lblk = m_lblk; map->m_len = m_len; map->m_flags = 0; ret = ext4_map_blocks(handle, inode, map, EXT4_GET_BLOCKS_QUERY_LAST_IN_LEAF); if (ret != m_len) { ext4_warning_inode(inode, "allocation failed for atomic write request m_lblk:%u, m_len:%u, ret:%d\n", m_lblk, m_len, ret); ret = -EINVAL; } return ret; out_err: /* reset map before returning an error */ map->m_lblk = m_lblk; map->m_len = m_len; map->m_flags = 0; return ret; } /* * ext4_map_blocks_atomic: Helper routine to ensure the entire requested * range in @map [lblk, lblk + len) is one single contiguous extent with no * mixed mappings. * * We first use m_flags passed to us by our caller (ext4_iomap_alloc()). * We only call EXT4_GET_BLOCKS_ZERO in the slow path, when the underlying * physical extent for the requested range does not have a single contiguous * mapping type i.e. (Hole, Mapped, or Unwritten) throughout. * In that case we will loop over the requested range to allocate and zero out * the unwritten / holes in between, to get a single mapped extent from * [m_lblk, m_lblk + m_len). Note that this is only possible because we know * this can be called only with bigalloc enabled filesystem where the underlying * cluster is already allocated. This avoids allocating discontiguous extents * in the slow path due to multiple calls to ext4_map_blocks(). * The slow path is mostly non-performance critical path, so it should be ok to * loop using ext4_map_blocks() with appropriate flags to allocate & zero the * underlying short holes/unwritten extents within the requested range. */ static int ext4_map_blocks_atomic_write(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, int m_flags, bool *force_commit) { ext4_lblk_t m_lblk = map->m_lblk; unsigned int m_len = map->m_len; int ret = 0; WARN_ON_ONCE(m_len > 1 && !ext4_has_feature_bigalloc(inode->i_sb)); ret = ext4_map_blocks(handle, inode, map, m_flags); if (ret < 0 || ret == m_len) goto out; /* * This is a mixed mapping case where we were not able to allocate * a single contiguous extent. In that case let's reset requested * mapping and call the slow path. */ map->m_lblk = m_lblk; map->m_len = m_len; map->m_flags = 0; /* * slow path means we have mixed mapping, that means we will need * to force txn commit. */ *force_commit = true; return ext4_map_blocks_atomic_write_slow(handle, inode, map); out: return ret; } static int ext4_iomap_alloc(struct inode *inode, struct ext4_map_blocks *map, unsigned int flags) { handle_t *handle; u8 blkbits = inode->i_blkbits; int ret, dio_credits, m_flags = 0, retries = 0; bool force_commit = false; /* * Trim the mapping request to the maximum value that we can map at * once for direct I/O. */ if (map->m_len > DIO_MAX_BLOCKS) map->m_len = DIO_MAX_BLOCKS; /* * journal credits estimation for atomic writes. We call * ext4_map_blocks(), to find if there could be a mixed mapping. If yes, * then let's assume the no. of pextents required can be m_len i.e. * every alternate block can be unwritten and hole. */ if (flags & IOMAP_ATOMIC) { unsigned int orig_mlen = map->m_len; ret = ext4_map_blocks(NULL, inode, map, 0); if (ret < 0) return ret; if (map->m_len < orig_mlen) { map->m_len = orig_mlen; dio_credits = ext4_meta_trans_blocks(inode, orig_mlen, map->m_len); } else { dio_credits = ext4_chunk_trans_blocks(inode, map->m_len); } } else { dio_credits = ext4_chunk_trans_blocks(inode, map->m_len); } retry: /* * Either we allocate blocks and then don't get an unwritten extent, so * in that case we have reserved enough credits. Or, the blocks are * already allocated and unwritten. In that case, the extent conversion * fits into the credits as well. */ handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, dio_credits); if (IS_ERR(handle)) return PTR_ERR(handle); /* * DAX and direct I/O are the only two operations that are currently * supported with IOMAP_WRITE. */ WARN_ON(!(flags & (IOMAP_DAX | IOMAP_DIRECT))); if (flags & IOMAP_DAX) m_flags = EXT4_GET_BLOCKS_CREATE_ZERO; /* * We use i_size instead of i_disksize here because delalloc writeback * can complete at any point during the I/O and subsequently push the * i_disksize out to i_size. This could be beyond where direct I/O is * happening and thus expose allocated blocks to direct I/O reads. */ else if (((loff_t)map->m_lblk << blkbits) >= i_size_read(inode)) m_flags = EXT4_GET_BLOCKS_CREATE; else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) m_flags = EXT4_GET_BLOCKS_IO_CREATE_EXT; if (flags & IOMAP_ATOMIC) ret = ext4_map_blocks_atomic_write(handle, inode, map, m_flags, &force_commit); else ret = ext4_map_blocks(handle, inode, map, m_flags); /* * We cannot fill holes in indirect tree based inodes as that could * expose stale data in the case of a crash. Use the magic error code * to fallback to buffered I/O. */ if (!m_flags && !ret) ret = -ENOTBLK; ext4_journal_stop(handle); if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) goto retry; /* * Force commit the current transaction if the allocation spans a mixed * mapping range. This ensures any pending metadata updates (like * unwritten to written extents conversion) in this range are in * consistent state with the file data blocks, before performing the * actual write I/O. If the commit fails, the whole I/O must be aborted * to prevent any possible torn writes. */ if (ret > 0 && force_commit) { int ret2; ret2 = ext4_force_commit(inode->i_sb); if (ret2) return ret2; } return ret; } static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length, unsigned flags, struct iomap *iomap, struct iomap *srcmap) { int ret; struct ext4_map_blocks map; u8 blkbits = inode->i_blkbits; unsigned int orig_mlen; if ((offset >> blkbits) > EXT4_MAX_LOGICAL_BLOCK) return -EINVAL; if (WARN_ON_ONCE(ext4_has_inline_data(inode))) return -ERANGE; /* * Calculate the first and last logical blocks respectively. */ map.m_lblk = offset >> blkbits; map.m_len = min_t(loff_t, (offset + length - 1) >> blkbits, EXT4_MAX_LOGICAL_BLOCK) - map.m_lblk + 1; orig_mlen = map.m_len; if (flags & IOMAP_WRITE) { /* * We check here if the blocks are already allocated, then we * don't need to start a journal txn and we can directly return * the mapping information. This could boost performance * especially in multi-threaded overwrite requests. */ if (offset + length <= i_size_read(inode)) { ret = ext4_map_blocks(NULL, inode, &map, 0); /* * For atomic writes the entire requested length should * be mapped. */ if (map.m_flags & EXT4_MAP_MAPPED) { if ((!(flags & IOMAP_ATOMIC) && ret > 0) || (flags & IOMAP_ATOMIC && ret >= orig_mlen)) goto out; } map.m_len = orig_mlen; } ret = ext4_iomap_alloc(inode, &map, flags); } else { /* * This can be called for overwrites path from * ext4_iomap_overwrite_begin(). */ ret = ext4_map_blocks(NULL, inode, &map, 0); } if (ret < 0) return ret; out: /* * When inline encryption is enabled, sometimes I/O to an encrypted file * has to be broken up to guarantee DUN contiguity. Handle this by * limiting the length of the mapping returned. */ map.m_len = fscrypt_limit_io_blocks(inode, map.m_lblk, map.m_len); /* * Before returning to iomap, let's ensure the allocated mapping * covers the entire requested length for atomic writes. */ if (flags & IOMAP_ATOMIC) { if (map.m_len < (length >> blkbits)) { WARN_ON_ONCE(1); return -EINVAL; } } ext4_set_iomap(inode, iomap, &map, offset, length, flags); return 0; } static int ext4_iomap_overwrite_begin(struct inode *inode, loff_t offset, loff_t length, unsigned flags, struct iomap *iomap, struct iomap *srcmap) { int ret; /* * Even for writes we don't need to allocate blocks, so just pretend * we are reading to save overhead of starting a transaction. */ flags &= ~IOMAP_WRITE; ret = ext4_iomap_begin(inode, offset, length, flags, iomap, srcmap); WARN_ON_ONCE(!ret && iomap->type != IOMAP_MAPPED); return ret; } static inline bool ext4_want_directio_fallback(unsigned flags, ssize_t written) { /* must be a directio to fall back to buffered */ if ((flags & (IOMAP_WRITE | IOMAP_DIRECT)) != (IOMAP_WRITE | IOMAP_DIRECT)) return false; /* atomic writes are all-or-nothing */ if (flags & IOMAP_ATOMIC) return false; /* can only try again if we wrote nothing */ return written == 0; } static int ext4_iomap_end(struct inode *inode, loff_t offset, loff_t length, ssize_t written, unsigned flags, struct iomap *iomap) { /* * Check to see whether an error occurred while writing out the data to * the allocated blocks. If so, return the magic error code for * non-atomic write so that we fallback to buffered I/O and attempt to * complete the remainder of the I/O. * For non-atomic writes, any blocks that may have been * allocated in preparation for the direct I/O will be reused during * buffered I/O. For atomic write, we never fallback to buffered-io. */ if (ext4_want_directio_fallback(flags, written)) return -ENOTBLK; return 0; } const struct iomap_ops ext4_iomap_ops = { .iomap_begin = ext4_iomap_begin, .iomap_end = ext4_iomap_end, }; const struct iomap_ops ext4_iomap_overwrite_ops = { .iomap_begin = ext4_iomap_overwrite_begin, .iomap_end = ext4_iomap_end, }; static int ext4_iomap_begin_report(struct inode *inode, loff_t offset, loff_t length, unsigned int flags, struct iomap *iomap, struct iomap *srcmap) { int ret; struct ext4_map_blocks map; u8 blkbits = inode->i_blkbits; if ((offset >> blkbits) > EXT4_MAX_LOGICAL_BLOCK) return -EINVAL; if (ext4_has_inline_data(inode)) { ret = ext4_inline_data_iomap(inode, iomap); if (ret != -EAGAIN) { if (ret == 0 && offset >= iomap->length) ret = -ENOENT; return ret; } } /* * Calculate the first and last logical block respectively. */ map.m_lblk = offset >> blkbits; map.m_len = min_t(loff_t, (offset + length - 1) >> blkbits, EXT4_MAX_LOGICAL_BLOCK) - map.m_lblk + 1; /* * Fiemap callers may call for offset beyond s_bitmap_maxbytes. * So handle it here itself instead of querying ext4_map_blocks(). * Since ext4_map_blocks() will warn about it and will return * -EIO error. */ if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); if (offset >= sbi->s_bitmap_maxbytes) { map.m_flags = 0; goto set_iomap; } } ret = ext4_map_blocks(NULL, inode, &map, 0); if (ret < 0) return ret; set_iomap: ext4_set_iomap(inode, iomap, &map, offset, length, flags); return 0; } const struct iomap_ops ext4_iomap_report_ops = { .iomap_begin = ext4_iomap_begin_report, }; /* * For data=journal mode, folio should be marked dirty only when it was * writeably mapped. When that happens, it was already attached to the * transaction and marked as jbddirty (we take care of this in * ext4_page_mkwrite()). On transaction commit, we writeprotect page mappings * so we should have nothing to do here, except for the case when someone * had the page pinned and dirtied the page through this pin (e.g. by doing * direct IO to it). In that case we'd need to attach buffers here to the * transaction but we cannot due to lock ordering. We cannot just dirty the * folio and leave attached buffers clean, because the buffers' dirty state is * "definitive". We cannot just set the buffers dirty or jbddirty because all * the journalling code will explode. So what we do is to mark the folio * "pending dirty" and next time ext4_writepages() is called, attach buffers * to the transaction appropriately. */ static bool ext4_journalled_dirty_folio(struct address_space *mapping, struct folio *folio) { WARN_ON_ONCE(!folio_buffers(folio)); if (folio_maybe_dma_pinned(folio)) folio_set_checked(folio); return filemap_dirty_folio(mapping, folio); } static bool ext4_dirty_folio(struct address_space *mapping, struct folio *folio) { WARN_ON_ONCE(!folio_test_locked(folio) && !folio_test_dirty(folio)); WARN_ON_ONCE(!folio_buffers(folio)); return block_dirty_folio(mapping, folio); } static int ext4_iomap_swap_activate(struct swap_info_struct *sis, struct file *file, sector_t *span) { return iomap_swapfile_activate(sis, file, span, &ext4_iomap_report_ops); } static const struct address_space_operations ext4_aops = { .read_folio = ext4_read_folio, .readahead = ext4_readahead, .writepages = ext4_writepages, .write_begin = ext4_write_begin, .write_end = ext4_write_end, .dirty_folio = ext4_dirty_folio, .bmap = ext4_bmap, .invalidate_folio = ext4_invalidate_folio, .release_folio = ext4_release_folio, .migrate_folio = buffer_migrate_folio, .is_partially_uptodate = block_is_partially_uptodate, .error_remove_folio = generic_error_remove_folio, .swap_activate = ext4_iomap_swap_activate, }; static const struct address_space_operations ext4_journalled_aops = { .read_folio = ext4_read_folio, .readahead = ext4_readahead, .writepages = ext4_writepages, .write_begin = ext4_write_begin, .write_end = ext4_journalled_write_end, .dirty_folio = ext4_journalled_dirty_folio, .bmap = ext4_bmap, .invalidate_folio = ext4_journalled_invalidate_folio, .release_folio = ext4_release_folio, .migrate_folio = buffer_migrate_folio_norefs, .is_partially_uptodate = block_is_partially_uptodate, .error_remove_folio = generic_error_remove_folio, .swap_activate = ext4_iomap_swap_activate, }; static const struct address_space_operations ext4_da_aops = { .read_folio = ext4_read_folio, .readahead = ext4_readahead, .writepages = ext4_writepages, .write_begin = ext4_da_write_begin, .write_end = ext4_da_write_end, .dirty_folio = ext4_dirty_folio, .bmap = ext4_bmap, .invalidate_folio = ext4_invalidate_folio, .release_folio = ext4_release_folio, .migrate_folio = buffer_migrate_folio, .is_partially_uptodate = block_is_partially_uptodate, .error_remove_folio = generic_error_remove_folio, .swap_activate = ext4_iomap_swap_activate, }; static const struct address_space_operations ext4_dax_aops = { .writepages = ext4_dax_writepages, .dirty_folio = noop_dirty_folio, .bmap = ext4_bmap, .swap_activate = ext4_iomap_swap_activate, }; void ext4_set_aops(struct inode *inode) { switch (ext4_inode_journal_mode(inode)) { case EXT4_INODE_ORDERED_DATA_MODE: case EXT4_INODE_WRITEBACK_DATA_MODE: break; case EXT4_INODE_JOURNAL_DATA_MODE: inode->i_mapping->a_ops = &ext4_journalled_aops; return; default: BUG(); } if (IS_DAX(inode)) inode->i_mapping->a_ops = &ext4_dax_aops; else if (test_opt(inode->i_sb, DELALLOC)) inode->i_mapping->a_ops = &ext4_da_aops; else inode->i_mapping->a_ops = &ext4_aops; } /* * Here we can't skip an unwritten buffer even though it usually reads zero * because it might have data in pagecache (eg, if called from ext4_zero_range, * ext4_punch_hole, etc) which needs to be properly zeroed out. Otherwise a * racing writeback can come later and flush the stale pagecache to disk. */ static int __ext4_block_zero_page_range(handle_t *handle, struct address_space *mapping, loff_t from, loff_t length) { unsigned int offset, blocksize, pos; ext4_lblk_t iblock; struct inode *inode = mapping->host; struct buffer_head *bh; struct folio *folio; int err = 0; folio = __filemap_get_folio(mapping, from >> PAGE_SHIFT, FGP_LOCK | FGP_ACCESSED | FGP_CREAT, mapping_gfp_constraint(mapping, ~__GFP_FS)); if (IS_ERR(folio)) return PTR_ERR(folio); blocksize = inode->i_sb->s_blocksize; iblock = folio->index << (PAGE_SHIFT - inode->i_sb->s_blocksize_bits); bh = folio_buffers(folio); if (!bh) bh = create_empty_buffers(folio, blocksize, 0); /* Find the buffer that contains "offset" */ offset = offset_in_folio(folio, from); pos = blocksize; while (offset >= pos) { bh = bh->b_this_page; iblock++; pos += blocksize; } if (buffer_freed(bh)) { BUFFER_TRACE(bh, "freed: skip"); goto unlock; } if (!buffer_mapped(bh)) { BUFFER_TRACE(bh, "unmapped"); ext4_get_block(inode, iblock, bh, 0); /* unmapped? It's a hole - nothing to do */ if (!buffer_mapped(bh)) { BUFFER_TRACE(bh, "still unmapped"); goto unlock; } } /* Ok, it's mapped. Make sure it's up-to-date */ if (folio_test_uptodate(folio)) set_buffer_uptodate(bh); if (!buffer_uptodate(bh)) { err = ext4_read_bh_lock(bh, 0, true); if (err) goto unlock; if (fscrypt_inode_uses_fs_layer_crypto(inode)) { /* We expect the key to be set. */ BUG_ON(!fscrypt_has_encryption_key(inode)); err = fscrypt_decrypt_pagecache_blocks(folio, blocksize, bh_offset(bh)); if (err) { clear_buffer_uptodate(bh); goto unlock; } } } if (ext4_should_journal_data(inode)) { BUFFER_TRACE(bh, "get write access"); err = ext4_journal_get_write_access(handle, inode->i_sb, bh, EXT4_JTR_NONE); if (err) goto unlock; } folio_zero_range(folio, offset, length); BUFFER_TRACE(bh, "zeroed end of block"); if (ext4_should_journal_data(inode)) { err = ext4_dirty_journalled_data(handle, bh); } else { err = 0; mark_buffer_dirty(bh); if (ext4_should_order_data(inode)) err = ext4_jbd2_inode_add_write(handle, inode, from, length); } unlock: folio_unlock(folio); folio_put(folio); return err; } /* * ext4_block_zero_page_range() zeros out a mapping of length 'length' * starting from file offset 'from'. The range to be zero'd must * be contained with in one block. If the specified range exceeds * the end of the block it will be shortened to end of the block * that corresponds to 'from' */ static int ext4_block_zero_page_range(handle_t *handle, struct address_space *mapping, loff_t from, loff_t length) { struct inode *inode = mapping->host; unsigned offset = from & (PAGE_SIZE-1); unsigned blocksize = inode->i_sb->s_blocksize; unsigned max = blocksize - (offset & (blocksize - 1)); /* * correct length if it does not fall between * 'from' and the end of the block */ if (length > max || length < 0) length = max; if (IS_DAX(inode)) { return dax_zero_range(inode, from, length, NULL, &ext4_iomap_ops); } return __ext4_block_zero_page_range(handle, mapping, from, length); } /* * ext4_block_truncate_page() zeroes out a mapping from file offset `from' * up to the end of the block which corresponds to `from'. * This required during truncate. We need to physically zero the tail end * of that block so it doesn't yield old data if the file is later grown. */ static int ext4_block_truncate_page(handle_t *handle, struct address_space *mapping, loff_t from) { unsigned offset = from & (PAGE_SIZE-1); unsigned length; unsigned blocksize; struct inode *inode = mapping->host; /* If we are processing an encrypted inode during orphan list handling */ if (IS_ENCRYPTED(inode) && !fscrypt_has_encryption_key(inode)) return 0; blocksize = inode->i_sb->s_blocksize; length = blocksize - (offset & (blocksize - 1)); return ext4_block_zero_page_range(handle, mapping, from, length); } int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode, loff_t lstart, loff_t length) { struct super_block *sb = inode->i_sb; struct address_space *mapping = inode->i_mapping; unsigned partial_start, partial_end; ext4_fsblk_t start, end; loff_t byte_end = (lstart + length - 1); int err = 0; partial_start = lstart & (sb->s_blocksize - 1); partial_end = byte_end & (sb->s_blocksize - 1); start = lstart >> sb->s_blocksize_bits; end = byte_end >> sb->s_blocksize_bits; /* Handle partial zero within the single block */ if (start == end && (partial_start || (partial_end != sb->s_blocksize - 1))) { err = ext4_block_zero_page_range(handle, mapping, lstart, length); return err; } /* Handle partial zero out on the start of the range */ if (partial_start) { err = ext4_block_zero_page_range(handle, mapping, lstart, sb->s_blocksize); if (err) return err; } /* Handle partial zero out on the end of the range */ if (partial_end != sb->s_blocksize - 1) err = ext4_block_zero_page_range(handle, mapping, byte_end - partial_end, partial_end + 1); return err; } int ext4_can_truncate(struct inode *inode) { if (S_ISREG(inode->i_mode)) return 1; if (S_ISDIR(inode->i_mode)) return 1; if (S_ISLNK(inode->i_mode)) return !ext4_inode_is_fast_symlink(inode); return 0; } /* * We have to make sure i_disksize gets properly updated before we truncate * page cache due to hole punching or zero range. Otherwise i_disksize update * can get lost as it may have been postponed to submission of writeback but * that will never happen after we truncate page cache. */ int ext4_update_disksize_before_punch(struct inode *inode, loff_t offset, loff_t len) { handle_t *handle; int ret; loff_t size = i_size_read(inode); WARN_ON(!inode_is_locked(inode)); if (offset > size || offset + len < size) return 0; if (EXT4_I(inode)->i_disksize >= size) return 0; handle = ext4_journal_start(inode, EXT4_HT_MISC, 1); if (IS_ERR(handle)) return PTR_ERR(handle); ext4_update_i_disksize(inode, size); ret = ext4_mark_inode_dirty(handle, inode); ext4_journal_stop(handle); return ret; } static inline void ext4_truncate_folio(struct inode *inode, loff_t start, loff_t end) { unsigned long blocksize = i_blocksize(inode); struct folio *folio; /* Nothing to be done if no complete block needs to be truncated. */ if (round_up(start, blocksize) >= round_down(end, blocksize)) return; folio = filemap_lock_folio(inode->i_mapping, start >> PAGE_SHIFT); if (IS_ERR(folio)) return; if (folio_mkclean(folio)) folio_mark_dirty(folio); folio_unlock(folio); folio_put(folio); } int ext4_truncate_page_cache_block_range(struct inode *inode, loff_t start, loff_t end) { unsigned long blocksize = i_blocksize(inode); int ret; /* * For journalled data we need to write (and checkpoint) pages * before discarding page cache to avoid inconsitent data on disk * in case of crash before freeing or unwritten converting trans * is committed. */ if (ext4_should_journal_data(inode)) { ret = filemap_write_and_wait_range(inode->i_mapping, start, end - 1); if (ret) return ret; goto truncate_pagecache; } /* * If the block size is less than the page size, the file's mapped * blocks within one page could be freed or converted to unwritten. * So it's necessary to remove writable userspace mappings, and then * ext4_page_mkwrite() can be called during subsequent write access * to these partial folios. */ if (!IS_ALIGNED(start | end, PAGE_SIZE) && blocksize < PAGE_SIZE && start < inode->i_size) { loff_t page_boundary = round_up(start, PAGE_SIZE); ext4_truncate_folio(inode, start, min(page_boundary, end)); if (end > page_boundary) ext4_truncate_folio(inode, round_down(end, PAGE_SIZE), end); } truncate_pagecache: truncate_pagecache_range(inode, start, end - 1); return 0; } static void ext4_wait_dax_page(struct inode *inode) { filemap_invalidate_unlock(inode->i_mapping); schedule(); filemap_invalidate_lock(inode->i_mapping); } int ext4_break_layouts(struct inode *inode) { if (WARN_ON_ONCE(!rwsem_is_locked(&inode->i_mapping->invalidate_lock))) return -EINVAL; return dax_break_layout_inode(inode, ext4_wait_dax_page); } /* * ext4_punch_hole: punches a hole in a file by releasing the blocks * associated with the given offset and length * * @inode: File inode * @offset: The offset where the hole will begin * @len: The length of the hole * * Returns: 0 on success or negative on failure */ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) { struct inode *inode = file_inode(file); struct super_block *sb = inode->i_sb; ext4_lblk_t start_lblk, end_lblk; loff_t max_end = sb->s_maxbytes; loff_t end = offset + length; handle_t *handle; unsigned int credits; int ret; trace_ext4_punch_hole(inode, offset, length, 0); WARN_ON_ONCE(!inode_is_locked(inode)); /* * For indirect-block based inodes, make sure that the hole within * one block before last range. */ if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) max_end = EXT4_SB(sb)->s_bitmap_maxbytes - sb->s_blocksize; /* No need to punch hole beyond i_size */ if (offset >= inode->i_size || offset >= max_end) return 0; /* * If the hole extends beyond i_size, set the hole to end after * the page that contains i_size. */ if (end > inode->i_size) end = round_up(inode->i_size, PAGE_SIZE); if (end > max_end) end = max_end; length = end - offset; /* * Attach jinode to inode for jbd2 if we do any zeroing of partial * block. */ if (!IS_ALIGNED(offset | end, sb->s_blocksize)) { ret = ext4_inode_attach_jinode(inode); if (ret < 0) return ret; } ret = ext4_update_disksize_before_punch(inode, offset, length); if (ret) return ret; /* Now release the pages and zero block aligned part of pages*/ ret = ext4_truncate_page_cache_block_range(inode, offset, end); if (ret) return ret; if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) credits = ext4_chunk_trans_extent(inode, 2); else credits = ext4_blocks_for_truncate(inode); handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits); if (IS_ERR(handle)) { ret = PTR_ERR(handle); ext4_std_error(sb, ret); return ret; } ret = ext4_zero_partial_blocks(handle, inode, offset, length); if (ret) goto out_handle; /* If there are blocks to remove, do it */ start_lblk = EXT4_B_TO_LBLK(inode, offset); end_lblk = end >> inode->i_blkbits; if (end_lblk > start_lblk) { ext4_lblk_t hole_len = end_lblk - start_lblk; ext4_fc_track_inode(handle, inode); ext4_check_map_extents_env(inode); down_write(&EXT4_I(inode)->i_data_sem); ext4_discard_preallocations(inode); ext4_es_remove_extent(inode, start_lblk, hole_len); if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) ret = ext4_ext_remove_space(inode, start_lblk, end_lblk - 1); else ret = ext4_ind_remove_space(handle, inode, start_lblk, end_lblk); if (ret) { up_write(&EXT4_I(inode)->i_data_sem); goto out_handle; } ext4_es_insert_extent(inode, start_lblk, hole_len, ~0, EXTENT_STATUS_HOLE, 0); up_write(&EXT4_I(inode)->i_data_sem); } ext4_fc_track_range(handle, inode, start_lblk, end_lblk); ret = ext4_mark_inode_dirty(handle, inode); if (unlikely(ret)) goto out_handle; ext4_update_inode_fsync_trans(handle, inode, 1); if (IS_SYNC(inode)) ext4_handle_sync(handle); out_handle: ext4_journal_stop(handle); return ret; } int ext4_inode_attach_jinode(struct inode *inode) { struct ext4_inode_info *ei = EXT4_I(inode); struct jbd2_inode *jinode; if (ei->jinode || !EXT4_SB(inode->i_sb)->s_journal) return 0; jinode = jbd2_alloc_inode(GFP_KERNEL); spin_lock(&inode->i_lock); if (!ei->jinode) { if (!jinode) { spin_unlock(&inode->i_lock); return -ENOMEM; } ei->jinode = jinode; jbd2_journal_init_jbd_inode(ei->jinode, inode); jinode = NULL; } spin_unlock(&inode->i_lock); if (unlikely(jinode != NULL)) jbd2_free_inode(jinode); return 0; } /* * ext4_truncate() * * We block out ext4_get_block() block instantiations across the entire * transaction, and VFS/VM ensures that ext4_truncate() cannot run * simultaneously on behalf of the same inode. * * As we work through the truncate and commit bits of it to the journal there * is one core, guiding principle: the file's tree must always be consistent on * disk. We must be able to restart the truncate after a crash. * * The file's tree may be transiently inconsistent in memory (although it * probably isn't), but whenever we close off and commit a journal transaction, * the contents of (the filesystem + the journal) must be consistent and * restartable. It's pretty simple, really: bottom up, right to left (although * left-to-right works OK too). * * Note that at recovery time, journal replay occurs *before* the restart of * truncate against the orphan inode list. * * The committed inode has the new, desired i_size (which is the same as * i_disksize in this case). After a crash, ext4_orphan_cleanup() will see * that this inode's truncate did not complete and it will again call * ext4_truncate() to have another go. So there will be instantiated blocks * to the right of the truncation point in a crashed ext4 filesystem. But * that's fine - as long as they are linked from the inode, the post-crash * ext4_truncate() run will find them and release them. */ int ext4_truncate(struct inode *inode) { struct ext4_inode_info *ei = EXT4_I(inode); unsigned int credits; int err = 0, err2; handle_t *handle; struct address_space *mapping = inode->i_mapping; /* * There is a possibility that we're either freeing the inode * or it's a completely new inode. In those cases we might not * have i_rwsem locked because it's not necessary. */ if (!(inode->i_state & (I_NEW|I_FREEING))) WARN_ON(!inode_is_locked(inode)); trace_ext4_truncate_enter(inode); if (!ext4_can_truncate(inode)) goto out_trace; if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC)) ext4_set_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE); if (ext4_has_inline_data(inode)) { int has_inline = 1; err = ext4_inline_data_truncate(inode, &has_inline); if (err || has_inline) goto out_trace; } /* If we zero-out tail of the page, we have to create jinode for jbd2 */ if (inode->i_size & (inode->i_sb->s_blocksize - 1)) { err = ext4_inode_attach_jinode(inode); if (err) goto out_trace; } if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) credits = ext4_chunk_trans_extent(inode, 1); else credits = ext4_blocks_for_truncate(inode); handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits); if (IS_ERR(handle)) { err = PTR_ERR(handle); goto out_trace; } if (inode->i_size & (inode->i_sb->s_blocksize - 1)) ext4_block_truncate_page(handle, mapping, inode->i_size); /* * We add the inode to the orphan list, so that if this * truncate spans multiple transactions, and we crash, we will * resume the truncate when the filesystem recovers. It also * marks the inode dirty, to catch the new size. * * Implication: the file must always be in a sane, consistent * truncatable state while each transaction commits. */ err = ext4_orphan_add(handle, inode); if (err) goto out_stop; ext4_fc_track_inode(handle, inode); ext4_check_map_extents_env(inode); down_write(&EXT4_I(inode)->i_data_sem); ext4_discard_preallocations(inode); if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) err = ext4_ext_truncate(handle, inode); else ext4_ind_truncate(handle, inode); up_write(&ei->i_data_sem); if (err) goto out_stop; if (IS_SYNC(inode)) ext4_handle_sync(handle); out_stop: /* * If this was a simple ftruncate() and the file will remain alive, * then we need to clear up the orphan record which we created above. * However, if this was a real unlink then we were called by * ext4_evict_inode(), and we allow that function to clean up the * orphan info for us. */ if (inode->i_nlink) ext4_orphan_del(handle, inode); inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); err2 = ext4_mark_inode_dirty(handle, inode); if (unlikely(err2 && !err)) err = err2; ext4_journal_stop(handle); out_trace: trace_ext4_truncate_exit(inode); return err; } static inline u64 ext4_inode_peek_iversion(const struct inode *inode) { if (unlikely(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) return inode_peek_iversion_raw(inode); else return inode_peek_iversion(inode); } static int ext4_inode_blocks_set(struct ext4_inode *raw_inode, struct ext4_inode_info *ei) { struct inode *inode = &(ei->vfs_inode); u64 i_blocks = READ_ONCE(inode->i_blocks); struct super_block *sb = inode->i_sb; if (i_blocks <= ~0U) { /* * i_blocks can be represented in a 32 bit variable * as multiple of 512 bytes */ raw_inode->i_blocks_lo = cpu_to_le32(i_blocks); raw_inode->i_blocks_high = 0; ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE); return 0; } /* * This should never happen since sb->s_maxbytes should not have * allowed this, sb->s_maxbytes was set according to the huge_file * feature in ext4_fill_super(). */ if (!ext4_has_feature_huge_file(sb)) return -EFSCORRUPTED; if (i_blocks <= 0xffffffffffffULL) { /* * i_blocks can be represented in a 48 bit variable * as multiple of 512 bytes */ raw_inode->i_blocks_lo = cpu_to_le32(i_blocks); raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32); ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE); } else { ext4_set_inode_flag(inode, EXT4_INODE_HUGE_FILE); /* i_block is stored in file system block size */ i_blocks = i_blocks >> (inode->i_blkbits - 9); raw_inode->i_blocks_lo = cpu_to_le32(i_blocks); raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32); } return 0; } static int ext4_fill_raw_inode(struct inode *inode, struct ext4_inode *raw_inode) { struct ext4_inode_info *ei = EXT4_I(inode); uid_t i_uid; gid_t i_gid; projid_t i_projid; int block; int err; err = ext4_inode_blocks_set(raw_inode, ei); raw_inode->i_mode = cpu_to_le16(inode->i_mode); i_uid = i_uid_read(inode); i_gid = i_gid_read(inode); i_projid = from_kprojid(&init_user_ns, ei->i_projid); if (!(test_opt(inode->i_sb, NO_UID32))) { raw_inode->i_uid_low = cpu_to_le16(low_16_bits(i_uid)); raw_inode->i_gid_low = cpu_to_le16(low_16_bits(i_gid)); /* * Fix up interoperability with old kernels. Otherwise, * old inodes get re-used with the upper 16 bits of the * uid/gid intact. */ if (ei->i_dtime && list_empty(&ei->i_orphan)) { raw_inode->i_uid_high = 0; raw_inode->i_gid_high = 0; } else { raw_inode->i_uid_high = cpu_to_le16(high_16_bits(i_uid)); raw_inode->i_gid_high = cpu_to_le16(high_16_bits(i_gid)); } } else { raw_inode->i_uid_low = cpu_to_le16(fs_high2lowuid(i_uid)); raw_inode->i_gid_low = cpu_to_le16(fs_high2lowgid(i_gid)); raw_inode->i_uid_high = 0; raw_inode->i_gid_high = 0; } raw_inode->i_links_count = cpu_to_le16(inode->i_nlink); EXT4_INODE_SET_CTIME(inode, raw_inode); EXT4_INODE_SET_MTIME(inode, raw_inode); EXT4_INODE_SET_ATIME(inode, raw_inode); EXT4_EINODE_SET_XTIME(i_crtime, ei, raw_inode); raw_inode->i_dtime = cpu_to_le32(ei->i_dtime); raw_inode->i_flags = cpu_to_le32(ei->i_flags & 0xFFFFFFFF); if (likely(!test_opt2(inode->i_sb, HURD_COMPAT))) raw_inode->i_file_acl_high = cpu_to_le16(ei->i_file_acl >> 32); raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl); ext4_isize_set(raw_inode, ei->i_disksize); raw_inode->i_generation = cpu_to_le32(inode->i_generation); if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) { if (old_valid_dev(inode->i_rdev)) { raw_inode->i_block[0] = cpu_to_le32(old_encode_dev(inode->i_rdev)); raw_inode->i_block[1] = 0; } else { raw_inode->i_block[0] = 0; raw_inode->i_block[1] = cpu_to_le32(new_encode_dev(inode->i_rdev)); raw_inode->i_block[2] = 0; } } else if (!ext4_has_inline_data(inode)) { for (block = 0; block < EXT4_N_BLOCKS; block++) raw_inode->i_block[block] = ei->i_data[block]; } if (likely(!test_opt2(inode->i_sb, HURD_COMPAT))) { u64 ivers = ext4_inode_peek_iversion(inode); raw_inode->i_disk_version = cpu_to_le32(ivers); if (ei->i_extra_isize) { if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi)) raw_inode->i_version_hi = cpu_to_le32(ivers >> 32); raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize); } } if (i_projid != EXT4_DEF_PROJID && !ext4_has_feature_project(inode->i_sb)) err = err ?: -EFSCORRUPTED; if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE && EXT4_FITS_IN_INODE(raw_inode, ei, i_projid)) raw_inode->i_projid = cpu_to_le32(i_projid); ext4_inode_csum_set(inode, raw_inode, ei); return err; } /* * ext4_get_inode_loc returns with an extra refcount against the inode's * underlying buffer_head on success. If we pass 'inode' and it does not * have in-inode xattr, we have all inode data in memory that is needed * to recreate the on-disk version of this inode. */ static int __ext4_get_inode_loc(struct super_block *sb, unsigned long ino, struct inode *inode, struct ext4_iloc *iloc, ext4_fsblk_t *ret_block) { struct ext4_group_desc *gdp; struct buffer_head *bh; ext4_fsblk_t block; struct blk_plug plug; int inodes_per_block, inode_offset; iloc->bh = NULL; if (ino < EXT4_ROOT_INO || ino > le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count)) return -EFSCORRUPTED; iloc->block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb); gdp = ext4_get_group_desc(sb, iloc->block_group, NULL); if (!gdp) return -EIO; /* * Figure out the offset within the block group inode table */ inodes_per_block = EXT4_SB(sb)->s_inodes_per_block; inode_offset = ((ino - 1) % EXT4_INODES_PER_GROUP(sb)); iloc->offset = (inode_offset % inodes_per_block) * EXT4_INODE_SIZE(sb); block = ext4_inode_table(sb, gdp); if ((block <= le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block)) || (block >= ext4_blocks_count(EXT4_SB(sb)->s_es))) { ext4_error(sb, "Invalid inode table block %llu in " "block_group %u", block, iloc->block_group); return -EFSCORRUPTED; } block += (inode_offset / inodes_per_block); bh = sb_getblk(sb, block); if (unlikely(!bh)) return -ENOMEM; if (ext4_buffer_uptodate(bh)) goto has_buffer; lock_buffer(bh); if (ext4_buffer_uptodate(bh)) { /* Someone brought it uptodate while we waited */ unlock_buffer(bh); goto has_buffer; } /* * If we have all information of the inode in memory and this * is the only valid inode in the block, we need not read the * block. */ if (inode && !ext4_test_inode_state(inode, EXT4_STATE_XATTR)) { struct buffer_head *bitmap_bh; int i, start; start = inode_offset & ~(inodes_per_block - 1); /* Is the inode bitmap in cache? */ bitmap_bh = sb_getblk(sb, ext4_inode_bitmap(sb, gdp)); if (unlikely(!bitmap_bh)) goto make_io; /* * If the inode bitmap isn't in cache then the * optimisation may end up performing two reads instead * of one, so skip it. */ if (!buffer_uptodate(bitmap_bh)) { brelse(bitmap_bh); goto make_io; } for (i = start; i < start + inodes_per_block; i++) { if (i == inode_offset) continue; if (ext4_test_bit(i, bitmap_bh->b_data)) break; } brelse(bitmap_bh); if (i == start + inodes_per_block) { struct ext4_inode *raw_inode = (struct ext4_inode *) (bh->b_data + iloc->offset); /* all other inodes are free, so skip I/O */ memset(bh->b_data, 0, bh->b_size); if (!ext4_test_inode_state(inode, EXT4_STATE_NEW)) ext4_fill_raw_inode(inode, raw_inode); set_buffer_uptodate(bh); unlock_buffer(bh); goto has_buffer; } } make_io: /* * If we need to do any I/O, try to pre-readahead extra * blocks from the inode table. */ blk_start_plug(&plug); if (EXT4_SB(sb)->s_inode_readahead_blks) { ext4_fsblk_t b, end, table; unsigned num; __u32 ra_blks = EXT4_SB(sb)->s_inode_readahead_blks; table = ext4_inode_table(sb, gdp); /* s_inode_readahead_blks is always a power of 2 */ b = block & ~((ext4_fsblk_t) ra_blks - 1); if (table > b) b = table; end = b + ra_blks; num = EXT4_INODES_PER_GROUP(sb); if (ext4_has_group_desc_csum(sb)) num -= ext4_itable_unused_count(sb, gdp); table += num / inodes_per_block; if (end > table) end = table; while (b <= end) ext4_sb_breadahead_unmovable(sb, b++); } /* * There are other valid inodes in the buffer, this inode * has in-inode xattrs, or we don't have this inode in memory. * Read the block from disk. */ trace_ext4_load_inode(sb, ino); ext4_read_bh_nowait(bh, REQ_META | REQ_PRIO, NULL, ext4_simulate_fail(sb, EXT4_SIM_INODE_EIO)); blk_finish_plug(&plug); wait_on_buffer(bh); if (!buffer_uptodate(bh)) { if (ret_block) *ret_block = block; brelse(bh); return -EIO; } has_buffer: iloc->bh = bh; return 0; } static int __ext4_get_inode_loc_noinmem(struct inode *inode, struct ext4_iloc *iloc) { ext4_fsblk_t err_blk = 0; int ret; ret = __ext4_get_inode_loc(inode->i_sb, inode->i_ino, NULL, iloc, &err_blk); if (ret == -EIO) ext4_error_inode_block(inode, err_blk, EIO, "unable to read itable block"); return ret; } int ext4_get_inode_loc(struct inode *inode, struct ext4_iloc *iloc) { ext4_fsblk_t err_blk = 0; int ret; ret = __ext4_get_inode_loc(inode->i_sb, inode->i_ino, inode, iloc, &err_blk); if (ret == -EIO) ext4_error_inode_block(inode, err_blk, EIO, "unable to read itable block"); return ret; } int ext4_get_fc_inode_loc(struct super_block *sb, unsigned long ino, struct ext4_iloc *iloc) { return __ext4_get_inode_loc(sb, ino, NULL, iloc, NULL); } static bool ext4_should_enable_dax(struct inode *inode) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); if (test_opt2(inode->i_sb, DAX_NEVER)) return false; if (!S_ISREG(inode->i_mode)) return false; if (ext4_should_journal_data(inode)) return false; if (ext4_has_inline_data(inode)) return false; if (ext4_test_inode_flag(inode, EXT4_INODE_ENCRYPT)) return false; if (ext4_test_inode_flag(inode, EXT4_INODE_VERITY)) return false; if (!test_bit(EXT4_FLAGS_BDEV_IS_DAX, &sbi->s_ext4_flags)) return false; if (test_opt(inode->i_sb, DAX_ALWAYS)) return true; return ext4_test_inode_flag(inode, EXT4_INODE_DAX); } void ext4_set_inode_flags(struct inode *inode, bool init) { unsigned int flags = EXT4_I(inode)->i_flags; unsigned int new_fl = 0; WARN_ON_ONCE(IS_DAX(inode) && init); if (flags & EXT4_SYNC_FL) new_fl |= S_SYNC; if (flags & EXT4_APPEND_FL) new_fl |= S_APPEND; if (flags & EXT4_IMMUTABLE_FL) new_fl |= S_IMMUTABLE; if (flags & EXT4_NOATIME_FL) new_fl |= S_NOATIME; if (flags & EXT4_DIRSYNC_FL) new_fl |= S_DIRSYNC; /* Because of the way inode_set_flags() works we must preserve S_DAX * here if already set. */ new_fl |= (inode->i_flags & S_DAX); if (init && ext4_should_enable_dax(inode)) new_fl |= S_DAX; if (flags & EXT4_ENCRYPT_FL) new_fl |= S_ENCRYPTED; if (flags & EXT4_CASEFOLD_FL) new_fl |= S_CASEFOLD; if (flags & EXT4_VERITY_FL) new_fl |= S_VERITY; inode_set_flags(inode, new_fl, S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC|S_DAX| S_ENCRYPTED|S_CASEFOLD|S_VERITY); } static blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode, struct ext4_inode_info *ei) { blkcnt_t i_blocks ; struct inode *inode = &(ei->vfs_inode); struct super_block *sb = inode->i_sb; if (ext4_has_feature_huge_file(sb)) { /* we are using combined 48 bit field */ i_blocks = ((u64)le16_to_cpu(raw_inode->i_blocks_high)) << 32 | le32_to_cpu(raw_inode->i_blocks_lo); if (ext4_test_inode_flag(inode, EXT4_INODE_HUGE_FILE)) { /* i_blocks represent file system block size */ return i_blocks << (inode->i_blkbits - 9); } else { return i_blocks; } } else { return le32_to_cpu(raw_inode->i_blocks_lo); } } static inline int ext4_iget_extra_inode(struct inode *inode, struct ext4_inode *raw_inode, struct ext4_inode_info *ei) { __le32 *magic = (void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize; if (EXT4_INODE_HAS_XATTR_SPACE(inode) && *magic == cpu_to_le32(EXT4_XATTR_MAGIC)) { int err; err = xattr_check_inode(inode, IHDR(inode, raw_inode), ITAIL(inode, raw_inode)); if (err) return err; ext4_set_inode_state(inode, EXT4_STATE_XATTR); err = ext4_find_inline_data_nolock(inode); if (!err && ext4_has_inline_data(inode)) ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); return err; } else EXT4_I(inode)->i_inline_off = 0; return 0; } int ext4_get_projid(struct inode *inode, kprojid_t *projid) { if (!ext4_has_feature_project(inode->i_sb)) return -EOPNOTSUPP; *projid = EXT4_I(inode)->i_projid; return 0; } /* * ext4 has self-managed i_version for ea inodes, it stores the lower 32bit of * refcount in i_version, so use raw values if inode has EXT4_EA_INODE_FL flag * set. */ static inline void ext4_inode_set_iversion_queried(struct inode *inode, u64 val) { if (unlikely(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) inode_set_iversion_raw(inode, val); else inode_set_iversion_queried(inode, val); } static int check_igot_inode(struct inode *inode, ext4_iget_flags flags, const char *function, unsigned int line) { const char *err_str; if (flags & EXT4_IGET_EA_INODE) { if (!(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) { err_str = "missing EA_INODE flag"; goto error; } if (ext4_test_inode_state(inode, EXT4_STATE_XATTR) || EXT4_I(inode)->i_file_acl) { err_str = "ea_inode with extended attributes"; goto error; } } else { if ((EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) { /* * open_by_handle_at() could provide an old inode number * that has since been reused for an ea_inode; this does * not indicate filesystem corruption */ if (flags & EXT4_IGET_HANDLE) return -ESTALE; err_str = "unexpected EA_INODE flag"; goto error; } } if (is_bad_inode(inode) && !(flags & EXT4_IGET_BAD)) { err_str = "unexpected bad inode w/o EXT4_IGET_BAD"; goto error; } return 0; error: ext4_error_inode(inode, function, line, 0, "%s", err_str); return -EFSCORRUPTED; } static bool ext4_should_enable_large_folio(struct inode *inode) { struct super_block *sb = inode->i_sb; if (!S_ISREG(inode->i_mode)) return false; if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA || ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA)) return false; if (ext4_has_feature_verity(sb)) return false; if (ext4_has_feature_encrypt(sb)) return false; return true; } /* * Limit the maximum folio order to 2048 blocks to prevent overestimation * of reserve handle credits during the folio writeback in environments * where the PAGE_SIZE exceeds 4KB. */ #define EXT4_MAX_PAGECACHE_ORDER(i) \ umin(MAX_PAGECACHE_ORDER, (11 + (i)->i_blkbits - PAGE_SHIFT)) void ext4_set_inode_mapping_order(struct inode *inode) { if (!ext4_should_enable_large_folio(inode)) return; mapping_set_folio_order_range(inode->i_mapping, 0, EXT4_MAX_PAGECACHE_ORDER(inode)); } struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, ext4_iget_flags flags, const char *function, unsigned int line) { struct ext4_iloc iloc; struct ext4_inode *raw_inode; struct ext4_inode_info *ei; struct ext4_super_block *es = EXT4_SB(sb)->s_es; struct inode *inode; journal_t *journal = EXT4_SB(sb)->s_journal; long ret; loff_t size; int block; uid_t i_uid; gid_t i_gid; projid_t i_projid; if ((!(flags & EXT4_IGET_SPECIAL) && is_special_ino(sb, ino)) || (ino < EXT4_ROOT_INO) || (ino > le32_to_cpu(es->s_inodes_count))) { if (flags & EXT4_IGET_HANDLE) return ERR_PTR(-ESTALE); __ext4_error(sb, function, line, false, EFSCORRUPTED, 0, "inode #%lu: comm %s: iget: illegal inode #", ino, current->comm); return ERR_PTR(-EFSCORRUPTED); } inode = iget_locked(sb, ino); if (!inode) return ERR_PTR(-ENOMEM); if (!(inode->i_state & I_NEW)) { ret = check_igot_inode(inode, flags, function, line); if (ret) { iput(inode); return ERR_PTR(ret); } return inode; } ei = EXT4_I(inode); iloc.bh = NULL; ret = __ext4_get_inode_loc_noinmem(inode, &iloc); if (ret < 0) goto bad_inode; raw_inode = ext4_raw_inode(&iloc); if ((flags & EXT4_IGET_HANDLE) && (raw_inode->i_links_count == 0) && (raw_inode->i_mode == 0)) { ret = -ESTALE; goto bad_inode; } if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) { ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize); if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize > EXT4_INODE_SIZE(inode->i_sb) || (ei->i_extra_isize & 3)) { ext4_error_inode(inode, function, line, 0, "iget: bad extra_isize %u " "(inode size %u)", ei->i_extra_isize, EXT4_INODE_SIZE(inode->i_sb)); ret = -EFSCORRUPTED; goto bad_inode; } } else ei->i_extra_isize = 0; /* Precompute checksum seed for inode metadata */ if (ext4_has_feature_metadata_csum(sb)) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); __u32 csum; __le32 inum = cpu_to_le32(inode->i_ino); __le32 gen = raw_inode->i_generation; csum = ext4_chksum(sbi->s_csum_seed, (__u8 *)&inum, sizeof(inum)); ei->i_csum_seed = ext4_chksum(csum, (__u8 *)&gen, sizeof(gen)); } if ((!ext4_inode_csum_verify(inode, raw_inode, ei) || ext4_simulate_fail(sb, EXT4_SIM_INODE_CRC)) && (!(EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))) { ext4_error_inode_err(inode, function, line, 0, EFSBADCRC, "iget: checksum invalid"); ret = -EFSBADCRC; goto bad_inode; } inode->i_mode = le16_to_cpu(raw_inode->i_mode); i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low); i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low); if (ext4_has_feature_project(sb) && EXT4_INODE_SIZE(sb) > EXT4_GOOD_OLD_INODE_SIZE && EXT4_FITS_IN_INODE(raw_inode, ei, i_projid)) i_projid = (projid_t)le32_to_cpu(raw_inode->i_projid); else i_projid = EXT4_DEF_PROJID; if (!(test_opt(inode->i_sb, NO_UID32))) { i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16; i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16; } i_uid_write(inode, i_uid); i_gid_write(inode, i_gid); ei->i_projid = make_kprojid(&init_user_ns, i_projid); set_nlink(inode, le16_to_cpu(raw_inode->i_links_count)); ext4_clear_state_flags(ei); /* Only relevant on 32-bit archs */ ei->i_inline_off = 0; ei->i_dir_start_lookup = 0; ei->i_dtime = le32_to_cpu(raw_inode->i_dtime); /* We now have enough fields to check if the inode was active or not. * This is needed because nfsd might try to access dead inodes * the test is that same one that e2fsck uses * NeilBrown 1999oct15 */ if (inode->i_nlink == 0) { if ((inode->i_mode == 0 || flags & EXT4_IGET_SPECIAL || !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) && ino != EXT4_BOOT_LOADER_INO) { /* this inode is deleted or unallocated */ if (flags & EXT4_IGET_SPECIAL) { ext4_error_inode(inode, function, line, 0, "iget: special inode unallocated"); ret = -EFSCORRUPTED; } else ret = -ESTALE; goto bad_inode; } /* The only unlinked inodes we let through here have * valid i_mode and are being read by the orphan * recovery code: that's fine, we're about to complete * the process of deleting those. * OR it is the EXT4_BOOT_LOADER_INO which is * not initialized on a new filesystem. */ } ei->i_flags = le32_to_cpu(raw_inode->i_flags); ext4_set_inode_flags(inode, true); inode->i_blocks = ext4_inode_blocks(raw_inode, ei); ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl_lo); if (ext4_has_feature_64bit(sb)) ei->i_file_acl |= ((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32; inode->i_size = ext4_isize(sb, raw_inode); size = i_size_read(inode); if (size < 0 || size > ext4_get_maxbytes(inode)) { ext4_error_inode(inode, function, line, 0, "iget: bad i_size value: %lld", size); ret = -EFSCORRUPTED; goto bad_inode; } /* * If dir_index is not enabled but there's dir with INDEX flag set, * we'd normally treat htree data as empty space. But with metadata * checksumming that corrupts checksums so forbid that. */ if (!ext4_has_feature_dir_index(sb) && ext4_has_feature_metadata_csum(sb) && ext4_test_inode_flag(inode, EXT4_INODE_INDEX)) { ext4_error_inode(inode, function, line, 0, "iget: Dir with htree data on filesystem without dir_index feature."); ret = -EFSCORRUPTED; goto bad_inode; } ei->i_disksize = inode->i_size; #ifdef CONFIG_QUOTA ei->i_reserved_quota = 0; #endif inode->i_generation = le32_to_cpu(raw_inode->i_generation); ei->i_block_group = iloc.block_group; ei->i_last_alloc_group = ~0; /* * NOTE! The in-memory inode i_data array is in little-endian order * even on big-endian machines: we do NOT byteswap the block numbers! */ for (block = 0; block < EXT4_N_BLOCKS; block++) ei->i_data[block] = raw_inode->i_block[block]; INIT_LIST_HEAD(&ei->i_orphan); ext4_fc_init_inode(&ei->vfs_inode); /* * Set transaction id's of transactions that have to be committed * to finish f[data]sync. We set them to currently running transaction * as we cannot be sure that the inode or some of its metadata isn't * part of the transaction - the inode could have been reclaimed and * now it is reread from disk. */ if (journal) { transaction_t *transaction; tid_t tid; read_lock(&journal->j_state_lock); if (journal->j_running_transaction) transaction = journal->j_running_transaction; else transaction = journal->j_committing_transaction; if (transaction) tid = transaction->t_tid; else tid = journal->j_commit_sequence; read_unlock(&journal->j_state_lock); ei->i_sync_tid = tid; ei->i_datasync_tid = tid; } if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) { if (ei->i_extra_isize == 0) { /* The extra space is currently unused. Use it. */ BUILD_BUG_ON(sizeof(struct ext4_inode) & 3); ei->i_extra_isize = sizeof(struct ext4_inode) - EXT4_GOOD_OLD_INODE_SIZE; } else { ret = ext4_iget_extra_inode(inode, raw_inode, ei); if (ret) goto bad_inode; } } EXT4_INODE_GET_CTIME(inode, raw_inode); EXT4_INODE_GET_ATIME(inode, raw_inode); EXT4_INODE_GET_MTIME(inode, raw_inode); EXT4_EINODE_GET_XTIME(i_crtime, ei, raw_inode); if (likely(!test_opt2(inode->i_sb, HURD_COMPAT))) { u64 ivers = le32_to_cpu(raw_inode->i_disk_version); if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) { if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi)) ivers |= (__u64)(le32_to_cpu(raw_inode->i_version_hi)) << 32; } ext4_inode_set_iversion_queried(inode, ivers); } ret = 0; if (ei->i_file_acl && !ext4_inode_block_valid(inode, ei->i_file_acl, 1)) { ext4_error_inode(inode, function, line, 0, "iget: bad extended attribute block %llu", ei->i_file_acl); ret = -EFSCORRUPTED; goto bad_inode; } else if (!ext4_has_inline_data(inode)) { /* validate the block references in the inode */ if (!(EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY) && (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || (S_ISLNK(inode->i_mode) && !ext4_inode_is_fast_symlink(inode)))) { if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) ret = ext4_ext_check_inode(inode); else ret = ext4_ind_check_inode(inode); } } if (ret) goto bad_inode; if (S_ISREG(inode->i_mode)) { inode->i_op = &ext4_file_inode_operations; inode->i_fop = &ext4_file_operations; ext4_set_aops(inode); } else if (S_ISDIR(inode->i_mode)) { inode->i_op = &ext4_dir_inode_operations; inode->i_fop = &ext4_dir_operations; } else if (S_ISLNK(inode->i_mode)) { /* VFS does not allow setting these so must be corruption */ if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) { ext4_error_inode(inode, function, line, 0, "iget: immutable or append flags " "not allowed on symlinks"); ret = -EFSCORRUPTED; goto bad_inode; } if (IS_ENCRYPTED(inode)) { inode->i_op = &ext4_encrypted_symlink_inode_operations; } else if (ext4_inode_is_fast_symlink(inode)) { inode->i_op = &ext4_fast_symlink_inode_operations; if (inode->i_size == 0 || inode->i_size >= sizeof(ei->i_data) || strnlen((char *)ei->i_data, inode->i_size + 1) != inode->i_size) { ext4_error_inode(inode, function, line, 0, "invalid fast symlink length %llu", (unsigned long long)inode->i_size); ret = -EFSCORRUPTED; goto bad_inode; } inode_set_cached_link(inode, (char *)ei->i_data, inode->i_size); } else { inode->i_op = &ext4_symlink_inode_operations; } } else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) || S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) { inode->i_op = &ext4_special_inode_operations; if (raw_inode->i_block[0]) init_special_inode(inode, inode->i_mode, old_decode_dev(le32_to_cpu(raw_inode->i_block[0]))); else init_special_inode(inode, inode->i_mode, new_decode_dev(le32_to_cpu(raw_inode->i_block[1]))); } else if (ino == EXT4_BOOT_LOADER_INO) { make_bad_inode(inode); } else { ret = -EFSCORRUPTED; ext4_error_inode(inode, function, line, 0, "iget: bogus i_mode (%o)", inode->i_mode); goto bad_inode; } if (IS_CASEFOLDED(inode) && !ext4_has_feature_casefold(inode->i_sb)) { ext4_error_inode(inode, function, line, 0, "casefold flag without casefold feature"); ret = -EFSCORRUPTED; goto bad_inode; } ext4_set_inode_mapping_order(inode); ret = check_igot_inode(inode, flags, function, line); /* * -ESTALE here means there is nothing inherently wrong with the inode, * it's just not an inode we can return for an fhandle lookup. */ if (ret == -ESTALE) { brelse(iloc.bh); unlock_new_inode(inode); iput(inode); return ERR_PTR(-ESTALE); } if (ret) goto bad_inode; brelse(iloc.bh); unlock_new_inode(inode); return inode; bad_inode: brelse(iloc.bh); iget_failed(inode); return ERR_PTR(ret); } static void __ext4_update_other_inode_time(struct super_block *sb, unsigned long orig_ino, unsigned long ino, struct ext4_inode *raw_inode) { struct inode *inode; inode = find_inode_by_ino_rcu(sb, ino); if (!inode) return; if (!inode_is_dirtytime_only(inode)) return; spin_lock(&inode->i_lock); if (inode_is_dirtytime_only(inode)) { struct ext4_inode_info *ei = EXT4_I(inode); inode->i_state &= ~I_DIRTY_TIME; spin_unlock(&inode->i_lock); spin_lock(&ei->i_raw_lock); EXT4_INODE_SET_CTIME(inode, raw_inode); EXT4_INODE_SET_MTIME(inode, raw_inode); EXT4_INODE_SET_ATIME(inode, raw_inode); ext4_inode_csum_set(inode, raw_inode, ei); spin_unlock(&ei->i_raw_lock); trace_ext4_other_inode_update_time(inode, orig_ino); return; } spin_unlock(&inode->i_lock); } /* * Opportunistically update the other time fields for other inodes in * the same inode table block. */ static void ext4_update_other_inodes_time(struct super_block *sb, unsigned long orig_ino, char *buf) { unsigned long ino; int i, inodes_per_block = EXT4_SB(sb)->s_inodes_per_block; int inode_size = EXT4_INODE_SIZE(sb); /* * Calculate the first inode in the inode table block. Inode * numbers are one-based. That is, the first inode in a block * (assuming 4k blocks and 256 byte inodes) is (n*16 + 1). */ ino = ((orig_ino - 1) & ~(inodes_per_block - 1)) + 1; rcu_read_lock(); for (i = 0; i < inodes_per_block; i++, ino++, buf += inode_size) { if (ino == orig_ino) continue; __ext4_update_other_inode_time(sb, orig_ino, ino, (struct ext4_inode *)buf); } rcu_read_unlock(); } /* * Post the struct inode info into an on-disk inode location in the * buffer-cache. This gobbles the caller's reference to the * buffer_head in the inode location struct. * * The caller must have write access to iloc->bh. */ static int ext4_do_update_inode(handle_t *handle, struct inode *inode, struct ext4_iloc *iloc) { struct ext4_inode *raw_inode = ext4_raw_inode(iloc); struct ext4_inode_info *ei = EXT4_I(inode); struct buffer_head *bh = iloc->bh; struct super_block *sb = inode->i_sb; int err; int need_datasync = 0, set_large_file = 0; spin_lock(&ei->i_raw_lock); /* * For fields not tracked in the in-memory inode, initialise them * to zero for new inodes. */ if (ext4_test_inode_state(inode, EXT4_STATE_NEW)) memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size); if (READ_ONCE(ei->i_disksize) != ext4_isize(inode->i_sb, raw_inode)) need_datasync = 1; if (ei->i_disksize > 0x7fffffffULL) { if (!ext4_has_feature_large_file(sb) || EXT4_SB(sb)->s_es->s_rev_level == cpu_to_le32(EXT4_GOOD_OLD_REV)) set_large_file = 1; } err = ext4_fill_raw_inode(inode, raw_inode); spin_unlock(&ei->i_raw_lock); if (err) { EXT4_ERROR_INODE(inode, "corrupted inode contents"); goto out_brelse; } if (inode->i_sb->s_flags & SB_LAZYTIME) ext4_update_other_inodes_time(inode->i_sb, inode->i_ino, bh->b_data); BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); err = ext4_handle_dirty_metadata(handle, NULL, bh); if (err) goto out_error; ext4_clear_inode_state(inode, EXT4_STATE_NEW); if (set_large_file) { BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get write access"); err = ext4_journal_get_write_access(handle, sb, EXT4_SB(sb)->s_sbh, EXT4_JTR_NONE); if (err) goto out_error; lock_buffer(EXT4_SB(sb)->s_sbh); ext4_set_feature_large_file(sb); ext4_superblock_csum_set(sb); unlock_buffer(EXT4_SB(sb)->s_sbh); ext4_handle_sync(handle); err = ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh); } ext4_update_inode_fsync_trans(handle, inode, need_datasync); out_error: ext4_std_error(inode->i_sb, err); out_brelse: brelse(bh); return err; } /* * ext4_write_inode() * * We are called from a few places: * * - Within generic_file_aio_write() -> generic_write_sync() for O_SYNC files. * Here, there will be no transaction running. We wait for any running * transaction to commit. * * - Within flush work (sys_sync(), kupdate and such). * We wait on commit, if told to. * * - Within iput_final() -> write_inode_now() * We wait on commit, if told to. * * In all cases it is actually safe for us to return without doing anything, * because the inode has been copied into a raw inode buffer in * ext4_mark_inode_dirty(). This is a correctness thing for WB_SYNC_ALL * writeback. * * Note that we are absolutely dependent upon all inode dirtiers doing the * right thing: they *must* call mark_inode_dirty() after dirtying info in * which we are interested. * * It would be a bug for them to not do this. The code: * * mark_inode_dirty(inode) * stuff(); * inode->i_size = expr; * * is in error because write_inode() could occur while `stuff()' is running, * and the new i_size will be lost. Plus the inode will no longer be on the * superblock's dirty inode list. */ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc) { int err; if (WARN_ON_ONCE(current->flags & PF_MEMALLOC)) return 0; err = ext4_emergency_state(inode->i_sb); if (unlikely(err)) return err; if (EXT4_SB(inode->i_sb)->s_journal) { if (ext4_journal_current_handle()) { ext4_debug("called recursively, non-PF_MEMALLOC!\n"); dump_stack(); return -EIO; } /* * No need to force transaction in WB_SYNC_NONE mode. Also * ext4_sync_fs() will force the commit after everything is * written. */ if (wbc->sync_mode != WB_SYNC_ALL || wbc->for_sync) return 0; err = ext4_fc_commit(EXT4_SB(inode->i_sb)->s_journal, EXT4_I(inode)->i_sync_tid); } else { struct ext4_iloc iloc; err = __ext4_get_inode_loc_noinmem(inode, &iloc); if (err) return err; /* * sync(2) will flush the whole buffer cache. No need to do * it here separately for each inode. */ if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync) sync_dirty_buffer(iloc.bh); if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) { ext4_error_inode_block(inode, iloc.bh->b_blocknr, EIO, "IO error syncing inode"); err = -EIO; } brelse(iloc.bh); } return err; } /* * In data=journal mode ext4_journalled_invalidate_folio() may fail to invalidate * buffers that are attached to a folio straddling i_size and are undergoing * commit. In that case we have to wait for commit to finish and try again. */ static void ext4_wait_for_tail_page_commit(struct inode *inode) { unsigned offset; journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; tid_t commit_tid; int ret; bool has_transaction; offset = inode->i_size & (PAGE_SIZE - 1); /* * If the folio is fully truncated, we don't need to wait for any commit * (and we even should not as __ext4_journalled_invalidate_folio() may * strip all buffers from the folio but keep the folio dirty which can then * confuse e.g. concurrent ext4_writepages() seeing dirty folio without * buffers). Also we don't need to wait for any commit if all buffers in * the folio remain valid. This is most beneficial for the common case of * blocksize == PAGESIZE. */ if (!offset || offset > (PAGE_SIZE - i_blocksize(inode))) return; while (1) { struct folio *folio = filemap_lock_folio(inode->i_mapping, inode->i_size >> PAGE_SHIFT); if (IS_ERR(folio)) return; ret = __ext4_journalled_invalidate_folio(folio, offset, folio_size(folio) - offset); folio_unlock(folio); folio_put(folio); if (ret != -EBUSY) return; has_transaction = false; read_lock(&journal->j_state_lock); if (journal->j_committing_transaction) { commit_tid = journal->j_committing_transaction->t_tid; has_transaction = true; } read_unlock(&journal->j_state_lock); if (has_transaction) jbd2_log_wait_commit(journal, commit_tid); } } /* * ext4_setattr() * * Called from notify_change. * * We want to trap VFS attempts to truncate the file as soon as * possible. In particular, we want to make sure that when the VFS * shrinks i_size, we put the inode on the orphan list and modify * i_disksize immediately, so that during the subsequent flushing of * dirty pages and freeing of disk blocks, we can guarantee that any * commit will leave the blocks being flushed in an unused state on * disk. (On recovery, the inode will get truncated and the blocks will * be freed, so we have a strong guarantee that no future commit will * leave these blocks visible to the user.) * * Another thing we have to assure is that if we are in ordered mode * and inode is still attached to the committing transaction, we must * we start writeout of all the dirty pages which are being truncated. * This way we are sure that all the data written in the previous * transaction are already on disk (truncate waits for pages under * writeback). * * Called with inode->i_rwsem down. */ int ext4_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_inode(dentry); int error, rc = 0; int orphan = 0; const unsigned int ia_valid = attr->ia_valid; bool inc_ivers = true; error = ext4_emergency_state(inode->i_sb); if (unlikely(error)) return error; if (unlikely(IS_IMMUTABLE(inode))) return -EPERM; if (unlikely(IS_APPEND(inode) && (ia_valid & (ATTR_MODE | ATTR_UID | ATTR_GID | ATTR_TIMES_SET)))) return -EPERM; error = setattr_prepare(idmap, dentry, attr); if (error) return error; error = fscrypt_prepare_setattr(dentry, attr); if (error) return error; error = fsverity_prepare_setattr(dentry, attr); if (error) return error; if (is_quota_modification(idmap, inode, attr)) { error = dquot_initialize(inode); if (error) return error; } if (i_uid_needs_update(idmap, attr, inode) || i_gid_needs_update(idmap, attr, inode)) { handle_t *handle; /* (user+group)*(old+new) structure, inode write (sb, * inode block, ? - but truncate inode update has it) */ handle = ext4_journal_start(inode, EXT4_HT_QUOTA, (EXT4_MAXQUOTAS_INIT_BLOCKS(inode->i_sb) + EXT4_MAXQUOTAS_DEL_BLOCKS(inode->i_sb)) + 3); if (IS_ERR(handle)) { error = PTR_ERR(handle); goto err_out; } /* dquot_transfer() calls back ext4_get_inode_usage() which * counts xattr inode references. */ down_read(&EXT4_I(inode)->xattr_sem); error = dquot_transfer(idmap, inode, attr); up_read(&EXT4_I(inode)->xattr_sem); if (error) { ext4_journal_stop(handle); return error; } /* Update corresponding info in inode so that everything is in * one transaction */ i_uid_update(idmap, attr, inode); i_gid_update(idmap, attr, inode); error = ext4_mark_inode_dirty(handle, inode); ext4_journal_stop(handle); if (unlikely(error)) { return error; } } if (attr->ia_valid & ATTR_SIZE) { handle_t *handle; loff_t oldsize = inode->i_size; loff_t old_disksize; int shrink = (attr->ia_size < inode->i_size); if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); if (attr->ia_size > sbi->s_bitmap_maxbytes) { return -EFBIG; } } if (!S_ISREG(inode->i_mode)) { return -EINVAL; } if (attr->ia_size == inode->i_size) inc_ivers = false; if (shrink) { if (ext4_should_order_data(inode)) { error = ext4_begin_ordered_truncate(inode, attr->ia_size); if (error) goto err_out; } /* * Blocks are going to be removed from the inode. Wait * for dio in flight. */ inode_dio_wait(inode); } filemap_invalidate_lock(inode->i_mapping); rc = ext4_break_layouts(inode); if (rc) { filemap_invalidate_unlock(inode->i_mapping); goto err_out; } if (attr->ia_size != inode->i_size) { /* attach jbd2 jinode for EOF folio tail zeroing */ if (attr->ia_size & (inode->i_sb->s_blocksize - 1) || oldsize & (inode->i_sb->s_blocksize - 1)) { error = ext4_inode_attach_jinode(inode); if (error) goto out_mmap_sem; } handle = ext4_journal_start(inode, EXT4_HT_INODE, 3); if (IS_ERR(handle)) { error = PTR_ERR(handle); goto out_mmap_sem; } if (ext4_handle_valid(handle) && shrink) { error = ext4_orphan_add(handle, inode); orphan = 1; } /* * Update c/mtime and tail zero the EOF folio on * truncate up. ext4_truncate() handles the shrink case * below. */ if (!shrink) { inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); if (oldsize & (inode->i_sb->s_blocksize - 1)) ext4_block_truncate_page(handle, inode->i_mapping, oldsize); } if (shrink) ext4_fc_track_range(handle, inode, (attr->ia_size > 0 ? attr->ia_size - 1 : 0) >> inode->i_sb->s_blocksize_bits, EXT_MAX_BLOCKS - 1); else ext4_fc_track_range( handle, inode, (oldsize > 0 ? oldsize - 1 : oldsize) >> inode->i_sb->s_blocksize_bits, (attr->ia_size > 0 ? attr->ia_size - 1 : 0) >> inode->i_sb->s_blocksize_bits); down_write(&EXT4_I(inode)->i_data_sem); old_disksize = EXT4_I(inode)->i_disksize; EXT4_I(inode)->i_disksize = attr->ia_size; /* * We have to update i_size under i_data_sem together * with i_disksize to avoid races with writeback code * running ext4_wb_update_i_disksize(). */ if (!error) i_size_write(inode, attr->ia_size); else EXT4_I(inode)->i_disksize = old_disksize; up_write(&EXT4_I(inode)->i_data_sem); rc = ext4_mark_inode_dirty(handle, inode); if (!error) error = rc; ext4_journal_stop(handle); if (error) goto out_mmap_sem; if (!shrink) { pagecache_isize_extended(inode, oldsize, inode->i_size); } else if (ext4_should_journal_data(inode)) { ext4_wait_for_tail_page_commit(inode); } } /* * Truncate pagecache after we've waited for commit * in data=journal mode to make pages freeable. */ truncate_pagecache(inode, inode->i_size); /* * Call ext4_truncate() even if i_size didn't change to * truncate possible preallocated blocks. */ if (attr->ia_size <= oldsize) { rc = ext4_truncate(inode); if (rc) error = rc; } out_mmap_sem: filemap_invalidate_unlock(inode->i_mapping); } if (!error) { if (inc_ivers) inode_inc_iversion(inode); setattr_copy(idmap, inode, attr); mark_inode_dirty(inode); } /* * If the call to ext4_truncate failed to get a transaction handle at * all, we need to clean up the in-core orphan list manually. */ if (orphan && inode->i_nlink) ext4_orphan_del(NULL, inode); if (!error && (ia_valid & ATTR_MODE)) rc = posix_acl_chmod(idmap, dentry, inode->i_mode); err_out: if (error) ext4_std_error(inode->i_sb, error); if (!error) error = rc; return error; } u32 ext4_dio_alignment(struct inode *inode) { if (fsverity_active(inode)) return 0; if (ext4_should_journal_data(inode)) return 0; if (ext4_has_inline_data(inode)) return 0; if (IS_ENCRYPTED(inode)) { if (!fscrypt_dio_supported(inode)) return 0; return i_blocksize(inode); } return 1; /* use the iomap defaults */ } int ext4_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { struct inode *inode = d_inode(path->dentry); struct ext4_inode *raw_inode; struct ext4_inode_info *ei = EXT4_I(inode); unsigned int flags; if ((request_mask & STATX_BTIME) && EXT4_FITS_IN_INODE(raw_inode, ei, i_crtime)) { stat->result_mask |= STATX_BTIME; stat->btime.tv_sec = ei->i_crtime.tv_sec; stat->btime.tv_nsec = ei->i_crtime.tv_nsec; } /* * Return the DIO alignment restrictions if requested. We only return * this information when requested, since on encrypted files it might * take a fair bit of work to get if the file wasn't opened recently. */ if ((request_mask & STATX_DIOALIGN) && S_ISREG(inode->i_mode)) { u32 dio_align = ext4_dio_alignment(inode); stat->result_mask |= STATX_DIOALIGN; if (dio_align == 1) { struct block_device *bdev = inode->i_sb->s_bdev; /* iomap defaults */ stat->dio_mem_align = bdev_dma_alignment(bdev) + 1; stat->dio_offset_align = bdev_logical_block_size(bdev); } else { stat->dio_mem_align = dio_align; stat->dio_offset_align = dio_align; } } if ((request_mask & STATX_WRITE_ATOMIC) && S_ISREG(inode->i_mode)) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); unsigned int awu_min = 0, awu_max = 0; if (ext4_inode_can_atomic_write(inode)) { awu_min = sbi->s_awu_min; awu_max = sbi->s_awu_max; } generic_fill_statx_atomic_writes(stat, awu_min, awu_max, 0); } flags = ei->i_flags & EXT4_FL_USER_VISIBLE; if (flags & EXT4_APPEND_FL) stat->attributes |= STATX_ATTR_APPEND; if (flags & EXT4_COMPR_FL) stat->attributes |= STATX_ATTR_COMPRESSED; if (flags & EXT4_ENCRYPT_FL) stat->attributes |= STATX_ATTR_ENCRYPTED; if (flags & EXT4_IMMUTABLE_FL) stat->attributes |= STATX_ATTR_IMMUTABLE; if (flags & EXT4_NODUMP_FL) stat->attributes |= STATX_ATTR_NODUMP; if (flags & EXT4_VERITY_FL) stat->attributes |= STATX_ATTR_VERITY; stat->attributes_mask |= (STATX_ATTR_APPEND | STATX_ATTR_COMPRESSED | STATX_ATTR_ENCRYPTED | STATX_ATTR_IMMUTABLE | STATX_ATTR_NODUMP | STATX_ATTR_VERITY); generic_fillattr(idmap, request_mask, inode, stat); return 0; } int ext4_file_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { struct inode *inode = d_inode(path->dentry); u64 delalloc_blocks; ext4_getattr(idmap, path, stat, request_mask, query_flags); /* * If there is inline data in the inode, the inode will normally not * have data blocks allocated (it may have an external xattr block). * Report at least one sector for such files, so tools like tar, rsync, * others don't incorrectly think the file is completely sparse. */ if (unlikely(ext4_has_inline_data(inode))) stat->blocks += (stat->size + 511) >> 9; /* * We can't update i_blocks if the block allocation is delayed * otherwise in the case of system crash before the real block * allocation is done, we will have i_blocks inconsistent with * on-disk file blocks. * We always keep i_blocks updated together with real * allocation. But to not confuse with user, stat * will return the blocks that include the delayed allocation * blocks for this file. */ delalloc_blocks = EXT4_C2B(EXT4_SB(inode->i_sb), EXT4_I(inode)->i_reserved_data_blocks); stat->blocks += delalloc_blocks << (inode->i_sb->s_blocksize_bits - 9); return 0; } static int ext4_index_trans_blocks(struct inode *inode, int lblocks, int pextents) { if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) return ext4_ind_trans_blocks(inode, lblocks); return ext4_ext_index_trans_blocks(inode, pextents); } /* * Account for index blocks, block groups bitmaps and block group * descriptor blocks if modify datablocks and index blocks * worse case, the indexs blocks spread over different block groups * * If datablocks are discontiguous, they are possible to spread over * different block groups too. If they are contiguous, with flexbg, * they could still across block group boundary. * * Also account for superblock, inode, quota and xattr blocks */ int ext4_meta_trans_blocks(struct inode *inode, int lblocks, int pextents) { ext4_group_t groups, ngroups = ext4_get_groups_count(inode->i_sb); int gdpblocks; int idxblocks; int ret; /* * How many index and leaf blocks need to touch to map @lblocks * logical blocks to @pextents physical extents? */ idxblocks = ext4_index_trans_blocks(inode, lblocks, pextents); /* * Now let's see how many group bitmaps and group descriptors need * to account */ groups = idxblocks + pextents; gdpblocks = groups; if (groups > ngroups) groups = ngroups; if (groups > EXT4_SB(inode->i_sb)->s_gdb_count) gdpblocks = EXT4_SB(inode->i_sb)->s_gdb_count; /* bitmaps and block group descriptor blocks */ ret = idxblocks + groups + gdpblocks; /* Blocks for super block, inode, quota and xattr blocks */ ret += EXT4_META_TRANS_BLOCKS(inode->i_sb); return ret; } /* * Calculate the journal credits for modifying the number of blocks * in a single extent within one transaction. 'nrblocks' is used only * for non-extent inodes. For extent type inodes, 'nrblocks' can be * zero if the exact number of blocks is unknown. */ int ext4_chunk_trans_extent(struct inode *inode, int nrblocks) { int ret; ret = ext4_meta_trans_blocks(inode, nrblocks, 1); /* Account for data blocks for journalled mode */ if (ext4_should_journal_data(inode)) ret += nrblocks; return ret; } /* * Calculate the journal credits for a chunk of data modification. * * This is called from DIO, fallocate or whoever calling * ext4_map_blocks() to map/allocate a chunk of contiguous disk blocks. * * journal buffers for data blocks are not included here, as DIO * and fallocate do no need to journal data buffers. */ int ext4_chunk_trans_blocks(struct inode *inode, int nrblocks) { return ext4_meta_trans_blocks(inode, nrblocks, 1); } /* * The caller must have previously called ext4_reserve_inode_write(). * Give this, we know that the caller already has write access to iloc->bh. */ int ext4_mark_iloc_dirty(handle_t *handle, struct inode *inode, struct ext4_iloc *iloc) { int err = 0; err = ext4_emergency_state(inode->i_sb); if (unlikely(err)) { put_bh(iloc->bh); return err; } ext4_fc_track_inode(handle, inode); /* the do_update_inode consumes one bh->b_count */ get_bh(iloc->bh); /* ext4_do_update_inode() does jbd2_journal_dirty_metadata */ err = ext4_do_update_inode(handle, inode, iloc); put_bh(iloc->bh); return err; } /* * On success, We end up with an outstanding reference count against * iloc->bh. This _must_ be cleaned up later. */ int ext4_reserve_inode_write(handle_t *handle, struct inode *inode, struct ext4_iloc *iloc) { int err; err = ext4_emergency_state(inode->i_sb); if (unlikely(err)) return err; err = ext4_get_inode_loc(inode, iloc); if (!err) { BUFFER_TRACE(iloc->bh, "get_write_access"); err = ext4_journal_get_write_access(handle, inode->i_sb, iloc->bh, EXT4_JTR_NONE); if (err) { brelse(iloc->bh); iloc->bh = NULL; } ext4_fc_track_inode(handle, inode); } ext4_std_error(inode->i_sb, err); return err; } static int __ext4_expand_extra_isize(struct inode *inode, unsigned int new_extra_isize, struct ext4_iloc *iloc, handle_t *handle, int *no_expand) { struct ext4_inode *raw_inode; struct ext4_xattr_ibody_header *header; unsigned int inode_size = EXT4_INODE_SIZE(inode->i_sb); struct ext4_inode_info *ei = EXT4_I(inode); int error; /* this was checked at iget time, but double check for good measure */ if ((EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize > inode_size) || (ei->i_extra_isize & 3)) { EXT4_ERROR_INODE(inode, "bad extra_isize %u (inode size %u)", ei->i_extra_isize, EXT4_INODE_SIZE(inode->i_sb)); return -EFSCORRUPTED; } if ((new_extra_isize < ei->i_extra_isize) || (new_extra_isize < 4) || (new_extra_isize > inode_size - EXT4_GOOD_OLD_INODE_SIZE)) return -EINVAL; /* Should never happen */ raw_inode = ext4_raw_inode(iloc); header = IHDR(inode, raw_inode); /* No extended attributes present */ if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR) || header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC)) { memset((void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE + EXT4_I(inode)->i_extra_isize, 0, new_extra_isize - EXT4_I(inode)->i_extra_isize); EXT4_I(inode)->i_extra_isize = new_extra_isize; return 0; } /* * We may need to allocate external xattr block so we need quotas * initialized. Here we can be called with various locks held so we * cannot affort to initialize quotas ourselves. So just bail. */ if (dquot_initialize_needed(inode)) return -EAGAIN; /* try to expand with EAs present */ error = ext4_expand_extra_isize_ea(inode, new_extra_isize, raw_inode, handle); if (error) { /* * Inode size expansion failed; don't try again */ *no_expand = 1; } return error; } /* * Expand an inode by new_extra_isize bytes. * Returns 0 on success or negative error number on failure. */ static int ext4_try_to_expand_extra_isize(struct inode *inode, unsigned int new_extra_isize, struct ext4_iloc iloc, handle_t *handle) { int no_expand; int error; if (ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND)) return -EOVERFLOW; /* * In nojournal mode, we can immediately attempt to expand * the inode. When journaled, we first need to obtain extra * buffer credits since we may write into the EA block * with this same handle. If journal_extend fails, then it will * only result in a minor loss of functionality for that inode. * If this is felt to be critical, then e2fsck should be run to * force a large enough s_min_extra_isize. */ if (ext4_journal_extend(handle, EXT4_DATA_TRANS_BLOCKS(inode->i_sb), 0) != 0) return -ENOSPC; if (ext4_write_trylock_xattr(inode, &no_expand) == 0) return -EBUSY; error = __ext4_expand_extra_isize(inode, new_extra_isize, &iloc, handle, &no_expand); ext4_write_unlock_xattr(inode, &no_expand); return error; } int ext4_expand_extra_isize(struct inode *inode, unsigned int new_extra_isize, struct ext4_iloc *iloc) { handle_t *handle; int no_expand; int error, rc; if (ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND)) { brelse(iloc->bh); return -EOVERFLOW; } handle = ext4_journal_start(inode, EXT4_HT_INODE, EXT4_DATA_TRANS_BLOCKS(inode->i_sb)); if (IS_ERR(handle)) { error = PTR_ERR(handle); brelse(iloc->bh); return error; } ext4_write_lock_xattr(inode, &no_expand); BUFFER_TRACE(iloc->bh, "get_write_access"); error = ext4_journal_get_write_access(handle, inode->i_sb, iloc->bh, EXT4_JTR_NONE); if (error) { brelse(iloc->bh); goto out_unlock; } error = __ext4_expand_extra_isize(inode, new_extra_isize, iloc, handle, &no_expand); rc = ext4_mark_iloc_dirty(handle, inode, iloc); if (!error) error = rc; out_unlock: ext4_write_unlock_xattr(inode, &no_expand); ext4_journal_stop(handle); return error; } /* * What we do here is to mark the in-core inode as clean with respect to inode * dirtiness (it may still be data-dirty). * This means that the in-core inode may be reaped by prune_icache * without having to perform any I/O. This is a very good thing, * because *any* task may call prune_icache - even ones which * have a transaction open against a different journal. * * Is this cheating? Not really. Sure, we haven't written the * inode out, but prune_icache isn't a user-visible syncing function. * Whenever the user wants stuff synced (sys_sync, sys_msync, sys_fsync) * we start and wait on commits. */ int __ext4_mark_inode_dirty(handle_t *handle, struct inode *inode, const char *func, unsigned int line) { struct ext4_iloc iloc; struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); int err; might_sleep(); trace_ext4_mark_inode_dirty(inode, _RET_IP_); err = ext4_reserve_inode_write(handle, inode, &iloc); if (err) goto out; if (EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize) ext4_try_to_expand_extra_isize(inode, sbi->s_want_extra_isize, iloc, handle); err = ext4_mark_iloc_dirty(handle, inode, &iloc); out: if (unlikely(err)) ext4_error_inode_err(inode, func, line, 0, err, "mark_inode_dirty error"); return err; } /* * ext4_dirty_inode() is called from __mark_inode_dirty() * * We're really interested in the case where a file is being extended. * i_size has been changed by generic_commit_write() and we thus need * to include the updated inode in the current transaction. * * Also, dquot_alloc_block() will always dirty the inode when blocks * are allocated to the file. * * If the inode is marked synchronous, we don't honour that here - doing * so would cause a commit on atime updates, which we don't bother doing. * We handle synchronous inodes at the highest possible level. */ void ext4_dirty_inode(struct inode *inode, int flags) { handle_t *handle; handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); if (IS_ERR(handle)) return; ext4_mark_inode_dirty(handle, inode); ext4_journal_stop(handle); } int ext4_change_inode_journal_flag(struct inode *inode, int val) { journal_t *journal; handle_t *handle; int err; int alloc_ctx; /* * We have to be very careful here: changing a data block's * journaling status dynamically is dangerous. If we write a * data block to the journal, change the status and then delete * that block, we risk forgetting to revoke the old log record * from the journal and so a subsequent replay can corrupt data. * So, first we make sure that the journal is empty and that * nobody is changing anything. */ journal = EXT4_JOURNAL(inode); if (!journal) return 0; if (is_journal_aborted(journal)) return -EROFS; /* Wait for all existing dio workers */ inode_dio_wait(inode); /* * Before flushing the journal and switching inode's aops, we have * to flush all dirty data the inode has. There can be outstanding * delayed allocations, there can be unwritten extents created by * fallocate or buffered writes in dioread_nolock mode covered by * dirty data which can be converted only after flushing the dirty * data (and journalled aops don't know how to handle these cases). */ if (val) { filemap_invalidate_lock(inode->i_mapping); err = filemap_write_and_wait(inode->i_mapping); if (err < 0) { filemap_invalidate_unlock(inode->i_mapping); return err; } } alloc_ctx = ext4_writepages_down_write(inode->i_sb); jbd2_journal_lock_updates(journal); /* * OK, there are no updates running now, and all cached data is * synced to disk. We are now in a completely consistent state * which doesn't have anything in the journal, and we know that * no filesystem updates are running, so it is safe to modify * the inode's in-core data-journaling state flag now. */ if (val) ext4_set_inode_flag(inode, EXT4_INODE_JOURNAL_DATA); else { err = jbd2_journal_flush(journal, 0); if (err < 0) { jbd2_journal_unlock_updates(journal); ext4_writepages_up_write(inode->i_sb, alloc_ctx); return err; } ext4_clear_inode_flag(inode, EXT4_INODE_JOURNAL_DATA); } ext4_set_aops(inode); jbd2_journal_unlock_updates(journal); ext4_writepages_up_write(inode->i_sb, alloc_ctx); if (val) filemap_invalidate_unlock(inode->i_mapping); /* Finally we can mark the inode as dirty. */ handle = ext4_journal_start(inode, EXT4_HT_INODE, 1); if (IS_ERR(handle)) return PTR_ERR(handle); ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_JOURNAL_FLAG_CHANGE, handle); err = ext4_mark_inode_dirty(handle, inode); ext4_handle_sync(handle); ext4_journal_stop(handle); ext4_std_error(inode->i_sb, err); return err; } static int ext4_bh_unmapped(handle_t *handle, struct inode *inode, struct buffer_head *bh) { return !buffer_mapped(bh); } static int ext4_block_page_mkwrite(struct inode *inode, struct folio *folio, get_block_t get_block) { handle_t *handle; loff_t size; unsigned long len; int credits; int ret; credits = ext4_chunk_trans_extent(inode, ext4_journal_blocks_per_folio(inode)); handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, credits); if (IS_ERR(handle)) return PTR_ERR(handle); folio_lock(folio); size = i_size_read(inode); /* Page got truncated from under us? */ if (folio->mapping != inode->i_mapping || folio_pos(folio) > size) { ret = -EFAULT; goto out_error; } len = folio_size(folio); if (folio_pos(folio) + len > size) len = size - folio_pos(folio); ret = ext4_block_write_begin(handle, folio, 0, len, get_block); if (ret) goto out_error; if (!ext4_should_journal_data(inode)) { block_commit_write(folio, 0, len); folio_mark_dirty(folio); } else { ret = ext4_journal_folio_buffers(handle, folio, len); if (ret) goto out_error; } ext4_journal_stop(handle); folio_wait_stable(folio); return ret; out_error: folio_unlock(folio); ext4_journal_stop(handle); return ret; } vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; struct folio *folio = page_folio(vmf->page); loff_t size; unsigned long len; int err; vm_fault_t ret; struct file *file = vma->vm_file; struct inode *inode = file_inode(file); struct address_space *mapping = inode->i_mapping; get_block_t *get_block = ext4_get_block; int retries = 0; if (unlikely(IS_IMMUTABLE(inode))) return VM_FAULT_SIGBUS; sb_start_pagefault(inode->i_sb); file_update_time(vma->vm_file); filemap_invalidate_lock_shared(mapping); err = ext4_convert_inline_data(inode); if (err) goto out_ret; /* * On data journalling we skip straight to the transaction handle: * there's no delalloc; page truncated will be checked later; the * early return w/ all buffers mapped (calculates size/len) can't * be used; and there's no dioread_nolock, so only ext4_get_block. */ if (ext4_should_journal_data(inode)) goto retry_alloc; /* Delalloc case is easy... */ if (test_opt(inode->i_sb, DELALLOC) && !ext4_nonda_switch(inode->i_sb)) { do { err = block_page_mkwrite(vma, vmf, ext4_da_get_block_prep); } while (err == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)); goto out_ret; } folio_lock(folio); size = i_size_read(inode); /* Page got truncated from under us? */ if (folio->mapping != mapping || folio_pos(folio) > size) { folio_unlock(folio); ret = VM_FAULT_NOPAGE; goto out; } len = folio_size(folio); if (folio_pos(folio) + len > size) len = size - folio_pos(folio); /* * Return if we have all the buffers mapped. This avoids the need to do * journal_start/journal_stop which can block and take a long time * * This cannot be done for data journalling, as we have to add the * inode to the transaction's list to writeprotect pages on commit. */ if (folio_buffers(folio)) { if (!ext4_walk_page_buffers(NULL, inode, folio_buffers(folio), 0, len, NULL, ext4_bh_unmapped)) { /* Wait so that we don't change page under IO */ folio_wait_stable(folio); ret = VM_FAULT_LOCKED; goto out; } } folio_unlock(folio); /* OK, we need to fill the hole... */ if (ext4_should_dioread_nolock(inode)) get_block = ext4_get_block_unwritten; retry_alloc: /* Start journal and allocate blocks */ err = ext4_block_page_mkwrite(inode, folio, get_block); if (err == -EAGAIN || (err == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))) goto retry_alloc; out_ret: ret = vmf_fs_error(err); out: filemap_invalidate_unlock_shared(mapping); sb_end_pagefault(inode->i_sb); return ret; } |
| 13 1 3 3 3 3 3 3 3 6 6 5 1 1 1 1 1 1 1 1 1 1 17 12 12 12 12 12 12 12 12 12 12 12 12 12 60 61 60 61 61 61 60 55 61 51 2 2 2 2 2 2 2 2 16 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 4 4 4 4 1 4 2 2 2 2 2 2 1 1 2 2 2 1 1 1 2 2 2 2 2 2 2 2 10 10 10 10 10 6 6 6 6 6 52 52 52 52 13 7 3 13 7 1 7 2 2 1 2 1 2 2 2 2 2 2 1 2 2 2 2 6 10 3 3 10 8 1 6 3 3 2 2 1 6 6 6 3 3 6 6 6 3 6 6 6 5 6 1 5 6 3 1 1 51 51 6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 | // SPDX-License-Identifier: GPL-2.0-only /* * drivers/net/veth.c * * Copyright (C) 2007 OpenVZ http://openvz.org, SWsoft Inc * * Author: Pavel Emelianov <xemul@openvz.org> * Ethtool interface from: Eric W. Biederman <ebiederm@xmission.com> * */ #include <linux/netdevice.h> #include <linux/slab.h> #include <linux/ethtool.h> #include <linux/etherdevice.h> #include <linux/u64_stats_sync.h> #include <net/rtnetlink.h> #include <net/dst.h> #include <net/netdev_lock.h> #include <net/xfrm.h> #include <net/xdp.h> #include <linux/veth.h> #include <linux/module.h> #include <linux/bpf.h> #include <linux/filter.h> #include <linux/ptr_ring.h> #include <linux/bpf_trace.h> #include <linux/net_tstamp.h> #include <linux/skbuff_ref.h> #include <net/page_pool/helpers.h> #define DRV_NAME "veth" #define DRV_VERSION "1.0" #define VETH_XDP_FLAG BIT(0) #define VETH_RING_SIZE 256 #define VETH_XDP_HEADROOM (XDP_PACKET_HEADROOM + NET_IP_ALIGN) #define VETH_XDP_TX_BULK_SIZE 16 #define VETH_XDP_BATCH 16 struct veth_stats { u64 rx_drops; /* xdp */ u64 xdp_packets; u64 xdp_bytes; u64 xdp_redirect; u64 xdp_drops; u64 xdp_tx; u64 xdp_tx_err; u64 peer_tq_xdp_xmit; u64 peer_tq_xdp_xmit_err; }; struct veth_rq_stats { struct veth_stats vs; struct u64_stats_sync syncp; }; struct veth_rq { struct napi_struct xdp_napi; struct napi_struct __rcu *napi; /* points to xdp_napi when the latter is initialized */ struct net_device *dev; struct bpf_prog __rcu *xdp_prog; struct xdp_mem_info xdp_mem; struct veth_rq_stats stats; bool rx_notify_masked; struct ptr_ring xdp_ring; struct xdp_rxq_info xdp_rxq; struct page_pool *page_pool; }; struct veth_priv { struct net_device __rcu *peer; atomic64_t dropped; struct bpf_prog *_xdp_prog; struct veth_rq *rq; unsigned int requested_headroom; }; struct veth_xdp_tx_bq { struct xdp_frame *q[VETH_XDP_TX_BULK_SIZE]; unsigned int count; }; /* * ethtool interface */ struct veth_q_stat_desc { char desc[ETH_GSTRING_LEN]; size_t offset; }; #define VETH_RQ_STAT(m) offsetof(struct veth_stats, m) static const struct veth_q_stat_desc veth_rq_stats_desc[] = { { "xdp_packets", VETH_RQ_STAT(xdp_packets) }, { "xdp_bytes", VETH_RQ_STAT(xdp_bytes) }, { "drops", VETH_RQ_STAT(rx_drops) }, { "xdp_redirect", VETH_RQ_STAT(xdp_redirect) }, { "xdp_drops", VETH_RQ_STAT(xdp_drops) }, { "xdp_tx", VETH_RQ_STAT(xdp_tx) }, { "xdp_tx_errors", VETH_RQ_STAT(xdp_tx_err) }, }; #define VETH_RQ_STATS_LEN ARRAY_SIZE(veth_rq_stats_desc) static const struct veth_q_stat_desc veth_tq_stats_desc[] = { { "xdp_xmit", VETH_RQ_STAT(peer_tq_xdp_xmit) }, { "xdp_xmit_errors", VETH_RQ_STAT(peer_tq_xdp_xmit_err) }, }; #define VETH_TQ_STATS_LEN ARRAY_SIZE(veth_tq_stats_desc) static struct { const char string[ETH_GSTRING_LEN]; } ethtool_stats_keys[] = { { "peer_ifindex" }, }; struct veth_xdp_buff { struct xdp_buff xdp; struct sk_buff *skb; }; static int veth_get_link_ksettings(struct net_device *dev, struct ethtool_link_ksettings *cmd) { cmd->base.speed = SPEED_10000; cmd->base.duplex = DUPLEX_FULL; cmd->base.port = PORT_TP; cmd->base.autoneg = AUTONEG_DISABLE; return 0; } static void veth_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info) { strscpy(info->driver, DRV_NAME, sizeof(info->driver)); strscpy(info->version, DRV_VERSION, sizeof(info->version)); } static void veth_get_strings(struct net_device *dev, u32 stringset, u8 *buf) { u8 *p = buf; int i, j; switch(stringset) { case ETH_SS_STATS: memcpy(p, ðtool_stats_keys, sizeof(ethtool_stats_keys)); p += sizeof(ethtool_stats_keys); for (i = 0; i < dev->real_num_rx_queues; i++) for (j = 0; j < VETH_RQ_STATS_LEN; j++) ethtool_sprintf(&p, "rx_queue_%u_%.18s", i, veth_rq_stats_desc[j].desc); for (i = 0; i < dev->real_num_tx_queues; i++) for (j = 0; j < VETH_TQ_STATS_LEN; j++) ethtool_sprintf(&p, "tx_queue_%u_%.18s", i, veth_tq_stats_desc[j].desc); page_pool_ethtool_stats_get_strings(p); break; } } static int veth_get_sset_count(struct net_device *dev, int sset) { switch (sset) { case ETH_SS_STATS: return ARRAY_SIZE(ethtool_stats_keys) + VETH_RQ_STATS_LEN * dev->real_num_rx_queues + VETH_TQ_STATS_LEN * dev->real_num_tx_queues + page_pool_ethtool_stats_get_count(); default: return -EOPNOTSUPP; } } static void veth_get_page_pool_stats(struct net_device *dev, u64 *data) { #ifdef CONFIG_PAGE_POOL_STATS struct veth_priv *priv = netdev_priv(dev); struct page_pool_stats pp_stats = {}; int i; for (i = 0; i < dev->real_num_rx_queues; i++) { if (!priv->rq[i].page_pool) continue; page_pool_get_stats(priv->rq[i].page_pool, &pp_stats); } page_pool_ethtool_stats_get(data, &pp_stats); #endif /* CONFIG_PAGE_POOL_STATS */ } static void veth_get_ethtool_stats(struct net_device *dev, struct ethtool_stats *stats, u64 *data) { struct veth_priv *rcv_priv, *priv = netdev_priv(dev); struct net_device *peer = rtnl_dereference(priv->peer); int i, j, idx, pp_idx; data[0] = peer ? peer->ifindex : 0; idx = 1; for (i = 0; i < dev->real_num_rx_queues; i++) { const struct veth_rq_stats *rq_stats = &priv->rq[i].stats; const void *stats_base = (void *)&rq_stats->vs; unsigned int start; size_t offset; do { start = u64_stats_fetch_begin(&rq_stats->syncp); for (j = 0; j < VETH_RQ_STATS_LEN; j++) { offset = veth_rq_stats_desc[j].offset; data[idx + j] = *(u64 *)(stats_base + offset); } } while (u64_stats_fetch_retry(&rq_stats->syncp, start)); idx += VETH_RQ_STATS_LEN; } pp_idx = idx; if (!peer) goto page_pool_stats; rcv_priv = netdev_priv(peer); for (i = 0; i < peer->real_num_rx_queues; i++) { const struct veth_rq_stats *rq_stats = &rcv_priv->rq[i].stats; const void *base = (void *)&rq_stats->vs; unsigned int start, tx_idx = idx; size_t offset; tx_idx += (i % dev->real_num_tx_queues) * VETH_TQ_STATS_LEN; do { start = u64_stats_fetch_begin(&rq_stats->syncp); for (j = 0; j < VETH_TQ_STATS_LEN; j++) { offset = veth_tq_stats_desc[j].offset; data[tx_idx + j] += *(u64 *)(base + offset); } } while (u64_stats_fetch_retry(&rq_stats->syncp, start)); } pp_idx = idx + dev->real_num_tx_queues * VETH_TQ_STATS_LEN; page_pool_stats: veth_get_page_pool_stats(dev, &data[pp_idx]); } static void veth_get_channels(struct net_device *dev, struct ethtool_channels *channels) { channels->tx_count = dev->real_num_tx_queues; channels->rx_count = dev->real_num_rx_queues; channels->max_tx = dev->num_tx_queues; channels->max_rx = dev->num_rx_queues; } static int veth_set_channels(struct net_device *dev, struct ethtool_channels *ch); static const struct ethtool_ops veth_ethtool_ops = { .get_drvinfo = veth_get_drvinfo, .get_link = ethtool_op_get_link, .get_strings = veth_get_strings, .get_sset_count = veth_get_sset_count, .get_ethtool_stats = veth_get_ethtool_stats, .get_link_ksettings = veth_get_link_ksettings, .get_ts_info = ethtool_op_get_ts_info, .get_channels = veth_get_channels, .set_channels = veth_set_channels, }; /* general routines */ static bool veth_is_xdp_frame(void *ptr) { return (unsigned long)ptr & VETH_XDP_FLAG; } static struct xdp_frame *veth_ptr_to_xdp(void *ptr) { return (void *)((unsigned long)ptr & ~VETH_XDP_FLAG); } static void *veth_xdp_to_ptr(struct xdp_frame *xdp) { return (void *)((unsigned long)xdp | VETH_XDP_FLAG); } static void veth_ptr_free(void *ptr) { if (veth_is_xdp_frame(ptr)) xdp_return_frame(veth_ptr_to_xdp(ptr)); else kfree_skb(ptr); } static void __veth_xdp_flush(struct veth_rq *rq) { /* Write ptr_ring before reading rx_notify_masked */ smp_mb(); if (!READ_ONCE(rq->rx_notify_masked) && napi_schedule_prep(&rq->xdp_napi)) { WRITE_ONCE(rq->rx_notify_masked, true); __napi_schedule(&rq->xdp_napi); } } static int veth_xdp_rx(struct veth_rq *rq, struct sk_buff *skb) { if (unlikely(ptr_ring_produce(&rq->xdp_ring, skb))) return NETDEV_TX_BUSY; /* signal qdisc layer */ return NET_RX_SUCCESS; /* same as NETDEV_TX_OK */ } static int veth_forward_skb(struct net_device *dev, struct sk_buff *skb, struct veth_rq *rq, bool xdp) { return __dev_forward_skb(dev, skb) ?: xdp ? veth_xdp_rx(rq, skb) : __netif_rx(skb); } /* return true if the specified skb has chances of GRO aggregation * Don't strive for accuracy, but try to avoid GRO overhead in the most * common scenarios. * When XDP is enabled, all traffic is considered eligible, as the xmit * device has TSO off. * When TSO is enabled on the xmit device, we are likely interested only * in UDP aggregation, explicitly check for that if the skb is suspected * - the sock_wfree destructor is used by UDP, ICMP and XDP sockets - * to belong to locally generated UDP traffic. */ static bool veth_skb_is_eligible_for_gro(const struct net_device *dev, const struct net_device *rcv, const struct sk_buff *skb) { return !(dev->features & NETIF_F_ALL_TSO) || (skb->destructor == sock_wfree && rcv->features & (NETIF_F_GRO_FRAGLIST | NETIF_F_GRO_UDP_FWD)); } static netdev_tx_t veth_xmit(struct sk_buff *skb, struct net_device *dev) { struct veth_priv *rcv_priv, *priv = netdev_priv(dev); struct veth_rq *rq = NULL; struct netdev_queue *txq; struct net_device *rcv; int length = skb->len; bool use_napi = false; int ret, rxq; rcu_read_lock(); rcv = rcu_dereference(priv->peer); if (unlikely(!rcv) || !pskb_may_pull(skb, ETH_HLEN)) { kfree_skb(skb); goto drop; } rcv_priv = netdev_priv(rcv); rxq = skb_get_queue_mapping(skb); if (rxq < rcv->real_num_rx_queues) { rq = &rcv_priv->rq[rxq]; /* The napi pointer is available when an XDP program is * attached or when GRO is enabled * Don't bother with napi/GRO if the skb can't be aggregated */ use_napi = rcu_access_pointer(rq->napi) && veth_skb_is_eligible_for_gro(dev, rcv, skb); } skb_tx_timestamp(skb); ret = veth_forward_skb(rcv, skb, rq, use_napi); switch (ret) { case NET_RX_SUCCESS: /* same as NETDEV_TX_OK */ if (!use_napi) dev_sw_netstats_tx_add(dev, 1, length); else __veth_xdp_flush(rq); break; case NETDEV_TX_BUSY: /* If a qdisc is attached to our virtual device, returning * NETDEV_TX_BUSY is allowed. */ txq = netdev_get_tx_queue(dev, rxq); if (qdisc_txq_has_no_queue(txq)) { dev_kfree_skb_any(skb); goto drop; } /* Restore Eth hdr pulled by dev_forward_skb/eth_type_trans */ __skb_push(skb, ETH_HLEN); /* Depend on prior success packets started NAPI consumer via * __veth_xdp_flush(). Cancel TXQ stop if consumer stopped, * paired with empty check in veth_poll(). */ netif_tx_stop_queue(txq); smp_mb__after_atomic(); if (unlikely(__ptr_ring_empty(&rq->xdp_ring))) netif_tx_wake_queue(txq); break; case NET_RX_DROP: /* same as NET_XMIT_DROP */ drop: atomic64_inc(&priv->dropped); ret = NET_XMIT_DROP; break; default: net_crit_ratelimited("%s(%s): Invalid return code(%d)", __func__, dev->name, ret); } rcu_read_unlock(); return ret; } static void veth_stats_rx(struct veth_stats *result, struct net_device *dev) { struct veth_priv *priv = netdev_priv(dev); int i; result->peer_tq_xdp_xmit_err = 0; result->xdp_packets = 0; result->xdp_tx_err = 0; result->xdp_bytes = 0; result->rx_drops = 0; for (i = 0; i < dev->num_rx_queues; i++) { u64 packets, bytes, drops, xdp_tx_err, peer_tq_xdp_xmit_err; struct veth_rq_stats *stats = &priv->rq[i].stats; unsigned int start; do { start = u64_stats_fetch_begin(&stats->syncp); peer_tq_xdp_xmit_err = stats->vs.peer_tq_xdp_xmit_err; xdp_tx_err = stats->vs.xdp_tx_err; packets = stats->vs.xdp_packets; bytes = stats->vs.xdp_bytes; drops = stats->vs.rx_drops; } while (u64_stats_fetch_retry(&stats->syncp, start)); result->peer_tq_xdp_xmit_err += peer_tq_xdp_xmit_err; result->xdp_tx_err += xdp_tx_err; result->xdp_packets += packets; result->xdp_bytes += bytes; result->rx_drops += drops; } } static void veth_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *tot) { struct veth_priv *priv = netdev_priv(dev); struct net_device *peer; struct veth_stats rx; tot->tx_dropped = atomic64_read(&priv->dropped); dev_fetch_sw_netstats(tot, dev->tstats); veth_stats_rx(&rx, dev); tot->tx_dropped += rx.xdp_tx_err; tot->rx_dropped = rx.rx_drops + rx.peer_tq_xdp_xmit_err; tot->rx_bytes += rx.xdp_bytes; tot->rx_packets += rx.xdp_packets; rcu_read_lock(); peer = rcu_dereference(priv->peer); if (peer) { struct rtnl_link_stats64 tot_peer = {}; dev_fetch_sw_netstats(&tot_peer, peer->tstats); tot->rx_bytes += tot_peer.tx_bytes; tot->rx_packets += tot_peer.tx_packets; veth_stats_rx(&rx, peer); tot->tx_dropped += rx.peer_tq_xdp_xmit_err; tot->rx_dropped += rx.xdp_tx_err; tot->tx_bytes += rx.xdp_bytes; tot->tx_packets += rx.xdp_packets; } rcu_read_unlock(); } /* fake multicast ability */ static void veth_set_multicast_list(struct net_device *dev) { } static int veth_select_rxq(struct net_device *dev) { return smp_processor_id() % dev->real_num_rx_queues; } static struct net_device *veth_peer_dev(struct net_device *dev) { struct veth_priv *priv = netdev_priv(dev); /* Callers must be under RCU read side. */ return rcu_dereference(priv->peer); } static int veth_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames, u32 flags, bool ndo_xmit) { struct veth_priv *rcv_priv, *priv = netdev_priv(dev); int i, ret = -ENXIO, nxmit = 0; struct net_device *rcv; unsigned int max_len; struct veth_rq *rq; if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK)) return -EINVAL; rcu_read_lock(); rcv = rcu_dereference(priv->peer); if (unlikely(!rcv)) goto out; rcv_priv = netdev_priv(rcv); rq = &rcv_priv->rq[veth_select_rxq(rcv)]; /* The napi pointer is set if NAPI is enabled, which ensures that * xdp_ring is initialized on receive side and the peer device is up. */ if (!rcu_access_pointer(rq->napi)) goto out; max_len = rcv->mtu + rcv->hard_header_len + VLAN_HLEN; spin_lock(&rq->xdp_ring.producer_lock); for (i = 0; i < n; i++) { struct xdp_frame *frame = frames[i]; void *ptr = veth_xdp_to_ptr(frame); if (unlikely(xdp_get_frame_len(frame) > max_len || __ptr_ring_produce(&rq->xdp_ring, ptr))) break; nxmit++; } spin_unlock(&rq->xdp_ring.producer_lock); if (flags & XDP_XMIT_FLUSH) __veth_xdp_flush(rq); ret = nxmit; if (ndo_xmit) { u64_stats_update_begin(&rq->stats.syncp); rq->stats.vs.peer_tq_xdp_xmit += nxmit; rq->stats.vs.peer_tq_xdp_xmit_err += n - nxmit; u64_stats_update_end(&rq->stats.syncp); } out: rcu_read_unlock(); return ret; } static int veth_ndo_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames, u32 flags) { int err; err = veth_xdp_xmit(dev, n, frames, flags, true); if (err < 0) { struct veth_priv *priv = netdev_priv(dev); atomic64_add(n, &priv->dropped); } return err; } static void veth_xdp_flush_bq(struct veth_rq *rq, struct veth_xdp_tx_bq *bq) { int sent, i, err = 0, drops; sent = veth_xdp_xmit(rq->dev, bq->count, bq->q, 0, false); if (sent < 0) { err = sent; sent = 0; } for (i = sent; unlikely(i < bq->count); i++) xdp_return_frame(bq->q[i]); drops = bq->count - sent; trace_xdp_bulk_tx(rq->dev, sent, drops, err); u64_stats_update_begin(&rq->stats.syncp); rq->stats.vs.xdp_tx += sent; rq->stats.vs.xdp_tx_err += drops; u64_stats_update_end(&rq->stats.syncp); bq->count = 0; } static void veth_xdp_flush(struct veth_rq *rq, struct veth_xdp_tx_bq *bq) { struct veth_priv *rcv_priv, *priv = netdev_priv(rq->dev); struct net_device *rcv; struct veth_rq *rcv_rq; rcu_read_lock(); veth_xdp_flush_bq(rq, bq); rcv = rcu_dereference(priv->peer); if (unlikely(!rcv)) goto out; rcv_priv = netdev_priv(rcv); rcv_rq = &rcv_priv->rq[veth_select_rxq(rcv)]; /* xdp_ring is initialized on receive side? */ if (unlikely(!rcu_access_pointer(rcv_rq->xdp_prog))) goto out; __veth_xdp_flush(rcv_rq); out: rcu_read_unlock(); } static int veth_xdp_tx(struct veth_rq *rq, struct xdp_buff *xdp, struct veth_xdp_tx_bq *bq) { struct xdp_frame *frame = xdp_convert_buff_to_frame(xdp); if (unlikely(!frame)) return -EOVERFLOW; if (unlikely(bq->count == VETH_XDP_TX_BULK_SIZE)) veth_xdp_flush_bq(rq, bq); bq->q[bq->count++] = frame; return 0; } static struct xdp_frame *veth_xdp_rcv_one(struct veth_rq *rq, struct xdp_frame *frame, struct veth_xdp_tx_bq *bq, struct veth_stats *stats) { struct xdp_frame orig_frame; struct bpf_prog *xdp_prog; rcu_read_lock(); xdp_prog = rcu_dereference(rq->xdp_prog); if (likely(xdp_prog)) { struct veth_xdp_buff vxbuf; struct xdp_buff *xdp = &vxbuf.xdp; u32 act; xdp_convert_frame_to_buff(frame, xdp); xdp->rxq = &rq->xdp_rxq; vxbuf.skb = NULL; act = bpf_prog_run_xdp(xdp_prog, xdp); switch (act) { case XDP_PASS: if (xdp_update_frame_from_buff(xdp, frame)) goto err_xdp; break; case XDP_TX: orig_frame = *frame; xdp->rxq->mem.type = frame->mem_type; if (unlikely(veth_xdp_tx(rq, xdp, bq) < 0)) { trace_xdp_exception(rq->dev, xdp_prog, act); frame = &orig_frame; stats->rx_drops++; goto err_xdp; } stats->xdp_tx++; rcu_read_unlock(); goto xdp_xmit; case XDP_REDIRECT: orig_frame = *frame; xdp->rxq->mem.type = frame->mem_type; if (xdp_do_redirect(rq->dev, xdp, xdp_prog)) { frame = &orig_frame; stats->rx_drops++; goto err_xdp; } stats->xdp_redirect++; rcu_read_unlock(); goto xdp_xmit; default: bpf_warn_invalid_xdp_action(rq->dev, xdp_prog, act); fallthrough; case XDP_ABORTED: trace_xdp_exception(rq->dev, xdp_prog, act); fallthrough; case XDP_DROP: stats->xdp_drops++; goto err_xdp; } } rcu_read_unlock(); return frame; err_xdp: rcu_read_unlock(); xdp_return_frame(frame); xdp_xmit: return NULL; } /* frames array contains VETH_XDP_BATCH at most */ static void veth_xdp_rcv_bulk_skb(struct veth_rq *rq, void **frames, int n_xdpf, struct veth_xdp_tx_bq *bq, struct veth_stats *stats) { void *skbs[VETH_XDP_BATCH]; int i; if (unlikely(!napi_skb_cache_get_bulk(skbs, n_xdpf))) { for (i = 0; i < n_xdpf; i++) xdp_return_frame(frames[i]); stats->rx_drops += n_xdpf; return; } for (i = 0; i < n_xdpf; i++) { struct sk_buff *skb = skbs[i]; skb = __xdp_build_skb_from_frame(frames[i], skb, rq->dev); if (!skb) { xdp_return_frame(frames[i]); stats->rx_drops++; continue; } napi_gro_receive(&rq->xdp_napi, skb); } } static void veth_xdp_get(struct xdp_buff *xdp) { struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp); int i; get_page(virt_to_page(xdp->data)); if (likely(!xdp_buff_has_frags(xdp))) return; for (i = 0; i < sinfo->nr_frags; i++) __skb_frag_ref(&sinfo->frags[i]); } static int veth_convert_skb_to_xdp_buff(struct veth_rq *rq, struct xdp_buff *xdp, struct sk_buff **pskb) { struct sk_buff *skb = *pskb; u32 frame_sz; if (skb_shared(skb) || skb_head_is_locked(skb) || skb_shinfo(skb)->nr_frags || skb_headroom(skb) < XDP_PACKET_HEADROOM) { if (skb_pp_cow_data(rq->page_pool, pskb, XDP_PACKET_HEADROOM)) goto drop; skb = *pskb; } /* SKB "head" area always have tailroom for skb_shared_info */ frame_sz = skb_end_pointer(skb) - skb->head; frame_sz += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); xdp_init_buff(xdp, frame_sz, &rq->xdp_rxq); xdp_prepare_buff(xdp, skb->head, skb_headroom(skb), skb_headlen(skb), true); if (skb_is_nonlinear(skb)) { skb_shinfo(skb)->xdp_frags_size = skb->data_len; xdp_buff_set_frags_flag(xdp); } else { xdp_buff_clear_frags_flag(xdp); } *pskb = skb; return 0; drop: consume_skb(skb); *pskb = NULL; return -ENOMEM; } static struct sk_buff *veth_xdp_rcv_skb(struct veth_rq *rq, struct sk_buff *skb, struct veth_xdp_tx_bq *bq, struct veth_stats *stats) { void *orig_data, *orig_data_end; struct bpf_prog *xdp_prog; struct veth_xdp_buff vxbuf; struct xdp_buff *xdp = &vxbuf.xdp; u32 act, metalen; int off; skb_prepare_for_gro(skb); rcu_read_lock(); xdp_prog = rcu_dereference(rq->xdp_prog); if (unlikely(!xdp_prog)) { rcu_read_unlock(); goto out; } __skb_push(skb, skb->data - skb_mac_header(skb)); if (veth_convert_skb_to_xdp_buff(rq, xdp, &skb)) goto drop; vxbuf.skb = skb; orig_data = xdp->data; orig_data_end = xdp->data_end; act = bpf_prog_run_xdp(xdp_prog, xdp); switch (act) { case XDP_PASS: break; case XDP_TX: veth_xdp_get(xdp); consume_skb(skb); xdp->rxq->mem = rq->xdp_mem; if (unlikely(veth_xdp_tx(rq, xdp, bq) < 0)) { trace_xdp_exception(rq->dev, xdp_prog, act); stats->rx_drops++; goto err_xdp; } stats->xdp_tx++; rcu_read_unlock(); goto xdp_xmit; case XDP_REDIRECT: veth_xdp_get(xdp); consume_skb(skb); xdp->rxq->mem = rq->xdp_mem; if (xdp_do_redirect(rq->dev, xdp, xdp_prog)) { stats->rx_drops++; goto err_xdp; } stats->xdp_redirect++; rcu_read_unlock(); goto xdp_xmit; default: bpf_warn_invalid_xdp_action(rq->dev, xdp_prog, act); fallthrough; case XDP_ABORTED: trace_xdp_exception(rq->dev, xdp_prog, act); fallthrough; case XDP_DROP: stats->xdp_drops++; goto xdp_drop; } rcu_read_unlock(); /* check if bpf_xdp_adjust_head was used */ off = orig_data - xdp->data; if (off > 0) __skb_push(skb, off); else if (off < 0) __skb_pull(skb, -off); skb_reset_mac_header(skb); /* check if bpf_xdp_adjust_tail was used */ off = xdp->data_end - orig_data_end; if (off != 0) __skb_put(skb, off); /* positive on grow, negative on shrink */ /* XDP frag metadata (e.g. nr_frags) are updated in eBPF helpers * (e.g. bpf_xdp_adjust_tail), we need to update data_len here. */ if (xdp_buff_has_frags(xdp)) skb->data_len = skb_shinfo(skb)->xdp_frags_size; else skb->data_len = 0; skb->protocol = eth_type_trans(skb, rq->dev); metalen = xdp->data - xdp->data_meta; if (metalen) skb_metadata_set(skb, metalen); out: return skb; drop: stats->rx_drops++; xdp_drop: rcu_read_unlock(); kfree_skb(skb); return NULL; err_xdp: rcu_read_unlock(); xdp_return_buff(xdp); xdp_xmit: return NULL; } static int veth_xdp_rcv(struct veth_rq *rq, int budget, struct veth_xdp_tx_bq *bq, struct veth_stats *stats) { struct veth_priv *priv = netdev_priv(rq->dev); int queue_idx = rq->xdp_rxq.queue_index; struct netdev_queue *peer_txq; struct net_device *peer_dev; int i, done = 0, n_xdpf = 0; void *xdpf[VETH_XDP_BATCH]; /* NAPI functions as RCU section */ peer_dev = rcu_dereference_check(priv->peer, rcu_read_lock_bh_held()); peer_txq = peer_dev ? netdev_get_tx_queue(peer_dev, queue_idx) : NULL; for (i = 0; i < budget; i++) { void *ptr = __ptr_ring_consume(&rq->xdp_ring); if (!ptr) break; if (veth_is_xdp_frame(ptr)) { /* ndo_xdp_xmit */ struct xdp_frame *frame = veth_ptr_to_xdp(ptr); stats->xdp_bytes += xdp_get_frame_len(frame); frame = veth_xdp_rcv_one(rq, frame, bq, stats); if (frame) { /* XDP_PASS */ xdpf[n_xdpf++] = frame; if (n_xdpf == VETH_XDP_BATCH) { veth_xdp_rcv_bulk_skb(rq, xdpf, n_xdpf, bq, stats); n_xdpf = 0; } } } else { /* ndo_start_xmit */ struct sk_buff *skb = ptr; stats->xdp_bytes += skb->len; skb = veth_xdp_rcv_skb(rq, skb, bq, stats); if (skb) { if (skb_shared(skb) || skb_unclone(skb, GFP_ATOMIC)) netif_receive_skb(skb); else napi_gro_receive(&rq->xdp_napi, skb); } } done++; } if (n_xdpf) veth_xdp_rcv_bulk_skb(rq, xdpf, n_xdpf, bq, stats); u64_stats_update_begin(&rq->stats.syncp); rq->stats.vs.xdp_redirect += stats->xdp_redirect; rq->stats.vs.xdp_bytes += stats->xdp_bytes; rq->stats.vs.xdp_drops += stats->xdp_drops; rq->stats.vs.rx_drops += stats->rx_drops; rq->stats.vs.xdp_packets += done; u64_stats_update_end(&rq->stats.syncp); if (peer_txq && unlikely(netif_tx_queue_stopped(peer_txq))) netif_tx_wake_queue(peer_txq); return done; } static int veth_poll(struct napi_struct *napi, int budget) { struct veth_rq *rq = container_of(napi, struct veth_rq, xdp_napi); struct veth_stats stats = {}; struct veth_xdp_tx_bq bq; int done; bq.count = 0; xdp_set_return_frame_no_direct(); done = veth_xdp_rcv(rq, budget, &bq, &stats); if (stats.xdp_redirect > 0) xdp_do_flush(); if (done < budget && napi_complete_done(napi, done)) { /* Write rx_notify_masked before reading ptr_ring */ smp_store_mb(rq->rx_notify_masked, false); if (unlikely(!__ptr_ring_empty(&rq->xdp_ring))) { if (napi_schedule_prep(&rq->xdp_napi)) { WRITE_ONCE(rq->rx_notify_masked, true); __napi_schedule(&rq->xdp_napi); } } } if (stats.xdp_tx > 0) veth_xdp_flush(rq, &bq); xdp_clear_return_frame_no_direct(); return done; } static int veth_create_page_pool(struct veth_rq *rq) { struct page_pool_params pp_params = { .order = 0, .pool_size = VETH_RING_SIZE, .nid = NUMA_NO_NODE, .dev = &rq->dev->dev, }; rq->page_pool = page_pool_create(&pp_params); if (IS_ERR(rq->page_pool)) { int err = PTR_ERR(rq->page_pool); rq->page_pool = NULL; return err; } return 0; } static int __veth_napi_enable_range(struct net_device *dev, int start, int end) { struct veth_priv *priv = netdev_priv(dev); int err, i; for (i = start; i < end; i++) { err = veth_create_page_pool(&priv->rq[i]); if (err) goto err_page_pool; } for (i = start; i < end; i++) { struct veth_rq *rq = &priv->rq[i]; err = ptr_ring_init(&rq->xdp_ring, VETH_RING_SIZE, GFP_KERNEL); if (err) goto err_xdp_ring; } for (i = start; i < end; i++) { struct veth_rq *rq = &priv->rq[i]; napi_enable(&rq->xdp_napi); rcu_assign_pointer(priv->rq[i].napi, &priv->rq[i].xdp_napi); } return 0; err_xdp_ring: for (i--; i >= start; i--) ptr_ring_cleanup(&priv->rq[i].xdp_ring, veth_ptr_free); i = end; err_page_pool: for (i--; i >= start; i--) { page_pool_destroy(priv->rq[i].page_pool); priv->rq[i].page_pool = NULL; } return err; } static int __veth_napi_enable(struct net_device *dev) { return __veth_napi_enable_range(dev, 0, dev->real_num_rx_queues); } static void veth_napi_del_range(struct net_device *dev, int start, int end) { struct veth_priv *priv = netdev_priv(dev); int i; for (i = start; i < end; i++) { struct veth_rq *rq = &priv->rq[i]; rcu_assign_pointer(priv->rq[i].napi, NULL); napi_disable(&rq->xdp_napi); __netif_napi_del(&rq->xdp_napi); } synchronize_net(); for (i = start; i < end; i++) { struct veth_rq *rq = &priv->rq[i]; rq->rx_notify_masked = false; ptr_ring_cleanup(&rq->xdp_ring, veth_ptr_free); } for (i = start; i < end; i++) { page_pool_destroy(priv->rq[i].page_pool); priv->rq[i].page_pool = NULL; } } static void veth_napi_del(struct net_device *dev) { veth_napi_del_range(dev, 0, dev->real_num_rx_queues); } static bool veth_gro_requested(const struct net_device *dev) { return !!(dev->wanted_features & NETIF_F_GRO); } static int veth_enable_xdp_range(struct net_device *dev, int start, int end, bool napi_already_on) { struct veth_priv *priv = netdev_priv(dev); int err, i; for (i = start; i < end; i++) { struct veth_rq *rq = &priv->rq[i]; if (!napi_already_on) netif_napi_add(dev, &rq->xdp_napi, veth_poll); err = xdp_rxq_info_reg(&rq->xdp_rxq, dev, i, rq->xdp_napi.napi_id); if (err < 0) goto err_rxq_reg; err = xdp_rxq_info_reg_mem_model(&rq->xdp_rxq, MEM_TYPE_PAGE_SHARED, NULL); if (err < 0) goto err_reg_mem; /* Save original mem info as it can be overwritten */ rq->xdp_mem = rq->xdp_rxq.mem; } return 0; err_reg_mem: xdp_rxq_info_unreg(&priv->rq[i].xdp_rxq); err_rxq_reg: for (i--; i >= start; i--) { struct veth_rq *rq = &priv->rq[i]; xdp_rxq_info_unreg(&rq->xdp_rxq); if (!napi_already_on) netif_napi_del(&rq->xdp_napi); } return err; } static void veth_disable_xdp_range(struct net_device *dev, int start, int end, bool delete_napi) { struct veth_priv *priv = netdev_priv(dev); int i; for (i = start; i < end; i++) { struct veth_rq *rq = &priv->rq[i]; rq->xdp_rxq.mem = rq->xdp_mem; xdp_rxq_info_unreg(&rq->xdp_rxq); if (delete_napi) netif_napi_del(&rq->xdp_napi); } } static int veth_enable_xdp(struct net_device *dev) { bool napi_already_on = veth_gro_requested(dev) && (dev->flags & IFF_UP); struct veth_priv *priv = netdev_priv(dev); int err, i; if (!xdp_rxq_info_is_reg(&priv->rq[0].xdp_rxq)) { err = veth_enable_xdp_range(dev, 0, dev->real_num_rx_queues, napi_already_on); if (err) return err; if (!napi_already_on) { err = __veth_napi_enable(dev); if (err) { veth_disable_xdp_range(dev, 0, dev->real_num_rx_queues, true); return err; } } } for (i = 0; i < dev->real_num_rx_queues; i++) { rcu_assign_pointer(priv->rq[i].xdp_prog, priv->_xdp_prog); rcu_assign_pointer(priv->rq[i].napi, &priv->rq[i].xdp_napi); } return 0; } static void veth_disable_xdp(struct net_device *dev) { struct veth_priv *priv = netdev_priv(dev); int i; for (i = 0; i < dev->real_num_rx_queues; i++) rcu_assign_pointer(priv->rq[i].xdp_prog, NULL); if (!netif_running(dev) || !veth_gro_requested(dev)) veth_napi_del(dev); veth_disable_xdp_range(dev, 0, dev->real_num_rx_queues, false); } static int veth_napi_enable_range(struct net_device *dev, int start, int end) { struct veth_priv *priv = netdev_priv(dev); int err, i; for (i = start; i < end; i++) { struct veth_rq *rq = &priv->rq[i]; netif_napi_add(dev, &rq->xdp_napi, veth_poll); } err = __veth_napi_enable_range(dev, start, end); if (err) { for (i = start; i < end; i++) { struct veth_rq *rq = &priv->rq[i]; netif_napi_del(&rq->xdp_napi); } return err; } return err; } static int veth_napi_enable(struct net_device *dev) { return veth_napi_enable_range(dev, 0, dev->real_num_rx_queues); } static void veth_disable_range_safe(struct net_device *dev, int start, int end) { struct veth_priv *priv = netdev_priv(dev); if (start >= end) return; if (priv->_xdp_prog) { veth_napi_del_range(dev, start, end); veth_disable_xdp_range(dev, start, end, false); } else if (veth_gro_requested(dev)) { veth_napi_del_range(dev, start, end); } } static int veth_enable_range_safe(struct net_device *dev, int start, int end) { struct veth_priv *priv = netdev_priv(dev); int err; if (start >= end) return 0; if (priv->_xdp_prog) { /* these channels are freshly initialized, napi is not on there even * when GRO is requeste */ err = veth_enable_xdp_range(dev, start, end, false); if (err) return err; err = __veth_napi_enable_range(dev, start, end); if (err) { /* on error always delete the newly added napis */ veth_disable_xdp_range(dev, start, end, true); return err; } } else if (veth_gro_requested(dev)) { return veth_napi_enable_range(dev, start, end); } return 0; } static void veth_set_xdp_features(struct net_device *dev) { struct veth_priv *priv = netdev_priv(dev); struct net_device *peer; peer = rtnl_dereference(priv->peer); if (peer && peer->real_num_tx_queues <= dev->real_num_rx_queues) { struct veth_priv *priv_peer = netdev_priv(peer); xdp_features_t val = NETDEV_XDP_ACT_BASIC | NETDEV_XDP_ACT_REDIRECT | NETDEV_XDP_ACT_RX_SG; if (priv_peer->_xdp_prog || veth_gro_requested(peer)) val |= NETDEV_XDP_ACT_NDO_XMIT | NETDEV_XDP_ACT_NDO_XMIT_SG; xdp_set_features_flag(dev, val); } else { xdp_clear_features_flag(dev); } } static int veth_set_channels(struct net_device *dev, struct ethtool_channels *ch) { struct veth_priv *priv = netdev_priv(dev); unsigned int old_rx_count, new_rx_count; struct veth_priv *peer_priv; struct net_device *peer; int err; /* sanity check. Upper bounds are already enforced by the caller */ if (!ch->rx_count || !ch->tx_count) return -EINVAL; /* avoid braking XDP, if that is enabled */ peer = rtnl_dereference(priv->peer); peer_priv = peer ? netdev_priv(peer) : NULL; if (priv->_xdp_prog && peer && ch->rx_count < peer->real_num_tx_queues) return -EINVAL; if (peer && peer_priv && peer_priv->_xdp_prog && ch->tx_count > peer->real_num_rx_queues) return -EINVAL; old_rx_count = dev->real_num_rx_queues; new_rx_count = ch->rx_count; if (netif_running(dev)) { /* turn device off */ netif_carrier_off(dev); if (peer) netif_carrier_off(peer); /* try to allocate new resurces, as needed*/ err = veth_enable_range_safe(dev, old_rx_count, new_rx_count); if (err) goto out; } err = netif_set_real_num_rx_queues(dev, ch->rx_count); if (err) goto revert; err = netif_set_real_num_tx_queues(dev, ch->tx_count); if (err) { int err2 = netif_set_real_num_rx_queues(dev, old_rx_count); /* this error condition could happen only if rx and tx change * in opposite directions (e.g. tx nr raises, rx nr decreases) * and we can't do anything to fully restore the original * status */ if (err2) pr_warn("Can't restore rx queues config %d -> %d %d", new_rx_count, old_rx_count, err2); else goto revert; } out: if (netif_running(dev)) { /* note that we need to swap the arguments WRT the enable part * to identify the range we have to disable */ veth_disable_range_safe(dev, new_rx_count, old_rx_count); netif_carrier_on(dev); if (peer) netif_carrier_on(peer); } /* update XDP supported features */ veth_set_xdp_features(dev); if (peer) veth_set_xdp_features(peer); return err; revert: new_rx_count = old_rx_count; old_rx_count = ch->rx_count; goto out; } static int veth_open(struct net_device *dev) { struct veth_priv *priv = netdev_priv(dev); struct net_device *peer = rtnl_dereference(priv->peer); int err; if (!peer) return -ENOTCONN; if (priv->_xdp_prog) { err = veth_enable_xdp(dev); if (err) return err; } else if (veth_gro_requested(dev)) { err = veth_napi_enable(dev); if (err) return err; } if (peer->flags & IFF_UP) { netif_carrier_on(dev); netif_carrier_on(peer); } veth_set_xdp_features(dev); return 0; } static int veth_close(struct net_device *dev) { struct veth_priv *priv = netdev_priv(dev); struct net_device *peer = rtnl_dereference(priv->peer); netif_carrier_off(dev); if (peer) netif_carrier_off(peer); if (priv->_xdp_prog) veth_disable_xdp(dev); else if (veth_gro_requested(dev)) veth_napi_del(dev); return 0; } static int is_valid_veth_mtu(int mtu) { return mtu >= ETH_MIN_MTU && mtu <= ETH_MAX_MTU; } static int veth_alloc_queues(struct net_device *dev) { struct veth_priv *priv = netdev_priv(dev); int i; priv->rq = kvcalloc(dev->num_rx_queues, sizeof(*priv->rq), GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL); if (!priv->rq) return -ENOMEM; for (i = 0; i < dev->num_rx_queues; i++) { priv->rq[i].dev = dev; u64_stats_init(&priv->rq[i].stats.syncp); } return 0; } static void veth_free_queues(struct net_device *dev) { struct veth_priv *priv = netdev_priv(dev); kvfree(priv->rq); } static int veth_dev_init(struct net_device *dev) { netdev_lockdep_set_classes(dev); return veth_alloc_queues(dev); } static void veth_dev_free(struct net_device *dev) { veth_free_queues(dev); } #ifdef CONFIG_NET_POLL_CONTROLLER static void veth_poll_controller(struct net_device *dev) { /* veth only receives frames when its peer sends one * Since it has nothing to do with disabling irqs, we are guaranteed * never to have pending data when we poll for it so * there is nothing to do here. * * We need this though so netpoll recognizes us as an interface that * supports polling, which enables bridge devices in virt setups to * still use netconsole */ } #endif /* CONFIG_NET_POLL_CONTROLLER */ static int veth_get_iflink(const struct net_device *dev) { struct veth_priv *priv = netdev_priv(dev); struct net_device *peer; int iflink; rcu_read_lock(); peer = rcu_dereference(priv->peer); iflink = peer ? READ_ONCE(peer->ifindex) : 0; rcu_read_unlock(); return iflink; } static netdev_features_t veth_fix_features(struct net_device *dev, netdev_features_t features) { struct veth_priv *priv = netdev_priv(dev); struct net_device *peer; peer = rtnl_dereference(priv->peer); if (peer) { struct veth_priv *peer_priv = netdev_priv(peer); if (peer_priv->_xdp_prog) features &= ~NETIF_F_GSO_SOFTWARE; } return features; } static int veth_set_features(struct net_device *dev, netdev_features_t features) { netdev_features_t changed = features ^ dev->features; struct veth_priv *priv = netdev_priv(dev); struct net_device *peer; int err; if (!(changed & NETIF_F_GRO) || !(dev->flags & IFF_UP) || priv->_xdp_prog) return 0; peer = rtnl_dereference(priv->peer); if (features & NETIF_F_GRO) { err = veth_napi_enable(dev); if (err) return err; if (peer) xdp_features_set_redirect_target(peer, true); } else { if (peer) xdp_features_clear_redirect_target(peer); veth_napi_del(dev); } return 0; } static void veth_set_rx_headroom(struct net_device *dev, int new_hr) { struct veth_priv *peer_priv, *priv = netdev_priv(dev); struct net_device *peer; if (new_hr < 0) new_hr = 0; rcu_read_lock(); peer = rcu_dereference(priv->peer); if (unlikely(!peer)) goto out; peer_priv = netdev_priv(peer); priv->requested_headroom = new_hr; new_hr = max(priv->requested_headroom, peer_priv->requested_headroom); dev->needed_headroom = new_hr; peer->needed_headroom = new_hr; out: rcu_read_unlock(); } static int veth_xdp_set(struct net_device *dev, struct bpf_prog *prog, struct netlink_ext_ack *extack) { struct veth_priv *priv = netdev_priv(dev); struct bpf_prog *old_prog; struct net_device *peer; unsigned int max_mtu; int err; old_prog = priv->_xdp_prog; priv->_xdp_prog = prog; peer = rtnl_dereference(priv->peer); if (prog) { if (!peer) { NL_SET_ERR_MSG_MOD(extack, "Cannot set XDP when peer is detached"); err = -ENOTCONN; goto err; } max_mtu = SKB_WITH_OVERHEAD(PAGE_SIZE - VETH_XDP_HEADROOM) - peer->hard_header_len; /* Allow increasing the max_mtu if the program supports * XDP fragments. */ if (prog->aux->xdp_has_frags) max_mtu += PAGE_SIZE * MAX_SKB_FRAGS; if (peer->mtu > max_mtu) { NL_SET_ERR_MSG_MOD(extack, "Peer MTU is too large to set XDP"); err = -ERANGE; goto err; } if (dev->real_num_rx_queues < peer->real_num_tx_queues) { NL_SET_ERR_MSG_MOD(extack, "XDP expects number of rx queues not less than peer tx queues"); err = -ENOSPC; goto err; } if (dev->flags & IFF_UP) { err = veth_enable_xdp(dev); if (err) { NL_SET_ERR_MSG_MOD(extack, "Setup for XDP failed"); goto err; } } if (!old_prog) { peer->hw_features &= ~NETIF_F_GSO_SOFTWARE; peer->max_mtu = max_mtu; } xdp_features_set_redirect_target(peer, true); } if (old_prog) { if (!prog) { if (peer && !veth_gro_requested(dev)) xdp_features_clear_redirect_target(peer); if (dev->flags & IFF_UP) veth_disable_xdp(dev); if (peer) { peer->hw_features |= NETIF_F_GSO_SOFTWARE; peer->max_mtu = ETH_MAX_MTU; } } bpf_prog_put(old_prog); } if ((!!old_prog ^ !!prog) && peer) netdev_update_features(peer); return 0; err: priv->_xdp_prog = old_prog; return err; } static int veth_xdp(struct net_device *dev, struct netdev_bpf *xdp) { switch (xdp->command) { case XDP_SETUP_PROG: return veth_xdp_set(dev, xdp->prog, xdp->extack); default: return -EINVAL; } } static int veth_xdp_rx_timestamp(const struct xdp_md *ctx, u64 *timestamp) { struct veth_xdp_buff *_ctx = (void *)ctx; if (!_ctx->skb) return -ENODATA; *timestamp = skb_hwtstamps(_ctx->skb)->hwtstamp; return 0; } static int veth_xdp_rx_hash(const struct xdp_md *ctx, u32 *hash, enum xdp_rss_hash_type *rss_type) { struct veth_xdp_buff *_ctx = (void *)ctx; struct sk_buff *skb = _ctx->skb; if (!skb) return -ENODATA; *hash = skb_get_hash(skb); *rss_type = skb->l4_hash ? XDP_RSS_TYPE_L4_ANY : XDP_RSS_TYPE_NONE; return 0; } static int veth_xdp_rx_vlan_tag(const struct xdp_md *ctx, __be16 *vlan_proto, u16 *vlan_tci) { const struct veth_xdp_buff *_ctx = (void *)ctx; const struct sk_buff *skb = _ctx->skb; int err; if (!skb) return -ENODATA; err = __vlan_hwaccel_get_tag(skb, vlan_tci); if (err) return err; *vlan_proto = skb->vlan_proto; return err; } static const struct net_device_ops veth_netdev_ops = { .ndo_init = veth_dev_init, .ndo_open = veth_open, .ndo_stop = veth_close, .ndo_start_xmit = veth_xmit, .ndo_get_stats64 = veth_get_stats64, .ndo_set_rx_mode = veth_set_multicast_list, .ndo_set_mac_address = eth_mac_addr, #ifdef CONFIG_NET_POLL_CONTROLLER .ndo_poll_controller = veth_poll_controller, #endif .ndo_get_iflink = veth_get_iflink, .ndo_fix_features = veth_fix_features, .ndo_set_features = veth_set_features, .ndo_features_check = passthru_features_check, .ndo_set_rx_headroom = veth_set_rx_headroom, .ndo_bpf = veth_xdp, .ndo_xdp_xmit = veth_ndo_xdp_xmit, .ndo_get_peer_dev = veth_peer_dev, }; static const struct xdp_metadata_ops veth_xdp_metadata_ops = { .xmo_rx_timestamp = veth_xdp_rx_timestamp, .xmo_rx_hash = veth_xdp_rx_hash, .xmo_rx_vlan_tag = veth_xdp_rx_vlan_tag, }; #define VETH_FEATURES (NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HW_CSUM | \ NETIF_F_RXCSUM | NETIF_F_SCTP_CRC | NETIF_F_HIGHDMA | \ NETIF_F_GSO_SOFTWARE | NETIF_F_GSO_ENCAP_ALL | \ NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_CTAG_RX | \ NETIF_F_HW_VLAN_STAG_TX | NETIF_F_HW_VLAN_STAG_RX ) static void veth_setup(struct net_device *dev) { ether_setup(dev); dev->priv_flags &= ~IFF_TX_SKB_SHARING; dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; dev->priv_flags |= IFF_NO_QUEUE; dev->priv_flags |= IFF_PHONY_HEADROOM; dev->priv_flags |= IFF_DISABLE_NETPOLL; dev->lltx = true; dev->netdev_ops = &veth_netdev_ops; dev->xdp_metadata_ops = &veth_xdp_metadata_ops; dev->ethtool_ops = &veth_ethtool_ops; dev->features |= VETH_FEATURES; dev->vlan_features = dev->features & ~(NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_STAG_TX | NETIF_F_HW_VLAN_CTAG_RX | NETIF_F_HW_VLAN_STAG_RX); dev->needs_free_netdev = true; dev->priv_destructor = veth_dev_free; dev->pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS; dev->max_mtu = ETH_MAX_MTU; dev->hw_features = VETH_FEATURES; dev->hw_enc_features = VETH_FEATURES; dev->mpls_features = NETIF_F_HW_CSUM | NETIF_F_GSO_SOFTWARE; netif_set_tso_max_size(dev, GSO_MAX_SIZE); } /* * netlink interface */ static int veth_validate(struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { if (tb[IFLA_ADDRESS]) { if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN) return -EINVAL; if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS]))) return -EADDRNOTAVAIL; } if (tb[IFLA_MTU]) { if (!is_valid_veth_mtu(nla_get_u32(tb[IFLA_MTU]))) return -EINVAL; } return 0; } static struct rtnl_link_ops veth_link_ops; static void veth_disable_gro(struct net_device *dev) { dev->features &= ~NETIF_F_GRO; dev->wanted_features &= ~NETIF_F_GRO; netdev_update_features(dev); } static int veth_init_queues(struct net_device *dev, struct nlattr *tb[]) { int err; if (!tb[IFLA_NUM_TX_QUEUES] && dev->num_tx_queues > 1) { err = netif_set_real_num_tx_queues(dev, 1); if (err) return err; } if (!tb[IFLA_NUM_RX_QUEUES] && dev->num_rx_queues > 1) { err = netif_set_real_num_rx_queues(dev, 1); if (err) return err; } return 0; } static int veth_newlink(struct net_device *dev, struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { struct net *peer_net = rtnl_newlink_peer_net(params); struct nlattr **data = params->data; struct nlattr **tb = params->tb; int err; struct net_device *peer; struct veth_priv *priv; char ifname[IFNAMSIZ]; struct nlattr *peer_tb[IFLA_MAX + 1], **tbp; unsigned char name_assign_type; struct ifinfomsg *ifmp; /* * create and register peer first */ if (data && data[VETH_INFO_PEER]) { struct nlattr *nla_peer = data[VETH_INFO_PEER]; ifmp = nla_data(nla_peer); rtnl_nla_parse_ifinfomsg(peer_tb, nla_peer, extack); tbp = peer_tb; } else { ifmp = NULL; tbp = tb; } if (ifmp && tbp[IFLA_IFNAME]) { nla_strscpy(ifname, tbp[IFLA_IFNAME], IFNAMSIZ); name_assign_type = NET_NAME_USER; } else { snprintf(ifname, IFNAMSIZ, DRV_NAME "%%d"); name_assign_type = NET_NAME_ENUM; } peer = rtnl_create_link(peer_net, ifname, name_assign_type, &veth_link_ops, tbp, extack); if (IS_ERR(peer)) return PTR_ERR(peer); if (!ifmp || !tbp[IFLA_ADDRESS]) eth_hw_addr_random(peer); if (ifmp && (dev->ifindex != 0)) peer->ifindex = ifmp->ifi_index; netif_inherit_tso_max(peer, dev); err = register_netdevice(peer); if (err < 0) goto err_register_peer; /* keep GRO disabled by default to be consistent with the established * veth behavior */ veth_disable_gro(peer); netif_carrier_off(peer); err = rtnl_configure_link(peer, ifmp, 0, NULL); if (err < 0) goto err_configure_peer; /* * register dev last * * note, that since we've registered new device the dev's name * should be re-allocated */ if (tb[IFLA_ADDRESS] == NULL) eth_hw_addr_random(dev); if (tb[IFLA_IFNAME]) nla_strscpy(dev->name, tb[IFLA_IFNAME], IFNAMSIZ); else snprintf(dev->name, IFNAMSIZ, DRV_NAME "%%d"); err = register_netdevice(dev); if (err < 0) goto err_register_dev; netif_carrier_off(dev); /* * tie the deviced together */ priv = netdev_priv(dev); rcu_assign_pointer(priv->peer, peer); err = veth_init_queues(dev, tb); if (err) goto err_queues; priv = netdev_priv(peer); rcu_assign_pointer(priv->peer, dev); err = veth_init_queues(peer, tb); if (err) goto err_queues; veth_disable_gro(dev); /* update XDP supported features */ veth_set_xdp_features(dev); veth_set_xdp_features(peer); return 0; err_queues: unregister_netdevice(dev); err_register_dev: /* nothing to do */ err_configure_peer: unregister_netdevice(peer); return err; err_register_peer: free_netdev(peer); return err; } static void veth_dellink(struct net_device *dev, struct list_head *head) { struct veth_priv *priv; struct net_device *peer; priv = netdev_priv(dev); peer = rtnl_dereference(priv->peer); /* Note : dellink() is called from default_device_exit_batch(), * before a rcu_synchronize() point. The devices are guaranteed * not being freed before one RCU grace period. */ RCU_INIT_POINTER(priv->peer, NULL); unregister_netdevice_queue(dev, head); if (peer) { priv = netdev_priv(peer); RCU_INIT_POINTER(priv->peer, NULL); unregister_netdevice_queue(peer, head); } } static const struct nla_policy veth_policy[VETH_INFO_MAX + 1] = { [VETH_INFO_PEER] = { .len = sizeof(struct ifinfomsg) }, }; static struct net *veth_get_link_net(const struct net_device *dev) { struct veth_priv *priv = netdev_priv(dev); struct net_device *peer = rtnl_dereference(priv->peer); return peer ? dev_net(peer) : dev_net(dev); } static unsigned int veth_get_num_queues(void) { /* enforce the same queue limit as rtnl_create_link */ int queues = num_possible_cpus(); if (queues > 4096) queues = 4096; return queues; } static struct rtnl_link_ops veth_link_ops = { .kind = DRV_NAME, .priv_size = sizeof(struct veth_priv), .setup = veth_setup, .validate = veth_validate, .newlink = veth_newlink, .dellink = veth_dellink, .policy = veth_policy, .peer_type = VETH_INFO_PEER, .maxtype = VETH_INFO_MAX, .get_link_net = veth_get_link_net, .get_num_tx_queues = veth_get_num_queues, .get_num_rx_queues = veth_get_num_queues, }; /* * init/fini */ static __init int veth_init(void) { return rtnl_link_register(&veth_link_ops); } static __exit void veth_exit(void) { rtnl_link_unregister(&veth_link_ops); } module_init(veth_init); module_exit(veth_exit); MODULE_DESCRIPTION("Virtual Ethernet Tunnel"); MODULE_LICENSE("GPL v2"); MODULE_ALIAS_RTNL_LINK(DRV_NAME); |
| 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 171 171 170 170 170 37 37 37 37 2 2 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 | // SPDX-License-Identifier: GPL-2.0-or-later /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * This file implements the various access functions for the * PROC file system. This is very similar to the IPv4 version, * except it reports the sockets in the INET6 address family. * * Authors: David S. Miller (davem@caip.rutgers.edu) * YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org> */ #include <linux/socket.h> #include <linux/net.h> #include <linux/ipv6.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/stddef.h> #include <linux/export.h> #include <net/net_namespace.h> #include <net/ip.h> #include <net/sock.h> #include <net/tcp.h> #include <net/udp.h> #include <net/transp_v6.h> #include <net/ipv6.h> #define MAX4(a, b, c, d) \ MAX_T(u32, MAX_T(u32, a, b), MAX_T(u32, c, d)) #define SNMP_MIB_MAX MAX4(UDP_MIB_MAX, TCP_MIB_MAX, \ IPSTATS_MIB_MAX, ICMP_MIB_MAX) static int sockstat6_seq_show(struct seq_file *seq, void *v) { struct net *net = seq->private; seq_printf(seq, "TCP6: inuse %d\n", sock_prot_inuse_get(net, &tcpv6_prot)); seq_printf(seq, "UDP6: inuse %d\n", sock_prot_inuse_get(net, &udpv6_prot)); seq_printf(seq, "UDPLITE6: inuse %d\n", sock_prot_inuse_get(net, &udplitev6_prot)); seq_printf(seq, "RAW6: inuse %d\n", sock_prot_inuse_get(net, &rawv6_prot)); seq_printf(seq, "FRAG6: inuse %u memory %lu\n", atomic_read(&net->ipv6.fqdir->rhashtable.nelems), frag_mem_limit(net->ipv6.fqdir)); return 0; } static const struct snmp_mib snmp6_ipstats_list[] = { /* ipv6 mib according to RFC 2465 */ SNMP_MIB_ITEM("Ip6InReceives", IPSTATS_MIB_INPKTS), SNMP_MIB_ITEM("Ip6InHdrErrors", IPSTATS_MIB_INHDRERRORS), SNMP_MIB_ITEM("Ip6InTooBigErrors", IPSTATS_MIB_INTOOBIGERRORS), SNMP_MIB_ITEM("Ip6InNoRoutes", IPSTATS_MIB_INNOROUTES), SNMP_MIB_ITEM("Ip6InAddrErrors", IPSTATS_MIB_INADDRERRORS), SNMP_MIB_ITEM("Ip6InUnknownProtos", IPSTATS_MIB_INUNKNOWNPROTOS), SNMP_MIB_ITEM("Ip6InTruncatedPkts", IPSTATS_MIB_INTRUNCATEDPKTS), SNMP_MIB_ITEM("Ip6InDiscards", IPSTATS_MIB_INDISCARDS), SNMP_MIB_ITEM("Ip6InDelivers", IPSTATS_MIB_INDELIVERS), SNMP_MIB_ITEM("Ip6OutForwDatagrams", IPSTATS_MIB_OUTFORWDATAGRAMS), SNMP_MIB_ITEM("Ip6OutRequests", IPSTATS_MIB_OUTREQUESTS), SNMP_MIB_ITEM("Ip6OutDiscards", IPSTATS_MIB_OUTDISCARDS), SNMP_MIB_ITEM("Ip6OutNoRoutes", IPSTATS_MIB_OUTNOROUTES), SNMP_MIB_ITEM("Ip6ReasmTimeout", IPSTATS_MIB_REASMTIMEOUT), SNMP_MIB_ITEM("Ip6ReasmReqds", IPSTATS_MIB_REASMREQDS), SNMP_MIB_ITEM("Ip6ReasmOKs", IPSTATS_MIB_REASMOKS), SNMP_MIB_ITEM("Ip6ReasmFails", IPSTATS_MIB_REASMFAILS), SNMP_MIB_ITEM("Ip6FragOKs", IPSTATS_MIB_FRAGOKS), SNMP_MIB_ITEM("Ip6FragFails", IPSTATS_MIB_FRAGFAILS), SNMP_MIB_ITEM("Ip6FragCreates", IPSTATS_MIB_FRAGCREATES), SNMP_MIB_ITEM("Ip6InMcastPkts", IPSTATS_MIB_INMCASTPKTS), SNMP_MIB_ITEM("Ip6OutMcastPkts", IPSTATS_MIB_OUTMCASTPKTS), SNMP_MIB_ITEM("Ip6InOctets", IPSTATS_MIB_INOCTETS), SNMP_MIB_ITEM("Ip6OutOctets", IPSTATS_MIB_OUTOCTETS), SNMP_MIB_ITEM("Ip6InMcastOctets", IPSTATS_MIB_INMCASTOCTETS), SNMP_MIB_ITEM("Ip6OutMcastOctets", IPSTATS_MIB_OUTMCASTOCTETS), SNMP_MIB_ITEM("Ip6InBcastOctets", IPSTATS_MIB_INBCASTOCTETS), SNMP_MIB_ITEM("Ip6OutBcastOctets", IPSTATS_MIB_OUTBCASTOCTETS), /* IPSTATS_MIB_CSUMERRORS is not relevant in IPv6 (no checksum) */ SNMP_MIB_ITEM("Ip6InNoECTPkts", IPSTATS_MIB_NOECTPKTS), SNMP_MIB_ITEM("Ip6InECT1Pkts", IPSTATS_MIB_ECT1PKTS), SNMP_MIB_ITEM("Ip6InECT0Pkts", IPSTATS_MIB_ECT0PKTS), SNMP_MIB_ITEM("Ip6InCEPkts", IPSTATS_MIB_CEPKTS), SNMP_MIB_ITEM("Ip6OutTransmits", IPSTATS_MIB_OUTPKTS), SNMP_MIB_SENTINEL }; static const struct snmp_mib snmp6_icmp6_list[] = { /* icmpv6 mib according to RFC 2466 */ SNMP_MIB_ITEM("Icmp6InMsgs", ICMP6_MIB_INMSGS), SNMP_MIB_ITEM("Icmp6InErrors", ICMP6_MIB_INERRORS), SNMP_MIB_ITEM("Icmp6OutMsgs", ICMP6_MIB_OUTMSGS), SNMP_MIB_ITEM("Icmp6OutErrors", ICMP6_MIB_OUTERRORS), SNMP_MIB_ITEM("Icmp6InCsumErrors", ICMP6_MIB_CSUMERRORS), SNMP_MIB_ITEM("Icmp6OutRateLimitHost", ICMP6_MIB_RATELIMITHOST), SNMP_MIB_SENTINEL }; /* RFC 4293 v6 ICMPMsgStatsTable; named items for RFC 2466 compatibility */ static const char *const icmp6type2name[256] = { [ICMPV6_DEST_UNREACH] = "DestUnreachs", [ICMPV6_PKT_TOOBIG] = "PktTooBigs", [ICMPV6_TIME_EXCEED] = "TimeExcds", [ICMPV6_PARAMPROB] = "ParmProblems", [ICMPV6_ECHO_REQUEST] = "Echos", [ICMPV6_ECHO_REPLY] = "EchoReplies", [ICMPV6_MGM_QUERY] = "GroupMembQueries", [ICMPV6_MGM_REPORT] = "GroupMembResponses", [ICMPV6_MGM_REDUCTION] = "GroupMembReductions", [ICMPV6_MLD2_REPORT] = "MLDv2Reports", [NDISC_ROUTER_ADVERTISEMENT] = "RouterAdvertisements", [NDISC_ROUTER_SOLICITATION] = "RouterSolicits", [NDISC_NEIGHBOUR_ADVERTISEMENT] = "NeighborAdvertisements", [NDISC_NEIGHBOUR_SOLICITATION] = "NeighborSolicits", [NDISC_REDIRECT] = "Redirects", }; static const struct snmp_mib snmp6_udp6_list[] = { SNMP_MIB_ITEM("Udp6InDatagrams", UDP_MIB_INDATAGRAMS), SNMP_MIB_ITEM("Udp6NoPorts", UDP_MIB_NOPORTS), SNMP_MIB_ITEM("Udp6InErrors", UDP_MIB_INERRORS), SNMP_MIB_ITEM("Udp6OutDatagrams", UDP_MIB_OUTDATAGRAMS), SNMP_MIB_ITEM("Udp6RcvbufErrors", UDP_MIB_RCVBUFERRORS), SNMP_MIB_ITEM("Udp6SndbufErrors", UDP_MIB_SNDBUFERRORS), SNMP_MIB_ITEM("Udp6InCsumErrors", UDP_MIB_CSUMERRORS), SNMP_MIB_ITEM("Udp6IgnoredMulti", UDP_MIB_IGNOREDMULTI), SNMP_MIB_ITEM("Udp6MemErrors", UDP_MIB_MEMERRORS), SNMP_MIB_SENTINEL }; static const struct snmp_mib snmp6_udplite6_list[] = { SNMP_MIB_ITEM("UdpLite6InDatagrams", UDP_MIB_INDATAGRAMS), SNMP_MIB_ITEM("UdpLite6NoPorts", UDP_MIB_NOPORTS), SNMP_MIB_ITEM("UdpLite6InErrors", UDP_MIB_INERRORS), SNMP_MIB_ITEM("UdpLite6OutDatagrams", UDP_MIB_OUTDATAGRAMS), SNMP_MIB_ITEM("UdpLite6RcvbufErrors", UDP_MIB_RCVBUFERRORS), SNMP_MIB_ITEM("UdpLite6SndbufErrors", UDP_MIB_SNDBUFERRORS), SNMP_MIB_ITEM("UdpLite6InCsumErrors", UDP_MIB_CSUMERRORS), SNMP_MIB_ITEM("UdpLite6MemErrors", UDP_MIB_MEMERRORS), SNMP_MIB_SENTINEL }; static void snmp6_seq_show_icmpv6msg(struct seq_file *seq, atomic_long_t *smib) { char name[32]; int i; /* print by name -- deprecated items */ for (i = 0; i < ICMP6MSG_MIB_MAX; i++) { int icmptype; const char *p; icmptype = i & 0xff; p = icmp6type2name[icmptype]; if (!p) /* don't print un-named types here */ continue; snprintf(name, sizeof(name), "Icmp6%s%s", i & 0x100 ? "Out" : "In", p); seq_printf(seq, "%-32s\t%lu\n", name, atomic_long_read(smib + i)); } /* print by number (nonzero only) - ICMPMsgStat format */ for (i = 0; i < ICMP6MSG_MIB_MAX; i++) { unsigned long val; val = atomic_long_read(smib + i); if (!val) continue; snprintf(name, sizeof(name), "Icmp6%sType%u", i & 0x100 ? "Out" : "In", i & 0xff); seq_printf(seq, "%-32s\t%lu\n", name, val); } } /* can be called either with percpu mib (pcpumib != NULL), * or shared one (smib != NULL) */ static void snmp6_seq_show_item(struct seq_file *seq, void __percpu *pcpumib, atomic_long_t *smib, const struct snmp_mib *itemlist) { unsigned long buff[SNMP_MIB_MAX]; int i; if (pcpumib) { memset(buff, 0, sizeof(unsigned long) * SNMP_MIB_MAX); snmp_get_cpu_field_batch(buff, itemlist, pcpumib); for (i = 0; itemlist[i].name; i++) seq_printf(seq, "%-32s\t%lu\n", itemlist[i].name, buff[i]); } else { for (i = 0; itemlist[i].name; i++) seq_printf(seq, "%-32s\t%lu\n", itemlist[i].name, atomic_long_read(smib + itemlist[i].entry)); } } static void snmp6_seq_show_item64(struct seq_file *seq, void __percpu *mib, const struct snmp_mib *itemlist, size_t syncpoff) { u64 buff64[SNMP_MIB_MAX]; int i; memset(buff64, 0, sizeof(u64) * SNMP_MIB_MAX); snmp_get_cpu_field64_batch(buff64, itemlist, mib, syncpoff); for (i = 0; itemlist[i].name; i++) seq_printf(seq, "%-32s\t%llu\n", itemlist[i].name, buff64[i]); } static int snmp6_seq_show(struct seq_file *seq, void *v) { struct net *net = (struct net *)seq->private; snmp6_seq_show_item64(seq, net->mib.ipv6_statistics, snmp6_ipstats_list, offsetof(struct ipstats_mib, syncp)); snmp6_seq_show_item(seq, net->mib.icmpv6_statistics, NULL, snmp6_icmp6_list); snmp6_seq_show_icmpv6msg(seq, net->mib.icmpv6msg_statistics->mibs); snmp6_seq_show_item(seq, net->mib.udp_stats_in6, NULL, snmp6_udp6_list); snmp6_seq_show_item(seq, net->mib.udplite_stats_in6, NULL, snmp6_udplite6_list); return 0; } static int snmp6_dev_seq_show(struct seq_file *seq, void *v) { struct inet6_dev *idev = (struct inet6_dev *)seq->private; seq_printf(seq, "%-32s\t%u\n", "ifIndex", idev->dev->ifindex); snmp6_seq_show_item64(seq, idev->stats.ipv6, snmp6_ipstats_list, offsetof(struct ipstats_mib, syncp)); snmp6_seq_show_item(seq, NULL, idev->stats.icmpv6dev->mibs, snmp6_icmp6_list); snmp6_seq_show_icmpv6msg(seq, idev->stats.icmpv6msgdev->mibs); return 0; } int snmp6_register_dev(struct inet6_dev *idev) { struct proc_dir_entry *p; struct net *net; if (!idev || !idev->dev) return -EINVAL; net = dev_net(idev->dev); if (!net->mib.proc_net_devsnmp6) return -ENOENT; p = proc_create_single_data(idev->dev->name, 0444, net->mib.proc_net_devsnmp6, snmp6_dev_seq_show, idev); if (!p) return -ENOMEM; idev->stats.proc_dir_entry = p; return 0; } int snmp6_unregister_dev(struct inet6_dev *idev) { struct net *net = dev_net(idev->dev); if (!net->mib.proc_net_devsnmp6) return -ENOENT; if (!idev->stats.proc_dir_entry) return -EINVAL; proc_remove(idev->stats.proc_dir_entry); idev->stats.proc_dir_entry = NULL; return 0; } static int __net_init ipv6_proc_init_net(struct net *net) { if (!proc_create_net_single("sockstat6", 0444, net->proc_net, sockstat6_seq_show, NULL)) return -ENOMEM; if (!proc_create_net_single("snmp6", 0444, net->proc_net, snmp6_seq_show, NULL)) goto proc_snmp6_fail; net->mib.proc_net_devsnmp6 = proc_mkdir("dev_snmp6", net->proc_net); if (!net->mib.proc_net_devsnmp6) goto proc_dev_snmp6_fail; return 0; proc_dev_snmp6_fail: remove_proc_entry("snmp6", net->proc_net); proc_snmp6_fail: remove_proc_entry("sockstat6", net->proc_net); return -ENOMEM; } static void __net_exit ipv6_proc_exit_net(struct net *net) { remove_proc_entry("sockstat6", net->proc_net); remove_proc_entry("dev_snmp6", net->proc_net); remove_proc_entry("snmp6", net->proc_net); } static struct pernet_operations ipv6_proc_ops = { .init = ipv6_proc_init_net, .exit = ipv6_proc_exit_net, }; int __init ipv6_misc_proc_init(void) { return register_pernet_subsys(&ipv6_proc_ops); } void ipv6_misc_proc_exit(void) { unregister_pernet_subsys(&ipv6_proc_ops); } |
| 5 5 5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 | // SPDX-License-Identifier: GPL-2.0 #include <linux/fs.h> #include <linux/export.h> /* * fs on-disk file type to dirent file type conversion */ static const unsigned char fs_dtype_by_ftype[FT_MAX] = { [FT_UNKNOWN] = DT_UNKNOWN, [FT_REG_FILE] = DT_REG, [FT_DIR] = DT_DIR, [FT_CHRDEV] = DT_CHR, [FT_BLKDEV] = DT_BLK, [FT_FIFO] = DT_FIFO, [FT_SOCK] = DT_SOCK, [FT_SYMLINK] = DT_LNK }; /** * fs_ftype_to_dtype() - fs on-disk file type to dirent type. * @filetype: The on-disk file type to convert. * * This function converts the on-disk file type value (FT_*) to the directory * entry type (DT_*). * * Context: Any context. * Return: * * DT_UNKNOWN - Unknown type * * DT_FIFO - FIFO * * DT_CHR - Character device * * DT_DIR - Directory * * DT_BLK - Block device * * DT_REG - Regular file * * DT_LNK - Symbolic link * * DT_SOCK - Local-domain socket */ unsigned char fs_ftype_to_dtype(unsigned int filetype) { if (filetype >= FT_MAX) return DT_UNKNOWN; return fs_dtype_by_ftype[filetype]; } EXPORT_SYMBOL_GPL(fs_ftype_to_dtype); /* * dirent file type to fs on-disk file type conversion * Values not initialized explicitly are FT_UNKNOWN (0). */ static const unsigned char fs_ftype_by_dtype[DT_MAX] = { [DT_REG] = FT_REG_FILE, [DT_DIR] = FT_DIR, [DT_LNK] = FT_SYMLINK, [DT_CHR] = FT_CHRDEV, [DT_BLK] = FT_BLKDEV, [DT_FIFO] = FT_FIFO, [DT_SOCK] = FT_SOCK, }; /** * fs_umode_to_ftype() - file mode to on-disk file type. * @mode: The file mode to convert. * * This function converts the file mode value to the on-disk file type (FT_*). * * Context: Any context. * Return: * * FT_UNKNOWN - Unknown type * * FT_REG_FILE - Regular file * * FT_DIR - Directory * * FT_CHRDEV - Character device * * FT_BLKDEV - Block device * * FT_FIFO - FIFO * * FT_SOCK - Local-domain socket * * FT_SYMLINK - Symbolic link */ unsigned char fs_umode_to_ftype(umode_t mode) { return fs_ftype_by_dtype[S_DT(mode)]; } EXPORT_SYMBOL_GPL(fs_umode_to_ftype); /** * fs_umode_to_dtype() - file mode to dirent file type. * @mode: The file mode to convert. * * This function converts the file mode value to the directory * entry type (DT_*). * * Context: Any context. * Return: * * DT_UNKNOWN - Unknown type * * DT_FIFO - FIFO * * DT_CHR - Character device * * DT_DIR - Directory * * DT_BLK - Block device * * DT_REG - Regular file * * DT_LNK - Symbolic link * * DT_SOCK - Local-domain socket */ unsigned char fs_umode_to_dtype(umode_t mode) { return fs_ftype_to_dtype(fs_umode_to_ftype(mode)); } EXPORT_SYMBOL_GPL(fs_umode_to_dtype); |
| 4 4 4 3 4 4 4 2 2 2 2 4 4 2 2 4 4 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 4 3 4 4 2 2 2 2 2 2 2 2 2 2 4 4 4 2 2 2 2 2 4 4 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 4 4 4 4 4 2 2 2 2 2 2 2 2 2 2 2 2 4 4 4 4 4 2 2 2 2 2 2 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 | // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2007-2008 Advanced Micro Devices, Inc. * Author: Joerg Roedel <jroedel@suse.de> */ #define pr_fmt(fmt) "iommu: " fmt #include <linux/amba/bus.h> #include <linux/device.h> #include <linux/kernel.h> #include <linux/bits.h> #include <linux/bug.h> #include <linux/types.h> #include <linux/init.h> #include <linux/export.h> #include <linux/slab.h> #include <linux/errno.h> #include <linux/host1x_context_bus.h> #include <linux/iommu.h> #include <linux/iommufd.h> #include <linux/idr.h> #include <linux/err.h> #include <linux/pci.h> #include <linux/pci-ats.h> #include <linux/bitops.h> #include <linux/platform_device.h> #include <linux/property.h> #include <linux/fsl/mc.h> #include <linux/module.h> #include <linux/cc_platform.h> #include <linux/cdx/cdx_bus.h> #include <trace/events/iommu.h> #include <linux/sched/mm.h> #include <linux/msi.h> #include <uapi/linux/iommufd.h> #include "dma-iommu.h" #include "iommu-priv.h" static struct kset *iommu_group_kset; static DEFINE_IDA(iommu_group_ida); static DEFINE_IDA(iommu_global_pasid_ida); static unsigned int iommu_def_domain_type __read_mostly; static bool iommu_dma_strict __read_mostly = IS_ENABLED(CONFIG_IOMMU_DEFAULT_DMA_STRICT); static u32 iommu_cmd_line __read_mostly; /* Tags used with xa_tag_pointer() in group->pasid_array */ enum { IOMMU_PASID_ARRAY_DOMAIN = 0, IOMMU_PASID_ARRAY_HANDLE = 1 }; struct iommu_group { struct kobject kobj; struct kobject *devices_kobj; struct list_head devices; struct xarray pasid_array; struct mutex mutex; void *iommu_data; void (*iommu_data_release)(void *iommu_data); char *name; int id; struct iommu_domain *default_domain; struct iommu_domain *blocking_domain; struct iommu_domain *domain; struct list_head entry; unsigned int owner_cnt; void *owner; }; struct group_device { struct list_head list; struct device *dev; char *name; }; /* Iterate over each struct group_device in a struct iommu_group */ #define for_each_group_device(group, pos) \ list_for_each_entry(pos, &(group)->devices, list) struct iommu_group_attribute { struct attribute attr; ssize_t (*show)(struct iommu_group *group, char *buf); ssize_t (*store)(struct iommu_group *group, const char *buf, size_t count); }; static const char * const iommu_group_resv_type_string[] = { [IOMMU_RESV_DIRECT] = "direct", [IOMMU_RESV_DIRECT_RELAXABLE] = "direct-relaxable", [IOMMU_RESV_RESERVED] = "reserved", [IOMMU_RESV_MSI] = "msi", [IOMMU_RESV_SW_MSI] = "msi", }; #define IOMMU_CMD_LINE_DMA_API BIT(0) #define IOMMU_CMD_LINE_STRICT BIT(1) static int bus_iommu_probe(const struct bus_type *bus); static int iommu_bus_notifier(struct notifier_block *nb, unsigned long action, void *data); static void iommu_release_device(struct device *dev); static int __iommu_attach_device(struct iommu_domain *domain, struct device *dev); static int __iommu_attach_group(struct iommu_domain *domain, struct iommu_group *group); static struct iommu_domain *__iommu_paging_domain_alloc_flags(struct device *dev, unsigned int type, unsigned int flags); enum { IOMMU_SET_DOMAIN_MUST_SUCCEED = 1 << 0, }; static int __iommu_device_set_domain(struct iommu_group *group, struct device *dev, struct iommu_domain *new_domain, unsigned int flags); static int __iommu_group_set_domain_internal(struct iommu_group *group, struct iommu_domain *new_domain, unsigned int flags); static int __iommu_group_set_domain(struct iommu_group *group, struct iommu_domain *new_domain) { return __iommu_group_set_domain_internal(group, new_domain, 0); } static void __iommu_group_set_domain_nofail(struct iommu_group *group, struct iommu_domain *new_domain) { WARN_ON(__iommu_group_set_domain_internal( group, new_domain, IOMMU_SET_DOMAIN_MUST_SUCCEED)); } static int iommu_setup_default_domain(struct iommu_group *group, int target_type); static int iommu_create_device_direct_mappings(struct iommu_domain *domain, struct device *dev); static ssize_t iommu_group_store_type(struct iommu_group *group, const char *buf, size_t count); static struct group_device *iommu_group_alloc_device(struct iommu_group *group, struct device *dev); static void __iommu_group_free_device(struct iommu_group *group, struct group_device *grp_dev); static void iommu_domain_init(struct iommu_domain *domain, unsigned int type, const struct iommu_ops *ops); #define IOMMU_GROUP_ATTR(_name, _mode, _show, _store) \ struct iommu_group_attribute iommu_group_attr_##_name = \ __ATTR(_name, _mode, _show, _store) #define to_iommu_group_attr(_attr) \ container_of(_attr, struct iommu_group_attribute, attr) #define to_iommu_group(_kobj) \ container_of(_kobj, struct iommu_group, kobj) static LIST_HEAD(iommu_device_list); static DEFINE_SPINLOCK(iommu_device_lock); static const struct bus_type * const iommu_buses[] = { &platform_bus_type, #ifdef CONFIG_PCI &pci_bus_type, #endif #ifdef CONFIG_ARM_AMBA &amba_bustype, #endif #ifdef CONFIG_FSL_MC_BUS &fsl_mc_bus_type, #endif #ifdef CONFIG_TEGRA_HOST1X_CONTEXT_BUS &host1x_context_device_bus_type, #endif #ifdef CONFIG_CDX_BUS &cdx_bus_type, #endif }; /* * Use a function instead of an array here because the domain-type is a * bit-field, so an array would waste memory. */ static const char *iommu_domain_type_str(unsigned int t) { switch (t) { case IOMMU_DOMAIN_BLOCKED: return "Blocked"; case IOMMU_DOMAIN_IDENTITY: return "Passthrough"; case IOMMU_DOMAIN_UNMANAGED: return "Unmanaged"; case IOMMU_DOMAIN_DMA: case IOMMU_DOMAIN_DMA_FQ: return "Translated"; case IOMMU_DOMAIN_PLATFORM: return "Platform"; default: return "Unknown"; } } static int __init iommu_subsys_init(void) { struct notifier_block *nb; if (!(iommu_cmd_line & IOMMU_CMD_LINE_DMA_API)) { if (IS_ENABLED(CONFIG_IOMMU_DEFAULT_PASSTHROUGH)) iommu_set_default_passthrough(false); else iommu_set_default_translated(false); if (iommu_default_passthrough() && cc_platform_has(CC_ATTR_MEM_ENCRYPT)) { pr_info("Memory encryption detected - Disabling default IOMMU Passthrough\n"); iommu_set_default_translated(false); } } if (!iommu_default_passthrough() && !iommu_dma_strict) iommu_def_domain_type = IOMMU_DOMAIN_DMA_FQ; pr_info("Default domain type: %s%s\n", iommu_domain_type_str(iommu_def_domain_type), (iommu_cmd_line & IOMMU_CMD_LINE_DMA_API) ? " (set via kernel command line)" : ""); if (!iommu_default_passthrough()) pr_info("DMA domain TLB invalidation policy: %s mode%s\n", iommu_dma_strict ? "strict" : "lazy", (iommu_cmd_line & IOMMU_CMD_LINE_STRICT) ? " (set via kernel command line)" : ""); nb = kcalloc(ARRAY_SIZE(iommu_buses), sizeof(*nb), GFP_KERNEL); if (!nb) return -ENOMEM; for (int i = 0; i < ARRAY_SIZE(iommu_buses); i++) { nb[i].notifier_call = iommu_bus_notifier; bus_register_notifier(iommu_buses[i], &nb[i]); } return 0; } subsys_initcall(iommu_subsys_init); static int remove_iommu_group(struct device *dev, void *data) { if (dev->iommu && dev->iommu->iommu_dev == data) iommu_release_device(dev); return 0; } /** * iommu_device_register() - Register an IOMMU hardware instance * @iommu: IOMMU handle for the instance * @ops: IOMMU ops to associate with the instance * @hwdev: (optional) actual instance device, used for fwnode lookup * * Return: 0 on success, or an error. */ int iommu_device_register(struct iommu_device *iommu, const struct iommu_ops *ops, struct device *hwdev) { int err = 0; /* We need to be able to take module references appropriately */ if (WARN_ON(is_module_address((unsigned long)ops) && !ops->owner)) return -EINVAL; iommu->ops = ops; if (hwdev) iommu->fwnode = dev_fwnode(hwdev); spin_lock(&iommu_device_lock); list_add_tail(&iommu->list, &iommu_device_list); spin_unlock(&iommu_device_lock); for (int i = 0; i < ARRAY_SIZE(iommu_buses) && !err; i++) err = bus_iommu_probe(iommu_buses[i]); if (err) iommu_device_unregister(iommu); else WRITE_ONCE(iommu->ready, true); return err; } EXPORT_SYMBOL_GPL(iommu_device_register); void iommu_device_unregister(struct iommu_device *iommu) { for (int i = 0; i < ARRAY_SIZE(iommu_buses); i++) bus_for_each_dev(iommu_buses[i], NULL, iommu, remove_iommu_group); spin_lock(&iommu_device_lock); list_del(&iommu->list); spin_unlock(&iommu_device_lock); /* Pairs with the alloc in generic_single_device_group() */ iommu_group_put(iommu->singleton_group); iommu->singleton_group = NULL; } EXPORT_SYMBOL_GPL(iommu_device_unregister); #if IS_ENABLED(CONFIG_IOMMUFD_TEST) void iommu_device_unregister_bus(struct iommu_device *iommu, const struct bus_type *bus, struct notifier_block *nb) { bus_unregister_notifier(bus, nb); iommu_device_unregister(iommu); } EXPORT_SYMBOL_GPL(iommu_device_unregister_bus); /* * Register an iommu driver against a single bus. This is only used by iommufd * selftest to create a mock iommu driver. The caller must provide * some memory to hold a notifier_block. */ int iommu_device_register_bus(struct iommu_device *iommu, const struct iommu_ops *ops, const struct bus_type *bus, struct notifier_block *nb) { int err; iommu->ops = ops; nb->notifier_call = iommu_bus_notifier; err = bus_register_notifier(bus, nb); if (err) return err; spin_lock(&iommu_device_lock); list_add_tail(&iommu->list, &iommu_device_list); spin_unlock(&iommu_device_lock); err = bus_iommu_probe(bus); if (err) { iommu_device_unregister_bus(iommu, bus, nb); return err; } return 0; } EXPORT_SYMBOL_GPL(iommu_device_register_bus); #endif static struct dev_iommu *dev_iommu_get(struct device *dev) { struct dev_iommu *param = dev->iommu; lockdep_assert_held(&iommu_probe_device_lock); if (param) return param; param = kzalloc(sizeof(*param), GFP_KERNEL); if (!param) return NULL; mutex_init(¶m->lock); dev->iommu = param; return param; } void dev_iommu_free(struct device *dev) { struct dev_iommu *param = dev->iommu; dev->iommu = NULL; if (param->fwspec) { fwnode_handle_put(param->fwspec->iommu_fwnode); kfree(param->fwspec); } kfree(param); } /* * Internal equivalent of device_iommu_mapped() for when we care that a device * actually has API ops, and don't want false positives from VFIO-only groups. */ static bool dev_has_iommu(struct device *dev) { return dev->iommu && dev->iommu->iommu_dev; } static u32 dev_iommu_get_max_pasids(struct device *dev) { u32 max_pasids = 0, bits = 0; int ret; if (dev_is_pci(dev)) { ret = pci_max_pasids(to_pci_dev(dev)); if (ret > 0) max_pasids = ret; } else { ret = device_property_read_u32(dev, "pasid-num-bits", &bits); if (!ret) max_pasids = 1UL << bits; } return min_t(u32, max_pasids, dev->iommu->iommu_dev->max_pasids); } void dev_iommu_priv_set(struct device *dev, void *priv) { /* FSL_PAMU does something weird */ if (!IS_ENABLED(CONFIG_FSL_PAMU)) lockdep_assert_held(&iommu_probe_device_lock); dev->iommu->priv = priv; } EXPORT_SYMBOL_GPL(dev_iommu_priv_set); /* * Init the dev->iommu and dev->iommu_group in the struct device and get the * driver probed */ static int iommu_init_device(struct device *dev) { const struct iommu_ops *ops; struct iommu_device *iommu_dev; struct iommu_group *group; int ret; if (!dev_iommu_get(dev)) return -ENOMEM; /* * For FDT-based systems and ACPI IORT/VIOT, the common firmware parsing * is buried in the bus dma_configure path. Properly unpicking that is * still a big job, so for now just invoke the whole thing. The device * already having a driver bound means dma_configure has already run and * found no IOMMU to wait for, so there's no point calling it again. */ if (!dev->iommu->fwspec && !dev->driver && dev->bus->dma_configure) { mutex_unlock(&iommu_probe_device_lock); dev->bus->dma_configure(dev); mutex_lock(&iommu_probe_device_lock); /* If another instance finished the job for us, skip it */ if (!dev->iommu || dev->iommu_group) return -ENODEV; } /* * At this point, relevant devices either now have a fwspec which will * match ops registered with a non-NULL fwnode, or we can reasonably * assume that only one of Intel, AMD, s390, PAMU or legacy SMMUv2 can * be present, and that any of their registered instances has suitable * ops for probing, and thus cheekily co-opt the same mechanism. */ ops = iommu_fwspec_ops(dev->iommu->fwspec); if (!ops) { ret = -ENODEV; goto err_free; } if (!try_module_get(ops->owner)) { ret = -EINVAL; goto err_free; } iommu_dev = ops->probe_device(dev); if (IS_ERR(iommu_dev)) { ret = PTR_ERR(iommu_dev); goto err_module_put; } dev->iommu->iommu_dev = iommu_dev; ret = iommu_device_link(iommu_dev, dev); if (ret) goto err_release; group = ops->device_group(dev); if (WARN_ON_ONCE(group == NULL)) group = ERR_PTR(-EINVAL); if (IS_ERR(group)) { ret = PTR_ERR(group); goto err_unlink; } dev->iommu_group = group; dev->iommu->max_pasids = dev_iommu_get_max_pasids(dev); if (ops->is_attach_deferred) dev->iommu->attach_deferred = ops->is_attach_deferred(dev); return 0; err_unlink: iommu_device_unlink(iommu_dev, dev); err_release: if (ops->release_device) ops->release_device(dev); err_module_put: module_put(ops->owner); err_free: dev->iommu->iommu_dev = NULL; dev_iommu_free(dev); return ret; } static void iommu_deinit_device(struct device *dev) { struct iommu_group *group = dev->iommu_group; const struct iommu_ops *ops = dev_iommu_ops(dev); lockdep_assert_held(&group->mutex); iommu_device_unlink(dev->iommu->iommu_dev, dev); /* * release_device() must stop using any attached domain on the device. * If there are still other devices in the group, they are not affected * by this callback. * * If the iommu driver provides release_domain, the core code ensures * that domain is attached prior to calling release_device. Drivers can * use this to enforce a translation on the idle iommu. Typically, the * global static blocked_domain is a good choice. * * Otherwise, the iommu driver must set the device to either an identity * or a blocking translation in release_device() and stop using any * domain pointer, as it is going to be freed. * * Regardless, if a delayed attach never occurred, then the release * should still avoid touching any hardware configuration either. */ if (!dev->iommu->attach_deferred && ops->release_domain) ops->release_domain->ops->attach_dev(ops->release_domain, dev); if (ops->release_device) ops->release_device(dev); /* * If this is the last driver to use the group then we must free the * domains before we do the module_put(). */ if (list_empty(&group->devices)) { if (group->default_domain) { iommu_domain_free(group->default_domain); group->default_domain = NULL; } if (group->blocking_domain) { iommu_domain_free(group->blocking_domain); group->blocking_domain = NULL; } group->domain = NULL; } /* Caller must put iommu_group */ dev->iommu_group = NULL; module_put(ops->owner); dev_iommu_free(dev); #ifdef CONFIG_IOMMU_DMA dev->dma_iommu = false; #endif } static struct iommu_domain *pasid_array_entry_to_domain(void *entry) { if (xa_pointer_tag(entry) == IOMMU_PASID_ARRAY_DOMAIN) return xa_untag_pointer(entry); return ((struct iommu_attach_handle *)xa_untag_pointer(entry))->domain; } DEFINE_MUTEX(iommu_probe_device_lock); static int __iommu_probe_device(struct device *dev, struct list_head *group_list) { struct iommu_group *group; struct group_device *gdev; int ret; /* * Serialise to avoid races between IOMMU drivers registering in * parallel and/or the "replay" calls from ACPI/OF code via client * driver probe. Once the latter have been cleaned up we should * probably be able to use device_lock() here to minimise the scope, * but for now enforcing a simple global ordering is fine. */ lockdep_assert_held(&iommu_probe_device_lock); /* Device is probed already if in a group */ if (dev->iommu_group) return 0; ret = iommu_init_device(dev); if (ret) return ret; /* * And if we do now see any replay calls, they would indicate someone * misusing the dma_configure path outside bus code. */ if (dev->driver) dev_WARN(dev, "late IOMMU probe at driver bind, something fishy here!\n"); group = dev->iommu_group; gdev = iommu_group_alloc_device(group, dev); mutex_lock(&group->mutex); if (IS_ERR(gdev)) { ret = PTR_ERR(gdev); goto err_put_group; } /* * The gdev must be in the list before calling * iommu_setup_default_domain() */ list_add_tail(&gdev->list, &group->devices); WARN_ON(group->default_domain && !group->domain); if (group->default_domain) iommu_create_device_direct_mappings(group->default_domain, dev); if (group->domain) { ret = __iommu_device_set_domain(group, dev, group->domain, 0); if (ret) goto err_remove_gdev; } else if (!group->default_domain && !group_list) { ret = iommu_setup_default_domain(group, 0); if (ret) goto err_remove_gdev; } else if (!group->default_domain) { /* * With a group_list argument we defer the default_domain setup * to the caller by providing a de-duplicated list of groups * that need further setup. */ if (list_empty(&group->entry)) list_add_tail(&group->entry, group_list); } if (group->default_domain) iommu_setup_dma_ops(dev); mutex_unlock(&group->mutex); return 0; err_remove_gdev: list_del(&gdev->list); __iommu_group_free_device(group, gdev); err_put_group: iommu_deinit_device(dev); mutex_unlock(&group->mutex); iommu_group_put(group); return ret; } int iommu_probe_device(struct device *dev) { const struct iommu_ops *ops; int ret; mutex_lock(&iommu_probe_device_lock); ret = __iommu_probe_device(dev, NULL); mutex_unlock(&iommu_probe_device_lock); if (ret) return ret; ops = dev_iommu_ops(dev); if (ops->probe_finalize) ops->probe_finalize(dev); return 0; } static void __iommu_group_free_device(struct iommu_group *group, struct group_device *grp_dev) { struct device *dev = grp_dev->dev; sysfs_remove_link(group->devices_kobj, grp_dev->name); sysfs_remove_link(&dev->kobj, "iommu_group"); trace_remove_device_from_group(group->id, dev); /* * If the group has become empty then ownership must have been * released, and the current domain must be set back to NULL or * the default domain. */ if (list_empty(&group->devices)) WARN_ON(group->owner_cnt || group->domain != group->default_domain); kfree(grp_dev->name); kfree(grp_dev); } /* Remove the iommu_group from the struct device. */ static void __iommu_group_remove_device(struct device *dev) { struct iommu_group *group = dev->iommu_group; struct group_device *device; mutex_lock(&group->mutex); for_each_group_device(group, device) { if (device->dev != dev) continue; list_del(&device->list); __iommu_group_free_device(group, device); if (dev_has_iommu(dev)) iommu_deinit_device(dev); else dev->iommu_group = NULL; break; } mutex_unlock(&group->mutex); /* * Pairs with the get in iommu_init_device() or * iommu_group_add_device() */ iommu_group_put(group); } static void iommu_release_device(struct device *dev) { struct iommu_group *group = dev->iommu_group; if (group) __iommu_group_remove_device(dev); /* Free any fwspec if no iommu_driver was ever attached */ if (dev->iommu) dev_iommu_free(dev); } static int __init iommu_set_def_domain_type(char *str) { bool pt; int ret; ret = kstrtobool(str, &pt); if (ret) return ret; if (pt) iommu_set_default_passthrough(true); else iommu_set_default_translated(true); return 0; } early_param("iommu.passthrough", iommu_set_def_domain_type); static int __init iommu_dma_setup(char *str) { int ret = kstrtobool(str, &iommu_dma_strict); if (!ret) iommu_cmd_line |= IOMMU_CMD_LINE_STRICT; return ret; } early_param("iommu.strict", iommu_dma_setup); void iommu_set_dma_strict(void) { iommu_dma_strict = true; if (iommu_def_domain_type == IOMMU_DOMAIN_DMA_FQ) iommu_def_domain_type = IOMMU_DOMAIN_DMA; } static ssize_t iommu_group_attr_show(struct kobject *kobj, struct attribute *__attr, char *buf) { struct iommu_group_attribute *attr = to_iommu_group_attr(__attr); struct iommu_group *group = to_iommu_group(kobj); ssize_t ret = -EIO; if (attr->show) ret = attr->show(group, buf); return ret; } static ssize_t iommu_group_attr_store(struct kobject *kobj, struct attribute *__attr, const char *buf, size_t count) { struct iommu_group_attribute *attr = to_iommu_group_attr(__attr); struct iommu_group *group = to_iommu_group(kobj); ssize_t ret = -EIO; if (attr->store) ret = attr->store(group, buf, count); return ret; } static const struct sysfs_ops iommu_group_sysfs_ops = { .show = iommu_group_attr_show, .store = iommu_group_attr_store, }; static int iommu_group_create_file(struct iommu_group *group, struct iommu_group_attribute *attr) { return sysfs_create_file(&group->kobj, &attr->attr); } static void iommu_group_remove_file(struct iommu_group *group, struct iommu_group_attribute *attr) { sysfs_remove_file(&group->kobj, &attr->attr); } static ssize_t iommu_group_show_name(struct iommu_group *group, char *buf) { return sysfs_emit(buf, "%s\n", group->name); } /** * iommu_insert_resv_region - Insert a new region in the * list of reserved regions. * @new: new region to insert * @regions: list of regions * * Elements are sorted by start address and overlapping segments * of the same type are merged. */ static int iommu_insert_resv_region(struct iommu_resv_region *new, struct list_head *regions) { struct iommu_resv_region *iter, *tmp, *nr, *top; LIST_HEAD(stack); nr = iommu_alloc_resv_region(new->start, new->length, new->prot, new->type, GFP_KERNEL); if (!nr) return -ENOMEM; /* First add the new element based on start address sorting */ list_for_each_entry(iter, regions, list) { if (nr->start < iter->start || (nr->start == iter->start && nr->type <= iter->type)) break; } list_add_tail(&nr->list, &iter->list); /* Merge overlapping segments of type nr->type in @regions, if any */ list_for_each_entry_safe(iter, tmp, regions, list) { phys_addr_t top_end, iter_end = iter->start + iter->length - 1; /* no merge needed on elements of different types than @new */ if (iter->type != new->type) { list_move_tail(&iter->list, &stack); continue; } /* look for the last stack element of same type as @iter */ list_for_each_entry_reverse(top, &stack, list) if (top->type == iter->type) goto check_overlap; list_move_tail(&iter->list, &stack); continue; check_overlap: top_end = top->start + top->length - 1; if (iter->start > top_end + 1) { list_move_tail(&iter->list, &stack); } else { top->length = max(top_end, iter_end) - top->start + 1; list_del(&iter->list); kfree(iter); } } list_splice(&stack, regions); return 0; } static int iommu_insert_device_resv_regions(struct list_head *dev_resv_regions, struct list_head *group_resv_regions) { struct iommu_resv_region *entry; int ret = 0; list_for_each_entry(entry, dev_resv_regions, list) { ret = iommu_insert_resv_region(entry, group_resv_regions); if (ret) break; } return ret; } int iommu_get_group_resv_regions(struct iommu_group *group, struct list_head *head) { struct group_device *device; int ret = 0; mutex_lock(&group->mutex); for_each_group_device(group, device) { struct list_head dev_resv_regions; /* * Non-API groups still expose reserved_regions in sysfs, * so filter out calls that get here that way. */ if (!dev_has_iommu(device->dev)) break; INIT_LIST_HEAD(&dev_resv_regions); iommu_get_resv_regions(device->dev, &dev_resv_regions); ret = iommu_insert_device_resv_regions(&dev_resv_regions, head); iommu_put_resv_regions(device->dev, &dev_resv_regions); if (ret) break; } mutex_unlock(&group->mutex); return ret; } EXPORT_SYMBOL_GPL(iommu_get_group_resv_regions); static ssize_t iommu_group_show_resv_regions(struct iommu_group *group, char *buf) { struct iommu_resv_region *region, *next; struct list_head group_resv_regions; int offset = 0; INIT_LIST_HEAD(&group_resv_regions); iommu_get_group_resv_regions(group, &group_resv_regions); list_for_each_entry_safe(region, next, &group_resv_regions, list) { offset += sysfs_emit_at(buf, offset, "0x%016llx 0x%016llx %s\n", (long long)region->start, (long long)(region->start + region->length - 1), iommu_group_resv_type_string[region->type]); kfree(region); } return offset; } static ssize_t iommu_group_show_type(struct iommu_group *group, char *buf) { char *type = "unknown"; mutex_lock(&group->mutex); if (group->default_domain) { switch (group->default_domain->type) { case IOMMU_DOMAIN_BLOCKED: type = "blocked"; break; case IOMMU_DOMAIN_IDENTITY: type = "identity"; break; case IOMMU_DOMAIN_UNMANAGED: type = "unmanaged"; break; case IOMMU_DOMAIN_DMA: type = "DMA"; break; case IOMMU_DOMAIN_DMA_FQ: type = "DMA-FQ"; break; } } mutex_unlock(&group->mutex); return sysfs_emit(buf, "%s\n", type); } static IOMMU_GROUP_ATTR(name, S_IRUGO, iommu_group_show_name, NULL); static IOMMU_GROUP_ATTR(reserved_regions, 0444, iommu_group_show_resv_regions, NULL); static IOMMU_GROUP_ATTR(type, 0644, iommu_group_show_type, iommu_group_store_type); static void iommu_group_release(struct kobject *kobj) { struct iommu_group *group = to_iommu_group(kobj); pr_debug("Releasing group %d\n", group->id); if (group->iommu_data_release) group->iommu_data_release(group->iommu_data); ida_free(&iommu_group_ida, group->id); /* Domains are free'd by iommu_deinit_device() */ WARN_ON(group->default_domain); WARN_ON(group->blocking_domain); kfree(group->name); kfree(group); } static const struct kobj_type iommu_group_ktype = { .sysfs_ops = &iommu_group_sysfs_ops, .release = iommu_group_release, }; /** * iommu_group_alloc - Allocate a new group * * This function is called by an iommu driver to allocate a new iommu * group. The iommu group represents the minimum granularity of the iommu. * Upon successful return, the caller holds a reference to the supplied * group in order to hold the group until devices are added. Use * iommu_group_put() to release this extra reference count, allowing the * group to be automatically reclaimed once it has no devices or external * references. */ struct iommu_group *iommu_group_alloc(void) { struct iommu_group *group; int ret; group = kzalloc(sizeof(*group), GFP_KERNEL); if (!group) return ERR_PTR(-ENOMEM); group->kobj.kset = iommu_group_kset; mutex_init(&group->mutex); INIT_LIST_HEAD(&group->devices); INIT_LIST_HEAD(&group->entry); xa_init(&group->pasid_array); ret = ida_alloc(&iommu_group_ida, GFP_KERNEL); if (ret < 0) { kfree(group); return ERR_PTR(ret); } group->id = ret; ret = kobject_init_and_add(&group->kobj, &iommu_group_ktype, NULL, "%d", group->id); if (ret) { kobject_put(&group->kobj); return ERR_PTR(ret); } group->devices_kobj = kobject_create_and_add("devices", &group->kobj); if (!group->devices_kobj) { kobject_put(&group->kobj); /* triggers .release & free */ return ERR_PTR(-ENOMEM); } /* * The devices_kobj holds a reference on the group kobject, so * as long as that exists so will the group. We can therefore * use the devices_kobj for reference counting. */ kobject_put(&group->kobj); ret = iommu_group_create_file(group, &iommu_group_attr_reserved_regions); if (ret) { kobject_put(group->devices_kobj); return ERR_PTR(ret); } ret = iommu_group_create_file(group, &iommu_group_attr_type); if (ret) { kobject_put(group->devices_kobj); return ERR_PTR(ret); } pr_debug("Allocated group %d\n", group->id); return group; } EXPORT_SYMBOL_GPL(iommu_group_alloc); /** * iommu_group_get_iommudata - retrieve iommu_data registered for a group * @group: the group * * iommu drivers can store data in the group for use when doing iommu * operations. This function provides a way to retrieve it. Caller * should hold a group reference. */ void *iommu_group_get_iommudata(struct iommu_group *group) { return group->iommu_data; } EXPORT_SYMBOL_GPL(iommu_group_get_iommudata); /** * iommu_group_set_iommudata - set iommu_data for a group * @group: the group * @iommu_data: new data * @release: release function for iommu_data * * iommu drivers can store data in the group for use when doing iommu * operations. This function provides a way to set the data after * the group has been allocated. Caller should hold a group reference. */ void iommu_group_set_iommudata(struct iommu_group *group, void *iommu_data, void (*release)(void *iommu_data)) { group->iommu_data = iommu_data; group->iommu_data_release = release; } EXPORT_SYMBOL_GPL(iommu_group_set_iommudata); /** * iommu_group_set_name - set name for a group * @group: the group * @name: name * * Allow iommu driver to set a name for a group. When set it will * appear in a name attribute file under the group in sysfs. */ int iommu_group_set_name(struct iommu_group *group, const char *name) { int ret; if (group->name) { iommu_group_remove_file(group, &iommu_group_attr_name); kfree(group->name); group->name = NULL; if (!name) return 0; } group->name = kstrdup(name, GFP_KERNEL); if (!group->name) return -ENOMEM; ret = iommu_group_create_file(group, &iommu_group_attr_name); if (ret) { kfree(group->name); group->name = NULL; return ret; } return 0; } EXPORT_SYMBOL_GPL(iommu_group_set_name); static int iommu_create_device_direct_mappings(struct iommu_domain *domain, struct device *dev) { struct iommu_resv_region *entry; struct list_head mappings; unsigned long pg_size; int ret = 0; pg_size = domain->pgsize_bitmap ? 1UL << __ffs(domain->pgsize_bitmap) : 0; INIT_LIST_HEAD(&mappings); if (WARN_ON_ONCE(iommu_is_dma_domain(domain) && !pg_size)) return -EINVAL; iommu_get_resv_regions(dev, &mappings); /* We need to consider overlapping regions for different devices */ list_for_each_entry(entry, &mappings, list) { dma_addr_t start, end, addr; size_t map_size = 0; if (entry->type == IOMMU_RESV_DIRECT) dev->iommu->require_direct = 1; if ((entry->type != IOMMU_RESV_DIRECT && entry->type != IOMMU_RESV_DIRECT_RELAXABLE) || !iommu_is_dma_domain(domain)) continue; start = ALIGN(entry->start, pg_size); end = ALIGN(entry->start + entry->length, pg_size); for (addr = start; addr <= end; addr += pg_size) { phys_addr_t phys_addr; if (addr == end) goto map_end; phys_addr = iommu_iova_to_phys(domain, addr); if (!phys_addr) { map_size += pg_size; continue; } map_end: if (map_size) { ret = iommu_map(domain, addr - map_size, addr - map_size, map_size, entry->prot, GFP_KERNEL); if (ret) goto out; map_size = 0; } } } out: iommu_put_resv_regions(dev, &mappings); return ret; } /* This is undone by __iommu_group_free_device() */ static struct group_device *iommu_group_alloc_device(struct iommu_group *group, struct device *dev) { int ret, i = 0; struct group_device *device; device = kzalloc(sizeof(*device), GFP_KERNEL); if (!device) return ERR_PTR(-ENOMEM); device->dev = dev; ret = sysfs_create_link(&dev->kobj, &group->kobj, "iommu_group"); if (ret) goto err_free_device; device->name = kasprintf(GFP_KERNEL, "%s", kobject_name(&dev->kobj)); rename: if (!device->name) { ret = -ENOMEM; goto err_remove_link; } ret = sysfs_create_link_nowarn(group->devices_kobj, &dev->kobj, device->name); if (ret) { if (ret == -EEXIST && i >= 0) { /* * Account for the slim chance of collision * and append an instance to the name. */ kfree(device->name); device->name = kasprintf(GFP_KERNEL, "%s.%d", kobject_name(&dev->kobj), i++); goto rename; } goto err_free_name; } trace_add_device_to_group(group->id, dev); dev_info(dev, "Adding to iommu group %d\n", group->id); return device; err_free_name: kfree(device->name); err_remove_link: sysfs_remove_link(&dev->kobj, "iommu_group"); err_free_device: kfree(device); dev_err(dev, "Failed to add to iommu group %d: %d\n", group->id, ret); return ERR_PTR(ret); } /** * iommu_group_add_device - add a device to an iommu group * @group: the group into which to add the device (reference should be held) * @dev: the device * * This function is called by an iommu driver to add a device into a * group. Adding a device increments the group reference count. */ int iommu_group_add_device(struct iommu_group *group, struct device *dev) { struct group_device *gdev; gdev = iommu_group_alloc_device(group, dev); if (IS_ERR(gdev)) return PTR_ERR(gdev); iommu_group_ref_get(group); dev->iommu_group = group; mutex_lock(&group->mutex); list_add_tail(&gdev->list, &group->devices); mutex_unlock(&group->mutex); return 0; } EXPORT_SYMBOL_GPL(iommu_group_add_device); /** * iommu_group_remove_device - remove a device from it's current group * @dev: device to be removed * * This function is called by an iommu driver to remove the device from * it's current group. This decrements the iommu group reference count. */ void iommu_group_remove_device(struct device *dev) { struct iommu_group *group = dev->iommu_group; if (!group) return; dev_info(dev, "Removing from iommu group %d\n", group->id); __iommu_group_remove_device(dev); } EXPORT_SYMBOL_GPL(iommu_group_remove_device); #if IS_ENABLED(CONFIG_LOCKDEP) && IS_ENABLED(CONFIG_IOMMU_API) /** * iommu_group_mutex_assert - Check device group mutex lock * @dev: the device that has group param set * * This function is called by an iommu driver to check whether it holds * group mutex lock for the given device or not. * * Note that this function must be called after device group param is set. */ void iommu_group_mutex_assert(struct device *dev) { struct iommu_group *group = dev->iommu_group; lockdep_assert_held(&group->mutex); } EXPORT_SYMBOL_GPL(iommu_group_mutex_assert); #endif static struct device *iommu_group_first_dev(struct iommu_group *group) { lockdep_assert_held(&group->mutex); return list_first_entry(&group->devices, struct group_device, list)->dev; } /** * iommu_group_for_each_dev - iterate over each device in the group * @group: the group * @data: caller opaque data to be passed to callback function * @fn: caller supplied callback function * * This function is called by group users to iterate over group devices. * Callers should hold a reference count to the group during callback. * The group->mutex is held across callbacks, which will block calls to * iommu_group_add/remove_device. */ int iommu_group_for_each_dev(struct iommu_group *group, void *data, int (*fn)(struct device *, void *)) { struct group_device *device; int ret = 0; mutex_lock(&group->mutex); for_each_group_device(group, device) { ret = fn(device->dev, data); if (ret) break; } mutex_unlock(&group->mutex); return ret; } EXPORT_SYMBOL_GPL(iommu_group_for_each_dev); /** * iommu_group_get - Return the group for a device and increment reference * @dev: get the group that this device belongs to * * This function is called by iommu drivers and users to get the group * for the specified device. If found, the group is returned and the group * reference in incremented, else NULL. */ struct iommu_group *iommu_group_get(struct device *dev) { struct iommu_group *group = dev->iommu_group; if (group) kobject_get(group->devices_kobj); return group; } EXPORT_SYMBOL_GPL(iommu_group_get); /** * iommu_group_ref_get - Increment reference on a group * @group: the group to use, must not be NULL * * This function is called by iommu drivers to take additional references on an * existing group. Returns the given group for convenience. */ struct iommu_group *iommu_group_ref_get(struct iommu_group *group) { kobject_get(group->devices_kobj); return group; } EXPORT_SYMBOL_GPL(iommu_group_ref_get); /** * iommu_group_put - Decrement group reference * @group: the group to use * * This function is called by iommu drivers and users to release the * iommu group. Once the reference count is zero, the group is released. */ void iommu_group_put(struct iommu_group *group) { if (group) kobject_put(group->devices_kobj); } EXPORT_SYMBOL_GPL(iommu_group_put); /** * iommu_group_id - Return ID for a group * @group: the group to ID * * Return the unique ID for the group matching the sysfs group number. */ int iommu_group_id(struct iommu_group *group) { return group->id; } EXPORT_SYMBOL_GPL(iommu_group_id); static struct iommu_group *get_pci_alias_group(struct pci_dev *pdev, unsigned long *devfns); /* * To consider a PCI device isolated, we require ACS to support Source * Validation, Request Redirection, Completer Redirection, and Upstream * Forwarding. This effectively means that devices cannot spoof their * requester ID, requests and completions cannot be redirected, and all * transactions are forwarded upstream, even as it passes through a * bridge where the target device is downstream. */ #define REQ_ACS_FLAGS (PCI_ACS_SV | PCI_ACS_RR | PCI_ACS_CR | PCI_ACS_UF) /* * For multifunction devices which are not isolated from each other, find * all the other non-isolated functions and look for existing groups. For * each function, we also need to look for aliases to or from other devices * that may already have a group. */ static struct iommu_group *get_pci_function_alias_group(struct pci_dev *pdev, unsigned long *devfns) { struct pci_dev *tmp = NULL; struct iommu_group *group; if (!pdev->multifunction || pci_acs_enabled(pdev, REQ_ACS_FLAGS)) return NULL; for_each_pci_dev(tmp) { if (tmp == pdev || tmp->bus != pdev->bus || PCI_SLOT(tmp->devfn) != PCI_SLOT(pdev->devfn) || pci_acs_enabled(tmp, REQ_ACS_FLAGS)) continue; group = get_pci_alias_group(tmp, devfns); if (group) { pci_dev_put(tmp); return group; } } return NULL; } /* * Look for aliases to or from the given device for existing groups. DMA * aliases are only supported on the same bus, therefore the search * space is quite small (especially since we're really only looking at pcie * device, and therefore only expect multiple slots on the root complex or * downstream switch ports). It's conceivable though that a pair of * multifunction devices could have aliases between them that would cause a * loop. To prevent this, we use a bitmap to track where we've been. */ static struct iommu_group *get_pci_alias_group(struct pci_dev *pdev, unsigned long *devfns) { struct pci_dev *tmp = NULL; struct iommu_group *group; if (test_and_set_bit(pdev->devfn & 0xff, devfns)) return NULL; group = iommu_group_get(&pdev->dev); if (group) return group; for_each_pci_dev(tmp) { if (tmp == pdev || tmp->bus != pdev->bus) continue; /* We alias them or they alias us */ if (pci_devs_are_dma_aliases(pdev, tmp)) { group = get_pci_alias_group(tmp, devfns); if (group) { pci_dev_put(tmp); return group; } group = get_pci_function_alias_group(tmp, devfns); if (group) { pci_dev_put(tmp); return group; } } } return NULL; } struct group_for_pci_data { struct pci_dev *pdev; struct iommu_group *group; }; /* * DMA alias iterator callback, return the last seen device. Stop and return * the IOMMU group if we find one along the way. */ static int get_pci_alias_or_group(struct pci_dev *pdev, u16 alias, void *opaque) { struct group_for_pci_data *data = opaque; data->pdev = pdev; data->group = iommu_group_get(&pdev->dev); return data->group != NULL; } /* * Generic device_group call-back function. It just allocates one * iommu-group per device. */ struct iommu_group *generic_device_group(struct device *dev) { return iommu_group_alloc(); } EXPORT_SYMBOL_GPL(generic_device_group); /* * Generic device_group call-back function. It just allocates one * iommu-group per iommu driver instance shared by every device * probed by that iommu driver. */ struct iommu_group *generic_single_device_group(struct device *dev) { struct iommu_device *iommu = dev->iommu->iommu_dev; if (!iommu->singleton_group) { struct iommu_group *group; group = iommu_group_alloc(); if (IS_ERR(group)) return group; iommu->singleton_group = group; } return iommu_group_ref_get(iommu->singleton_group); } EXPORT_SYMBOL_GPL(generic_single_device_group); /* * Use standard PCI bus topology, isolation features, and DMA alias quirks * to find or create an IOMMU group for a device. */ struct iommu_group *pci_device_group(struct device *dev) { struct pci_dev *pdev = to_pci_dev(dev); struct group_for_pci_data data; struct pci_bus *bus; struct iommu_group *group = NULL; u64 devfns[4] = { 0 }; if (WARN_ON(!dev_is_pci(dev))) return ERR_PTR(-EINVAL); /* * Find the upstream DMA alias for the device. A device must not * be aliased due to topology in order to have its own IOMMU group. * If we find an alias along the way that already belongs to a * group, use it. */ if (pci_for_each_dma_alias(pdev, get_pci_alias_or_group, &data)) return data.group; pdev = data.pdev; /* * Continue upstream from the point of minimum IOMMU granularity * due to aliases to the point where devices are protected from * peer-to-peer DMA by PCI ACS. Again, if we find an existing * group, use it. */ for (bus = pdev->bus; !pci_is_root_bus(bus); bus = bus->parent) { if (!bus->self) continue; if (pci_acs_path_enabled(bus->self, NULL, REQ_ACS_FLAGS)) break; pdev = bus->self; group = iommu_group_get(&pdev->dev); if (group) return group; } /* * Look for existing groups on device aliases. If we alias another * device or another device aliases us, use the same group. */ group = get_pci_alias_group(pdev, (unsigned long *)devfns); if (group) return group; /* * Look for existing groups on non-isolated functions on the same * slot and aliases of those funcions, if any. No need to clear * the search bitmap, the tested devfns are still valid. */ group = get_pci_function_alias_group(pdev, (unsigned long *)devfns); if (group) return group; /* No shared group found, allocate new */ return iommu_group_alloc(); } EXPORT_SYMBOL_GPL(pci_device_group); /* Get the IOMMU group for device on fsl-mc bus */ struct iommu_group *fsl_mc_device_group(struct device *dev) { struct device *cont_dev = fsl_mc_cont_dev(dev); struct iommu_group *group; group = iommu_group_get(cont_dev); if (!group) group = iommu_group_alloc(); return group; } EXPORT_SYMBOL_GPL(fsl_mc_device_group); static struct iommu_domain *__iommu_alloc_identity_domain(struct device *dev) { const struct iommu_ops *ops = dev_iommu_ops(dev); struct iommu_domain *domain; if (ops->identity_domain) return ops->identity_domain; if (ops->domain_alloc_identity) { domain = ops->domain_alloc_identity(dev); if (IS_ERR(domain)) return domain; } else { return ERR_PTR(-EOPNOTSUPP); } iommu_domain_init(domain, IOMMU_DOMAIN_IDENTITY, ops); return domain; } static struct iommu_domain * __iommu_group_alloc_default_domain(struct iommu_group *group, int req_type) { struct device *dev = iommu_group_first_dev(group); struct iommu_domain *dom; if (group->default_domain && group->default_domain->type == req_type) return group->default_domain; /* * When allocating the DMA API domain assume that the driver is going to * use PASID and make sure the RID's domain is PASID compatible. */ if (req_type & __IOMMU_DOMAIN_PAGING) { dom = __iommu_paging_domain_alloc_flags(dev, req_type, dev->iommu->max_pasids ? IOMMU_HWPT_ALLOC_PASID : 0); /* * If driver does not support PASID feature then * try to allocate non-PASID domain */ if (PTR_ERR(dom) == -EOPNOTSUPP) dom = __iommu_paging_domain_alloc_flags(dev, req_type, 0); return dom; } if (req_type == IOMMU_DOMAIN_IDENTITY) return __iommu_alloc_identity_domain(dev); return ERR_PTR(-EINVAL); } /* * req_type of 0 means "auto" which means to select a domain based on * iommu_def_domain_type or what the driver actually supports. */ static struct iommu_domain * iommu_group_alloc_default_domain(struct iommu_group *group, int req_type) { const struct iommu_ops *ops = dev_iommu_ops(iommu_group_first_dev(group)); struct iommu_domain *dom; lockdep_assert_held(&group->mutex); /* * Allow legacy drivers to specify the domain that will be the default * domain. This should always be either an IDENTITY/BLOCKED/PLATFORM * domain. Do not use in new drivers. */ if (ops->default_domain) { if (req_type != ops->default_domain->type) return ERR_PTR(-EINVAL); return ops->default_domain; } if (req_type) return __iommu_group_alloc_default_domain(group, req_type); /* The driver gave no guidance on what type to use, try the default */ dom = __iommu_group_alloc_default_domain(group, iommu_def_domain_type); if (!IS_ERR(dom)) return dom; /* Otherwise IDENTITY and DMA_FQ defaults will try DMA */ if (iommu_def_domain_type == IOMMU_DOMAIN_DMA) return ERR_PTR(-EINVAL); dom = __iommu_group_alloc_default_domain(group, IOMMU_DOMAIN_DMA); if (IS_ERR(dom)) return dom; pr_warn("Failed to allocate default IOMMU domain of type %u for group %s - Falling back to IOMMU_DOMAIN_DMA", iommu_def_domain_type, group->name); return dom; } struct iommu_domain *iommu_group_default_domain(struct iommu_group *group) { return group->default_domain; } static int probe_iommu_group(struct device *dev, void *data) { struct list_head *group_list = data; int ret; mutex_lock(&iommu_probe_device_lock); ret = __iommu_probe_device(dev, group_list); mutex_unlock(&iommu_probe_device_lock); if (ret == -ENODEV) ret = 0; return ret; } static int iommu_bus_notifier(struct notifier_block *nb, unsigned long action, void *data) { struct device *dev = data; if (action == BUS_NOTIFY_ADD_DEVICE) { int ret; ret = iommu_probe_device(dev); return (ret) ? NOTIFY_DONE : NOTIFY_OK; } else if (action == BUS_NOTIFY_REMOVED_DEVICE) { iommu_release_device(dev); return NOTIFY_OK; } return 0; } /* * Combine the driver's chosen def_domain_type across all the devices in a * group. Drivers must give a consistent result. */ static int iommu_get_def_domain_type(struct iommu_group *group, struct device *dev, int cur_type) { const struct iommu_ops *ops = dev_iommu_ops(dev); int type; if (ops->default_domain) { /* * Drivers that declare a global static default_domain will * always choose that. */ type = ops->default_domain->type; } else { if (ops->def_domain_type) type = ops->def_domain_type(dev); else return cur_type; } if (!type || cur_type == type) return cur_type; if (!cur_type) return type; dev_err_ratelimited( dev, "IOMMU driver error, requesting conflicting def_domain_type, %s and %s, for devices in group %u.\n", iommu_domain_type_str(cur_type), iommu_domain_type_str(type), group->id); /* * Try to recover, drivers are allowed to force IDENTITY or DMA, IDENTITY * takes precedence. */ if (type == IOMMU_DOMAIN_IDENTITY) return type; return cur_type; } /* * A target_type of 0 will select the best domain type. 0 can be returned in * this case meaning the global default should be used. */ static int iommu_get_default_domain_type(struct iommu_group *group, int target_type) { struct device *untrusted = NULL; struct group_device *gdev; int driver_type = 0; lockdep_assert_held(&group->mutex); /* * ARM32 drivers supporting CONFIG_ARM_DMA_USE_IOMMU can declare an * identity_domain and it will automatically become their default * domain. Later on ARM_DMA_USE_IOMMU will install its UNMANAGED domain. * Override the selection to IDENTITY. */ if (IS_ENABLED(CONFIG_ARM_DMA_USE_IOMMU)) { static_assert(!(IS_ENABLED(CONFIG_ARM_DMA_USE_IOMMU) && IS_ENABLED(CONFIG_IOMMU_DMA))); driver_type = IOMMU_DOMAIN_IDENTITY; } for_each_group_device(group, gdev) { driver_type = iommu_get_def_domain_type(group, gdev->dev, driver_type); if (dev_is_pci(gdev->dev) && to_pci_dev(gdev->dev)->untrusted) { /* * No ARM32 using systems will set untrusted, it cannot * work. */ if (WARN_ON(IS_ENABLED(CONFIG_ARM_DMA_USE_IOMMU))) return -1; untrusted = gdev->dev; } } /* * If the common dma ops are not selected in kconfig then we cannot use * IOMMU_DOMAIN_DMA at all. Force IDENTITY if nothing else has been * selected. */ if (!IS_ENABLED(CONFIG_IOMMU_DMA)) { if (WARN_ON(driver_type == IOMMU_DOMAIN_DMA)) return -1; if (!driver_type) driver_type = IOMMU_DOMAIN_IDENTITY; } if (untrusted) { if (driver_type && driver_type != IOMMU_DOMAIN_DMA) { dev_err_ratelimited( untrusted, "Device is not trusted, but driver is overriding group %u to %s, refusing to probe.\n", group->id, iommu_domain_type_str(driver_type)); return -1; } driver_type = IOMMU_DOMAIN_DMA; } if (target_type) { if (driver_type && target_type != driver_type) return -1; return target_type; } return driver_type; } static void iommu_group_do_probe_finalize(struct device *dev) { const struct iommu_ops *ops = dev_iommu_ops(dev); if (ops->probe_finalize) ops->probe_finalize(dev); } static int bus_iommu_probe(const struct bus_type *bus) { struct iommu_group *group, *next; LIST_HEAD(group_list); int ret; ret = bus_for_each_dev(bus, NULL, &group_list, probe_iommu_group); if (ret) return ret; list_for_each_entry_safe(group, next, &group_list, entry) { struct group_device *gdev; mutex_lock(&group->mutex); /* Remove item from the list */ list_del_init(&group->entry); /* * We go to the trouble of deferred default domain creation so * that the cross-group default domain type and the setup of the * IOMMU_RESV_DIRECT will work correctly in non-hotpug scenarios. */ ret = iommu_setup_default_domain(group, 0); if (ret) { mutex_unlock(&group->mutex); return ret; } for_each_group_device(group, gdev) iommu_setup_dma_ops(gdev->dev); mutex_unlock(&group->mutex); /* * FIXME: Mis-locked because the ops->probe_finalize() call-back * of some IOMMU drivers calls arm_iommu_attach_device() which * in-turn might call back into IOMMU core code, where it tries * to take group->mutex, resulting in a deadlock. */ for_each_group_device(group, gdev) iommu_group_do_probe_finalize(gdev->dev); } return 0; } /** * device_iommu_capable() - check for a general IOMMU capability * @dev: device to which the capability would be relevant, if available * @cap: IOMMU capability * * Return: true if an IOMMU is present and supports the given capability * for the given device, otherwise false. */ bool device_iommu_capable(struct device *dev, enum iommu_cap cap) { const struct iommu_ops *ops; if (!dev_has_iommu(dev)) return false; ops = dev_iommu_ops(dev); if (!ops->capable) return false; return ops->capable(dev, cap); } EXPORT_SYMBOL_GPL(device_iommu_capable); /** * iommu_group_has_isolated_msi() - Compute msi_device_has_isolated_msi() * for a group * @group: Group to query * * IOMMU groups should not have differing values of * msi_device_has_isolated_msi() for devices in a group. However nothing * directly prevents this, so ensure mistakes don't result in isolation failures * by checking that all the devices are the same. */ bool iommu_group_has_isolated_msi(struct iommu_group *group) { struct group_device *group_dev; bool ret = true; mutex_lock(&group->mutex); for_each_group_device(group, group_dev) ret &= msi_device_has_isolated_msi(group_dev->dev); mutex_unlock(&group->mutex); return ret; } EXPORT_SYMBOL_GPL(iommu_group_has_isolated_msi); /** * iommu_set_fault_handler() - set a fault handler for an iommu domain * @domain: iommu domain * @handler: fault handler * @token: user data, will be passed back to the fault handler * * This function should be used by IOMMU users which want to be notified * whenever an IOMMU fault happens. * * The fault handler itself should return 0 on success, and an appropriate * error code otherwise. */ void iommu_set_fault_handler(struct iommu_domain *domain, iommu_fault_handler_t handler, void *token) { if (WARN_ON(!domain || domain->cookie_type != IOMMU_COOKIE_NONE)) return; domain->cookie_type = IOMMU_COOKIE_FAULT_HANDLER; domain->handler = handler; domain->handler_token = token; } EXPORT_SYMBOL_GPL(iommu_set_fault_handler); static void iommu_domain_init(struct iommu_domain *domain, unsigned int type, const struct iommu_ops *ops) { domain->type = type; domain->owner = ops; if (!domain->ops) domain->ops = ops->default_domain_ops; } static struct iommu_domain * __iommu_paging_domain_alloc_flags(struct device *dev, unsigned int type, unsigned int flags) { const struct iommu_ops *ops; struct iommu_domain *domain; if (!dev_has_iommu(dev)) return ERR_PTR(-ENODEV); ops = dev_iommu_ops(dev); if (ops->domain_alloc_paging && !flags) domain = ops->domain_alloc_paging(dev); else if (ops->domain_alloc_paging_flags) domain = ops->domain_alloc_paging_flags(dev, flags, NULL); #if IS_ENABLED(CONFIG_FSL_PAMU) else if (ops->domain_alloc && !flags) domain = ops->domain_alloc(IOMMU_DOMAIN_UNMANAGED); #endif else return ERR_PTR(-EOPNOTSUPP); if (IS_ERR(domain)) return domain; if (!domain) return ERR_PTR(-ENOMEM); iommu_domain_init(domain, type, ops); return domain; } /** * iommu_paging_domain_alloc_flags() - Allocate a paging domain * @dev: device for which the domain is allocated * @flags: Bitmap of iommufd_hwpt_alloc_flags * * Allocate a paging domain which will be managed by a kernel driver. Return * allocated domain if successful, or an ERR pointer for failure. */ struct iommu_domain *iommu_paging_domain_alloc_flags(struct device *dev, unsigned int flags) { return __iommu_paging_domain_alloc_flags(dev, IOMMU_DOMAIN_UNMANAGED, flags); } EXPORT_SYMBOL_GPL(iommu_paging_domain_alloc_flags); void iommu_domain_free(struct iommu_domain *domain) { switch (domain->cookie_type) { case IOMMU_COOKIE_DMA_IOVA: iommu_put_dma_cookie(domain); break; case IOMMU_COOKIE_DMA_MSI: iommu_put_msi_cookie(domain); break; case IOMMU_COOKIE_SVA: mmdrop(domain->mm); break; default: break; } if (domain->ops->free) domain->ops->free(domain); } EXPORT_SYMBOL_GPL(iommu_domain_free); /* * Put the group's domain back to the appropriate core-owned domain - either the * standard kernel-mode DMA configuration or an all-DMA-blocked domain. */ static void __iommu_group_set_core_domain(struct iommu_group *group) { struct iommu_domain *new_domain; if (group->owner) new_domain = group->blocking_domain; else new_domain = group->default_domain; __iommu_group_set_domain_nofail(group, new_domain); } static int __iommu_attach_device(struct iommu_domain *domain, struct device *dev) { int ret; if (unlikely(domain->ops->attach_dev == NULL)) return -ENODEV; ret = domain->ops->attach_dev(domain, dev); if (ret) return ret; dev->iommu->attach_deferred = 0; trace_attach_device_to_domain(dev); return 0; } /** * iommu_attach_device - Attach an IOMMU domain to a device * @domain: IOMMU domain to attach * @dev: Device that will be attached * * Returns 0 on success and error code on failure * * Note that EINVAL can be treated as a soft failure, indicating * that certain configuration of the domain is incompatible with * the device. In this case attaching a different domain to the * device may succeed. */ int iommu_attach_device(struct iommu_domain *domain, struct device *dev) { /* Caller must be a probed driver on dev */ struct iommu_group *group = dev->iommu_group; int ret; if (!group) return -ENODEV; /* * Lock the group to make sure the device-count doesn't * change while we are attaching */ mutex_lock(&group->mutex); ret = -EINVAL; if (list_count_nodes(&group->devices) != 1) goto out_unlock; ret = __iommu_attach_group(domain, group); out_unlock: mutex_unlock(&group->mutex); return ret; } EXPORT_SYMBOL_GPL(iommu_attach_device); int iommu_deferred_attach(struct device *dev, struct iommu_domain *domain) { if (dev->iommu && dev->iommu->attach_deferred) return __iommu_attach_device(domain, dev); return 0; } void iommu_detach_device(struct iommu_domain *domain, struct device *dev) { /* Caller must be a probed driver on dev */ struct iommu_group *group = dev->iommu_group; if (!group) return; mutex_lock(&group->mutex); if (WARN_ON(domain != group->domain) || WARN_ON(list_count_nodes(&group->devices) != 1)) goto out_unlock; __iommu_group_set_core_domain(group); out_unlock: mutex_unlock(&group->mutex); } EXPORT_SYMBOL_GPL(iommu_detach_device); struct iommu_domain *iommu_get_domain_for_dev(struct device *dev) { /* Caller must be a probed driver on dev */ struct iommu_group *group = dev->iommu_group; if (!group) return NULL; return group->domain; } EXPORT_SYMBOL_GPL(iommu_get_domain_for_dev); /* * For IOMMU_DOMAIN_DMA implementations which already provide their own * guarantees that the group and its default domain are valid and correct. */ struct iommu_domain *iommu_get_dma_domain(struct device *dev) { return dev->iommu_group->default_domain; } static void *iommu_make_pasid_array_entry(struct iommu_domain *domain, struct iommu_attach_handle *handle) { if (handle) { handle->domain = domain; return xa_tag_pointer(handle, IOMMU_PASID_ARRAY_HANDLE); } return xa_tag_pointer(domain, IOMMU_PASID_ARRAY_DOMAIN); } static bool domain_iommu_ops_compatible(const struct iommu_ops *ops, struct iommu_domain *domain) { if (domain->owner == ops) return true; /* For static domains, owner isn't set. */ if (domain == ops->blocked_domain || domain == ops->identity_domain) return true; return false; } static int __iommu_attach_group(struct iommu_domain *domain, struct iommu_group *group) { struct device *dev; if (group->domain && group->domain != group->default_domain && group->domain != group->blocking_domain) return -EBUSY; dev = iommu_group_first_dev(group); if (!dev_has_iommu(dev) || !domain_iommu_ops_compatible(dev_iommu_ops(dev), domain)) return -EINVAL; return __iommu_group_set_domain(group, domain); } /** * iommu_attach_group - Attach an IOMMU domain to an IOMMU group * @domain: IOMMU domain to attach * @group: IOMMU group that will be attached * * Returns 0 on success and error code on failure * * Note that EINVAL can be treated as a soft failure, indicating * that certain configuration of the domain is incompatible with * the group. In this case attaching a different domain to the * group may succeed. */ int iommu_attach_group(struct iommu_domain *domain, struct iommu_group *group) { int ret; mutex_lock(&group->mutex); ret = __iommu_attach_group(domain, group); mutex_unlock(&group->mutex); return ret; } EXPORT_SYMBOL_GPL(iommu_attach_group); static int __iommu_device_set_domain(struct iommu_group *group, struct device *dev, struct iommu_domain *new_domain, unsigned int flags) { int ret; /* * If the device requires IOMMU_RESV_DIRECT then we cannot allow * the blocking domain to be attached as it does not contain the * required 1:1 mapping. This test effectively excludes the device * being used with iommu_group_claim_dma_owner() which will block * vfio and iommufd as well. */ if (dev->iommu->require_direct && (new_domain->type == IOMMU_DOMAIN_BLOCKED || new_domain == group->blocking_domain)) { dev_warn(dev, "Firmware has requested this device have a 1:1 IOMMU mapping, rejecting configuring the device without a 1:1 mapping. Contact your platform vendor.\n"); return -EINVAL; } if (dev->iommu->attach_deferred) { if (new_domain == group->default_domain) return 0; dev->iommu->attach_deferred = 0; } ret = __iommu_attach_device(new_domain, dev); if (ret) { /* * If we have a blocking domain then try to attach that in hopes * of avoiding a UAF. Modern drivers should implement blocking * domains as global statics that cannot fail. */ if ((flags & IOMMU_SET_DOMAIN_MUST_SUCCEED) && group->blocking_domain && group->blocking_domain != new_domain) __iommu_attach_device(group->blocking_domain, dev); return ret; } return 0; } /* * If 0 is returned the group's domain is new_domain. If an error is returned * then the group's domain will be set back to the existing domain unless * IOMMU_SET_DOMAIN_MUST_SUCCEED, otherwise an error is returned and the group's * domains is left inconsistent. This is a driver bug to fail attach with a * previously good domain. We try to avoid a kernel UAF because of this. * * IOMMU groups are really the natural working unit of the IOMMU, but the IOMMU * API works on domains and devices. Bridge that gap by iterating over the * devices in a group. Ideally we'd have a single device which represents the * requestor ID of the group, but we also allow IOMMU drivers to create policy * defined minimum sets, where the physical hardware may be able to distiguish * members, but we wish to group them at a higher level (ex. untrusted * multi-function PCI devices). Thus we attach each device. */ static int __iommu_group_set_domain_internal(struct iommu_group *group, struct iommu_domain *new_domain, unsigned int flags) { struct group_device *last_gdev; struct group_device *gdev; int result; int ret; lockdep_assert_held(&group->mutex); if (group->domain == new_domain) return 0; if (WARN_ON(!new_domain)) return -EINVAL; /* * Changing the domain is done by calling attach_dev() on the new * domain. This switch does not have to be atomic and DMA can be * discarded during the transition. DMA must only be able to access * either new_domain or group->domain, never something else. */ result = 0; for_each_group_device(group, gdev) { ret = __iommu_device_set_domain(group, gdev->dev, new_domain, flags); if (ret) { result = ret; /* * Keep trying the other devices in the group. If a * driver fails attach to an otherwise good domain, and * does not support blocking domains, it should at least * drop its reference on the current domain so we don't * UAF. */ if (flags & IOMMU_SET_DOMAIN_MUST_SUCCEED) continue; goto err_revert; } } group->domain = new_domain; return result; err_revert: /* * This is called in error unwind paths. A well behaved driver should * always allow us to attach to a domain that was already attached. */ last_gdev = gdev; for_each_group_device(group, gdev) { /* * A NULL domain can happen only for first probe, in which case * we leave group->domain as NULL and let release clean * everything up. */ if (group->domain) WARN_ON(__iommu_device_set_domain( group, gdev->dev, group->domain, IOMMU_SET_DOMAIN_MUST_SUCCEED)); if (gdev == last_gdev) break; } return ret; } void iommu_detach_group(struct iommu_domain *domain, struct iommu_group *group) { mutex_lock(&group->mutex); __iommu_group_set_core_domain(group); mutex_unlock(&group->mutex); } EXPORT_SYMBOL_GPL(iommu_detach_group); phys_addr_t iommu_iova_to_phys(struct iommu_domain *domain, dma_addr_t iova) { if (domain->type == IOMMU_DOMAIN_IDENTITY) return iova; if (domain->type == IOMMU_DOMAIN_BLOCKED) return 0; return domain->ops->iova_to_phys(domain, iova); } EXPORT_SYMBOL_GPL(iommu_iova_to_phys); static size_t iommu_pgsize(struct iommu_domain *domain, unsigned long iova, phys_addr_t paddr, size_t size, size_t *count) { unsigned int pgsize_idx, pgsize_idx_next; unsigned long pgsizes; size_t offset, pgsize, pgsize_next; size_t offset_end; unsigned long addr_merge = paddr | iova; /* Page sizes supported by the hardware and small enough for @size */ pgsizes = domain->pgsize_bitmap & GENMASK(__fls(size), 0); /* Constrain the page sizes further based on the maximum alignment */ if (likely(addr_merge)) pgsizes &= GENMASK(__ffs(addr_merge), 0); /* Make sure we have at least one suitable page size */ BUG_ON(!pgsizes); /* Pick the biggest page size remaining */ pgsize_idx = __fls(pgsizes); pgsize = BIT(pgsize_idx); if (!count) return pgsize; /* Find the next biggest support page size, if it exists */ pgsizes = domain->pgsize_bitmap & ~GENMASK(pgsize_idx, 0); if (!pgsizes) goto out_set_count; pgsize_idx_next = __ffs(pgsizes); pgsize_next = BIT(pgsize_idx_next); /* * There's no point trying a bigger page size unless the virtual * and physical addresses are similarly offset within the larger page. */ if ((iova ^ paddr) & (pgsize_next - 1)) goto out_set_count; /* Calculate the offset to the next page size alignment boundary */ offset = pgsize_next - (addr_merge & (pgsize_next - 1)); /* * If size is big enough to accommodate the larger page, reduce * the number of smaller pages. */ if (!check_add_overflow(offset, pgsize_next, &offset_end) && offset_end <= size) size = offset; out_set_count: *count = size >> pgsize_idx; return pgsize; } int iommu_map_nosync(struct iommu_domain *domain, unsigned long iova, phys_addr_t paddr, size_t size, int prot, gfp_t gfp) { const struct iommu_domain_ops *ops = domain->ops; unsigned long orig_iova = iova; unsigned int min_pagesz; size_t orig_size = size; phys_addr_t orig_paddr = paddr; int ret = 0; might_sleep_if(gfpflags_allow_blocking(gfp)); if (unlikely(!(domain->type & __IOMMU_DOMAIN_PAGING))) return -EINVAL; if (WARN_ON(!ops->map_pages || domain->pgsize_bitmap == 0UL)) return -ENODEV; /* Discourage passing strange GFP flags */ if (WARN_ON_ONCE(gfp & (__GFP_COMP | __GFP_DMA | __GFP_DMA32 | __GFP_HIGHMEM))) return -EINVAL; /* find out the minimum page size supported */ min_pagesz = 1 << __ffs(domain->pgsize_bitmap); /* * both the virtual address and the physical one, as well as * the size of the mapping, must be aligned (at least) to the * size of the smallest page supported by the hardware */ if (!IS_ALIGNED(iova | paddr | size, min_pagesz)) { pr_err("unaligned: iova 0x%lx pa %pa size 0x%zx min_pagesz 0x%x\n", iova, &paddr, size, min_pagesz); return -EINVAL; } pr_debug("map: iova 0x%lx pa %pa size 0x%zx\n", iova, &paddr, size); while (size) { size_t pgsize, count, mapped = 0; pgsize = iommu_pgsize(domain, iova, paddr, size, &count); pr_debug("mapping: iova 0x%lx pa %pa pgsize 0x%zx count %zu\n", iova, &paddr, pgsize, count); ret = ops->map_pages(domain, iova, paddr, pgsize, count, prot, gfp, &mapped); /* * Some pages may have been mapped, even if an error occurred, * so we should account for those so they can be unmapped. */ size -= mapped; if (ret) break; iova += mapped; paddr += mapped; } /* unroll mapping in case something went wrong */ if (ret) iommu_unmap(domain, orig_iova, orig_size - size); else trace_map(orig_iova, orig_paddr, orig_size); return ret; } int iommu_sync_map(struct iommu_domain *domain, unsigned long iova, size_t size) { const struct iommu_domain_ops *ops = domain->ops; if (!ops->iotlb_sync_map) return 0; return ops->iotlb_sync_map(domain, iova, size); } int iommu_map(struct iommu_domain *domain, unsigned long iova, phys_addr_t paddr, size_t size, int prot, gfp_t gfp) { int ret; ret = iommu_map_nosync(domain, iova, paddr, size, prot, gfp); if (ret) return ret; ret = iommu_sync_map(domain, iova, size); if (ret) iommu_unmap(domain, iova, size); return ret; } EXPORT_SYMBOL_GPL(iommu_map); static size_t __iommu_unmap(struct iommu_domain *domain, unsigned long iova, size_t size, struct iommu_iotlb_gather *iotlb_gather) { const struct iommu_domain_ops *ops = domain->ops; size_t unmapped_page, unmapped = 0; unsigned long orig_iova = iova; unsigned int min_pagesz; if (unlikely(!(domain->type & __IOMMU_DOMAIN_PAGING))) return 0; if (WARN_ON(!ops->unmap_pages || domain->pgsize_bitmap == 0UL)) return 0; /* find out the minimum page size supported */ min_pagesz = 1 << __ffs(domain->pgsize_bitmap); /* * The virtual address, as well as the size of the mapping, must be * aligned (at least) to the size of the smallest page supported * by the hardware */ if (!IS_ALIGNED(iova | size, min_pagesz)) { pr_err("unaligned: iova 0x%lx size 0x%zx min_pagesz 0x%x\n", iova, size, min_pagesz); return 0; } pr_debug("unmap this: iova 0x%lx size 0x%zx\n", iova, size); /* * Keep iterating until we either unmap 'size' bytes (or more) * or we hit an area that isn't mapped. */ while (unmapped < size) { size_t pgsize, count; pgsize = iommu_pgsize(domain, iova, iova, size - unmapped, &count); unmapped_page = ops->unmap_pages(domain, iova, pgsize, count, iotlb_gather); if (!unmapped_page) break; pr_debug("unmapped: iova 0x%lx size 0x%zx\n", iova, unmapped_page); iova += unmapped_page; unmapped += unmapped_page; } trace_unmap(orig_iova, size, unmapped); return unmapped; } /** * iommu_unmap() - Remove mappings from a range of IOVA * @domain: Domain to manipulate * @iova: IO virtual address to start * @size: Length of the range starting from @iova * * iommu_unmap() will remove a translation created by iommu_map(). It cannot * subdivide a mapping created by iommu_map(), so it should be called with IOVA * ranges that match what was passed to iommu_map(). The range can aggregate * contiguous iommu_map() calls so long as no individual range is split. * * Returns: Number of bytes of IOVA unmapped. iova + res will be the point * unmapping stopped. */ size_t iommu_unmap(struct iommu_domain *domain, unsigned long iova, size_t size) { struct iommu_iotlb_gather iotlb_gather; size_t ret; iommu_iotlb_gather_init(&iotlb_gather); ret = __iommu_unmap(domain, iova, size, &iotlb_gather); iommu_iotlb_sync(domain, &iotlb_gather); return ret; } EXPORT_SYMBOL_GPL(iommu_unmap); /** * iommu_unmap_fast() - Remove mappings from a range of IOVA without IOTLB sync * @domain: Domain to manipulate * @iova: IO virtual address to start * @size: Length of the range starting from @iova * @iotlb_gather: range information for a pending IOTLB flush * * iommu_unmap_fast() will remove a translation created by iommu_map(). * It can't subdivide a mapping created by iommu_map(), so it should be * called with IOVA ranges that match what was passed to iommu_map(). The * range can aggregate contiguous iommu_map() calls so long as no individual * range is split. * * Basically iommu_unmap_fast() is the same as iommu_unmap() but for callers * which manage the IOTLB flushing externally to perform a batched sync. * * Returns: Number of bytes of IOVA unmapped. iova + res will be the point * unmapping stopped. */ size_t iommu_unmap_fast(struct iommu_domain *domain, unsigned long iova, size_t size, struct iommu_iotlb_gather *iotlb_gather) { return __iommu_unmap(domain, iova, size, iotlb_gather); } EXPORT_SYMBOL_GPL(iommu_unmap_fast); ssize_t iommu_map_sg(struct iommu_domain *domain, unsigned long iova, struct scatterlist *sg, unsigned int nents, int prot, gfp_t gfp) { size_t len = 0, mapped = 0; phys_addr_t start; unsigned int i = 0; int ret; while (i <= nents) { phys_addr_t s_phys = sg_phys(sg); if (len && s_phys != start + len) { ret = iommu_map_nosync(domain, iova + mapped, start, len, prot, gfp); if (ret) goto out_err; mapped += len; len = 0; } if (sg_dma_is_bus_address(sg)) goto next; if (len) { len += sg->length; } else { len = sg->length; start = s_phys; } next: if (++i < nents) sg = sg_next(sg); } ret = iommu_sync_map(domain, iova, mapped); if (ret) goto out_err; return mapped; out_err: /* undo mappings already done */ iommu_unmap(domain, iova, mapped); return ret; } EXPORT_SYMBOL_GPL(iommu_map_sg); /** * report_iommu_fault() - report about an IOMMU fault to the IOMMU framework * @domain: the iommu domain where the fault has happened * @dev: the device where the fault has happened * @iova: the faulting address * @flags: mmu fault flags (e.g. IOMMU_FAULT_READ/IOMMU_FAULT_WRITE/...) * * This function should be called by the low-level IOMMU implementations * whenever IOMMU faults happen, to allow high-level users, that are * interested in such events, to know about them. * * This event may be useful for several possible use cases: * - mere logging of the event * - dynamic TLB/PTE loading * - if restarting of the faulting device is required * * Returns 0 on success and an appropriate error code otherwise (if dynamic * PTE/TLB loading will one day be supported, implementations will be able * to tell whether it succeeded or not according to this return value). * * Specifically, -ENOSYS is returned if a fault handler isn't installed * (though fault handlers can also return -ENOSYS, in case they want to * elicit the default behavior of the IOMMU drivers). */ int report_iommu_fault(struct iommu_domain *domain, struct device *dev, unsigned long iova, int flags) { int ret = -ENOSYS; /* * if upper layers showed interest and installed a fault handler, * invoke it. */ if (domain->cookie_type == IOMMU_COOKIE_FAULT_HANDLER && domain->handler) ret = domain->handler(domain, dev, iova, flags, domain->handler_token); trace_io_page_fault(dev, iova, flags); return ret; } EXPORT_SYMBOL_GPL(report_iommu_fault); static int __init iommu_init(void) { iommu_group_kset = kset_create_and_add("iommu_groups", NULL, kernel_kobj); BUG_ON(!iommu_group_kset); iommu_debugfs_setup(); return 0; } core_initcall(iommu_init); int iommu_set_pgtable_quirks(struct iommu_domain *domain, unsigned long quirk) { if (domain->type != IOMMU_DOMAIN_UNMANAGED) return -EINVAL; if (!domain->ops->set_pgtable_quirks) return -EINVAL; return domain->ops->set_pgtable_quirks(domain, quirk); } EXPORT_SYMBOL_GPL(iommu_set_pgtable_quirks); /** * iommu_get_resv_regions - get reserved regions * @dev: device for which to get reserved regions * @list: reserved region list for device * * This returns a list of reserved IOVA regions specific to this device. * A domain user should not map IOVA in these ranges. */ void iommu_get_resv_regions(struct device *dev, struct list_head *list) { const struct iommu_ops *ops = dev_iommu_ops(dev); if (ops->get_resv_regions) ops->get_resv_regions(dev, list); } EXPORT_SYMBOL_GPL(iommu_get_resv_regions); /** * iommu_put_resv_regions - release reserved regions * @dev: device for which to free reserved regions * @list: reserved region list for device * * This releases a reserved region list acquired by iommu_get_resv_regions(). */ void iommu_put_resv_regions(struct device *dev, struct list_head *list) { struct iommu_resv_region *entry, *next; list_for_each_entry_safe(entry, next, list, list) { if (entry->free) entry->free(dev, entry); else kfree(entry); } } EXPORT_SYMBOL(iommu_put_resv_regions); struct iommu_resv_region *iommu_alloc_resv_region(phys_addr_t start, size_t length, int prot, enum iommu_resv_type type, gfp_t gfp) { struct iommu_resv_region *region; region = kzalloc(sizeof(*region), gfp); if (!region) return NULL; INIT_LIST_HEAD(®ion->list); region->start = start; region->length = length; region->prot = prot; region->type = type; return region; } EXPORT_SYMBOL_GPL(iommu_alloc_resv_region); void iommu_set_default_passthrough(bool cmd_line) { if (cmd_line) iommu_cmd_line |= IOMMU_CMD_LINE_DMA_API; iommu_def_domain_type = IOMMU_DOMAIN_IDENTITY; } void iommu_set_default_translated(bool cmd_line) { if (cmd_line) iommu_cmd_line |= IOMMU_CMD_LINE_DMA_API; iommu_def_domain_type = IOMMU_DOMAIN_DMA; } bool iommu_default_passthrough(void) { return iommu_def_domain_type == IOMMU_DOMAIN_IDENTITY; } EXPORT_SYMBOL_GPL(iommu_default_passthrough); static const struct iommu_device *iommu_from_fwnode(const struct fwnode_handle *fwnode) { const struct iommu_device *iommu, *ret = NULL; spin_lock(&iommu_device_lock); list_for_each_entry(iommu, &iommu_device_list, list) if (iommu->fwnode == fwnode) { ret = iommu; break; } spin_unlock(&iommu_device_lock); return ret; } const struct iommu_ops *iommu_ops_from_fwnode(const struct fwnode_handle *fwnode) { const struct iommu_device *iommu = iommu_from_fwnode(fwnode); return iommu ? iommu->ops : NULL; } int iommu_fwspec_init(struct device *dev, struct fwnode_handle *iommu_fwnode) { const struct iommu_device *iommu = iommu_from_fwnode(iommu_fwnode); struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); if (!iommu) return driver_deferred_probe_check_state(dev); if (!dev->iommu && !READ_ONCE(iommu->ready)) return -EPROBE_DEFER; if (fwspec) return iommu->ops == iommu_fwspec_ops(fwspec) ? 0 : -EINVAL; if (!dev_iommu_get(dev)) return -ENOMEM; /* Preallocate for the overwhelmingly common case of 1 ID */ fwspec = kzalloc(struct_size(fwspec, ids, 1), GFP_KERNEL); if (!fwspec) return -ENOMEM; fwnode_handle_get(iommu_fwnode); fwspec->iommu_fwnode = iommu_fwnode; dev_iommu_fwspec_set(dev, fwspec); return 0; } EXPORT_SYMBOL_GPL(iommu_fwspec_init); void iommu_fwspec_free(struct device *dev) { struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); if (fwspec) { fwnode_handle_put(fwspec->iommu_fwnode); kfree(fwspec); dev_iommu_fwspec_set(dev, NULL); } } int iommu_fwspec_add_ids(struct device *dev, const u32 *ids, int num_ids) { struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); int i, new_num; if (!fwspec) return -EINVAL; new_num = fwspec->num_ids + num_ids; if (new_num > 1) { fwspec = krealloc(fwspec, struct_size(fwspec, ids, new_num), GFP_KERNEL); if (!fwspec) return -ENOMEM; dev_iommu_fwspec_set(dev, fwspec); } for (i = 0; i < num_ids; i++) fwspec->ids[fwspec->num_ids + i] = ids[i]; fwspec->num_ids = new_num; return 0; } EXPORT_SYMBOL_GPL(iommu_fwspec_add_ids); /** * iommu_setup_default_domain - Set the default_domain for the group * @group: Group to change * @target_type: Domain type to set as the default_domain * * Allocate a default domain and set it as the current domain on the group. If * the group already has a default domain it will be changed to the target_type. * When target_type is 0 the default domain is selected based on driver and * system preferences. */ static int iommu_setup_default_domain(struct iommu_group *group, int target_type) { struct iommu_domain *old_dom = group->default_domain; struct group_device *gdev; struct iommu_domain *dom; bool direct_failed; int req_type; int ret; lockdep_assert_held(&group->mutex); req_type = iommu_get_default_domain_type(group, target_type); if (req_type < 0) return -EINVAL; dom = iommu_group_alloc_default_domain(group, req_type); if (IS_ERR(dom)) return PTR_ERR(dom); if (group->default_domain == dom) return 0; if (iommu_is_dma_domain(dom)) { ret = iommu_get_dma_cookie(dom); if (ret) { iommu_domain_free(dom); return ret; } } /* * IOMMU_RESV_DIRECT and IOMMU_RESV_DIRECT_RELAXABLE regions must be * mapped before their device is attached, in order to guarantee * continuity with any FW activity */ direct_failed = false; for_each_group_device(group, gdev) { if (iommu_create_device_direct_mappings(dom, gdev->dev)) { direct_failed = true; dev_warn_once( gdev->dev->iommu->iommu_dev->dev, "IOMMU driver was not able to establish FW requested direct mapping."); } } /* We must set default_domain early for __iommu_device_set_domain */ group->default_domain = dom; if (!group->domain) { /* * Drivers are not allowed to fail the first domain attach. * The only way to recover from this is to fail attaching the * iommu driver and call ops->release_device. Put the domain * in group->default_domain so it is freed after. */ ret = __iommu_group_set_domain_internal( group, dom, IOMMU_SET_DOMAIN_MUST_SUCCEED); if (WARN_ON(ret)) goto out_free_old; } else { ret = __iommu_group_set_domain(group, dom); if (ret) goto err_restore_def_domain; } /* * Drivers are supposed to allow mappings to be installed in a domain * before device attachment, but some don't. Hack around this defect by * trying again after attaching. If this happens it means the device * will not continuously have the IOMMU_RESV_DIRECT map. */ if (direct_failed) { for_each_group_device(group, gdev) { ret = iommu_create_device_direct_mappings(dom, gdev->dev); if (ret) goto err_restore_domain; } } out_free_old: if (old_dom) iommu_domain_free(old_dom); return ret; err_restore_domain: if (old_dom) __iommu_group_set_domain_internal( group, old_dom, IOMMU_SET_DOMAIN_MUST_SUCCEED); err_restore_def_domain: if (old_dom) { iommu_domain_free(dom); group->default_domain = old_dom; } return ret; } /* * Changing the default domain through sysfs requires the users to unbind the * drivers from the devices in the iommu group, except for a DMA -> DMA-FQ * transition. Return failure if this isn't met. * * We need to consider the race between this and the device release path. * group->mutex is used here to guarantee that the device release path * will not be entered at the same time. */ static ssize_t iommu_group_store_type(struct iommu_group *group, const char *buf, size_t count) { struct group_device *gdev; int ret, req_type; if (!capable(CAP_SYS_ADMIN) || !capable(CAP_SYS_RAWIO)) return -EACCES; if (WARN_ON(!group) || !group->default_domain) return -EINVAL; if (sysfs_streq(buf, "identity")) req_type = IOMMU_DOMAIN_IDENTITY; else if (sysfs_streq(buf, "DMA")) req_type = IOMMU_DOMAIN_DMA; else if (sysfs_streq(buf, "DMA-FQ")) req_type = IOMMU_DOMAIN_DMA_FQ; else if (sysfs_streq(buf, "auto")) req_type = 0; else return -EINVAL; mutex_lock(&group->mutex); /* We can bring up a flush queue without tearing down the domain. */ if (req_type == IOMMU_DOMAIN_DMA_FQ && group->default_domain->type == IOMMU_DOMAIN_DMA) { ret = iommu_dma_init_fq(group->default_domain); if (ret) goto out_unlock; group->default_domain->type = IOMMU_DOMAIN_DMA_FQ; ret = count; goto out_unlock; } /* Otherwise, ensure that device exists and no driver is bound. */ if (list_empty(&group->devices) || group->owner_cnt) { ret = -EPERM; goto out_unlock; } ret = iommu_setup_default_domain(group, req_type); if (ret) goto out_unlock; /* Make sure dma_ops is appropriatley set */ for_each_group_device(group, gdev) iommu_setup_dma_ops(gdev->dev); out_unlock: mutex_unlock(&group->mutex); return ret ?: count; } /** * iommu_device_use_default_domain() - Device driver wants to handle device * DMA through the kernel DMA API. * @dev: The device. * * The device driver about to bind @dev wants to do DMA through the kernel * DMA API. Return 0 if it is allowed, otherwise an error. */ int iommu_device_use_default_domain(struct device *dev) { /* Caller is the driver core during the pre-probe path */ struct iommu_group *group = dev->iommu_group; int ret = 0; if (!group) return 0; mutex_lock(&group->mutex); /* We may race against bus_iommu_probe() finalising groups here */ if (!group->default_domain) { ret = -EPROBE_DEFER; goto unlock_out; } if (group->owner_cnt) { if (group->domain != group->default_domain || group->owner || !xa_empty(&group->pasid_array)) { ret = -EBUSY; goto unlock_out; } } group->owner_cnt++; unlock_out: mutex_unlock(&group->mutex); return ret; } /** * iommu_device_unuse_default_domain() - Device driver stops handling device * DMA through the kernel DMA API. * @dev: The device. * * The device driver doesn't want to do DMA through kernel DMA API anymore. * It must be called after iommu_device_use_default_domain(). */ void iommu_device_unuse_default_domain(struct device *dev) { /* Caller is the driver core during the post-probe path */ struct iommu_group *group = dev->iommu_group; if (!group) return; mutex_lock(&group->mutex); if (!WARN_ON(!group->owner_cnt || !xa_empty(&group->pasid_array))) group->owner_cnt--; mutex_unlock(&group->mutex); } static int __iommu_group_alloc_blocking_domain(struct iommu_group *group) { struct device *dev = iommu_group_first_dev(group); const struct iommu_ops *ops = dev_iommu_ops(dev); struct iommu_domain *domain; if (group->blocking_domain) return 0; if (ops->blocked_domain) { group->blocking_domain = ops->blocked_domain; return 0; } /* * For drivers that do not yet understand IOMMU_DOMAIN_BLOCKED create an * empty PAGING domain instead. */ domain = iommu_paging_domain_alloc(dev); if (IS_ERR(domain)) return PTR_ERR(domain); group->blocking_domain = domain; return 0; } static int __iommu_take_dma_ownership(struct iommu_group *group, void *owner) { int ret; if ((group->domain && group->domain != group->default_domain) || !xa_empty(&group->pasid_array)) return -EBUSY; ret = __iommu_group_alloc_blocking_domain(group); if (ret) return ret; ret = __iommu_group_set_domain(group, group->blocking_domain); if (ret) return ret; group->owner = owner; group->owner_cnt++; return 0; } /** * iommu_group_claim_dma_owner() - Set DMA ownership of a group * @group: The group. * @owner: Caller specified pointer. Used for exclusive ownership. * * This is to support backward compatibility for vfio which manages the dma * ownership in iommu_group level. New invocations on this interface should be * prohibited. Only a single owner may exist for a group. */ int iommu_group_claim_dma_owner(struct iommu_group *group, void *owner) { int ret = 0; if (WARN_ON(!owner)) return -EINVAL; mutex_lock(&group->mutex); if (group->owner_cnt) { ret = -EPERM; goto unlock_out; } ret = __iommu_take_dma_ownership(group, owner); unlock_out: mutex_unlock(&group->mutex); return ret; } EXPORT_SYMBOL_GPL(iommu_group_claim_dma_owner); /** * iommu_device_claim_dma_owner() - Set DMA ownership of a device * @dev: The device. * @owner: Caller specified pointer. Used for exclusive ownership. * * Claim the DMA ownership of a device. Multiple devices in the same group may * concurrently claim ownership if they present the same owner value. Returns 0 * on success and error code on failure */ int iommu_device_claim_dma_owner(struct device *dev, void *owner) { /* Caller must be a probed driver on dev */ struct iommu_group *group = dev->iommu_group; int ret = 0; if (WARN_ON(!owner)) return -EINVAL; if (!group) return -ENODEV; mutex_lock(&group->mutex); if (group->owner_cnt) { if (group->owner != owner) { ret = -EPERM; goto unlock_out; } group->owner_cnt++; goto unlock_out; } ret = __iommu_take_dma_ownership(group, owner); unlock_out: mutex_unlock(&group->mutex); return ret; } EXPORT_SYMBOL_GPL(iommu_device_claim_dma_owner); static void __iommu_release_dma_ownership(struct iommu_group *group) { if (WARN_ON(!group->owner_cnt || !group->owner || !xa_empty(&group->pasid_array))) return; group->owner_cnt = 0; group->owner = NULL; __iommu_group_set_domain_nofail(group, group->default_domain); } /** * iommu_group_release_dma_owner() - Release DMA ownership of a group * @group: The group * * Release the DMA ownership claimed by iommu_group_claim_dma_owner(). */ void iommu_group_release_dma_owner(struct iommu_group *group) { mutex_lock(&group->mutex); __iommu_release_dma_ownership(group); mutex_unlock(&group->mutex); } EXPORT_SYMBOL_GPL(iommu_group_release_dma_owner); /** * iommu_device_release_dma_owner() - Release DMA ownership of a device * @dev: The device. * * Release the DMA ownership claimed by iommu_device_claim_dma_owner(). */ void iommu_device_release_dma_owner(struct device *dev) { /* Caller must be a probed driver on dev */ struct iommu_group *group = dev->iommu_group; mutex_lock(&group->mutex); if (group->owner_cnt > 1) group->owner_cnt--; else __iommu_release_dma_ownership(group); mutex_unlock(&group->mutex); } EXPORT_SYMBOL_GPL(iommu_device_release_dma_owner); /** * iommu_group_dma_owner_claimed() - Query group dma ownership status * @group: The group. * * This provides status query on a given group. It is racy and only for * non-binding status reporting. */ bool iommu_group_dma_owner_claimed(struct iommu_group *group) { unsigned int user; mutex_lock(&group->mutex); user = group->owner_cnt; mutex_unlock(&group->mutex); return user; } EXPORT_SYMBOL_GPL(iommu_group_dma_owner_claimed); static void iommu_remove_dev_pasid(struct device *dev, ioasid_t pasid, struct iommu_domain *domain) { const struct iommu_ops *ops = dev_iommu_ops(dev); struct iommu_domain *blocked_domain = ops->blocked_domain; WARN_ON(blocked_domain->ops->set_dev_pasid(blocked_domain, dev, pasid, domain)); } static int __iommu_set_group_pasid(struct iommu_domain *domain, struct iommu_group *group, ioasid_t pasid, struct iommu_domain *old) { struct group_device *device, *last_gdev; int ret; for_each_group_device(group, device) { if (device->dev->iommu->max_pasids > 0) { ret = domain->ops->set_dev_pasid(domain, device->dev, pasid, old); if (ret) goto err_revert; } } return 0; err_revert: last_gdev = device; for_each_group_device(group, device) { if (device == last_gdev) break; if (device->dev->iommu->max_pasids > 0) { /* * If no old domain, undo the succeeded devices/pasid. * Otherwise, rollback the succeeded devices/pasid to * the old domain. And it is a driver bug to fail * attaching with a previously good domain. */ if (!old || WARN_ON(old->ops->set_dev_pasid(old, device->dev, pasid, domain))) iommu_remove_dev_pasid(device->dev, pasid, domain); } } return ret; } static void __iommu_remove_group_pasid(struct iommu_group *group, ioasid_t pasid, struct iommu_domain *domain) { struct group_device *device; for_each_group_device(group, device) { if (device->dev->iommu->max_pasids > 0) iommu_remove_dev_pasid(device->dev, pasid, domain); } } /* * iommu_attach_device_pasid() - Attach a domain to pasid of device * @domain: the iommu domain. * @dev: the attached device. * @pasid: the pasid of the device. * @handle: the attach handle. * * Caller should always provide a new handle to avoid race with the paths * that have lockless reference to handle if it intends to pass a valid handle. * * Return: 0 on success, or an error. */ int iommu_attach_device_pasid(struct iommu_domain *domain, struct device *dev, ioasid_t pasid, struct iommu_attach_handle *handle) { /* Caller must be a probed driver on dev */ struct iommu_group *group = dev->iommu_group; struct group_device *device; const struct iommu_ops *ops; void *entry; int ret; if (!group) return -ENODEV; ops = dev_iommu_ops(dev); if (!domain->ops->set_dev_pasid || !ops->blocked_domain || !ops->blocked_domain->ops->set_dev_pasid) return -EOPNOTSUPP; if (!domain_iommu_ops_compatible(ops, domain) || pasid == IOMMU_NO_PASID) return -EINVAL; mutex_lock(&group->mutex); for_each_group_device(group, device) { /* * Skip PASID validation for devices without PASID support * (max_pasids = 0). These devices cannot issue transactions * with PASID, so they don't affect group's PASID usage. */ if ((device->dev->iommu->max_pasids > 0) && (pasid >= device->dev->iommu->max_pasids)) { ret = -EINVAL; goto out_unlock; } } entry = iommu_make_pasid_array_entry(domain, handle); /* * Entry present is a failure case. Use xa_insert() instead of * xa_reserve(). */ ret = xa_insert(&group->pasid_array, pasid, XA_ZERO_ENTRY, GFP_KERNEL); if (ret) goto out_unlock; ret = __iommu_set_group_pasid(domain, group, pasid, NULL); if (ret) { xa_release(&group->pasid_array, pasid); goto out_unlock; } /* * The xa_insert() above reserved the memory, and the group->mutex is * held, this cannot fail. The new domain cannot be visible until the * operation succeeds as we cannot tolerate PRIs becoming concurrently * queued and then failing attach. */ WARN_ON(xa_is_err(xa_store(&group->pasid_array, pasid, entry, GFP_KERNEL))); out_unlock: mutex_unlock(&group->mutex); return ret; } EXPORT_SYMBOL_GPL(iommu_attach_device_pasid); /** * iommu_replace_device_pasid - Replace the domain that a specific pasid * of the device is attached to * @domain: the new iommu domain * @dev: the attached device. * @pasid: the pasid of the device. * @handle: the attach handle. * * This API allows the pasid to switch domains. The @pasid should have been * attached. Otherwise, this fails. The pasid will keep the old configuration * if replacement failed. * * Caller should always provide a new handle to avoid race with the paths * that have lockless reference to handle if it intends to pass a valid handle. * * Return 0 on success, or an error. */ int iommu_replace_device_pasid(struct iommu_domain *domain, struct device *dev, ioasid_t pasid, struct iommu_attach_handle *handle) { /* Caller must be a probed driver on dev */ struct iommu_group *group = dev->iommu_group; struct iommu_attach_handle *entry; struct iommu_domain *curr_domain; void *curr; int ret; if (!group) return -ENODEV; if (!domain->ops->set_dev_pasid) return -EOPNOTSUPP; if (!domain_iommu_ops_compatible(dev_iommu_ops(dev), domain) || pasid == IOMMU_NO_PASID || !handle) return -EINVAL; mutex_lock(&group->mutex); entry = iommu_make_pasid_array_entry(domain, handle); curr = xa_cmpxchg(&group->pasid_array, pasid, NULL, XA_ZERO_ENTRY, GFP_KERNEL); if (xa_is_err(curr)) { ret = xa_err(curr); goto out_unlock; } /* * No domain (with or without handle) attached, hence not * a replace case. */ if (!curr) { xa_release(&group->pasid_array, pasid); ret = -EINVAL; goto out_unlock; } /* * Reusing handle is problematic as there are paths that refers * the handle without lock. To avoid race, reject the callers that * attempt it. */ if (curr == entry) { WARN_ON(1); ret = -EINVAL; goto out_unlock; } curr_domain = pasid_array_entry_to_domain(curr); ret = 0; if (curr_domain != domain) { ret = __iommu_set_group_pasid(domain, group, pasid, curr_domain); if (ret) goto out_unlock; } /* * The above xa_cmpxchg() reserved the memory, and the * group->mutex is held, this cannot fail. */ WARN_ON(xa_is_err(xa_store(&group->pasid_array, pasid, entry, GFP_KERNEL))); out_unlock: mutex_unlock(&group->mutex); return ret; } EXPORT_SYMBOL_NS_GPL(iommu_replace_device_pasid, "IOMMUFD_INTERNAL"); /* * iommu_detach_device_pasid() - Detach the domain from pasid of device * @domain: the iommu domain. * @dev: the attached device. * @pasid: the pasid of the device. * * The @domain must have been attached to @pasid of the @dev with * iommu_attach_device_pasid(). */ void iommu_detach_device_pasid(struct iommu_domain *domain, struct device *dev, ioasid_t pasid) { /* Caller must be a probed driver on dev */ struct iommu_group *group = dev->iommu_group; mutex_lock(&group->mutex); __iommu_remove_group_pasid(group, pasid, domain); xa_erase(&group->pasid_array, pasid); mutex_unlock(&group->mutex); } EXPORT_SYMBOL_GPL(iommu_detach_device_pasid); ioasid_t iommu_alloc_global_pasid(struct device *dev) { int ret; /* max_pasids == 0 means that the device does not support PASID */ if (!dev->iommu->max_pasids) return IOMMU_PASID_INVALID; /* * max_pasids is set up by vendor driver based on number of PASID bits * supported but the IDA allocation is inclusive. */ ret = ida_alloc_range(&iommu_global_pasid_ida, IOMMU_FIRST_GLOBAL_PASID, dev->iommu->max_pasids - 1, GFP_KERNEL); return ret < 0 ? IOMMU_PASID_INVALID : ret; } EXPORT_SYMBOL_GPL(iommu_alloc_global_pasid); void iommu_free_global_pasid(ioasid_t pasid) { if (WARN_ON(pasid == IOMMU_PASID_INVALID)) return; ida_free(&iommu_global_pasid_ida, pasid); } EXPORT_SYMBOL_GPL(iommu_free_global_pasid); /** * iommu_attach_handle_get - Return the attach handle * @group: the iommu group that domain was attached to * @pasid: the pasid within the group * @type: matched domain type, 0 for any match * * Return handle or ERR_PTR(-ENOENT) on none, ERR_PTR(-EBUSY) on mismatch. * * Return the attach handle to the caller. The life cycle of an iommu attach * handle is from the time when the domain is attached to the time when the * domain is detached. Callers are required to synchronize the call of * iommu_attach_handle_get() with domain attachment and detachment. The attach * handle can only be used during its life cycle. */ struct iommu_attach_handle * iommu_attach_handle_get(struct iommu_group *group, ioasid_t pasid, unsigned int type) { struct iommu_attach_handle *handle; void *entry; xa_lock(&group->pasid_array); entry = xa_load(&group->pasid_array, pasid); if (!entry || xa_pointer_tag(entry) != IOMMU_PASID_ARRAY_HANDLE) { handle = ERR_PTR(-ENOENT); } else { handle = xa_untag_pointer(entry); if (type && handle->domain->type != type) handle = ERR_PTR(-EBUSY); } xa_unlock(&group->pasid_array); return handle; } EXPORT_SYMBOL_NS_GPL(iommu_attach_handle_get, "IOMMUFD_INTERNAL"); /** * iommu_attach_group_handle - Attach an IOMMU domain to an IOMMU group * @domain: IOMMU domain to attach * @group: IOMMU group that will be attached * @handle: attach handle * * Returns 0 on success and error code on failure. * * This is a variant of iommu_attach_group(). It allows the caller to provide * an attach handle and use it when the domain is attached. This is currently * used by IOMMUFD to deliver the I/O page faults. * * Caller should always provide a new handle to avoid race with the paths * that have lockless reference to handle. */ int iommu_attach_group_handle(struct iommu_domain *domain, struct iommu_group *group, struct iommu_attach_handle *handle) { void *entry; int ret; if (!handle) return -EINVAL; mutex_lock(&group->mutex); entry = iommu_make_pasid_array_entry(domain, handle); ret = xa_insert(&group->pasid_array, IOMMU_NO_PASID, XA_ZERO_ENTRY, GFP_KERNEL); if (ret) goto out_unlock; ret = __iommu_attach_group(domain, group); if (ret) { xa_release(&group->pasid_array, IOMMU_NO_PASID); goto out_unlock; } /* * The xa_insert() above reserved the memory, and the group->mutex is * held, this cannot fail. The new domain cannot be visible until the * operation succeeds as we cannot tolerate PRIs becoming concurrently * queued and then failing attach. */ WARN_ON(xa_is_err(xa_store(&group->pasid_array, IOMMU_NO_PASID, entry, GFP_KERNEL))); out_unlock: mutex_unlock(&group->mutex); return ret; } EXPORT_SYMBOL_NS_GPL(iommu_attach_group_handle, "IOMMUFD_INTERNAL"); /** * iommu_detach_group_handle - Detach an IOMMU domain from an IOMMU group * @domain: IOMMU domain to attach * @group: IOMMU group that will be attached * * Detach the specified IOMMU domain from the specified IOMMU group. * It must be used in conjunction with iommu_attach_group_handle(). */ void iommu_detach_group_handle(struct iommu_domain *domain, struct iommu_group *group) { mutex_lock(&group->mutex); __iommu_group_set_core_domain(group); xa_erase(&group->pasid_array, IOMMU_NO_PASID); mutex_unlock(&group->mutex); } EXPORT_SYMBOL_NS_GPL(iommu_detach_group_handle, "IOMMUFD_INTERNAL"); /** * iommu_replace_group_handle - replace the domain that a group is attached to * @group: IOMMU group that will be attached to the new domain * @new_domain: new IOMMU domain to replace with * @handle: attach handle * * This API allows the group to switch domains without being forced to go to * the blocking domain in-between. It allows the caller to provide an attach * handle for the new domain and use it when the domain is attached. * * If the currently attached domain is a core domain (e.g. a default_domain), * it will act just like the iommu_attach_group_handle(). * * Caller should always provide a new handle to avoid race with the paths * that have lockless reference to handle. */ int iommu_replace_group_handle(struct iommu_group *group, struct iommu_domain *new_domain, struct iommu_attach_handle *handle) { void *curr, *entry; int ret; if (!new_domain || !handle) return -EINVAL; mutex_lock(&group->mutex); entry = iommu_make_pasid_array_entry(new_domain, handle); ret = xa_reserve(&group->pasid_array, IOMMU_NO_PASID, GFP_KERNEL); if (ret) goto err_unlock; ret = __iommu_group_set_domain(group, new_domain); if (ret) goto err_release; curr = xa_store(&group->pasid_array, IOMMU_NO_PASID, entry, GFP_KERNEL); WARN_ON(xa_is_err(curr)); mutex_unlock(&group->mutex); return 0; err_release: xa_release(&group->pasid_array, IOMMU_NO_PASID); err_unlock: mutex_unlock(&group->mutex); return ret; } EXPORT_SYMBOL_NS_GPL(iommu_replace_group_handle, "IOMMUFD_INTERNAL"); #if IS_ENABLED(CONFIG_IRQ_MSI_IOMMU) /** * iommu_dma_prepare_msi() - Map the MSI page in the IOMMU domain * @desc: MSI descriptor, will store the MSI page * @msi_addr: MSI target address to be mapped * * The implementation of sw_msi() should take msi_addr and map it to * an IOVA in the domain and call msi_desc_set_iommu_msi_iova() with the * mapping information. * * Return: 0 on success or negative error code if the mapping failed. */ int iommu_dma_prepare_msi(struct msi_desc *desc, phys_addr_t msi_addr) { struct device *dev = msi_desc_to_dev(desc); struct iommu_group *group = dev->iommu_group; int ret = 0; if (!group) return 0; mutex_lock(&group->mutex); /* An IDENTITY domain must pass through */ if (group->domain && group->domain->type != IOMMU_DOMAIN_IDENTITY) { switch (group->domain->cookie_type) { case IOMMU_COOKIE_DMA_MSI: case IOMMU_COOKIE_DMA_IOVA: ret = iommu_dma_sw_msi(group->domain, desc, msi_addr); break; case IOMMU_COOKIE_IOMMUFD: ret = iommufd_sw_msi(group->domain, desc, msi_addr); break; default: ret = -EOPNOTSUPP; break; } } mutex_unlock(&group->mutex); return ret; } #endif /* CONFIG_IRQ_MSI_IOMMU */ |
| 205 205 203 206 182 183 182 182 5 5 5 61 86 89 90 88 28 28 51 51 11 2 11 11 79 80 80 11 11 11 11 11 172 224 222 221 2 61 2 5 10 10 3 10 10 10 12 11 1 11 11 1 1 9 10 3 10 12 12 10 8 4 2 1 11 11 11 11 3 3 1 2 2 2 8 8 8 8 8 7 8 12 11 12 10 6 6 6 10 76 76 76 21 2 2 3 1 76 64 2 2 63 23 22 55 43 30 56 14 4 1 3 13 7 7 19 19 19 13 13 13 9 5 13 10 4 6 6 6 76 4 65 65 65 65 28 27 28 65 64 65 64 65 63 65 62 63 47 21 21 21 21 63 64 12 65 73 7 7 73 71 43 45 44 2 44 1 44 6 6 40 3 43 41 41 47 72 72 68 4 69 71 69 44 71 71 70 71 27 3 3 1 1 1 1 1 1 1 1 1 1 1 72 9 9 5 2 5 106 102 106 102 70 75 10 10 4 75 2 73 5 5 5 5 1 4 1 4 4 1 3 3 72 73 70 1 1 71 14 71 7 1 1 1 1 1 1 1 1 72 4 8 8 76 1 2 370 371 376 347 371 371 376 371 371 375 371 370 375 374 2 370 123 123 371 371 372 373 20 20 20 20 20 20 9 9 4 8 7 20 20 4 4 4 20 49 49 49 49 49 1 1 49 52 13 52 52 52 52 52 52 52 52 52 52 51 52 51 50 48 48 48 48 52 50 2 2 50 50 42 44 43 50 49 51 49 49 49 48 49 49 30 31 38 38 49 51 49 49 49 64 64 62 62 5 64 64 62 51 51 50 51 50 64 60 50 61 63 64 61 52 5 63 63 64 62 63 64 64 62 63 64 64 2 64 52 204 203 203 205 205 67 204 203 204 202 203 204 204 206 205 206 204 205 206 202 205 68 66 202 202 203 203 201 204 203 202 193 79 79 168 193 193 168 165 51 166 168 193 194 193 192 191 194 166 79 38 38 1 38 39 60 39 1 60 59 60 60 60 60 60 60 60 60 37 46 60 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 1 2 2 2 2 2 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Linux INET6 implementation * Forwarding Information Database * * Authors: * Pedro Roque <roque@di.fc.ul.pt> * * Changes: * Yuji SEKIYA @USAGI: Support default route on router node; * remove ip6_null_entry from the top of * routing table. * Ville Nuorvala: Fixed routing subtrees. */ #define pr_fmt(fmt) "IPv6: " fmt #include <linux/bpf.h> #include <linux/errno.h> #include <linux/types.h> #include <linux/net.h> #include <linux/route.h> #include <linux/netdevice.h> #include <linux/in6.h> #include <linux/init.h> #include <linux/list.h> #include <linux/slab.h> #include <net/ip.h> #include <net/ipv6.h> #include <net/ndisc.h> #include <net/addrconf.h> #include <net/lwtunnel.h> #include <net/fib_notifier.h> #include <net/ip_fib.h> #include <net/ip6_fib.h> #include <net/ip6_route.h> static struct kmem_cache *fib6_node_kmem __read_mostly; struct fib6_cleaner { struct fib6_walker w; struct net *net; int (*func)(struct fib6_info *, void *arg); int sernum; void *arg; bool skip_notify; }; #ifdef CONFIG_IPV6_SUBTREES #define FWS_INIT FWS_S #else #define FWS_INIT FWS_L #endif static struct fib6_info *fib6_find_prefix(struct net *net, struct fib6_table *table, struct fib6_node *fn); static struct fib6_node *fib6_repair_tree(struct net *net, struct fib6_table *table, struct fib6_node *fn); static int fib6_walk(struct net *net, struct fib6_walker *w); static int fib6_walk_continue(struct fib6_walker *w); /* * A routing update causes an increase of the serial number on the * affected subtree. This allows for cached routes to be asynchronously * tested when modifications are made to the destination cache as a * result of redirects, path MTU changes, etc. */ static void fib6_gc_timer_cb(struct timer_list *t); #define FOR_WALKERS(net, w) \ list_for_each_entry(w, &(net)->ipv6.fib6_walkers, lh) static void fib6_walker_link(struct net *net, struct fib6_walker *w) { write_lock_bh(&net->ipv6.fib6_walker_lock); list_add(&w->lh, &net->ipv6.fib6_walkers); write_unlock_bh(&net->ipv6.fib6_walker_lock); } static void fib6_walker_unlink(struct net *net, struct fib6_walker *w) { write_lock_bh(&net->ipv6.fib6_walker_lock); list_del(&w->lh); write_unlock_bh(&net->ipv6.fib6_walker_lock); } static int fib6_new_sernum(struct net *net) { int new, old = atomic_read(&net->ipv6.fib6_sernum); do { new = old < INT_MAX ? old + 1 : 1; } while (!atomic_try_cmpxchg(&net->ipv6.fib6_sernum, &old, new)); return new; } enum { FIB6_NO_SERNUM_CHANGE = 0, }; void fib6_update_sernum(struct net *net, struct fib6_info *f6i) { struct fib6_node *fn; fn = rcu_dereference_protected(f6i->fib6_node, lockdep_is_held(&f6i->fib6_table->tb6_lock)); if (fn) WRITE_ONCE(fn->fn_sernum, fib6_new_sernum(net)); } /* * Auxiliary address test functions for the radix tree. * * These assume a 32bit processor (although it will work on * 64bit processors) */ /* * test bit */ #if defined(__LITTLE_ENDIAN) # define BITOP_BE32_SWIZZLE (0x1F & ~7) #else # define BITOP_BE32_SWIZZLE 0 #endif static __be32 addr_bit_set(const void *token, int fn_bit) { const __be32 *addr = token; /* * Here, * 1 << ((~fn_bit ^ BITOP_BE32_SWIZZLE) & 0x1f) * is optimized version of * htonl(1 << ((~fn_bit)&0x1F)) * See include/asm-generic/bitops/le.h. */ return (__force __be32)(1 << ((~fn_bit ^ BITOP_BE32_SWIZZLE) & 0x1f)) & addr[fn_bit >> 5]; } struct fib6_info *fib6_info_alloc(gfp_t gfp_flags, bool with_fib6_nh) { struct fib6_info *f6i; size_t sz = sizeof(*f6i); if (with_fib6_nh) sz += sizeof(struct fib6_nh); f6i = kzalloc(sz, gfp_flags); if (!f6i) return NULL; /* fib6_siblings is a union with nh_list, so this initializes both */ INIT_LIST_HEAD(&f6i->fib6_siblings); refcount_set(&f6i->fib6_ref, 1); INIT_HLIST_NODE(&f6i->gc_link); return f6i; } void fib6_info_destroy_rcu(struct rcu_head *head) { struct fib6_info *f6i = container_of(head, struct fib6_info, rcu); WARN_ON(f6i->fib6_node); if (f6i->nh) nexthop_put(f6i->nh); else fib6_nh_release(f6i->fib6_nh); ip_fib_metrics_put(f6i->fib6_metrics); kfree(f6i); } EXPORT_SYMBOL_GPL(fib6_info_destroy_rcu); static struct fib6_node *node_alloc(struct net *net) { struct fib6_node *fn; fn = kmem_cache_zalloc(fib6_node_kmem, GFP_ATOMIC); if (fn) net->ipv6.rt6_stats->fib_nodes++; return fn; } static void node_free_immediate(struct net *net, struct fib6_node *fn) { kmem_cache_free(fib6_node_kmem, fn); net->ipv6.rt6_stats->fib_nodes--; } static void node_free(struct net *net, struct fib6_node *fn) { kfree_rcu(fn, rcu); net->ipv6.rt6_stats->fib_nodes--; } static void fib6_free_table(struct fib6_table *table) { inetpeer_invalidate_tree(&table->tb6_peers); kfree(table); } static void fib6_link_table(struct net *net, struct fib6_table *tb) { unsigned int h; /* * Initialize table lock at a single place to give lockdep a key, * tables aren't visible prior to being linked to the list. */ spin_lock_init(&tb->tb6_lock); h = tb->tb6_id & (FIB6_TABLE_HASHSZ - 1); /* * No protection necessary, this is the only list mutatation * operation, tables never disappear once they exist. */ hlist_add_head_rcu(&tb->tb6_hlist, &net->ipv6.fib_table_hash[h]); } #ifdef CONFIG_IPV6_MULTIPLE_TABLES static struct fib6_table *fib6_alloc_table(struct net *net, u32 id) { struct fib6_table *table; table = kzalloc(sizeof(*table), GFP_ATOMIC); if (table) { table->tb6_id = id; rcu_assign_pointer(table->tb6_root.leaf, net->ipv6.fib6_null_entry); table->tb6_root.fn_flags = RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO; inet_peer_base_init(&table->tb6_peers); INIT_HLIST_HEAD(&table->tb6_gc_hlist); } return table; } struct fib6_table *fib6_new_table(struct net *net, u32 id) { struct fib6_table *tb, *new_tb; if (id == 0) id = RT6_TABLE_MAIN; tb = fib6_get_table(net, id); if (tb) return tb; new_tb = fib6_alloc_table(net, id); if (!new_tb) return NULL; spin_lock_bh(&net->ipv6.fib_table_hash_lock); tb = fib6_get_table(net, id); if (unlikely(tb)) { spin_unlock_bh(&net->ipv6.fib_table_hash_lock); kfree(new_tb); return tb; } fib6_link_table(net, new_tb); spin_unlock_bh(&net->ipv6.fib_table_hash_lock); return new_tb; } EXPORT_SYMBOL_GPL(fib6_new_table); struct fib6_table *fib6_get_table(struct net *net, u32 id) { struct hlist_head *head; struct fib6_table *tb; if (!id) id = RT6_TABLE_MAIN; head = &net->ipv6.fib_table_hash[id & (FIB6_TABLE_HASHSZ - 1)]; /* See comment in fib6_link_table(). RCU is not required, * but rcu_dereference_raw() is used to avoid data-race. */ hlist_for_each_entry_rcu(tb, head, tb6_hlist, true) if (tb->tb6_id == id) return tb; return NULL; } EXPORT_SYMBOL_GPL(fib6_get_table); static void __net_init fib6_tables_init(struct net *net) { fib6_link_table(net, net->ipv6.fib6_main_tbl); fib6_link_table(net, net->ipv6.fib6_local_tbl); } #else struct fib6_table *fib6_new_table(struct net *net, u32 id) { return fib6_get_table(net, id); } struct fib6_table *fib6_get_table(struct net *net, u32 id) { return net->ipv6.fib6_main_tbl; } struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6, const struct sk_buff *skb, int flags, pol_lookup_t lookup) { struct rt6_info *rt; rt = pol_lookup_func(lookup, net, net->ipv6.fib6_main_tbl, fl6, skb, flags); if (rt->dst.error == -EAGAIN) { ip6_rt_put_flags(rt, flags); rt = net->ipv6.ip6_null_entry; if (!(flags & RT6_LOOKUP_F_DST_NOREF)) dst_hold(&rt->dst); } return &rt->dst; } /* called with rcu lock held; no reference taken on fib6_info */ int fib6_lookup(struct net *net, int oif, struct flowi6 *fl6, struct fib6_result *res, int flags) { return fib6_table_lookup(net, net->ipv6.fib6_main_tbl, oif, fl6, res, flags); } static void __net_init fib6_tables_init(struct net *net) { fib6_link_table(net, net->ipv6.fib6_main_tbl); } #endif unsigned int fib6_tables_seq_read(const struct net *net) { unsigned int h, fib_seq = 0; rcu_read_lock(); for (h = 0; h < FIB6_TABLE_HASHSZ; h++) { const struct hlist_head *head = &net->ipv6.fib_table_hash[h]; const struct fib6_table *tb; hlist_for_each_entry_rcu(tb, head, tb6_hlist) fib_seq += READ_ONCE(tb->fib_seq); } rcu_read_unlock(); return fib_seq; } static int call_fib6_entry_notifier(struct notifier_block *nb, enum fib_event_type event_type, struct fib6_info *rt, struct netlink_ext_ack *extack) { struct fib6_entry_notifier_info info = { .info.extack = extack, .rt = rt, }; return call_fib6_notifier(nb, event_type, &info.info); } static int call_fib6_multipath_entry_notifier(struct notifier_block *nb, enum fib_event_type event_type, struct fib6_info *rt, unsigned int nsiblings, struct netlink_ext_ack *extack) { struct fib6_entry_notifier_info info = { .info.extack = extack, .rt = rt, .nsiblings = nsiblings, }; return call_fib6_notifier(nb, event_type, &info.info); } int call_fib6_entry_notifiers(struct net *net, enum fib_event_type event_type, struct fib6_info *rt, struct netlink_ext_ack *extack) { struct fib6_entry_notifier_info info = { .info.extack = extack, .rt = rt, }; WRITE_ONCE(rt->fib6_table->fib_seq, rt->fib6_table->fib_seq + 1); return call_fib6_notifiers(net, event_type, &info.info); } int call_fib6_multipath_entry_notifiers(struct net *net, enum fib_event_type event_type, struct fib6_info *rt, unsigned int nsiblings, struct netlink_ext_ack *extack) { struct fib6_entry_notifier_info info = { .info.extack = extack, .rt = rt, .nsiblings = nsiblings, }; WRITE_ONCE(rt->fib6_table->fib_seq, rt->fib6_table->fib_seq + 1); return call_fib6_notifiers(net, event_type, &info.info); } int call_fib6_entry_notifiers_replace(struct net *net, struct fib6_info *rt) { struct fib6_entry_notifier_info info = { .rt = rt, .nsiblings = rt->fib6_nsiblings, }; WRITE_ONCE(rt->fib6_table->fib_seq, rt->fib6_table->fib_seq + 1); return call_fib6_notifiers(net, FIB_EVENT_ENTRY_REPLACE, &info.info); } struct fib6_dump_arg { struct net *net; struct notifier_block *nb; struct netlink_ext_ack *extack; }; static int fib6_rt_dump(struct fib6_info *rt, struct fib6_dump_arg *arg) { enum fib_event_type fib_event = FIB_EVENT_ENTRY_REPLACE; unsigned int nsiblings; int err; if (!rt || rt == arg->net->ipv6.fib6_null_entry) return 0; nsiblings = READ_ONCE(rt->fib6_nsiblings); if (nsiblings) err = call_fib6_multipath_entry_notifier(arg->nb, fib_event, rt, nsiblings, arg->extack); else err = call_fib6_entry_notifier(arg->nb, fib_event, rt, arg->extack); return err; } static int fib6_node_dump(struct fib6_walker *w) { int err; err = fib6_rt_dump(w->leaf, w->args); w->leaf = NULL; return err; } static int fib6_table_dump(struct net *net, struct fib6_table *tb, struct fib6_walker *w) { int err; w->root = &tb->tb6_root; spin_lock_bh(&tb->tb6_lock); err = fib6_walk(net, w); spin_unlock_bh(&tb->tb6_lock); return err; } /* Called with rcu_read_lock() */ int fib6_tables_dump(struct net *net, struct notifier_block *nb, struct netlink_ext_ack *extack) { struct fib6_dump_arg arg; struct fib6_walker *w; unsigned int h; int err = 0; w = kzalloc(sizeof(*w), GFP_ATOMIC); if (!w) return -ENOMEM; w->func = fib6_node_dump; arg.net = net; arg.nb = nb; arg.extack = extack; w->args = &arg; for (h = 0; h < FIB6_TABLE_HASHSZ; h++) { struct hlist_head *head = &net->ipv6.fib_table_hash[h]; struct fib6_table *tb; hlist_for_each_entry_rcu(tb, head, tb6_hlist) { err = fib6_table_dump(net, tb, w); if (err) goto out; } } out: kfree(w); /* The tree traversal function should never return a positive value. */ return err > 0 ? -EINVAL : err; } static int fib6_dump_node(struct fib6_walker *w) { int res; struct fib6_info *rt; for_each_fib6_walker_rt(w) { res = rt6_dump_route(rt, w->args, w->skip_in_node); if (res >= 0) { /* Frame is full, suspend walking */ w->leaf = rt; /* We'll restart from this node, so if some routes were * already dumped, skip them next time. */ w->skip_in_node += res; return 1; } w->skip_in_node = 0; /* Multipath routes are dumped in one route with the * RTA_MULTIPATH attribute. Jump 'rt' to point to the * last sibling of this route (no need to dump the * sibling routes again) */ if (rt->fib6_nsiblings) rt = list_last_entry(&rt->fib6_siblings, struct fib6_info, fib6_siblings); } w->leaf = NULL; return 0; } static void fib6_dump_end(struct netlink_callback *cb) { struct net *net = sock_net(cb->skb->sk); struct fib6_walker *w = (void *)cb->args[2]; if (w) { if (cb->args[4]) { cb->args[4] = 0; fib6_walker_unlink(net, w); } cb->args[2] = 0; kfree(w); } cb->done = (void *)cb->args[3]; cb->args[1] = 3; } static int fib6_dump_done(struct netlink_callback *cb) { fib6_dump_end(cb); return cb->done ? cb->done(cb) : 0; } static int fib6_dump_table(struct fib6_table *table, struct sk_buff *skb, struct netlink_callback *cb) { struct net *net = sock_net(skb->sk); struct fib6_walker *w; int res; w = (void *)cb->args[2]; w->root = &table->tb6_root; if (cb->args[4] == 0) { w->count = 0; w->skip = 0; w->skip_in_node = 0; spin_lock_bh(&table->tb6_lock); res = fib6_walk(net, w); spin_unlock_bh(&table->tb6_lock); if (res > 0) { cb->args[4] = 1; cb->args[5] = READ_ONCE(w->root->fn_sernum); } } else { int sernum = READ_ONCE(w->root->fn_sernum); if (cb->args[5] != sernum) { /* Begin at the root if the tree changed */ cb->args[5] = sernum; w->state = FWS_INIT; w->node = w->root; w->skip = w->count; w->skip_in_node = 0; } else w->skip = 0; spin_lock_bh(&table->tb6_lock); res = fib6_walk_continue(w); spin_unlock_bh(&table->tb6_lock); if (res <= 0) { fib6_walker_unlink(net, w); cb->args[4] = 0; } } return res; } static int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) { struct rt6_rtnl_dump_arg arg = { .filter.dump_exceptions = true, .filter.dump_routes = true, .filter.rtnl_held = false, }; const struct nlmsghdr *nlh = cb->nlh; struct net *net = sock_net(skb->sk); unsigned int e = 0, s_e; struct hlist_head *head; struct fib6_walker *w; struct fib6_table *tb; unsigned int h, s_h; int err = 0; rcu_read_lock(); if (cb->strict_check) { err = ip_valid_fib_dump_req(net, nlh, &arg.filter, cb); if (err < 0) goto unlock; } else if (nlmsg_len(nlh) >= sizeof(struct rtmsg)) { struct rtmsg *rtm = nlmsg_data(nlh); if (rtm->rtm_flags & RTM_F_PREFIX) arg.filter.flags = RTM_F_PREFIX; } w = (void *)cb->args[2]; if (!w) { /* New dump: * * 1. allocate and initialize walker. */ w = kzalloc(sizeof(*w), GFP_ATOMIC); if (!w) { err = -ENOMEM; goto unlock; } w->func = fib6_dump_node; cb->args[2] = (long)w; /* 2. hook callback destructor. */ cb->args[3] = (long)cb->done; cb->done = fib6_dump_done; } arg.skb = skb; arg.cb = cb; arg.net = net; w->args = &arg; if (arg.filter.table_id) { tb = fib6_get_table(net, arg.filter.table_id); if (!tb) { if (rtnl_msg_family(cb->nlh) != PF_INET6) goto unlock; NL_SET_ERR_MSG_MOD(cb->extack, "FIB table does not exist"); err = -ENOENT; goto unlock; } if (!cb->args[0]) { err = fib6_dump_table(tb, skb, cb); if (!err) cb->args[0] = 1; } goto unlock; } s_h = cb->args[0]; s_e = cb->args[1]; for (h = s_h; h < FIB6_TABLE_HASHSZ; h++, s_e = 0) { e = 0; head = &net->ipv6.fib_table_hash[h]; hlist_for_each_entry_rcu(tb, head, tb6_hlist) { if (e < s_e) goto next; err = fib6_dump_table(tb, skb, cb); if (err != 0) goto out; next: e++; } } out: cb->args[1] = e; cb->args[0] = h; unlock: rcu_read_unlock(); if (err <= 0) fib6_dump_end(cb); return err; } void fib6_metric_set(struct fib6_info *f6i, int metric, u32 val) { if (!f6i) return; if (f6i->fib6_metrics == &dst_default_metrics) { struct dst_metrics *p = kzalloc(sizeof(*p), GFP_ATOMIC); if (!p) return; refcount_set(&p->refcnt, 1); f6i->fib6_metrics = p; } f6i->fib6_metrics->metrics[metric - 1] = val; } /* * Routing Table * * return the appropriate node for a routing tree "add" operation * by either creating and inserting or by returning an existing * node. */ static struct fib6_node *fib6_add_1(struct net *net, struct fib6_table *table, struct fib6_node *root, struct in6_addr *addr, int plen, int offset, int allow_create, int replace_required, struct netlink_ext_ack *extack) { struct fib6_node *fn, *in, *ln; struct fib6_node *pn = NULL; struct rt6key *key; int bit; __be32 dir = 0; /* insert node in tree */ fn = root; do { struct fib6_info *leaf = rcu_dereference_protected(fn->leaf, lockdep_is_held(&table->tb6_lock)); key = (struct rt6key *)((u8 *)leaf + offset); /* * Prefix match */ if (plen < fn->fn_bit || !ipv6_prefix_equal(&key->addr, addr, fn->fn_bit)) { if (!allow_create) { if (replace_required) { NL_SET_ERR_MSG(extack, "Can not replace route - no match found"); pr_warn("Can't replace route, no match found\n"); return ERR_PTR(-ENOENT); } pr_warn("NLM_F_CREATE should be set when creating new route\n"); } goto insert_above; } /* * Exact match ? */ if (plen == fn->fn_bit) { /* clean up an intermediate node */ if (!(fn->fn_flags & RTN_RTINFO)) { RCU_INIT_POINTER(fn->leaf, NULL); fib6_info_release(leaf); /* remove null_entry in the root node */ } else if (fn->fn_flags & RTN_TL_ROOT && rcu_access_pointer(fn->leaf) == net->ipv6.fib6_null_entry) { RCU_INIT_POINTER(fn->leaf, NULL); } return fn; } /* * We have more bits to go */ /* Try to walk down on tree. */ dir = addr_bit_set(addr, fn->fn_bit); pn = fn; fn = dir ? rcu_dereference_protected(fn->right, lockdep_is_held(&table->tb6_lock)) : rcu_dereference_protected(fn->left, lockdep_is_held(&table->tb6_lock)); } while (fn); if (!allow_create) { /* We should not create new node because * NLM_F_REPLACE was specified without NLM_F_CREATE * I assume it is safe to require NLM_F_CREATE when * REPLACE flag is used! Later we may want to remove the * check for replace_required, because according * to netlink specification, NLM_F_CREATE * MUST be specified if new route is created. * That would keep IPv6 consistent with IPv4 */ if (replace_required) { NL_SET_ERR_MSG(extack, "Can not replace route - no match found"); pr_warn("Can't replace route, no match found\n"); return ERR_PTR(-ENOENT); } pr_warn("NLM_F_CREATE should be set when creating new route\n"); } /* * We walked to the bottom of tree. * Create new leaf node without children. */ ln = node_alloc(net); if (!ln) return ERR_PTR(-ENOMEM); ln->fn_bit = plen; RCU_INIT_POINTER(ln->parent, pn); if (dir) rcu_assign_pointer(pn->right, ln); else rcu_assign_pointer(pn->left, ln); return ln; insert_above: /* * split since we don't have a common prefix anymore or * we have a less significant route. * we've to insert an intermediate node on the list * this new node will point to the one we need to create * and the current */ pn = rcu_dereference_protected(fn->parent, lockdep_is_held(&table->tb6_lock)); /* find 1st bit in difference between the 2 addrs. See comment in __ipv6_addr_diff: bit may be an invalid value, but if it is >= plen, the value is ignored in any case. */ bit = __ipv6_addr_diff(addr, &key->addr, sizeof(*addr)); /* * (intermediate)[in] * / \ * (new leaf node)[ln] (old node)[fn] */ if (plen > bit) { in = node_alloc(net); ln = node_alloc(net); if (!in || !ln) { if (in) node_free_immediate(net, in); if (ln) node_free_immediate(net, ln); return ERR_PTR(-ENOMEM); } /* * new intermediate node. * RTN_RTINFO will * be off since that an address that chooses one of * the branches would not match less specific routes * in the other branch */ in->fn_bit = bit; RCU_INIT_POINTER(in->parent, pn); in->leaf = fn->leaf; fib6_info_hold(rcu_dereference_protected(in->leaf, lockdep_is_held(&table->tb6_lock))); /* update parent pointer */ if (dir) rcu_assign_pointer(pn->right, in); else rcu_assign_pointer(pn->left, in); ln->fn_bit = plen; RCU_INIT_POINTER(ln->parent, in); rcu_assign_pointer(fn->parent, in); if (addr_bit_set(addr, bit)) { rcu_assign_pointer(in->right, ln); rcu_assign_pointer(in->left, fn); } else { rcu_assign_pointer(in->left, ln); rcu_assign_pointer(in->right, fn); } } else { /* plen <= bit */ /* * (new leaf node)[ln] * / \ * (old node)[fn] NULL */ ln = node_alloc(net); if (!ln) return ERR_PTR(-ENOMEM); ln->fn_bit = plen; RCU_INIT_POINTER(ln->parent, pn); if (addr_bit_set(&key->addr, plen)) RCU_INIT_POINTER(ln->right, fn); else RCU_INIT_POINTER(ln->left, fn); rcu_assign_pointer(fn->parent, ln); if (dir) rcu_assign_pointer(pn->right, ln); else rcu_assign_pointer(pn->left, ln); } return ln; } static void __fib6_drop_pcpu_from(struct fib6_nh *fib6_nh, const struct fib6_info *match) { int cpu; if (!fib6_nh->rt6i_pcpu) return; rcu_read_lock(); /* release the reference to this fib entry from * all of its cached pcpu routes */ for_each_possible_cpu(cpu) { struct rt6_info **ppcpu_rt; struct rt6_info *pcpu_rt; ppcpu_rt = per_cpu_ptr(fib6_nh->rt6i_pcpu, cpu); /* Paired with xchg() in rt6_get_pcpu_route() */ pcpu_rt = READ_ONCE(*ppcpu_rt); /* only dropping the 'from' reference if the cached route * is using 'match'. The cached pcpu_rt->from only changes * from a fib6_info to NULL (ip6_dst_destroy); it can never * change from one fib6_info reference to another */ if (pcpu_rt && rcu_access_pointer(pcpu_rt->from) == match) { struct fib6_info *from; from = unrcu_pointer(xchg(&pcpu_rt->from, NULL)); fib6_info_release(from); } } rcu_read_unlock(); } static int fib6_nh_drop_pcpu_from(struct fib6_nh *nh, void *_arg) { struct fib6_info *arg = _arg; __fib6_drop_pcpu_from(nh, arg); return 0; } static void fib6_drop_pcpu_from(struct fib6_info *f6i) { /* Make sure rt6_make_pcpu_route() wont add other percpu routes * while we are cleaning them here. */ f6i->fib6_destroying = 1; mb(); /* paired with the cmpxchg() in rt6_make_pcpu_route() */ if (f6i->nh) { rcu_read_lock(); nexthop_for_each_fib6_nh(f6i->nh, fib6_nh_drop_pcpu_from, f6i); rcu_read_unlock(); } else { struct fib6_nh *fib6_nh; fib6_nh = f6i->fib6_nh; __fib6_drop_pcpu_from(fib6_nh, f6i); } } static void fib6_purge_rt(struct fib6_info *rt, struct fib6_node *fn, struct net *net) { struct fib6_table *table = rt->fib6_table; /* Flush all cached dst in exception table */ rt6_flush_exceptions(rt); fib6_drop_pcpu_from(rt); if (rt->nh) { spin_lock(&rt->nh->lock); if (!list_empty(&rt->nh_list)) list_del_init(&rt->nh_list); spin_unlock(&rt->nh->lock); } if (refcount_read(&rt->fib6_ref) != 1) { /* This route is used as dummy address holder in some split * nodes. It is not leaked, but it still holds other resources, * which must be released in time. So, scan ascendant nodes * and replace dummy references to this route with references * to still alive ones. */ while (fn) { struct fib6_info *leaf = rcu_dereference_protected(fn->leaf, lockdep_is_held(&table->tb6_lock)); struct fib6_info *new_leaf; if (!(fn->fn_flags & RTN_RTINFO) && leaf == rt) { new_leaf = fib6_find_prefix(net, table, fn); fib6_info_hold(new_leaf); rcu_assign_pointer(fn->leaf, new_leaf); fib6_info_release(rt); } fn = rcu_dereference_protected(fn->parent, lockdep_is_held(&table->tb6_lock)); } } fib6_clean_expires(rt); fib6_remove_gc_list(rt); } /* * Insert routing information in a node. */ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt, struct nl_info *info, struct netlink_ext_ack *extack, struct list_head *purge_list) { struct fib6_info *leaf = rcu_dereference_protected(fn->leaf, lockdep_is_held(&rt->fib6_table->tb6_lock)); struct fib6_info *iter = NULL; struct fib6_info __rcu **ins; struct fib6_info __rcu **fallback_ins = NULL; int replace = (info->nlh && (info->nlh->nlmsg_flags & NLM_F_REPLACE)); int add = (!info->nlh || (info->nlh->nlmsg_flags & NLM_F_CREATE)); int found = 0; bool rt_can_ecmp = rt6_qualify_for_ecmp(rt); bool notify_sibling_rt = false; u16 nlflags = NLM_F_EXCL; int err; if (info->nlh && (info->nlh->nlmsg_flags & NLM_F_APPEND)) nlflags |= NLM_F_APPEND; ins = &fn->leaf; for (iter = leaf; iter; iter = rcu_dereference_protected(iter->fib6_next, lockdep_is_held(&rt->fib6_table->tb6_lock))) { /* * Search for duplicates */ if (iter->fib6_metric == rt->fib6_metric) { /* * Same priority level */ if (info->nlh && (info->nlh->nlmsg_flags & NLM_F_EXCL)) return -EEXIST; nlflags &= ~NLM_F_EXCL; if (replace) { if (rt_can_ecmp == rt6_qualify_for_ecmp(iter)) { found++; break; } fallback_ins = fallback_ins ?: ins; goto next_iter; } if (rt6_duplicate_nexthop(iter, rt)) { if (rt->fib6_nsiblings) WRITE_ONCE(rt->fib6_nsiblings, 0); if (!(iter->fib6_flags & RTF_EXPIRES)) return -EEXIST; if (!(rt->fib6_flags & RTF_EXPIRES)) { fib6_clean_expires(iter); fib6_remove_gc_list(iter); } else { fib6_set_expires(iter, rt->expires); fib6_add_gc_list(iter); } if (rt->fib6_pmtu) fib6_metric_set(iter, RTAX_MTU, rt->fib6_pmtu); return -EEXIST; } /* If we have the same destination and the same metric, * but not the same gateway, then the route we try to * add is sibling to this route, increment our counter * of siblings, and later we will add our route to the * list. * Only static routes (which don't have flag * RTF_EXPIRES) are used for ECMPv6. * * To avoid long list, we only had siblings if the * route have a gateway. */ if (rt_can_ecmp && rt6_qualify_for_ecmp(iter)) WRITE_ONCE(rt->fib6_nsiblings, rt->fib6_nsiblings + 1); } if (iter->fib6_metric > rt->fib6_metric) break; next_iter: ins = &iter->fib6_next; } if (fallback_ins && !found) { /* No matching route with same ecmp-able-ness found, replace * first matching route */ ins = fallback_ins; iter = rcu_dereference_protected(*ins, lockdep_is_held(&rt->fib6_table->tb6_lock)); found++; } /* Reset round-robin state, if necessary */ if (ins == &fn->leaf) fn->rr_ptr = NULL; /* Link this route to others same route. */ if (rt->fib6_nsiblings) { unsigned int fib6_nsiblings; struct fib6_info *sibling, *temp_sibling; /* Find the first route that have the same metric */ sibling = leaf; notify_sibling_rt = true; while (sibling) { if (sibling->fib6_metric == rt->fib6_metric && rt6_qualify_for_ecmp(sibling)) { list_add_tail_rcu(&rt->fib6_siblings, &sibling->fib6_siblings); break; } sibling = rcu_dereference_protected(sibling->fib6_next, lockdep_is_held(&rt->fib6_table->tb6_lock)); notify_sibling_rt = false; } /* For each sibling in the list, increment the counter of * siblings. BUG() if counters does not match, list of siblings * is broken! */ fib6_nsiblings = 0; list_for_each_entry_safe(sibling, temp_sibling, &rt->fib6_siblings, fib6_siblings) { WRITE_ONCE(sibling->fib6_nsiblings, sibling->fib6_nsiblings + 1); BUG_ON(sibling->fib6_nsiblings != rt->fib6_nsiblings); fib6_nsiblings++; } BUG_ON(fib6_nsiblings != rt->fib6_nsiblings); rcu_read_lock(); rt6_multipath_rebalance(temp_sibling); rcu_read_unlock(); } /* * insert node */ if (!replace) { if (!add) pr_warn("NLM_F_CREATE should be set when creating new route\n"); add: nlflags |= NLM_F_CREATE; /* The route should only be notified if it is the first * route in the node or if it is added as a sibling * route to the first route in the node. */ if (!info->skip_notify_kernel && (notify_sibling_rt || ins == &fn->leaf)) { enum fib_event_type fib_event; if (notify_sibling_rt) fib_event = FIB_EVENT_ENTRY_APPEND; else fib_event = FIB_EVENT_ENTRY_REPLACE; err = call_fib6_entry_notifiers(info->nl_net, fib_event, rt, extack); if (err) { struct fib6_info *sibling, *next_sibling; /* If the route has siblings, then it first * needs to be unlinked from them. */ if (!rt->fib6_nsiblings) return err; list_for_each_entry_safe(sibling, next_sibling, &rt->fib6_siblings, fib6_siblings) WRITE_ONCE(sibling->fib6_nsiblings, sibling->fib6_nsiblings - 1); WRITE_ONCE(rt->fib6_nsiblings, 0); list_del_rcu(&rt->fib6_siblings); rcu_read_lock(); rt6_multipath_rebalance(next_sibling); rcu_read_unlock(); return err; } } rcu_assign_pointer(rt->fib6_next, iter); fib6_info_hold(rt); rcu_assign_pointer(rt->fib6_node, fn); rcu_assign_pointer(*ins, rt); if (!info->skip_notify) inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags); info->nl_net->ipv6.rt6_stats->fib_rt_entries++; if (!(fn->fn_flags & RTN_RTINFO)) { info->nl_net->ipv6.rt6_stats->fib_route_nodes++; fn->fn_flags |= RTN_RTINFO; } } else { int nsiblings; if (!found) { if (add) goto add; pr_warn("NLM_F_REPLACE set, but no existing node found!\n"); return -ENOENT; } if (!info->skip_notify_kernel && ins == &fn->leaf) { err = call_fib6_entry_notifiers(info->nl_net, FIB_EVENT_ENTRY_REPLACE, rt, extack); if (err) return err; } fib6_info_hold(rt); rcu_assign_pointer(rt->fib6_node, fn); rt->fib6_next = iter->fib6_next; rcu_assign_pointer(*ins, rt); if (!info->skip_notify) inet6_rt_notify(RTM_NEWROUTE, rt, info, NLM_F_REPLACE); if (!(fn->fn_flags & RTN_RTINFO)) { info->nl_net->ipv6.rt6_stats->fib_route_nodes++; fn->fn_flags |= RTN_RTINFO; } nsiblings = iter->fib6_nsiblings; iter->fib6_node = NULL; list_add(&iter->purge_link, purge_list); if (rcu_access_pointer(fn->rr_ptr) == iter) fn->rr_ptr = NULL; if (nsiblings) { /* Replacing an ECMP route, remove all siblings */ ins = &rt->fib6_next; iter = rcu_dereference_protected(*ins, lockdep_is_held(&rt->fib6_table->tb6_lock)); while (iter) { if (iter->fib6_metric > rt->fib6_metric) break; if (rt6_qualify_for_ecmp(iter)) { *ins = iter->fib6_next; iter->fib6_node = NULL; list_add(&iter->purge_link, purge_list); if (rcu_access_pointer(fn->rr_ptr) == iter) fn->rr_ptr = NULL; nsiblings--; info->nl_net->ipv6.rt6_stats->fib_rt_entries--; } else { ins = &iter->fib6_next; } iter = rcu_dereference_protected(*ins, lockdep_is_held(&rt->fib6_table->tb6_lock)); } WARN_ON(nsiblings != 0); } } return 0; } static int fib6_add_rt2node_nh(struct fib6_node *fn, struct fib6_info *rt, struct nl_info *info, struct netlink_ext_ack *extack, struct list_head *purge_list) { int err; spin_lock(&rt->nh->lock); if (rt->nh->dead) { NL_SET_ERR_MSG(extack, "Nexthop has been deleted"); err = -EINVAL; } else { err = fib6_add_rt2node(fn, rt, info, extack, purge_list); if (!err) list_add(&rt->nh_list, &rt->nh->f6i_list); } spin_unlock(&rt->nh->lock); return err; } static void fib6_start_gc(struct net *net, struct fib6_info *rt) { if (!timer_pending(&net->ipv6.ip6_fib_timer) && (rt->fib6_flags & RTF_EXPIRES)) mod_timer(&net->ipv6.ip6_fib_timer, jiffies + net->ipv6.sysctl.ip6_rt_gc_interval); } void fib6_force_start_gc(struct net *net) { if (!timer_pending(&net->ipv6.ip6_fib_timer)) mod_timer(&net->ipv6.ip6_fib_timer, jiffies + net->ipv6.sysctl.ip6_rt_gc_interval); } static void __fib6_update_sernum_upto_root(struct fib6_info *rt, int sernum) { struct fib6_node *fn = rcu_dereference_protected(rt->fib6_node, lockdep_is_held(&rt->fib6_table->tb6_lock)); /* paired with smp_rmb() in fib6_get_cookie_safe() */ smp_wmb(); while (fn) { WRITE_ONCE(fn->fn_sernum, sernum); fn = rcu_dereference_protected(fn->parent, lockdep_is_held(&rt->fib6_table->tb6_lock)); } } void fib6_update_sernum_upto_root(struct net *net, struct fib6_info *rt) { __fib6_update_sernum_upto_root(rt, fib6_new_sernum(net)); } /* allow ipv4 to update sernum via ipv6_stub */ void fib6_update_sernum_stub(struct net *net, struct fib6_info *f6i) { spin_lock_bh(&f6i->fib6_table->tb6_lock); fib6_update_sernum_upto_root(net, f6i); spin_unlock_bh(&f6i->fib6_table->tb6_lock); } /* * Add routing information to the routing tree. * <destination addr>/<source addr> * with source addr info in sub-trees * Need to own table->tb6_lock */ int fib6_add(struct fib6_node *root, struct fib6_info *rt, struct nl_info *info, struct netlink_ext_ack *extack) { struct fib6_table *table = rt->fib6_table; LIST_HEAD(purge_list); struct fib6_node *fn; #ifdef CONFIG_IPV6_SUBTREES struct fib6_node *pn = NULL; #endif int err = -ENOMEM; int allow_create = 1; int replace_required = 0; if (info->nlh) { if (!(info->nlh->nlmsg_flags & NLM_F_CREATE)) allow_create = 0; if (info->nlh->nlmsg_flags & NLM_F_REPLACE) replace_required = 1; } if (!allow_create && !replace_required) pr_warn("RTM_NEWROUTE with no NLM_F_CREATE or NLM_F_REPLACE\n"); fn = fib6_add_1(info->nl_net, table, root, &rt->fib6_dst.addr, rt->fib6_dst.plen, offsetof(struct fib6_info, fib6_dst), allow_create, replace_required, extack); if (IS_ERR(fn)) { err = PTR_ERR(fn); fn = NULL; goto out; } #ifdef CONFIG_IPV6_SUBTREES pn = fn; if (rt->fib6_src.plen) { struct fib6_node *sn; if (!rcu_access_pointer(fn->subtree)) { struct fib6_node *sfn; /* * Create subtree. * * fn[main tree] * | * sfn[subtree root] * \ * sn[new leaf node] */ /* Create subtree root node */ sfn = node_alloc(info->nl_net); if (!sfn) goto failure; fib6_info_hold(info->nl_net->ipv6.fib6_null_entry); rcu_assign_pointer(sfn->leaf, info->nl_net->ipv6.fib6_null_entry); sfn->fn_flags = RTN_ROOT; /* Now add the first leaf node to new subtree */ sn = fib6_add_1(info->nl_net, table, sfn, &rt->fib6_src.addr, rt->fib6_src.plen, offsetof(struct fib6_info, fib6_src), allow_create, replace_required, extack); if (IS_ERR(sn)) { /* If it is failed, discard just allocated root, and then (in failure) stale node in main tree. */ node_free_immediate(info->nl_net, sfn); err = PTR_ERR(sn); goto failure; } /* Now link new subtree to main tree */ rcu_assign_pointer(sfn->parent, fn); rcu_assign_pointer(fn->subtree, sfn); } else { sn = fib6_add_1(info->nl_net, table, FIB6_SUBTREE(fn), &rt->fib6_src.addr, rt->fib6_src.plen, offsetof(struct fib6_info, fib6_src), allow_create, replace_required, extack); if (IS_ERR(sn)) { err = PTR_ERR(sn); goto failure; } } if (!rcu_access_pointer(fn->leaf)) { if (fn->fn_flags & RTN_TL_ROOT) { /* put back null_entry for root node */ rcu_assign_pointer(fn->leaf, info->nl_net->ipv6.fib6_null_entry); } else { fib6_info_hold(rt); rcu_assign_pointer(fn->leaf, rt); } } fn = sn; } #endif if (rt->nh) err = fib6_add_rt2node_nh(fn, rt, info, extack, &purge_list); else err = fib6_add_rt2node(fn, rt, info, extack, &purge_list); if (!err) { struct fib6_info *iter, *next; list_for_each_entry_safe(iter, next, &purge_list, purge_link) { list_del(&iter->purge_link); fib6_purge_rt(iter, fn, info->nl_net); fib6_info_release(iter); } __fib6_update_sernum_upto_root(rt, fib6_new_sernum(info->nl_net)); if (rt->fib6_flags & RTF_EXPIRES) fib6_add_gc_list(rt); fib6_start_gc(info->nl_net, rt); } out: if (err) { #ifdef CONFIG_IPV6_SUBTREES /* * If fib6_add_1 has cleared the old leaf pointer in the * super-tree leaf node we have to find a new one for it. */ if (pn != fn) { struct fib6_info *pn_leaf = rcu_dereference_protected(pn->leaf, lockdep_is_held(&table->tb6_lock)); if (pn_leaf == rt) { pn_leaf = NULL; RCU_INIT_POINTER(pn->leaf, NULL); fib6_info_release(rt); } if (!pn_leaf && !(pn->fn_flags & RTN_RTINFO)) { pn_leaf = fib6_find_prefix(info->nl_net, table, pn); if (!pn_leaf) pn_leaf = info->nl_net->ipv6.fib6_null_entry; fib6_info_hold(pn_leaf); rcu_assign_pointer(pn->leaf, pn_leaf); } } #endif goto failure; } else if (fib6_requires_src(rt)) { fib6_routes_require_src_inc(info->nl_net); } return err; failure: /* fn->leaf could be NULL and fib6_repair_tree() needs to be called if: * 1. fn is an intermediate node and we failed to add the new * route to it in both subtree creation failure and fib6_add_rt2node() * failure case. * 2. fn is the root node in the table and we fail to add the first * default route to it. */ if (fn && (!(fn->fn_flags & (RTN_RTINFO|RTN_ROOT)) || (fn->fn_flags & RTN_TL_ROOT && !rcu_access_pointer(fn->leaf)))) fib6_repair_tree(info->nl_net, table, fn); return err; } /* * Routing tree lookup * */ struct lookup_args { int offset; /* key offset on fib6_info */ const struct in6_addr *addr; /* search key */ }; static struct fib6_node *fib6_node_lookup_1(struct fib6_node *root, struct lookup_args *args) { struct fib6_node *fn; __be32 dir; if (unlikely(args->offset == 0)) return NULL; /* * Descend on a tree */ fn = root; for (;;) { struct fib6_node *next; dir = addr_bit_set(args->addr, fn->fn_bit); next = dir ? rcu_dereference(fn->right) : rcu_dereference(fn->left); if (next) { fn = next; continue; } break; } while (fn) { struct fib6_node *subtree = FIB6_SUBTREE(fn); if (subtree || fn->fn_flags & RTN_RTINFO) { struct fib6_info *leaf = rcu_dereference(fn->leaf); struct rt6key *key; if (!leaf) goto backtrack; key = (struct rt6key *) ((u8 *)leaf + args->offset); if (ipv6_prefix_equal(&key->addr, args->addr, key->plen)) { #ifdef CONFIG_IPV6_SUBTREES if (subtree) { struct fib6_node *sfn; sfn = fib6_node_lookup_1(subtree, args + 1); if (!sfn) goto backtrack; fn = sfn; } #endif if (fn->fn_flags & RTN_RTINFO) return fn; } } backtrack: if (fn->fn_flags & RTN_ROOT) break; fn = rcu_dereference(fn->parent); } return NULL; } /* called with rcu_read_lock() held */ struct fib6_node *fib6_node_lookup(struct fib6_node *root, const struct in6_addr *daddr, const struct in6_addr *saddr) { struct fib6_node *fn; struct lookup_args args[] = { { .offset = offsetof(struct fib6_info, fib6_dst), .addr = daddr, }, #ifdef CONFIG_IPV6_SUBTREES { .offset = offsetof(struct fib6_info, fib6_src), .addr = saddr, }, #endif { .offset = 0, /* sentinel */ } }; fn = fib6_node_lookup_1(root, daddr ? args : args + 1); if (!fn || fn->fn_flags & RTN_TL_ROOT) fn = root; return fn; } /* * Get node with specified destination prefix (and source prefix, * if subtrees are used) * exact_match == true means we try to find fn with exact match of * the passed in prefix addr * exact_match == false means we try to find fn with longest prefix * match of the passed in prefix addr. This is useful for finding fn * for cached route as it will be stored in the exception table under * the node with longest prefix length. */ static struct fib6_node *fib6_locate_1(struct fib6_node *root, const struct in6_addr *addr, int plen, int offset, bool exact_match) { struct fib6_node *fn, *prev = NULL; for (fn = root; fn ; ) { struct fib6_info *leaf = rcu_dereference(fn->leaf); struct rt6key *key; /* This node is being deleted */ if (!leaf) { if (plen <= fn->fn_bit) goto out; else goto next; } key = (struct rt6key *)((u8 *)leaf + offset); /* * Prefix match */ if (plen < fn->fn_bit || !ipv6_prefix_equal(&key->addr, addr, fn->fn_bit)) goto out; if (plen == fn->fn_bit) return fn; if (fn->fn_flags & RTN_RTINFO) prev = fn; next: /* * We have more bits to go */ if (addr_bit_set(addr, fn->fn_bit)) fn = rcu_dereference(fn->right); else fn = rcu_dereference(fn->left); } out: if (exact_match) return NULL; else return prev; } struct fib6_node *fib6_locate(struct fib6_node *root, const struct in6_addr *daddr, int dst_len, const struct in6_addr *saddr, int src_len, bool exact_match) { struct fib6_node *fn; fn = fib6_locate_1(root, daddr, dst_len, offsetof(struct fib6_info, fib6_dst), exact_match); #ifdef CONFIG_IPV6_SUBTREES if (src_len) { WARN_ON(saddr == NULL); if (fn) { struct fib6_node *subtree = FIB6_SUBTREE(fn); if (subtree) { fn = fib6_locate_1(subtree, saddr, src_len, offsetof(struct fib6_info, fib6_src), exact_match); } } } #endif if (fn && fn->fn_flags & RTN_RTINFO) return fn; return NULL; } /* * Deletion * */ static struct fib6_info *fib6_find_prefix(struct net *net, struct fib6_table *table, struct fib6_node *fn) { struct fib6_node *child_left, *child_right; if (fn->fn_flags & RTN_ROOT) return net->ipv6.fib6_null_entry; while (fn) { child_left = rcu_dereference_protected(fn->left, lockdep_is_held(&table->tb6_lock)); child_right = rcu_dereference_protected(fn->right, lockdep_is_held(&table->tb6_lock)); if (child_left) return rcu_dereference_protected(child_left->leaf, lockdep_is_held(&table->tb6_lock)); if (child_right) return rcu_dereference_protected(child_right->leaf, lockdep_is_held(&table->tb6_lock)); fn = FIB6_SUBTREE(fn); } return NULL; } /* * Called to trim the tree of intermediate nodes when possible. "fn" * is the node we want to try and remove. * Need to own table->tb6_lock */ static struct fib6_node *fib6_repair_tree(struct net *net, struct fib6_table *table, struct fib6_node *fn) { int children; int nstate; struct fib6_node *child; struct fib6_walker *w; int iter = 0; /* Set fn->leaf to null_entry for root node. */ if (fn->fn_flags & RTN_TL_ROOT) { rcu_assign_pointer(fn->leaf, net->ipv6.fib6_null_entry); return fn; } for (;;) { struct fib6_node *fn_r = rcu_dereference_protected(fn->right, lockdep_is_held(&table->tb6_lock)); struct fib6_node *fn_l = rcu_dereference_protected(fn->left, lockdep_is_held(&table->tb6_lock)); struct fib6_node *pn = rcu_dereference_protected(fn->parent, lockdep_is_held(&table->tb6_lock)); struct fib6_node *pn_r = rcu_dereference_protected(pn->right, lockdep_is_held(&table->tb6_lock)); struct fib6_node *pn_l = rcu_dereference_protected(pn->left, lockdep_is_held(&table->tb6_lock)); struct fib6_info *fn_leaf = rcu_dereference_protected(fn->leaf, lockdep_is_held(&table->tb6_lock)); struct fib6_info *pn_leaf = rcu_dereference_protected(pn->leaf, lockdep_is_held(&table->tb6_lock)); struct fib6_info *new_fn_leaf; pr_debug("fixing tree: plen=%d iter=%d\n", fn->fn_bit, iter); iter++; WARN_ON(fn->fn_flags & RTN_RTINFO); WARN_ON(fn->fn_flags & RTN_TL_ROOT); WARN_ON(fn_leaf); children = 0; child = NULL; if (fn_r) { child = fn_r; children |= 1; } if (fn_l) { child = fn_l; children |= 2; } if (children == 3 || FIB6_SUBTREE(fn) #ifdef CONFIG_IPV6_SUBTREES /* Subtree root (i.e. fn) may have one child */ || (children && fn->fn_flags & RTN_ROOT) #endif ) { new_fn_leaf = fib6_find_prefix(net, table, fn); #if RT6_DEBUG >= 2 if (!new_fn_leaf) { WARN_ON(!new_fn_leaf); new_fn_leaf = net->ipv6.fib6_null_entry; } #endif fib6_info_hold(new_fn_leaf); rcu_assign_pointer(fn->leaf, new_fn_leaf); return pn; } #ifdef CONFIG_IPV6_SUBTREES if (FIB6_SUBTREE(pn) == fn) { WARN_ON(!(fn->fn_flags & RTN_ROOT)); RCU_INIT_POINTER(pn->subtree, NULL); nstate = FWS_L; } else { WARN_ON(fn->fn_flags & RTN_ROOT); #endif if (pn_r == fn) rcu_assign_pointer(pn->right, child); else if (pn_l == fn) rcu_assign_pointer(pn->left, child); #if RT6_DEBUG >= 2 else WARN_ON(1); #endif if (child) rcu_assign_pointer(child->parent, pn); nstate = FWS_R; #ifdef CONFIG_IPV6_SUBTREES } #endif read_lock(&net->ipv6.fib6_walker_lock); FOR_WALKERS(net, w) { if (!child) { if (w->node == fn) { pr_debug("W %p adjusted by delnode 1, s=%d/%d\n", w, w->state, nstate); w->node = pn; w->state = nstate; } } else { if (w->node == fn) { w->node = child; if (children&2) { pr_debug("W %p adjusted by delnode 2, s=%d\n", w, w->state); w->state = w->state >= FWS_R ? FWS_U : FWS_INIT; } else { pr_debug("W %p adjusted by delnode 2, s=%d\n", w, w->state); w->state = w->state >= FWS_C ? FWS_U : FWS_INIT; } } } } read_unlock(&net->ipv6.fib6_walker_lock); node_free(net, fn); if (pn->fn_flags & RTN_RTINFO || FIB6_SUBTREE(pn)) return pn; RCU_INIT_POINTER(pn->leaf, NULL); fib6_info_release(pn_leaf); fn = pn; } } static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn, struct fib6_info __rcu **rtp, struct nl_info *info) { struct fib6_info *leaf, *replace_rt = NULL; struct fib6_walker *w; struct fib6_info *rt = rcu_dereference_protected(*rtp, lockdep_is_held(&table->tb6_lock)); struct net *net = info->nl_net; bool notify_del = false; /* If the deleted route is the first in the node and it is not part of * a multipath route, then we need to replace it with the next route * in the node, if exists. */ leaf = rcu_dereference_protected(fn->leaf, lockdep_is_held(&table->tb6_lock)); if (leaf == rt && !rt->fib6_nsiblings) { if (rcu_access_pointer(rt->fib6_next)) replace_rt = rcu_dereference_protected(rt->fib6_next, lockdep_is_held(&table->tb6_lock)); else notify_del = true; } /* Unlink it */ *rtp = rt->fib6_next; rt->fib6_node = NULL; net->ipv6.rt6_stats->fib_rt_entries--; net->ipv6.rt6_stats->fib_discarded_routes++; /* Reset round-robin state, if necessary */ if (rcu_access_pointer(fn->rr_ptr) == rt) fn->rr_ptr = NULL; /* Remove this entry from other siblings */ if (rt->fib6_nsiblings) { struct fib6_info *sibling, *next_sibling; /* The route is deleted from a multipath route. If this * multipath route is the first route in the node, then we need * to emit a delete notification. Otherwise, we need to skip * the notification. */ if (rt->fib6_metric == leaf->fib6_metric && rt6_qualify_for_ecmp(leaf)) notify_del = true; list_for_each_entry_safe(sibling, next_sibling, &rt->fib6_siblings, fib6_siblings) WRITE_ONCE(sibling->fib6_nsiblings, sibling->fib6_nsiblings - 1); WRITE_ONCE(rt->fib6_nsiblings, 0); list_del_rcu(&rt->fib6_siblings); rt6_multipath_rebalance(next_sibling); } /* Adjust walkers */ read_lock(&net->ipv6.fib6_walker_lock); FOR_WALKERS(net, w) { if (w->state == FWS_C && w->leaf == rt) { pr_debug("walker %p adjusted by delroute\n", w); w->leaf = rcu_dereference_protected(rt->fib6_next, lockdep_is_held(&table->tb6_lock)); if (!w->leaf) w->state = FWS_U; } } read_unlock(&net->ipv6.fib6_walker_lock); /* If it was last route, call fib6_repair_tree() to: * 1. For root node, put back null_entry as how the table was created. * 2. For other nodes, expunge its radix tree node. */ if (!rcu_access_pointer(fn->leaf)) { if (!(fn->fn_flags & RTN_TL_ROOT)) { fn->fn_flags &= ~RTN_RTINFO; net->ipv6.rt6_stats->fib_route_nodes--; } fn = fib6_repair_tree(net, table, fn); } fib6_purge_rt(rt, fn, net); if (!info->skip_notify_kernel) { if (notify_del) call_fib6_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, rt, NULL); else if (replace_rt) call_fib6_entry_notifiers_replace(net, replace_rt); } if (!info->skip_notify) inet6_rt_notify(RTM_DELROUTE, rt, info, 0); fib6_info_release(rt); } /* Need to own table->tb6_lock */ int fib6_del(struct fib6_info *rt, struct nl_info *info) { struct net *net = info->nl_net; struct fib6_info __rcu **rtp; struct fib6_info __rcu **rtp_next; struct fib6_table *table; struct fib6_node *fn; if (rt == net->ipv6.fib6_null_entry) return -ENOENT; table = rt->fib6_table; fn = rcu_dereference_protected(rt->fib6_node, lockdep_is_held(&table->tb6_lock)); if (!fn) return -ENOENT; WARN_ON(!(fn->fn_flags & RTN_RTINFO)); /* * Walk the leaf entries looking for ourself */ for (rtp = &fn->leaf; *rtp; rtp = rtp_next) { struct fib6_info *cur = rcu_dereference_protected(*rtp, lockdep_is_held(&table->tb6_lock)); if (rt == cur) { if (fib6_requires_src(cur)) fib6_routes_require_src_dec(info->nl_net); fib6_del_route(table, fn, rtp, info); return 0; } rtp_next = &cur->fib6_next; } return -ENOENT; } /* * Tree traversal function. * * Certainly, it is not interrupt safe. * However, it is internally reenterable wrt itself and fib6_add/fib6_del. * It means, that we can modify tree during walking * and use this function for garbage collection, clone pruning, * cleaning tree when a device goes down etc. etc. * * It guarantees that every node will be traversed, * and that it will be traversed only once. * * Callback function w->func may return: * 0 -> continue walking. * positive value -> walking is suspended (used by tree dumps, * and probably by gc, if it will be split to several slices) * negative value -> terminate walking. * * The function itself returns: * 0 -> walk is complete. * >0 -> walk is incomplete (i.e. suspended) * <0 -> walk is terminated by an error. * * This function is called with tb6_lock held. */ static int fib6_walk_continue(struct fib6_walker *w) { struct fib6_node *fn, *pn, *left, *right; /* w->root should always be table->tb6_root */ WARN_ON_ONCE(!(w->root->fn_flags & RTN_TL_ROOT)); for (;;) { fn = w->node; if (!fn) return 0; switch (w->state) { #ifdef CONFIG_IPV6_SUBTREES case FWS_S: if (FIB6_SUBTREE(fn)) { w->node = FIB6_SUBTREE(fn); continue; } w->state = FWS_L; fallthrough; #endif case FWS_L: left = rcu_dereference_protected(fn->left, 1); if (left) { w->node = left; w->state = FWS_INIT; continue; } w->state = FWS_R; fallthrough; case FWS_R: right = rcu_dereference_protected(fn->right, 1); if (right) { w->node = right; w->state = FWS_INIT; continue; } w->state = FWS_C; w->leaf = rcu_dereference_protected(fn->leaf, 1); fallthrough; case FWS_C: if (w->leaf && fn->fn_flags & RTN_RTINFO) { int err; if (w->skip) { w->skip--; goto skip; } err = w->func(w); if (err) return err; w->count++; continue; } skip: w->state = FWS_U; fallthrough; case FWS_U: if (fn == w->root) return 0; pn = rcu_dereference_protected(fn->parent, 1); left = rcu_dereference_protected(pn->left, 1); right = rcu_dereference_protected(pn->right, 1); w->node = pn; #ifdef CONFIG_IPV6_SUBTREES if (FIB6_SUBTREE(pn) == fn) { WARN_ON(!(fn->fn_flags & RTN_ROOT)); w->state = FWS_L; continue; } #endif if (left == fn) { w->state = FWS_R; continue; } if (right == fn) { w->state = FWS_C; w->leaf = rcu_dereference_protected(w->node->leaf, 1); continue; } #if RT6_DEBUG >= 2 WARN_ON(1); #endif } } } static int fib6_walk(struct net *net, struct fib6_walker *w) { int res; w->state = FWS_INIT; w->node = w->root; fib6_walker_link(net, w); res = fib6_walk_continue(w); if (res <= 0) fib6_walker_unlink(net, w); return res; } static int fib6_clean_node(struct fib6_walker *w) { int res; struct fib6_info *rt; struct fib6_cleaner *c = container_of(w, struct fib6_cleaner, w); struct nl_info info = { .nl_net = c->net, .skip_notify = c->skip_notify, }; if (c->sernum != FIB6_NO_SERNUM_CHANGE && READ_ONCE(w->node->fn_sernum) != c->sernum) WRITE_ONCE(w->node->fn_sernum, c->sernum); if (!c->func) { WARN_ON_ONCE(c->sernum == FIB6_NO_SERNUM_CHANGE); w->leaf = NULL; return 0; } for_each_fib6_walker_rt(w) { res = c->func(rt, c->arg); if (res == -1) { w->leaf = rt; res = fib6_del(rt, &info); if (res) { #if RT6_DEBUG >= 2 pr_debug("%s: del failed: rt=%p@%p err=%d\n", __func__, rt, rcu_access_pointer(rt->fib6_node), res); #endif continue; } return 0; } else if (res == -2) { if (WARN_ON(!rt->fib6_nsiblings)) continue; rt = list_last_entry(&rt->fib6_siblings, struct fib6_info, fib6_siblings); continue; } WARN_ON(res != 0); } w->leaf = rt; return 0; } /* * Convenient frontend to tree walker. * * func is called on each route. * It may return -2 -> skip multipath route. * -1 -> delete this route. * 0 -> continue walking */ static void fib6_clean_tree(struct net *net, struct fib6_node *root, int (*func)(struct fib6_info *, void *arg), int sernum, void *arg, bool skip_notify) { struct fib6_cleaner c; c.w.root = root; c.w.func = fib6_clean_node; c.w.count = 0; c.w.skip = 0; c.w.skip_in_node = 0; c.func = func; c.sernum = sernum; c.arg = arg; c.net = net; c.skip_notify = skip_notify; fib6_walk(net, &c.w); } static void __fib6_clean_all(struct net *net, int (*func)(struct fib6_info *, void *), int sernum, void *arg, bool skip_notify) { struct fib6_table *table; struct hlist_head *head; unsigned int h; rcu_read_lock(); for (h = 0; h < FIB6_TABLE_HASHSZ; h++) { head = &net->ipv6.fib_table_hash[h]; hlist_for_each_entry_rcu(table, head, tb6_hlist) { spin_lock_bh(&table->tb6_lock); fib6_clean_tree(net, &table->tb6_root, func, sernum, arg, skip_notify); spin_unlock_bh(&table->tb6_lock); } } rcu_read_unlock(); } void fib6_clean_all(struct net *net, int (*func)(struct fib6_info *, void *), void *arg) { __fib6_clean_all(net, func, FIB6_NO_SERNUM_CHANGE, arg, false); } void fib6_clean_all_skip_notify(struct net *net, int (*func)(struct fib6_info *, void *), void *arg) { __fib6_clean_all(net, func, FIB6_NO_SERNUM_CHANGE, arg, true); } static void fib6_flush_trees(struct net *net) { int new_sernum = fib6_new_sernum(net); __fib6_clean_all(net, NULL, new_sernum, NULL, false); } /* * Garbage collection */ static int fib6_age(struct fib6_info *rt, struct fib6_gc_args *gc_args) { unsigned long now = jiffies; /* * check addrconf expiration here. * Routes are expired even if they are in use. */ if (rt->fib6_flags & RTF_EXPIRES && rt->expires) { if (time_after(now, rt->expires)) { pr_debug("expiring %p\n", rt); return -1; } gc_args->more++; } /* Also age clones in the exception table. * Note, that clones are aged out * only if they are not in use now. */ rt6_age_exceptions(rt, gc_args, now); return 0; } static void fib6_gc_table(struct net *net, struct fib6_table *tb6, struct fib6_gc_args *gc_args) { struct fib6_info *rt; struct hlist_node *n; struct nl_info info = { .nl_net = net, .skip_notify = false, }; hlist_for_each_entry_safe(rt, n, &tb6->tb6_gc_hlist, gc_link) if (fib6_age(rt, gc_args) == -1) fib6_del(rt, &info); } static void fib6_gc_all(struct net *net, struct fib6_gc_args *gc_args) { struct fib6_table *table; struct hlist_head *head; unsigned int h; rcu_read_lock(); for (h = 0; h < FIB6_TABLE_HASHSZ; h++) { head = &net->ipv6.fib_table_hash[h]; hlist_for_each_entry_rcu(table, head, tb6_hlist) { spin_lock_bh(&table->tb6_lock); fib6_gc_table(net, table, gc_args); spin_unlock_bh(&table->tb6_lock); } } rcu_read_unlock(); } void fib6_run_gc(unsigned long expires, struct net *net, bool force) { struct fib6_gc_args gc_args; unsigned long now; if (force) { spin_lock_bh(&net->ipv6.fib6_gc_lock); } else if (!spin_trylock_bh(&net->ipv6.fib6_gc_lock)) { mod_timer(&net->ipv6.ip6_fib_timer, jiffies + HZ); return; } gc_args.timeout = expires ? (int)expires : net->ipv6.sysctl.ip6_rt_gc_interval; gc_args.more = 0; fib6_gc_all(net, &gc_args); now = jiffies; net->ipv6.ip6_rt_last_gc = now; if (gc_args.more) mod_timer(&net->ipv6.ip6_fib_timer, round_jiffies(now + net->ipv6.sysctl.ip6_rt_gc_interval)); else timer_delete(&net->ipv6.ip6_fib_timer); spin_unlock_bh(&net->ipv6.fib6_gc_lock); } static void fib6_gc_timer_cb(struct timer_list *t) { struct net *arg = timer_container_of(arg, t, ipv6.ip6_fib_timer); fib6_run_gc(0, arg, true); } static int __net_init fib6_net_init(struct net *net) { size_t size = sizeof(struct hlist_head) * FIB6_TABLE_HASHSZ; int err; err = fib6_notifier_init(net); if (err) return err; /* Default to 3-tuple */ net->ipv6.sysctl.multipath_hash_fields = FIB_MULTIPATH_HASH_FIELD_DEFAULT_MASK; spin_lock_init(&net->ipv6.fib6_gc_lock); rwlock_init(&net->ipv6.fib6_walker_lock); INIT_LIST_HEAD(&net->ipv6.fib6_walkers); timer_setup(&net->ipv6.ip6_fib_timer, fib6_gc_timer_cb, 0); net->ipv6.rt6_stats = kzalloc(sizeof(*net->ipv6.rt6_stats), GFP_KERNEL); if (!net->ipv6.rt6_stats) goto out_notifier; /* Avoid false sharing : Use at least a full cache line */ size = max_t(size_t, size, L1_CACHE_BYTES); net->ipv6.fib_table_hash = kzalloc(size, GFP_KERNEL); if (!net->ipv6.fib_table_hash) goto out_rt6_stats; spin_lock_init(&net->ipv6.fib_table_hash_lock); net->ipv6.fib6_main_tbl = kzalloc(sizeof(*net->ipv6.fib6_main_tbl), GFP_KERNEL); if (!net->ipv6.fib6_main_tbl) goto out_fib_table_hash; net->ipv6.fib6_main_tbl->tb6_id = RT6_TABLE_MAIN; rcu_assign_pointer(net->ipv6.fib6_main_tbl->tb6_root.leaf, net->ipv6.fib6_null_entry); net->ipv6.fib6_main_tbl->tb6_root.fn_flags = RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO; inet_peer_base_init(&net->ipv6.fib6_main_tbl->tb6_peers); INIT_HLIST_HEAD(&net->ipv6.fib6_main_tbl->tb6_gc_hlist); #ifdef CONFIG_IPV6_MULTIPLE_TABLES net->ipv6.fib6_local_tbl = kzalloc(sizeof(*net->ipv6.fib6_local_tbl), GFP_KERNEL); if (!net->ipv6.fib6_local_tbl) goto out_fib6_main_tbl; net->ipv6.fib6_local_tbl->tb6_id = RT6_TABLE_LOCAL; rcu_assign_pointer(net->ipv6.fib6_local_tbl->tb6_root.leaf, net->ipv6.fib6_null_entry); net->ipv6.fib6_local_tbl->tb6_root.fn_flags = RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO; inet_peer_base_init(&net->ipv6.fib6_local_tbl->tb6_peers); INIT_HLIST_HEAD(&net->ipv6.fib6_local_tbl->tb6_gc_hlist); #endif fib6_tables_init(net); return 0; #ifdef CONFIG_IPV6_MULTIPLE_TABLES out_fib6_main_tbl: kfree(net->ipv6.fib6_main_tbl); #endif out_fib_table_hash: kfree(net->ipv6.fib_table_hash); out_rt6_stats: kfree(net->ipv6.rt6_stats); out_notifier: fib6_notifier_exit(net); return -ENOMEM; } static void fib6_net_exit(struct net *net) { unsigned int i; timer_delete_sync(&net->ipv6.ip6_fib_timer); for (i = 0; i < FIB6_TABLE_HASHSZ; i++) { struct hlist_head *head = &net->ipv6.fib_table_hash[i]; struct hlist_node *tmp; struct fib6_table *tb; hlist_for_each_entry_safe(tb, tmp, head, tb6_hlist) { hlist_del(&tb->tb6_hlist); fib6_free_table(tb); } } kfree(net->ipv6.fib_table_hash); kfree(net->ipv6.rt6_stats); fib6_notifier_exit(net); } static struct pernet_operations fib6_net_ops = { .init = fib6_net_init, .exit = fib6_net_exit, }; static const struct rtnl_msg_handler fib6_rtnl_msg_handlers[] __initconst_or_module = { {.owner = THIS_MODULE, .protocol = PF_INET6, .msgtype = RTM_GETROUTE, .dumpit = inet6_dump_fib, .flags = RTNL_FLAG_DUMP_UNLOCKED | RTNL_FLAG_DUMP_SPLIT_NLM_DONE}, }; int __init fib6_init(void) { int ret = -ENOMEM; fib6_node_kmem = KMEM_CACHE(fib6_node, SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT); if (!fib6_node_kmem) goto out; ret = register_pernet_subsys(&fib6_net_ops); if (ret) goto out_kmem_cache_create; ret = rtnl_register_many(fib6_rtnl_msg_handlers); if (ret) goto out_unregister_subsys; __fib6_flush_trees = fib6_flush_trees; out: return ret; out_unregister_subsys: unregister_pernet_subsys(&fib6_net_ops); out_kmem_cache_create: kmem_cache_destroy(fib6_node_kmem); goto out; } void fib6_gc_cleanup(void) { unregister_pernet_subsys(&fib6_net_ops); kmem_cache_destroy(fib6_node_kmem); } #ifdef CONFIG_PROC_FS static int ipv6_route_native_seq_show(struct seq_file *seq, void *v) { struct fib6_info *rt = v; struct ipv6_route_iter *iter = seq->private; struct fib6_nh *fib6_nh = rt->fib6_nh; unsigned int flags = rt->fib6_flags; const struct net_device *dev; if (rt->nh) fib6_nh = nexthop_fib6_nh(rt->nh); seq_printf(seq, "%pi6 %02x ", &rt->fib6_dst.addr, rt->fib6_dst.plen); #ifdef CONFIG_IPV6_SUBTREES seq_printf(seq, "%pi6 %02x ", &rt->fib6_src.addr, rt->fib6_src.plen); #else seq_puts(seq, "00000000000000000000000000000000 00 "); #endif if (fib6_nh->fib_nh_gw_family) { flags |= RTF_GATEWAY; seq_printf(seq, "%pi6", &fib6_nh->fib_nh_gw6); } else { seq_puts(seq, "00000000000000000000000000000000"); } dev = fib6_nh->fib_nh_dev; seq_printf(seq, " %08x %08x %08x %08x %8s\n", rt->fib6_metric, refcount_read(&rt->fib6_ref), 0, flags, dev ? dev->name : ""); iter->w.leaf = NULL; return 0; } static int ipv6_route_yield(struct fib6_walker *w) { struct ipv6_route_iter *iter = w->args; if (!iter->skip) return 1; do { iter->w.leaf = rcu_dereference_protected( iter->w.leaf->fib6_next, lockdep_is_held(&iter->tbl->tb6_lock)); iter->skip--; if (!iter->skip && iter->w.leaf) return 1; } while (iter->w.leaf); return 0; } static void ipv6_route_seq_setup_walk(struct ipv6_route_iter *iter, struct net *net) { memset(&iter->w, 0, sizeof(iter->w)); iter->w.func = ipv6_route_yield; iter->w.root = &iter->tbl->tb6_root; iter->w.state = FWS_INIT; iter->w.node = iter->w.root; iter->w.args = iter; iter->sernum = READ_ONCE(iter->w.root->fn_sernum); INIT_LIST_HEAD(&iter->w.lh); fib6_walker_link(net, &iter->w); } static struct fib6_table *ipv6_route_seq_next_table(struct fib6_table *tbl, struct net *net) { unsigned int h; struct hlist_node *node; if (tbl) { h = (tbl->tb6_id & (FIB6_TABLE_HASHSZ - 1)) + 1; node = rcu_dereference(hlist_next_rcu(&tbl->tb6_hlist)); } else { h = 0; node = NULL; } while (!node && h < FIB6_TABLE_HASHSZ) { node = rcu_dereference( hlist_first_rcu(&net->ipv6.fib_table_hash[h++])); } return hlist_entry_safe(node, struct fib6_table, tb6_hlist); } static void ipv6_route_check_sernum(struct ipv6_route_iter *iter) { int sernum = READ_ONCE(iter->w.root->fn_sernum); if (iter->sernum != sernum) { iter->sernum = sernum; iter->w.state = FWS_INIT; iter->w.node = iter->w.root; WARN_ON(iter->w.skip); iter->w.skip = iter->w.count; } } static void *ipv6_route_seq_next(struct seq_file *seq, void *v, loff_t *pos) { int r; struct fib6_info *n; struct net *net = seq_file_net(seq); struct ipv6_route_iter *iter = seq->private; ++(*pos); if (!v) goto iter_table; n = rcu_dereference(((struct fib6_info *)v)->fib6_next); if (n) return n; iter_table: ipv6_route_check_sernum(iter); spin_lock_bh(&iter->tbl->tb6_lock); r = fib6_walk_continue(&iter->w); spin_unlock_bh(&iter->tbl->tb6_lock); if (r > 0) { return iter->w.leaf; } else if (r < 0) { fib6_walker_unlink(net, &iter->w); return NULL; } fib6_walker_unlink(net, &iter->w); iter->tbl = ipv6_route_seq_next_table(iter->tbl, net); if (!iter->tbl) return NULL; ipv6_route_seq_setup_walk(iter, net); goto iter_table; } static void *ipv6_route_seq_start(struct seq_file *seq, loff_t *pos) __acquires(RCU) { struct net *net = seq_file_net(seq); struct ipv6_route_iter *iter = seq->private; rcu_read_lock(); iter->tbl = ipv6_route_seq_next_table(NULL, net); iter->skip = *pos; if (iter->tbl) { loff_t p = 0; ipv6_route_seq_setup_walk(iter, net); return ipv6_route_seq_next(seq, NULL, &p); } else { return NULL; } } static bool ipv6_route_iter_active(struct ipv6_route_iter *iter) { struct fib6_walker *w = &iter->w; return w->node && !(w->state == FWS_U && w->node == w->root); } static void ipv6_route_native_seq_stop(struct seq_file *seq, void *v) __releases(RCU) { struct net *net = seq_file_net(seq); struct ipv6_route_iter *iter = seq->private; if (ipv6_route_iter_active(iter)) fib6_walker_unlink(net, &iter->w); rcu_read_unlock(); } #if IS_BUILTIN(CONFIG_IPV6) && defined(CONFIG_BPF_SYSCALL) static int ipv6_route_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta, void *v) { struct bpf_iter__ipv6_route ctx; ctx.meta = meta; ctx.rt = v; return bpf_iter_run_prog(prog, &ctx); } static int ipv6_route_seq_show(struct seq_file *seq, void *v) { struct ipv6_route_iter *iter = seq->private; struct bpf_iter_meta meta; struct bpf_prog *prog; int ret; meta.seq = seq; prog = bpf_iter_get_info(&meta, false); if (!prog) return ipv6_route_native_seq_show(seq, v); ret = ipv6_route_prog_seq_show(prog, &meta, v); iter->w.leaf = NULL; return ret; } static void ipv6_route_seq_stop(struct seq_file *seq, void *v) { struct bpf_iter_meta meta; struct bpf_prog *prog; if (!v) { meta.seq = seq; prog = bpf_iter_get_info(&meta, true); if (prog) (void)ipv6_route_prog_seq_show(prog, &meta, v); } ipv6_route_native_seq_stop(seq, v); } #else static int ipv6_route_seq_show(struct seq_file *seq, void *v) { return ipv6_route_native_seq_show(seq, v); } static void ipv6_route_seq_stop(struct seq_file *seq, void *v) { ipv6_route_native_seq_stop(seq, v); } #endif const struct seq_operations ipv6_route_seq_ops = { .start = ipv6_route_seq_start, .next = ipv6_route_seq_next, .stop = ipv6_route_seq_stop, .show = ipv6_route_seq_show }; #endif /* CONFIG_PROC_FS */ |
| 1 18 18 18 18 18 18 18 18 18 18 4 4 18 18 18 18 18 17 18 18 18 18 17 18 18 18 18 18 3 3 3 1 1 1 3 98 98 77 78 78 77 78 15 78 78 4 4 1 1 1 1 4 1 1 1 2 2 4 4 4 4 4 4 4 4 4 4 4 4 4 3 3 4 4 3 4 4 3 4 4 4 4 4 4 4 4 4 4 1 1 4 4 4 4 1 4 4 4 4 4 4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 | // SPDX-License-Identifier: GPL-2.0 /* * linux/mm/swap_state.c * * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds * Swap reorganised 29.12.95, Stephen Tweedie * * Rewritten to use page cache, (C) 1998 Stephen Tweedie */ #include <linux/mm.h> #include <linux/gfp.h> #include <linux/kernel_stat.h> #include <linux/mempolicy.h> #include <linux/swap.h> #include <linux/swapops.h> #include <linux/init.h> #include <linux/pagemap.h> #include <linux/pagevec.h> #include <linux/backing-dev.h> #include <linux/blkdev.h> #include <linux/migrate.h> #include <linux/vmalloc.h> #include <linux/huge_mm.h> #include <linux/shmem_fs.h> #include "internal.h" #include "swap.h" /* * swapper_space is a fiction, retained to simplify the path through * vmscan's shrink_folio_list. */ static const struct address_space_operations swap_aops = { .dirty_folio = noop_dirty_folio, #ifdef CONFIG_MIGRATION .migrate_folio = migrate_folio, #endif }; struct address_space *swapper_spaces[MAX_SWAPFILES] __read_mostly; static unsigned int nr_swapper_spaces[MAX_SWAPFILES] __read_mostly; static bool enable_vma_readahead __read_mostly = true; #define SWAP_RA_ORDER_CEILING 5 #define SWAP_RA_WIN_SHIFT (PAGE_SHIFT / 2) #define SWAP_RA_HITS_MASK ((1UL << SWAP_RA_WIN_SHIFT) - 1) #define SWAP_RA_HITS_MAX SWAP_RA_HITS_MASK #define SWAP_RA_WIN_MASK (~PAGE_MASK & ~SWAP_RA_HITS_MASK) #define SWAP_RA_HITS(v) ((v) & SWAP_RA_HITS_MASK) #define SWAP_RA_WIN(v) (((v) & SWAP_RA_WIN_MASK) >> SWAP_RA_WIN_SHIFT) #define SWAP_RA_ADDR(v) ((v) & PAGE_MASK) #define SWAP_RA_VAL(addr, win, hits) \ (((addr) & PAGE_MASK) | \ (((win) << SWAP_RA_WIN_SHIFT) & SWAP_RA_WIN_MASK) | \ ((hits) & SWAP_RA_HITS_MASK)) /* Initial readahead hits is 4 to start up with a small window */ #define GET_SWAP_RA_VAL(vma) \ (atomic_long_read(&(vma)->swap_readahead_info) ? : 4) static atomic_t swapin_readahead_hits = ATOMIC_INIT(4); void show_swap_cache_info(void) { printk("%lu pages in swap cache\n", total_swapcache_pages()); printk("Free swap = %ldkB\n", K(get_nr_swap_pages())); printk("Total swap = %lukB\n", K(total_swap_pages)); } void *get_shadow_from_swap_cache(swp_entry_t entry) { struct address_space *address_space = swap_address_space(entry); pgoff_t idx = swap_cache_index(entry); void *shadow; shadow = xa_load(&address_space->i_pages, idx); if (xa_is_value(shadow)) return shadow; return NULL; } /* * add_to_swap_cache resembles filemap_add_folio on swapper_space, * but sets SwapCache flag and 'swap' instead of mapping and index. */ int add_to_swap_cache(struct folio *folio, swp_entry_t entry, gfp_t gfp, void **shadowp) { struct address_space *address_space = swap_address_space(entry); pgoff_t idx = swap_cache_index(entry); XA_STATE_ORDER(xas, &address_space->i_pages, idx, folio_order(folio)); unsigned long i, nr = folio_nr_pages(folio); void *old; xas_set_update(&xas, workingset_update_node); VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); VM_BUG_ON_FOLIO(folio_test_swapcache(folio), folio); VM_BUG_ON_FOLIO(!folio_test_swapbacked(folio), folio); folio_ref_add(folio, nr); folio_set_swapcache(folio); folio->swap = entry; do { xas_lock_irq(&xas); xas_create_range(&xas); if (xas_error(&xas)) goto unlock; for (i = 0; i < nr; i++) { VM_BUG_ON_FOLIO(xas.xa_index != idx + i, folio); if (shadowp) { old = xas_load(&xas); if (xa_is_value(old)) *shadowp = old; } xas_store(&xas, folio); xas_next(&xas); } address_space->nrpages += nr; __node_stat_mod_folio(folio, NR_FILE_PAGES, nr); __lruvec_stat_mod_folio(folio, NR_SWAPCACHE, nr); unlock: xas_unlock_irq(&xas); } while (xas_nomem(&xas, gfp)); if (!xas_error(&xas)) return 0; folio_clear_swapcache(folio); folio_ref_sub(folio, nr); return xas_error(&xas); } /* * This must be called only on folios that have * been verified to be in the swap cache. */ void __delete_from_swap_cache(struct folio *folio, swp_entry_t entry, void *shadow) { struct address_space *address_space = swap_address_space(entry); int i; long nr = folio_nr_pages(folio); pgoff_t idx = swap_cache_index(entry); XA_STATE(xas, &address_space->i_pages, idx); xas_set_update(&xas, workingset_update_node); VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); VM_BUG_ON_FOLIO(!folio_test_swapcache(folio), folio); VM_BUG_ON_FOLIO(folio_test_writeback(folio), folio); for (i = 0; i < nr; i++) { void *entry = xas_store(&xas, shadow); VM_BUG_ON_PAGE(entry != folio, entry); xas_next(&xas); } folio->swap.val = 0; folio_clear_swapcache(folio); address_space->nrpages -= nr; __node_stat_mod_folio(folio, NR_FILE_PAGES, -nr); __lruvec_stat_mod_folio(folio, NR_SWAPCACHE, -nr); } /* * This must be called only on folios that have * been verified to be in the swap cache and locked. * It will never put the folio into the free list, * the caller has a reference on the folio. */ void delete_from_swap_cache(struct folio *folio) { swp_entry_t entry = folio->swap; struct address_space *address_space = swap_address_space(entry); xa_lock_irq(&address_space->i_pages); __delete_from_swap_cache(folio, entry, NULL); xa_unlock_irq(&address_space->i_pages); put_swap_folio(folio, entry); folio_ref_sub(folio, folio_nr_pages(folio)); } void clear_shadow_from_swap_cache(int type, unsigned long begin, unsigned long end) { unsigned long curr = begin; void *old; for (;;) { swp_entry_t entry = swp_entry(type, curr); unsigned long index = curr & SWAP_ADDRESS_SPACE_MASK; struct address_space *address_space = swap_address_space(entry); XA_STATE(xas, &address_space->i_pages, index); xas_set_update(&xas, workingset_update_node); xa_lock_irq(&address_space->i_pages); xas_for_each(&xas, old, min(index + (end - curr), SWAP_ADDRESS_SPACE_PAGES)) { if (!xa_is_value(old)) continue; xas_store(&xas, NULL); } xa_unlock_irq(&address_space->i_pages); /* search the next swapcache until we meet end */ curr = ALIGN((curr + 1), SWAP_ADDRESS_SPACE_PAGES); if (curr > end) break; } } /* * If we are the only user, then try to free up the swap cache. * * Its ok to check the swapcache flag without the folio lock * here because we are going to recheck again inside * folio_free_swap() _with_ the lock. * - Marcelo */ void free_swap_cache(struct folio *folio) { if (folio_test_swapcache(folio) && !folio_mapped(folio) && folio_trylock(folio)) { folio_free_swap(folio); folio_unlock(folio); } } /* * Freeing a folio and also freeing any swap cache associated with * this folio if it is the last user. */ void free_folio_and_swap_cache(struct folio *folio) { free_swap_cache(folio); if (!is_huge_zero_folio(folio)) folio_put(folio); } /* * Passed an array of pages, drop them all from swapcache and then release * them. They are removed from the LRU and freed if this is their last use. */ void free_pages_and_swap_cache(struct encoded_page **pages, int nr) { struct folio_batch folios; unsigned int refs[PAGEVEC_SIZE]; folio_batch_init(&folios); for (int i = 0; i < nr; i++) { struct folio *folio = page_folio(encoded_page_ptr(pages[i])); free_swap_cache(folio); refs[folios.nr] = 1; if (unlikely(encoded_page_flags(pages[i]) & ENCODED_PAGE_BIT_NR_PAGES_NEXT)) refs[folios.nr] = encoded_nr_pages(pages[++i]); if (folio_batch_add(&folios, folio) == 0) folios_put_refs(&folios, refs); } if (folios.nr) folios_put_refs(&folios, refs); } static inline bool swap_use_vma_readahead(void) { return READ_ONCE(enable_vma_readahead) && !atomic_read(&nr_rotate_swap); } /* * Lookup a swap entry in the swap cache. A found folio will be returned * unlocked and with its refcount incremented - we rely on the kernel * lock getting page table operations atomic even if we drop the folio * lock before returning. * * Caller must lock the swap device or hold a reference to keep it valid. */ struct folio *swap_cache_get_folio(swp_entry_t entry, struct vm_area_struct *vma, unsigned long addr) { struct folio *folio; folio = filemap_get_folio(swap_address_space(entry), swap_cache_index(entry)); if (!IS_ERR(folio)) { bool vma_ra = swap_use_vma_readahead(); bool readahead; /* * At the moment, we don't support PG_readahead for anon THP * so let's bail out rather than confusing the readahead stat. */ if (unlikely(folio_test_large(folio))) return folio; readahead = folio_test_clear_readahead(folio); if (vma && vma_ra) { unsigned long ra_val; int win, hits; ra_val = GET_SWAP_RA_VAL(vma); win = SWAP_RA_WIN(ra_val); hits = SWAP_RA_HITS(ra_val); if (readahead) hits = min_t(int, hits + 1, SWAP_RA_HITS_MAX); atomic_long_set(&vma->swap_readahead_info, SWAP_RA_VAL(addr, win, hits)); } if (readahead) { count_vm_event(SWAP_RA_HIT); if (!vma || !vma_ra) atomic_inc(&swapin_readahead_hits); } } else { folio = NULL; } return folio; } /** * filemap_get_incore_folio - Find and get a folio from the page or swap caches. * @mapping: The address_space to search. * @index: The page cache index. * * This differs from filemap_get_folio() in that it will also look for the * folio in the swap cache. * * Return: The found folio or %NULL. */ struct folio *filemap_get_incore_folio(struct address_space *mapping, pgoff_t index) { swp_entry_t swp; struct swap_info_struct *si; struct folio *folio = filemap_get_entry(mapping, index); if (!folio) return ERR_PTR(-ENOENT); if (!xa_is_value(folio)) return folio; if (!shmem_mapping(mapping)) return ERR_PTR(-ENOENT); swp = radix_to_swp_entry(folio); /* There might be swapin error entries in shmem mapping. */ if (non_swap_entry(swp)) return ERR_PTR(-ENOENT); /* Prevent swapoff from happening to us */ si = get_swap_device(swp); if (!si) return ERR_PTR(-ENOENT); index = swap_cache_index(swp); folio = filemap_get_folio(swap_address_space(swp), index); put_swap_device(si); return folio; } struct folio *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, struct mempolicy *mpol, pgoff_t ilx, bool *new_page_allocated, bool skip_if_exists) { struct swap_info_struct *si = swp_swap_info(entry); struct folio *folio; struct folio *new_folio = NULL; struct folio *result = NULL; void *shadow = NULL; *new_page_allocated = false; for (;;) { int err; /* * First check the swap cache. Since this is normally * called after swap_cache_get_folio() failed, re-calling * that would confuse statistics. */ folio = filemap_get_folio(swap_address_space(entry), swap_cache_index(entry)); if (!IS_ERR(folio)) goto got_folio; /* * Just skip read ahead for unused swap slot. */ if (!swap_entry_swapped(si, entry)) goto put_and_return; /* * Get a new folio to read into from swap. Allocate it now if * new_folio not exist, before marking swap_map SWAP_HAS_CACHE, * when -EEXIST will cause any racers to loop around until we * add it to cache. */ if (!new_folio) { new_folio = folio_alloc_mpol(gfp_mask, 0, mpol, ilx, numa_node_id()); if (!new_folio) goto put_and_return; } /* * Swap entry may have been freed since our caller observed it. */ err = swapcache_prepare(entry, 1); if (!err) break; else if (err != -EEXIST) goto put_and_return; /* * Protect against a recursive call to __read_swap_cache_async() * on the same entry waiting forever here because SWAP_HAS_CACHE * is set but the folio is not the swap cache yet. This can * happen today if mem_cgroup_swapin_charge_folio() below * triggers reclaim through zswap, which may call * __read_swap_cache_async() in the writeback path. */ if (skip_if_exists) goto put_and_return; /* * We might race against __delete_from_swap_cache(), and * stumble across a swap_map entry whose SWAP_HAS_CACHE * has not yet been cleared. Or race against another * __read_swap_cache_async(), which has set SWAP_HAS_CACHE * in swap_map, but not yet added its folio to swap cache. */ schedule_timeout_uninterruptible(1); } /* * The swap entry is ours to swap in. Prepare the new folio. */ __folio_set_locked(new_folio); __folio_set_swapbacked(new_folio); if (mem_cgroup_swapin_charge_folio(new_folio, NULL, gfp_mask, entry)) goto fail_unlock; /* May fail (-ENOMEM) if XArray node allocation failed. */ if (add_to_swap_cache(new_folio, entry, gfp_mask & GFP_RECLAIM_MASK, &shadow)) goto fail_unlock; memcg1_swapin(entry, 1); if (shadow) workingset_refault(new_folio, shadow); /* Caller will initiate read into locked new_folio */ folio_add_lru(new_folio); *new_page_allocated = true; folio = new_folio; got_folio: result = folio; goto put_and_return; fail_unlock: put_swap_folio(new_folio, entry); folio_unlock(new_folio); put_and_return: if (!(*new_page_allocated) && new_folio) folio_put(new_folio); return result; } /* * Locate a page of swap in physical memory, reserving swap cache space * and reading the disk if it is not already cached. * A failure return means that either the page allocation failed or that * the swap entry is no longer in use. * * get/put_swap_device() aren't needed to call this function, because * __read_swap_cache_async() call them and swap_read_folio() holds the * swap cache folio lock. */ struct folio *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, struct vm_area_struct *vma, unsigned long addr, struct swap_iocb **plug) { struct swap_info_struct *si; bool page_allocated; struct mempolicy *mpol; pgoff_t ilx; struct folio *folio; si = get_swap_device(entry); if (!si) return NULL; mpol = get_vma_policy(vma, addr, 0, &ilx); folio = __read_swap_cache_async(entry, gfp_mask, mpol, ilx, &page_allocated, false); mpol_cond_put(mpol); if (page_allocated) swap_read_folio(folio, plug); put_swap_device(si); return folio; } static unsigned int __swapin_nr_pages(unsigned long prev_offset, unsigned long offset, int hits, int max_pages, int prev_win) { unsigned int pages, last_ra; /* * This heuristic has been found to work well on both sequential and * random loads, swapping to hard disk or to SSD: please don't ask * what the "+ 2" means, it just happens to work well, that's all. */ pages = hits + 2; if (pages == 2) { /* * We can have no readahead hits to judge by: but must not get * stuck here forever, so check for an adjacent offset instead * (and don't even bother to check whether swap type is same). */ if (offset != prev_offset + 1 && offset != prev_offset - 1) pages = 1; } else { unsigned int roundup = 4; while (roundup < pages) roundup <<= 1; pages = roundup; } if (pages > max_pages) pages = max_pages; /* Don't shrink readahead too fast */ last_ra = prev_win / 2; if (pages < last_ra) pages = last_ra; return pages; } static unsigned long swapin_nr_pages(unsigned long offset) { static unsigned long prev_offset; unsigned int hits, pages, max_pages; static atomic_t last_readahead_pages; max_pages = 1 << READ_ONCE(page_cluster); if (max_pages <= 1) return 1; hits = atomic_xchg(&swapin_readahead_hits, 0); pages = __swapin_nr_pages(READ_ONCE(prev_offset), offset, hits, max_pages, atomic_read(&last_readahead_pages)); if (!hits) WRITE_ONCE(prev_offset, offset); atomic_set(&last_readahead_pages, pages); return pages; } /** * swap_cluster_readahead - swap in pages in hope we need them soon * @entry: swap entry of this memory * @gfp_mask: memory allocation flags * @mpol: NUMA memory allocation policy to be applied * @ilx: NUMA interleave index, for use only when MPOL_INTERLEAVE * * Returns the struct folio for entry and addr, after queueing swapin. * * Primitive swap readahead code. We simply read an aligned block of * (1 << page_cluster) entries in the swap area. This method is chosen * because it doesn't cost us any seek time. We also make sure to queue * the 'original' request together with the readahead ones... * * Note: it is intentional that the same NUMA policy and interleave index * are used for every page of the readahead: neighbouring pages on swap * are fairly likely to have been swapped out from the same node. */ struct folio *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask, struct mempolicy *mpol, pgoff_t ilx) { struct folio *folio; unsigned long entry_offset = swp_offset(entry); unsigned long offset = entry_offset; unsigned long start_offset, end_offset; unsigned long mask; struct swap_info_struct *si = swp_swap_info(entry); struct blk_plug plug; struct swap_iocb *splug = NULL; bool page_allocated; mask = swapin_nr_pages(offset) - 1; if (!mask) goto skip; /* Read a page_cluster sized and aligned cluster around offset. */ start_offset = offset & ~mask; end_offset = offset | mask; if (!start_offset) /* First page is swap header. */ start_offset++; if (end_offset >= si->max) end_offset = si->max - 1; blk_start_plug(&plug); for (offset = start_offset; offset <= end_offset ; offset++) { /* Ok, do the async read-ahead now */ folio = __read_swap_cache_async( swp_entry(swp_type(entry), offset), gfp_mask, mpol, ilx, &page_allocated, false); if (!folio) continue; if (page_allocated) { swap_read_folio(folio, &splug); if (offset != entry_offset) { folio_set_readahead(folio); count_vm_event(SWAP_RA); } } folio_put(folio); } blk_finish_plug(&plug); swap_read_unplug(splug); lru_add_drain(); /* Push any new pages onto the LRU now */ skip: /* The page was likely read above, so no need for plugging here */ folio = __read_swap_cache_async(entry, gfp_mask, mpol, ilx, &page_allocated, false); if (unlikely(page_allocated)) swap_read_folio(folio, NULL); return folio; } int init_swap_address_space(unsigned int type, unsigned long nr_pages) { struct address_space *spaces, *space; unsigned int i, nr; nr = DIV_ROUND_UP(nr_pages, SWAP_ADDRESS_SPACE_PAGES); spaces = kvcalloc(nr, sizeof(struct address_space), GFP_KERNEL); if (!spaces) return -ENOMEM; for (i = 0; i < nr; i++) { space = spaces + i; xa_init_flags(&space->i_pages, XA_FLAGS_LOCK_IRQ); atomic_set(&space->i_mmap_writable, 0); space->a_ops = &swap_aops; /* swap cache doesn't use writeback related tags */ mapping_set_no_writeback_tags(space); } nr_swapper_spaces[type] = nr; swapper_spaces[type] = spaces; return 0; } void exit_swap_address_space(unsigned int type) { int i; struct address_space *spaces = swapper_spaces[type]; for (i = 0; i < nr_swapper_spaces[type]; i++) VM_WARN_ON_ONCE(!mapping_empty(&spaces[i])); kvfree(spaces); nr_swapper_spaces[type] = 0; swapper_spaces[type] = NULL; } static int swap_vma_ra_win(struct vm_fault *vmf, unsigned long *start, unsigned long *end) { struct vm_area_struct *vma = vmf->vma; unsigned long ra_val; unsigned long faddr, prev_faddr, left, right; unsigned int max_win, hits, prev_win, win; max_win = 1 << min(READ_ONCE(page_cluster), SWAP_RA_ORDER_CEILING); if (max_win == 1) return 1; faddr = vmf->address; ra_val = GET_SWAP_RA_VAL(vma); prev_faddr = SWAP_RA_ADDR(ra_val); prev_win = SWAP_RA_WIN(ra_val); hits = SWAP_RA_HITS(ra_val); win = __swapin_nr_pages(PFN_DOWN(prev_faddr), PFN_DOWN(faddr), hits, max_win, prev_win); atomic_long_set(&vma->swap_readahead_info, SWAP_RA_VAL(faddr, win, 0)); if (win == 1) return 1; if (faddr == prev_faddr + PAGE_SIZE) left = faddr; else if (prev_faddr == faddr + PAGE_SIZE) left = faddr - (win << PAGE_SHIFT) + PAGE_SIZE; else left = faddr - (((win - 1) / 2) << PAGE_SHIFT); right = left + (win << PAGE_SHIFT); if ((long)left < 0) left = 0; *start = max3(left, vma->vm_start, faddr & PMD_MASK); *end = min3(right, vma->vm_end, (faddr & PMD_MASK) + PMD_SIZE); return win; } /** * swap_vma_readahead - swap in pages in hope we need them soon * @targ_entry: swap entry of the targeted memory * @gfp_mask: memory allocation flags * @mpol: NUMA memory allocation policy to be applied * @targ_ilx: NUMA interleave index, for use only when MPOL_INTERLEAVE * @vmf: fault information * * Returns the struct folio for entry and addr, after queueing swapin. * * Primitive swap readahead code. We simply read in a few pages whose * virtual addresses are around the fault address in the same vma. * * Caller must hold read mmap_lock if vmf->vma is not NULL. * */ static struct folio *swap_vma_readahead(swp_entry_t targ_entry, gfp_t gfp_mask, struct mempolicy *mpol, pgoff_t targ_ilx, struct vm_fault *vmf) { struct blk_plug plug; struct swap_iocb *splug = NULL; struct folio *folio; pte_t *pte = NULL, pentry; int win; unsigned long start, end, addr; swp_entry_t entry; pgoff_t ilx; bool page_allocated; win = swap_vma_ra_win(vmf, &start, &end); if (win == 1) goto skip; ilx = targ_ilx - PFN_DOWN(vmf->address - start); blk_start_plug(&plug); for (addr = start; addr < end; ilx++, addr += PAGE_SIZE) { if (!pte++) { pte = pte_offset_map(vmf->pmd, addr); if (!pte) break; } pentry = ptep_get_lockless(pte); if (!is_swap_pte(pentry)) continue; entry = pte_to_swp_entry(pentry); if (unlikely(non_swap_entry(entry))) continue; pte_unmap(pte); pte = NULL; folio = __read_swap_cache_async(entry, gfp_mask, mpol, ilx, &page_allocated, false); if (!folio) continue; if (page_allocated) { swap_read_folio(folio, &splug); if (addr != vmf->address) { folio_set_readahead(folio); count_vm_event(SWAP_RA); } } folio_put(folio); } if (pte) pte_unmap(pte); blk_finish_plug(&plug); swap_read_unplug(splug); lru_add_drain(); skip: /* The folio was likely read above, so no need for plugging here */ folio = __read_swap_cache_async(targ_entry, gfp_mask, mpol, targ_ilx, &page_allocated, false); if (unlikely(page_allocated)) swap_read_folio(folio, NULL); return folio; } /** * swapin_readahead - swap in pages in hope we need them soon * @entry: swap entry of this memory * @gfp_mask: memory allocation flags * @vmf: fault information * * Returns the struct folio for entry and addr, after queueing swapin. * * It's a main entry function for swap readahead. By the configuration, * it will read ahead blocks by cluster-based(ie, physical disk based) * or vma-based(ie, virtual address based on faulty address) readahead. */ struct folio *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask, struct vm_fault *vmf) { struct mempolicy *mpol; pgoff_t ilx; struct folio *folio; mpol = get_vma_policy(vmf->vma, vmf->address, 0, &ilx); folio = swap_use_vma_readahead() ? swap_vma_readahead(entry, gfp_mask, mpol, ilx, vmf) : swap_cluster_readahead(entry, gfp_mask, mpol, ilx); mpol_cond_put(mpol); return folio; } #ifdef CONFIG_SYSFS static ssize_t vma_ra_enabled_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%s\n", str_true_false(enable_vma_readahead)); } static ssize_t vma_ra_enabled_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { ssize_t ret; ret = kstrtobool(buf, &enable_vma_readahead); if (ret) return ret; return count; } static struct kobj_attribute vma_ra_enabled_attr = __ATTR_RW(vma_ra_enabled); static struct attribute *swap_attrs[] = { &vma_ra_enabled_attr.attr, NULL, }; static const struct attribute_group swap_attr_group = { .attrs = swap_attrs, }; static int __init swap_init_sysfs(void) { int err; struct kobject *swap_kobj; swap_kobj = kobject_create_and_add("swap", mm_kobj); if (!swap_kobj) { pr_err("failed to create swap kobject\n"); return -ENOMEM; } err = sysfs_create_group(swap_kobj, &swap_attr_group); if (err) { pr_err("failed to register swap group\n"); goto delete_obj; } return 0; delete_obj: kobject_put(swap_kobj); return err; } subsys_initcall(swap_init_sysfs); #endif |
| 1 1 6 49 1 48 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_PTRACE_H #define _LINUX_PTRACE_H #include <linux/compiler.h> /* For unlikely. */ #include <linux/sched.h> /* For struct task_struct. */ #include <linux/sched/signal.h> /* For send_sig(), same_thread_group(), etc. */ #include <linux/err.h> /* for IS_ERR_VALUE */ #include <linux/bug.h> /* For BUG_ON. */ #include <linux/pid_namespace.h> /* For task_active_pid_ns. */ #include <uapi/linux/ptrace.h> #include <linux/seccomp.h> /* Add sp to seccomp_data, as seccomp is user API, we don't want to modify it */ struct syscall_info { __u64 sp; struct seccomp_data data; }; extern int ptrace_access_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, unsigned int gup_flags); /* * Ptrace flags * * The owner ship rules for task->ptrace which holds the ptrace * flags is simple. When a task is running it owns it's task->ptrace * flags. When the a task is stopped the ptracer owns task->ptrace. */ #define PT_SEIZED 0x00010000 /* SEIZE used, enable new behavior */ #define PT_PTRACED 0x00000001 #define PT_OPT_FLAG_SHIFT 3 /* PT_TRACE_* event enable flags */ #define PT_EVENT_FLAG(event) (1 << (PT_OPT_FLAG_SHIFT + (event))) #define PT_TRACESYSGOOD PT_EVENT_FLAG(0) #define PT_TRACE_FORK PT_EVENT_FLAG(PTRACE_EVENT_FORK) #define PT_TRACE_VFORK PT_EVENT_FLAG(PTRACE_EVENT_VFORK) #define PT_TRACE_CLONE PT_EVENT_FLAG(PTRACE_EVENT_CLONE) #define PT_TRACE_EXEC PT_EVENT_FLAG(PTRACE_EVENT_EXEC) #define PT_TRACE_VFORK_DONE PT_EVENT_FLAG(PTRACE_EVENT_VFORK_DONE) #define PT_TRACE_EXIT PT_EVENT_FLAG(PTRACE_EVENT_EXIT) #define PT_TRACE_SECCOMP PT_EVENT_FLAG(PTRACE_EVENT_SECCOMP) #define PT_EXITKILL (PTRACE_O_EXITKILL << PT_OPT_FLAG_SHIFT) #define PT_SUSPEND_SECCOMP (PTRACE_O_SUSPEND_SECCOMP << PT_OPT_FLAG_SHIFT) extern long arch_ptrace(struct task_struct *child, long request, unsigned long addr, unsigned long data); extern int ptrace_readdata(struct task_struct *tsk, unsigned long src, char __user *dst, int len); extern int ptrace_writedata(struct task_struct *tsk, char __user *src, unsigned long dst, int len); extern void ptrace_disable(struct task_struct *); extern int ptrace_request(struct task_struct *child, long request, unsigned long addr, unsigned long data); extern int ptrace_notify(int exit_code, unsigned long message); extern void __ptrace_link(struct task_struct *child, struct task_struct *new_parent, const struct cred *ptracer_cred); extern void __ptrace_unlink(struct task_struct *child); extern void exit_ptrace(struct task_struct *tracer, struct list_head *dead); #define PTRACE_MODE_READ 0x01 #define PTRACE_MODE_ATTACH 0x02 #define PTRACE_MODE_NOAUDIT 0x04 #define PTRACE_MODE_FSCREDS 0x08 #define PTRACE_MODE_REALCREDS 0x10 /* shorthands for READ/ATTACH and FSCREDS/REALCREDS combinations */ #define PTRACE_MODE_READ_FSCREDS (PTRACE_MODE_READ | PTRACE_MODE_FSCREDS) #define PTRACE_MODE_READ_REALCREDS (PTRACE_MODE_READ | PTRACE_MODE_REALCREDS) #define PTRACE_MODE_ATTACH_FSCREDS (PTRACE_MODE_ATTACH | PTRACE_MODE_FSCREDS) #define PTRACE_MODE_ATTACH_REALCREDS (PTRACE_MODE_ATTACH | PTRACE_MODE_REALCREDS) /** * ptrace_may_access - check whether the caller is permitted to access * a target task. * @task: target task * @mode: selects type of access and caller credentials * * Returns true on success, false on denial. * * One of the flags PTRACE_MODE_FSCREDS and PTRACE_MODE_REALCREDS must * be set in @mode to specify whether the access was requested through * a filesystem syscall (should use effective capabilities and fsuid * of the caller) or through an explicit syscall such as * process_vm_writev or ptrace (and should use the real credentials). */ extern bool ptrace_may_access(struct task_struct *task, unsigned int mode); static inline int ptrace_reparented(struct task_struct *child) { return !same_thread_group(child->real_parent, child->parent); } static inline void ptrace_unlink(struct task_struct *child) { if (unlikely(child->ptrace)) __ptrace_unlink(child); } int generic_ptrace_peekdata(struct task_struct *tsk, unsigned long addr, unsigned long data); int generic_ptrace_pokedata(struct task_struct *tsk, unsigned long addr, unsigned long data); /** * ptrace_parent - return the task that is tracing the given task * @task: task to consider * * Returns %NULL if no one is tracing @task, or the &struct task_struct * pointer to its tracer. * * Must called under rcu_read_lock(). The pointer returned might be kept * live only by RCU. During exec, this may be called with task_lock() held * on @task, still held from when check_unsafe_exec() was called. */ static inline struct task_struct *ptrace_parent(struct task_struct *task) { if (unlikely(task->ptrace)) return rcu_dereference(task->parent); return NULL; } /** * ptrace_event_enabled - test whether a ptrace event is enabled * @task: ptracee of interest * @event: %PTRACE_EVENT_* to test * * Test whether @event is enabled for ptracee @task. * * Returns %true if @event is enabled, %false otherwise. */ static inline bool ptrace_event_enabled(struct task_struct *task, int event) { return task->ptrace & PT_EVENT_FLAG(event); } /** * ptrace_event - possibly stop for a ptrace event notification * @event: %PTRACE_EVENT_* value to report * @message: value for %PTRACE_GETEVENTMSG to return * * Check whether @event is enabled and, if so, report @event and @message * to the ptrace parent. * * Called without locks. */ static inline void ptrace_event(int event, unsigned long message) { if (unlikely(ptrace_event_enabled(current, event))) { ptrace_notify((event << 8) | SIGTRAP, message); } else if (event == PTRACE_EVENT_EXEC) { /* legacy EXEC report via SIGTRAP */ if ((current->ptrace & (PT_PTRACED|PT_SEIZED)) == PT_PTRACED) send_sig(SIGTRAP, current, 0); } } /** * ptrace_event_pid - possibly stop for a ptrace event notification * @event: %PTRACE_EVENT_* value to report * @pid: process identifier for %PTRACE_GETEVENTMSG to return * * Check whether @event is enabled and, if so, report @event and @pid * to the ptrace parent. @pid is reported as the pid_t seen from the * ptrace parent's pid namespace. * * Called without locks. */ static inline void ptrace_event_pid(int event, struct pid *pid) { /* * FIXME: There's a potential race if a ptracer in a different pid * namespace than parent attaches between computing message below and * when we acquire tasklist_lock in ptrace_stop(). If this happens, * the ptracer will get a bogus pid from PTRACE_GETEVENTMSG. */ unsigned long message = 0; struct pid_namespace *ns; rcu_read_lock(); ns = task_active_pid_ns(rcu_dereference(current->parent)); if (ns) message = pid_nr_ns(pid, ns); rcu_read_unlock(); ptrace_event(event, message); } /** * ptrace_init_task - initialize ptrace state for a new child * @child: new child task * @ptrace: true if child should be ptrace'd by parent's tracer * * This is called immediately after adding @child to its parent's children * list. @ptrace is false in the normal case, and true to ptrace @child. * * Called with current's siglock and write_lock_irq(&tasklist_lock) held. */ static inline void ptrace_init_task(struct task_struct *child, bool ptrace) { INIT_LIST_HEAD(&child->ptrace_entry); INIT_LIST_HEAD(&child->ptraced); child->jobctl = 0; child->ptrace = 0; child->parent = child->real_parent; if (unlikely(ptrace) && current->ptrace) { child->ptrace = current->ptrace; __ptrace_link(child, current->parent, current->ptracer_cred); if (child->ptrace & PT_SEIZED) task_set_jobctl_pending(child, JOBCTL_TRAP_STOP); else sigaddset(&child->pending.signal, SIGSTOP); } else child->ptracer_cred = NULL; } /** * ptrace_release_task - final ptrace-related cleanup of a zombie being reaped * @task: task in %EXIT_DEAD state * * Called with write_lock(&tasklist_lock) held. */ static inline void ptrace_release_task(struct task_struct *task) { BUG_ON(!list_empty(&task->ptraced)); ptrace_unlink(task); BUG_ON(!list_empty(&task->ptrace_entry)); } #ifndef force_successful_syscall_return /* * System call handlers that, upon successful completion, need to return a * negative value should call force_successful_syscall_return() right before * returning. On architectures where the syscall convention provides for a * separate error flag (e.g., alpha, ia64, ppc{,64}, sparc{,64}, possibly * others), this macro can be used to ensure that the error flag will not get * set. On architectures which do not support a separate error flag, the macro * is a no-op and the spurious error condition needs to be filtered out by some * other means (e.g., in user-level, by passing an extra argument to the * syscall handler, or something along those lines). */ #define force_successful_syscall_return() do { } while (0) #endif #ifndef is_syscall_success /* * On most systems we can tell if a syscall is a success based on if the retval * is an error value. On some systems like ia64 and powerpc they have different * indicators of success/failure and must define their own. */ #define is_syscall_success(regs) (!IS_ERR_VALUE((unsigned long)(regs_return_value(regs)))) #endif /* * <asm/ptrace.h> should define the following things inside #ifdef __KERNEL__. * * These do-nothing inlines are used when the arch does not * implement single-step. The kerneldoc comments are here * to document the interface for all arch definitions. */ #ifndef arch_has_single_step /** * arch_has_single_step - does this CPU support user-mode single-step? * * If this is defined, then there must be function declarations or * inlines for user_enable_single_step() and user_disable_single_step(). * arch_has_single_step() should evaluate to nonzero iff the machine * supports instruction single-step for user mode. * It can be a constant or it can test a CPU feature bit. */ #define arch_has_single_step() (0) /** * user_enable_single_step - single-step in user-mode task * @task: either current or a task stopped in %TASK_TRACED * * This can only be called when arch_has_single_step() has returned nonzero. * Set @task so that when it returns to user mode, it will trap after the * next single instruction executes. If arch_has_block_step() is defined, * this must clear the effects of user_enable_block_step() too. */ static inline void user_enable_single_step(struct task_struct *task) { BUG(); /* This can never be called. */ } /** * user_disable_single_step - cancel user-mode single-step * @task: either current or a task stopped in %TASK_TRACED * * Clear @task of the effects of user_enable_single_step() and * user_enable_block_step(). This can be called whether or not either * of those was ever called on @task, and even if arch_has_single_step() * returned zero. */ static inline void user_disable_single_step(struct task_struct *task) { } #else extern void user_enable_single_step(struct task_struct *); extern void user_disable_single_step(struct task_struct *); #endif /* arch_has_single_step */ #ifndef arch_has_block_step /** * arch_has_block_step - does this CPU support user-mode block-step? * * If this is defined, then there must be a function declaration or inline * for user_enable_block_step(), and arch_has_single_step() must be defined * too. arch_has_block_step() should evaluate to nonzero iff the machine * supports step-until-branch for user mode. It can be a constant or it * can test a CPU feature bit. */ #define arch_has_block_step() (0) /** * user_enable_block_step - step until branch in user-mode task * @task: either current or a task stopped in %TASK_TRACED * * This can only be called when arch_has_block_step() has returned nonzero, * and will never be called when single-instruction stepping is being used. * Set @task so that when it returns to user mode, it will trap after the * next branch or trap taken. */ static inline void user_enable_block_step(struct task_struct *task) { BUG(); /* This can never be called. */ } #else extern void user_enable_block_step(struct task_struct *); #endif /* arch_has_block_step */ #ifdef ARCH_HAS_USER_SINGLE_STEP_REPORT extern void user_single_step_report(struct pt_regs *regs); #else static inline void user_single_step_report(struct pt_regs *regs) { kernel_siginfo_t info; clear_siginfo(&info); info.si_signo = SIGTRAP; info.si_errno = 0; info.si_code = SI_USER; info.si_pid = 0; info.si_uid = 0; force_sig_info(&info); } #endif #ifndef arch_ptrace_stop_needed /** * arch_ptrace_stop_needed - Decide whether arch_ptrace_stop() should be called * * This is called with the siglock held, to decide whether or not it's * necessary to release the siglock and call arch_ptrace_stop(). It can be * defined to a constant if arch_ptrace_stop() is never required, or always * is. On machines where this makes sense, it should be defined to a quick * test to optimize out calling arch_ptrace_stop() when it would be * superfluous. For example, if the thread has not been back to user mode * since the last stop, the thread state might indicate that nothing needs * to be done. * * This is guaranteed to be invoked once before a task stops for ptrace and * may include arch-specific operations necessary prior to a ptrace stop. */ #define arch_ptrace_stop_needed() (0) #endif #ifndef arch_ptrace_stop /** * arch_ptrace_stop - Do machine-specific work before stopping for ptrace * * This is called with no locks held when arch_ptrace_stop_needed() has * just returned nonzero. It is allowed to block, e.g. for user memory * access. The arch can have machine-specific work to be done before * ptrace stops. On ia64, register backing store gets written back to user * memory here. Since this can be costly (requires dropping the siglock), * we only do it when the arch requires it for this particular stop, as * indicated by arch_ptrace_stop_needed(). */ #define arch_ptrace_stop() do { } while (0) #endif #ifndef current_pt_regs #define current_pt_regs() task_pt_regs(current) #endif #ifndef current_user_stack_pointer #define current_user_stack_pointer() user_stack_pointer(current_pt_regs()) #endif #ifndef exception_ip #define exception_ip(x) instruction_pointer(x) #endif extern int task_current_syscall(struct task_struct *target, struct syscall_info *info); extern void sigaction_compat_abi(struct k_sigaction *act, struct k_sigaction *oact); /* * ptrace report for syscall entry and exit looks identical. */ static inline int ptrace_report_syscall(unsigned long message) { int ptrace = current->ptrace; int signr; if (!(ptrace & PT_PTRACED)) return 0; signr = ptrace_notify(SIGTRAP | ((ptrace & PT_TRACESYSGOOD) ? 0x80 : 0), message); /* * this isn't the same as continuing with a signal, but it will do * for normal use. strace only continues with a signal if the * stopping signal is not SIGTRAP. -brl */ if (signr) send_sig(signr, current, 1); return fatal_signal_pending(current); } /** * ptrace_report_syscall_entry - task is about to attempt a system call * @regs: user register state of current task * * This will be called if %SYSCALL_WORK_SYSCALL_TRACE or * %SYSCALL_WORK_SYSCALL_EMU have been set, when the current task has just * entered the kernel for a system call. Full user register state is * available here. Changing the values in @regs can affect the system * call number and arguments to be tried. It is safe to block here, * preventing the system call from beginning. * * Returns zero normally, or nonzero if the calling arch code should abort * the system call. That must prevent normal entry so no system call is * made. If @task ever returns to user mode after this, its register state * is unspecified, but should be something harmless like an %ENOSYS error * return. It should preserve enough information so that syscall_rollback() * can work (see asm-generic/syscall.h). * * Called without locks, just after entering kernel mode. */ static inline __must_check int ptrace_report_syscall_entry( struct pt_regs *regs) { return ptrace_report_syscall(PTRACE_EVENTMSG_SYSCALL_ENTRY); } /** * ptrace_report_syscall_exit - task has just finished a system call * @regs: user register state of current task * @step: nonzero if simulating single-step or block-step * * This will be called if %SYSCALL_WORK_SYSCALL_TRACE has been set, when * the current task has just finished an attempted system call. Full * user register state is available here. It is safe to block here, * preventing signals from being processed. * * If @step is nonzero, this report is also in lieu of the normal * trap that would follow the system call instruction because * user_enable_block_step() or user_enable_single_step() was used. * In this case, %SYSCALL_WORK_SYSCALL_TRACE might not be set. * * Called without locks, just before checking for pending signals. */ static inline void ptrace_report_syscall_exit(struct pt_regs *regs, int step) { if (step) user_single_step_report(regs); else ptrace_report_syscall(PTRACE_EVENTMSG_SYSCALL_EXIT); } #endif |
| 534 534 3877 3862 22 23 494 117 117 68 68 68 68 68 68 68 66 66 50 50 50 49 49 50 3914 3914 29 29 29 29 29 29 29 29 804 807 808 797 801 800 798 804 808 808 805 798 806 801 808 805 806 4 1052 1051 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 | // SPDX-License-Identifier: GPL-2.0-only /* Kernel thread helper functions. * Copyright (C) 2004 IBM Corporation, Rusty Russell. * Copyright (C) 2009 Red Hat, Inc. * * Creation is done via kthreadd, so that we get a clean environment * even if we're invoked from userspace (think modprobe, hotplug cpu, * etc.). */ #include <uapi/linux/sched/types.h> #include <linux/mm.h> #include <linux/mmu_context.h> #include <linux/sched.h> #include <linux/sched/mm.h> #include <linux/sched/task.h> #include <linux/kthread.h> #include <linux/completion.h> #include <linux/err.h> #include <linux/cgroup.h> #include <linux/cpuset.h> #include <linux/unistd.h> #include <linux/file.h> #include <linux/export.h> #include <linux/mutex.h> #include <linux/slab.h> #include <linux/freezer.h> #include <linux/ptrace.h> #include <linux/uaccess.h> #include <linux/numa.h> #include <linux/sched/isolation.h> #include <trace/events/sched.h> static DEFINE_SPINLOCK(kthread_create_lock); static LIST_HEAD(kthread_create_list); struct task_struct *kthreadd_task; static LIST_HEAD(kthreads_hotplug); static DEFINE_MUTEX(kthreads_hotplug_lock); struct kthread_create_info { /* Information passed to kthread() from kthreadd. */ char *full_name; int (*threadfn)(void *data); void *data; int node; /* Result passed back to kthread_create() from kthreadd. */ struct task_struct *result; struct completion *done; struct list_head list; }; struct kthread { unsigned long flags; unsigned int cpu; unsigned int node; int started; int result; int (*threadfn)(void *); void *data; struct completion parked; struct completion exited; #ifdef CONFIG_BLK_CGROUP struct cgroup_subsys_state *blkcg_css; #endif /* To store the full name if task comm is truncated. */ char *full_name; struct task_struct *task; struct list_head hotplug_node; struct cpumask *preferred_affinity; }; enum KTHREAD_BITS { KTHREAD_IS_PER_CPU = 0, KTHREAD_SHOULD_STOP, KTHREAD_SHOULD_PARK, }; static inline struct kthread *to_kthread(struct task_struct *k) { WARN_ON(!(k->flags & PF_KTHREAD)); return k->worker_private; } /* * Variant of to_kthread() that doesn't assume @p is a kthread. * * When "(p->flags & PF_KTHREAD)" is set the task is a kthread and will * always remain a kthread. For kthreads p->worker_private always * points to a struct kthread. For tasks that are not kthreads * p->worker_private is used to point to other things. * * Return NULL for any task that is not a kthread. */ static inline struct kthread *__to_kthread(struct task_struct *p) { void *kthread = p->worker_private; if (kthread && !(p->flags & PF_KTHREAD)) kthread = NULL; return kthread; } void get_kthread_comm(char *buf, size_t buf_size, struct task_struct *tsk) { struct kthread *kthread = to_kthread(tsk); if (!kthread || !kthread->full_name) { strscpy(buf, tsk->comm, buf_size); return; } strscpy_pad(buf, kthread->full_name, buf_size); } bool set_kthread_struct(struct task_struct *p) { struct kthread *kthread; if (WARN_ON_ONCE(to_kthread(p))) return false; kthread = kzalloc(sizeof(*kthread), GFP_KERNEL); if (!kthread) return false; init_completion(&kthread->exited); init_completion(&kthread->parked); INIT_LIST_HEAD(&kthread->hotplug_node); p->vfork_done = &kthread->exited; kthread->task = p; kthread->node = tsk_fork_get_node(current); p->worker_private = kthread; return true; } void free_kthread_struct(struct task_struct *k) { struct kthread *kthread; /* * Can be NULL if kmalloc() in set_kthread_struct() failed. */ kthread = to_kthread(k); if (!kthread) return; #ifdef CONFIG_BLK_CGROUP WARN_ON_ONCE(kthread->blkcg_css); #endif k->worker_private = NULL; kfree(kthread->full_name); kfree(kthread); } /** * kthread_should_stop - should this kthread return now? * * When someone calls kthread_stop() on your kthread, it will be woken * and this will return true. You should then return, and your return * value will be passed through to kthread_stop(). */ bool kthread_should_stop(void) { return test_bit(KTHREAD_SHOULD_STOP, &to_kthread(current)->flags); } EXPORT_SYMBOL(kthread_should_stop); static bool __kthread_should_park(struct task_struct *k) { return test_bit(KTHREAD_SHOULD_PARK, &to_kthread(k)->flags); } /** * kthread_should_park - should this kthread park now? * * When someone calls kthread_park() on your kthread, it will be woken * and this will return true. You should then do the necessary * cleanup and call kthread_parkme() * * Similar to kthread_should_stop(), but this keeps the thread alive * and in a park position. kthread_unpark() "restarts" the thread and * calls the thread function again. */ bool kthread_should_park(void) { return __kthread_should_park(current); } EXPORT_SYMBOL_GPL(kthread_should_park); bool kthread_should_stop_or_park(void) { struct kthread *kthread = __to_kthread(current); if (!kthread) return false; return kthread->flags & (BIT(KTHREAD_SHOULD_STOP) | BIT(KTHREAD_SHOULD_PARK)); } /** * kthread_freezable_should_stop - should this freezable kthread return now? * @was_frozen: optional out parameter, indicates whether %current was frozen * * kthread_should_stop() for freezable kthreads, which will enter * refrigerator if necessary. This function is safe from kthread_stop() / * freezer deadlock and freezable kthreads should use this function instead * of calling try_to_freeze() directly. */ bool kthread_freezable_should_stop(bool *was_frozen) { bool frozen = false; might_sleep(); if (unlikely(freezing(current))) frozen = __refrigerator(true); if (was_frozen) *was_frozen = frozen; return kthread_should_stop(); } EXPORT_SYMBOL_GPL(kthread_freezable_should_stop); /** * kthread_func - return the function specified on kthread creation * @task: kthread task in question * * Returns NULL if the task is not a kthread. */ void *kthread_func(struct task_struct *task) { struct kthread *kthread = __to_kthread(task); if (kthread) return kthread->threadfn; return NULL; } EXPORT_SYMBOL_GPL(kthread_func); /** * kthread_data - return data value specified on kthread creation * @task: kthread task in question * * Return the data value specified when kthread @task was created. * The caller is responsible for ensuring the validity of @task when * calling this function. */ void *kthread_data(struct task_struct *task) { return to_kthread(task)->data; } EXPORT_SYMBOL_GPL(kthread_data); /** * kthread_probe_data - speculative version of kthread_data() * @task: possible kthread task in question * * @task could be a kthread task. Return the data value specified when it * was created if accessible. If @task isn't a kthread task or its data is * inaccessible for any reason, %NULL is returned. This function requires * that @task itself is safe to dereference. */ void *kthread_probe_data(struct task_struct *task) { struct kthread *kthread = __to_kthread(task); void *data = NULL; if (kthread) copy_from_kernel_nofault(&data, &kthread->data, sizeof(data)); return data; } static void __kthread_parkme(struct kthread *self) { for (;;) { /* * TASK_PARKED is a special state; we must serialize against * possible pending wakeups to avoid store-store collisions on * task->state. * * Such a collision might possibly result in the task state * changin from TASK_PARKED and us failing the * wait_task_inactive() in kthread_park(). */ set_special_state(TASK_PARKED); if (!test_bit(KTHREAD_SHOULD_PARK, &self->flags)) break; /* * Thread is going to call schedule(), do not preempt it, * or the caller of kthread_park() may spend more time in * wait_task_inactive(). */ preempt_disable(); complete(&self->parked); schedule_preempt_disabled(); preempt_enable(); } __set_current_state(TASK_RUNNING); } void kthread_parkme(void) { __kthread_parkme(to_kthread(current)); } EXPORT_SYMBOL_GPL(kthread_parkme); /** * kthread_exit - Cause the current kthread return @result to kthread_stop(). * @result: The integer value to return to kthread_stop(). * * While kthread_exit can be called directly, it exists so that * functions which do some additional work in non-modular code such as * module_put_and_kthread_exit can be implemented. * * Does not return. */ void __noreturn kthread_exit(long result) { struct kthread *kthread = to_kthread(current); kthread->result = result; if (!list_empty(&kthread->hotplug_node)) { mutex_lock(&kthreads_hotplug_lock); list_del(&kthread->hotplug_node); mutex_unlock(&kthreads_hotplug_lock); if (kthread->preferred_affinity) { kfree(kthread->preferred_affinity); kthread->preferred_affinity = NULL; } } do_exit(0); } EXPORT_SYMBOL(kthread_exit); /** * kthread_complete_and_exit - Exit the current kthread. * @comp: Completion to complete * @code: The integer value to return to kthread_stop(). * * If present, complete @comp and then return code to kthread_stop(). * * A kernel thread whose module may be removed after the completion of * @comp can use this function to exit safely. * * Does not return. */ void __noreturn kthread_complete_and_exit(struct completion *comp, long code) { if (comp) complete(comp); kthread_exit(code); } EXPORT_SYMBOL(kthread_complete_and_exit); static void kthread_fetch_affinity(struct kthread *kthread, struct cpumask *cpumask) { const struct cpumask *pref; if (kthread->preferred_affinity) { pref = kthread->preferred_affinity; } else { if (WARN_ON_ONCE(kthread->node == NUMA_NO_NODE)) return; pref = cpumask_of_node(kthread->node); } cpumask_and(cpumask, pref, housekeeping_cpumask(HK_TYPE_KTHREAD)); if (cpumask_empty(cpumask)) cpumask_copy(cpumask, housekeeping_cpumask(HK_TYPE_KTHREAD)); } static void kthread_affine_node(void) { struct kthread *kthread = to_kthread(current); cpumask_var_t affinity; WARN_ON_ONCE(kthread_is_per_cpu(current)); if (kthread->node == NUMA_NO_NODE) { housekeeping_affine(current, HK_TYPE_KTHREAD); } else { if (!zalloc_cpumask_var(&affinity, GFP_KERNEL)) { WARN_ON_ONCE(1); return; } mutex_lock(&kthreads_hotplug_lock); WARN_ON_ONCE(!list_empty(&kthread->hotplug_node)); list_add_tail(&kthread->hotplug_node, &kthreads_hotplug); /* * The node cpumask is racy when read from kthread() but: * - a racing CPU going down will either fail on the subsequent * call to set_cpus_allowed_ptr() or be migrated to housekeepers * afterwards by the scheduler. * - a racing CPU going up will be handled by kthreads_online_cpu() */ kthread_fetch_affinity(kthread, affinity); set_cpus_allowed_ptr(current, affinity); mutex_unlock(&kthreads_hotplug_lock); free_cpumask_var(affinity); } } static int kthread(void *_create) { static const struct sched_param param = { .sched_priority = 0 }; /* Copy data: it's on kthread's stack */ struct kthread_create_info *create = _create; int (*threadfn)(void *data) = create->threadfn; void *data = create->data; struct completion *done; struct kthread *self; int ret; self = to_kthread(current); /* Release the structure when caller killed by a fatal signal. */ done = xchg(&create->done, NULL); if (!done) { kfree(create->full_name); kfree(create); kthread_exit(-EINTR); } self->full_name = create->full_name; self->threadfn = threadfn; self->data = data; /* * The new thread inherited kthreadd's priority and CPU mask. Reset * back to default in case they have been changed. */ sched_setscheduler_nocheck(current, SCHED_NORMAL, ¶m); /* OK, tell user we're spawned, wait for stop or wakeup */ __set_current_state(TASK_UNINTERRUPTIBLE); create->result = current; /* * Thread is going to call schedule(), do not preempt it, * or the creator may spend more time in wait_task_inactive(). */ preempt_disable(); complete(done); schedule_preempt_disabled(); preempt_enable(); self->started = 1; if (!(current->flags & PF_NO_SETAFFINITY) && !self->preferred_affinity) kthread_affine_node(); ret = -EINTR; if (!test_bit(KTHREAD_SHOULD_STOP, &self->flags)) { cgroup_kthread_ready(); __kthread_parkme(self); ret = threadfn(data); } kthread_exit(ret); } /* called from kernel_clone() to get node information for about to be created task */ int tsk_fork_get_node(struct task_struct *tsk) { #ifdef CONFIG_NUMA if (tsk == kthreadd_task) return tsk->pref_node_fork; #endif return NUMA_NO_NODE; } static void create_kthread(struct kthread_create_info *create) { int pid; #ifdef CONFIG_NUMA current->pref_node_fork = create->node; #endif /* We want our own signal handler (we take no signals by default). */ pid = kernel_thread(kthread, create, create->full_name, CLONE_FS | CLONE_FILES | SIGCHLD); if (pid < 0) { /* Release the structure when caller killed by a fatal signal. */ struct completion *done = xchg(&create->done, NULL); kfree(create->full_name); if (!done) { kfree(create); return; } create->result = ERR_PTR(pid); complete(done); } } static __printf(4, 0) struct task_struct *__kthread_create_on_node(int (*threadfn)(void *data), void *data, int node, const char namefmt[], va_list args) { DECLARE_COMPLETION_ONSTACK(done); struct task_struct *task; struct kthread_create_info *create = kmalloc(sizeof(*create), GFP_KERNEL); if (!create) return ERR_PTR(-ENOMEM); create->threadfn = threadfn; create->data = data; create->node = node; create->done = &done; create->full_name = kvasprintf(GFP_KERNEL, namefmt, args); if (!create->full_name) { task = ERR_PTR(-ENOMEM); goto free_create; } spin_lock(&kthread_create_lock); list_add_tail(&create->list, &kthread_create_list); spin_unlock(&kthread_create_lock); wake_up_process(kthreadd_task); /* * Wait for completion in killable state, for I might be chosen by * the OOM killer while kthreadd is trying to allocate memory for * new kernel thread. */ if (unlikely(wait_for_completion_killable(&done))) { /* * If I was killed by a fatal signal before kthreadd (or new * kernel thread) calls complete(), leave the cleanup of this * structure to that thread. */ if (xchg(&create->done, NULL)) return ERR_PTR(-EINTR); /* * kthreadd (or new kernel thread) will call complete() * shortly. */ wait_for_completion(&done); } task = create->result; free_create: kfree(create); return task; } /** * kthread_create_on_node - create a kthread. * @threadfn: the function to run until signal_pending(current). * @data: data ptr for @threadfn. * @node: task and thread structures for the thread are allocated on this node * @namefmt: printf-style name for the thread. * * Description: This helper function creates and names a kernel * thread. The thread will be stopped: use wake_up_process() to start * it. See also kthread_run(). The new thread has SCHED_NORMAL policy and * is affine to all CPUs. * * If thread is going to be bound on a particular cpu, give its node * in @node, to get NUMA affinity for kthread stack, or else give NUMA_NO_NODE. * When woken, the thread will run @threadfn() with @data as its * argument. @threadfn() can either return directly if it is a * standalone thread for which no one will call kthread_stop(), or * return when 'kthread_should_stop()' is true (which means * kthread_stop() has been called). The return value should be zero * or a negative error number; it will be passed to kthread_stop(). * * Returns a task_struct or ERR_PTR(-ENOMEM) or ERR_PTR(-EINTR). */ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data), void *data, int node, const char namefmt[], ...) { struct task_struct *task; va_list args; va_start(args, namefmt); task = __kthread_create_on_node(threadfn, data, node, namefmt, args); va_end(args); return task; } EXPORT_SYMBOL(kthread_create_on_node); static void __kthread_bind_mask(struct task_struct *p, const struct cpumask *mask, unsigned int state) { unsigned long flags; if (!wait_task_inactive(p, state)) { WARN_ON(1); return; } /* It's safe because the task is inactive. */ raw_spin_lock_irqsave(&p->pi_lock, flags); do_set_cpus_allowed(p, mask); p->flags |= PF_NO_SETAFFINITY; raw_spin_unlock_irqrestore(&p->pi_lock, flags); } static void __kthread_bind(struct task_struct *p, unsigned int cpu, unsigned int state) { __kthread_bind_mask(p, cpumask_of(cpu), state); } void kthread_bind_mask(struct task_struct *p, const struct cpumask *mask) { struct kthread *kthread = to_kthread(p); __kthread_bind_mask(p, mask, TASK_UNINTERRUPTIBLE); WARN_ON_ONCE(kthread->started); } /** * kthread_bind - bind a just-created kthread to a cpu. * @p: thread created by kthread_create(). * @cpu: cpu (might not be online, must be possible) for @k to run on. * * Description: This function is equivalent to set_cpus_allowed(), * except that @cpu doesn't need to be online, and the thread must be * stopped (i.e., just returned from kthread_create()). */ void kthread_bind(struct task_struct *p, unsigned int cpu) { struct kthread *kthread = to_kthread(p); __kthread_bind(p, cpu, TASK_UNINTERRUPTIBLE); WARN_ON_ONCE(kthread->started); } EXPORT_SYMBOL(kthread_bind); /** * kthread_create_on_cpu - Create a cpu bound kthread * @threadfn: the function to run until signal_pending(current). * @data: data ptr for @threadfn. * @cpu: The cpu on which the thread should be bound, * @namefmt: printf-style name for the thread. Format is restricted * to "name.*%u". Code fills in cpu number. * * Description: This helper function creates and names a kernel thread */ struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data), void *data, unsigned int cpu, const char *namefmt) { struct task_struct *p; p = kthread_create_on_node(threadfn, data, cpu_to_node(cpu), namefmt, cpu); if (IS_ERR(p)) return p; kthread_bind(p, cpu); /* CPU hotplug need to bind once again when unparking the thread. */ to_kthread(p)->cpu = cpu; return p; } EXPORT_SYMBOL(kthread_create_on_cpu); void kthread_set_per_cpu(struct task_struct *k, int cpu) { struct kthread *kthread = to_kthread(k); if (!kthread) return; WARN_ON_ONCE(!(k->flags & PF_NO_SETAFFINITY)); if (cpu < 0) { clear_bit(KTHREAD_IS_PER_CPU, &kthread->flags); return; } kthread->cpu = cpu; set_bit(KTHREAD_IS_PER_CPU, &kthread->flags); } bool kthread_is_per_cpu(struct task_struct *p) { struct kthread *kthread = __to_kthread(p); if (!kthread) return false; return test_bit(KTHREAD_IS_PER_CPU, &kthread->flags); } /** * kthread_unpark - unpark a thread created by kthread_create(). * @k: thread created by kthread_create(). * * Sets kthread_should_park() for @k to return false, wakes it, and * waits for it to return. If the thread is marked percpu then its * bound to the cpu again. */ void kthread_unpark(struct task_struct *k) { struct kthread *kthread = to_kthread(k); if (!test_bit(KTHREAD_SHOULD_PARK, &kthread->flags)) return; /* * Newly created kthread was parked when the CPU was offline. * The binding was lost and we need to set it again. */ if (test_bit(KTHREAD_IS_PER_CPU, &kthread->flags)) __kthread_bind(k, kthread->cpu, TASK_PARKED); clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags); /* * __kthread_parkme() will either see !SHOULD_PARK or get the wakeup. */ wake_up_state(k, TASK_PARKED); } EXPORT_SYMBOL_GPL(kthread_unpark); /** * kthread_park - park a thread created by kthread_create(). * @k: thread created by kthread_create(). * * Sets kthread_should_park() for @k to return true, wakes it, and * waits for it to return. This can also be called after kthread_create() * instead of calling wake_up_process(): the thread will park without * calling threadfn(). * * Returns 0 if the thread is parked, -ENOSYS if the thread exited. * If called by the kthread itself just the park bit is set. */ int kthread_park(struct task_struct *k) { struct kthread *kthread = to_kthread(k); if (WARN_ON(k->flags & PF_EXITING)) return -ENOSYS; if (WARN_ON_ONCE(test_bit(KTHREAD_SHOULD_PARK, &kthread->flags))) return -EBUSY; set_bit(KTHREAD_SHOULD_PARK, &kthread->flags); if (k != current) { wake_up_process(k); /* * Wait for __kthread_parkme() to complete(), this means we * _will_ have TASK_PARKED and are about to call schedule(). */ wait_for_completion(&kthread->parked); /* * Now wait for that schedule() to complete and the task to * get scheduled out. */ WARN_ON_ONCE(!wait_task_inactive(k, TASK_PARKED)); } return 0; } EXPORT_SYMBOL_GPL(kthread_park); /** * kthread_stop - stop a thread created by kthread_create(). * @k: thread created by kthread_create(). * * Sets kthread_should_stop() for @k to return true, wakes it, and * waits for it to exit. This can also be called after kthread_create() * instead of calling wake_up_process(): the thread will exit without * calling threadfn(). * * If threadfn() may call kthread_exit() itself, the caller must ensure * task_struct can't go away. * * Returns the result of threadfn(), or %-EINTR if wake_up_process() * was never called. */ int kthread_stop(struct task_struct *k) { struct kthread *kthread; int ret; trace_sched_kthread_stop(k); get_task_struct(k); kthread = to_kthread(k); set_bit(KTHREAD_SHOULD_STOP, &kthread->flags); kthread_unpark(k); set_tsk_thread_flag(k, TIF_NOTIFY_SIGNAL); wake_up_process(k); wait_for_completion(&kthread->exited); ret = kthread->result; put_task_struct(k); trace_sched_kthread_stop_ret(ret); return ret; } EXPORT_SYMBOL(kthread_stop); /** * kthread_stop_put - stop a thread and put its task struct * @k: thread created by kthread_create(). * * Stops a thread created by kthread_create() and put its task_struct. * Only use when holding an extra task struct reference obtained by * calling get_task_struct(). */ int kthread_stop_put(struct task_struct *k) { int ret; ret = kthread_stop(k); put_task_struct(k); return ret; } EXPORT_SYMBOL(kthread_stop_put); int kthreadd(void *unused) { static const char comm[TASK_COMM_LEN] = "kthreadd"; struct task_struct *tsk = current; /* Setup a clean context for our children to inherit. */ set_task_comm(tsk, comm); ignore_signals(tsk); set_cpus_allowed_ptr(tsk, housekeeping_cpumask(HK_TYPE_KTHREAD)); set_mems_allowed(node_states[N_MEMORY]); current->flags |= PF_NOFREEZE; cgroup_init_kthreadd(); for (;;) { set_current_state(TASK_INTERRUPTIBLE); if (list_empty(&kthread_create_list)) schedule(); __set_current_state(TASK_RUNNING); spin_lock(&kthread_create_lock); while (!list_empty(&kthread_create_list)) { struct kthread_create_info *create; create = list_entry(kthread_create_list.next, struct kthread_create_info, list); list_del_init(&create->list); spin_unlock(&kthread_create_lock); create_kthread(create); spin_lock(&kthread_create_lock); } spin_unlock(&kthread_create_lock); } return 0; } int kthread_affine_preferred(struct task_struct *p, const struct cpumask *mask) { struct kthread *kthread = to_kthread(p); cpumask_var_t affinity; unsigned long flags; int ret = 0; if (!wait_task_inactive(p, TASK_UNINTERRUPTIBLE) || kthread->started) { WARN_ON(1); return -EINVAL; } WARN_ON_ONCE(kthread->preferred_affinity); if (!zalloc_cpumask_var(&affinity, GFP_KERNEL)) return -ENOMEM; kthread->preferred_affinity = kzalloc(sizeof(struct cpumask), GFP_KERNEL); if (!kthread->preferred_affinity) { ret = -ENOMEM; goto out; } mutex_lock(&kthreads_hotplug_lock); cpumask_copy(kthread->preferred_affinity, mask); WARN_ON_ONCE(!list_empty(&kthread->hotplug_node)); list_add_tail(&kthread->hotplug_node, &kthreads_hotplug); kthread_fetch_affinity(kthread, affinity); /* It's safe because the task is inactive. */ raw_spin_lock_irqsave(&p->pi_lock, flags); do_set_cpus_allowed(p, affinity); raw_spin_unlock_irqrestore(&p->pi_lock, flags); mutex_unlock(&kthreads_hotplug_lock); out: free_cpumask_var(affinity); return ret; } /* * Re-affine kthreads according to their preferences * and the newly online CPU. The CPU down part is handled * by select_fallback_rq() which default re-affines to * housekeepers from other nodes in case the preferred * affinity doesn't apply anymore. */ static int kthreads_online_cpu(unsigned int cpu) { cpumask_var_t affinity; struct kthread *k; int ret; guard(mutex)(&kthreads_hotplug_lock); if (list_empty(&kthreads_hotplug)) return 0; if (!zalloc_cpumask_var(&affinity, GFP_KERNEL)) return -ENOMEM; ret = 0; list_for_each_entry(k, &kthreads_hotplug, hotplug_node) { if (WARN_ON_ONCE((k->task->flags & PF_NO_SETAFFINITY) || kthread_is_per_cpu(k->task))) { ret = -EINVAL; continue; } kthread_fetch_affinity(k, affinity); set_cpus_allowed_ptr(k->task, affinity); } free_cpumask_var(affinity); return ret; } static int kthreads_init(void) { return cpuhp_setup_state(CPUHP_AP_KTHREADS_ONLINE, "kthreads:online", kthreads_online_cpu, NULL); } early_initcall(kthreads_init); void __kthread_init_worker(struct kthread_worker *worker, const char *name, struct lock_class_key *key) { memset(worker, 0, sizeof(struct kthread_worker)); raw_spin_lock_init(&worker->lock); lockdep_set_class_and_name(&worker->lock, key, name); INIT_LIST_HEAD(&worker->work_list); INIT_LIST_HEAD(&worker->delayed_work_list); } EXPORT_SYMBOL_GPL(__kthread_init_worker); /** * kthread_worker_fn - kthread function to process kthread_worker * @worker_ptr: pointer to initialized kthread_worker * * This function implements the main cycle of kthread worker. It processes * work_list until it is stopped with kthread_stop(). It sleeps when the queue * is empty. * * The works are not allowed to keep any locks, disable preemption or interrupts * when they finish. There is defined a safe point for freezing when one work * finishes and before a new one is started. * * Also the works must not be handled by more than one worker at the same time, * see also kthread_queue_work(). */ int kthread_worker_fn(void *worker_ptr) { struct kthread_worker *worker = worker_ptr; struct kthread_work *work; /* * FIXME: Update the check and remove the assignment when all kthread * worker users are created using kthread_create_worker*() functions. */ WARN_ON(worker->task && worker->task != current); worker->task = current; if (worker->flags & KTW_FREEZABLE) set_freezable(); repeat: set_current_state(TASK_INTERRUPTIBLE); /* mb paired w/ kthread_stop */ if (kthread_should_stop()) { __set_current_state(TASK_RUNNING); raw_spin_lock_irq(&worker->lock); worker->task = NULL; raw_spin_unlock_irq(&worker->lock); return 0; } work = NULL; raw_spin_lock_irq(&worker->lock); if (!list_empty(&worker->work_list)) { work = list_first_entry(&worker->work_list, struct kthread_work, node); list_del_init(&work->node); } worker->current_work = work; raw_spin_unlock_irq(&worker->lock); if (work) { kthread_work_func_t func = work->func; __set_current_state(TASK_RUNNING); trace_sched_kthread_work_execute_start(work); work->func(work); /* * Avoid dereferencing work after this point. The trace * event only cares about the address. */ trace_sched_kthread_work_execute_end(work, func); } else if (!freezing(current)) { schedule(); } else { /* * Handle the case where the current remains * TASK_INTERRUPTIBLE. try_to_freeze() expects * the current to be TASK_RUNNING. */ __set_current_state(TASK_RUNNING); } try_to_freeze(); cond_resched(); goto repeat; } EXPORT_SYMBOL_GPL(kthread_worker_fn); static __printf(3, 0) struct kthread_worker * __kthread_create_worker_on_node(unsigned int flags, int node, const char namefmt[], va_list args) { struct kthread_worker *worker; struct task_struct *task; worker = kzalloc(sizeof(*worker), GFP_KERNEL); if (!worker) return ERR_PTR(-ENOMEM); kthread_init_worker(worker); task = __kthread_create_on_node(kthread_worker_fn, worker, node, namefmt, args); if (IS_ERR(task)) goto fail_task; worker->flags = flags; worker->task = task; return worker; fail_task: kfree(worker); return ERR_CAST(task); } /** * kthread_create_worker_on_node - create a kthread worker * @flags: flags modifying the default behavior of the worker * @node: task structure for the thread is allocated on this node * @namefmt: printf-style name for the kthread worker (task). * * Returns a pointer to the allocated worker on success, ERR_PTR(-ENOMEM) * when the needed structures could not get allocated, and ERR_PTR(-EINTR) * when the caller was killed by a fatal signal. */ struct kthread_worker * kthread_create_worker_on_node(unsigned int flags, int node, const char namefmt[], ...) { struct kthread_worker *worker; va_list args; va_start(args, namefmt); worker = __kthread_create_worker_on_node(flags, node, namefmt, args); va_end(args); return worker; } EXPORT_SYMBOL(kthread_create_worker_on_node); /** * kthread_create_worker_on_cpu - create a kthread worker and bind it * to a given CPU and the associated NUMA node. * @cpu: CPU number * @flags: flags modifying the default behavior of the worker * @namefmt: printf-style name for the thread. Format is restricted * to "name.*%u". Code fills in cpu number. * * Use a valid CPU number if you want to bind the kthread worker * to the given CPU and the associated NUMA node. * * A good practice is to add the cpu number also into the worker name. * For example, use kthread_create_worker_on_cpu(cpu, "helper/%d", cpu). * * CPU hotplug: * The kthread worker API is simple and generic. It just provides a way * to create, use, and destroy workers. * * It is up to the API user how to handle CPU hotplug. They have to decide * how to handle pending work items, prevent queuing new ones, and * restore the functionality when the CPU goes off and on. There are a * few catches: * * - CPU affinity gets lost when it is scheduled on an offline CPU. * * - The worker might not exist when the CPU was off when the user * created the workers. * * Good practice is to implement two CPU hotplug callbacks and to * destroy/create the worker when the CPU goes down/up. * * Return: * The pointer to the allocated worker on success, ERR_PTR(-ENOMEM) * when the needed structures could not get allocated, and ERR_PTR(-EINTR) * when the caller was killed by a fatal signal. */ struct kthread_worker * kthread_create_worker_on_cpu(int cpu, unsigned int flags, const char namefmt[]) { struct kthread_worker *worker; worker = kthread_create_worker_on_node(flags, cpu_to_node(cpu), namefmt, cpu); if (!IS_ERR(worker)) kthread_bind(worker->task, cpu); return worker; } EXPORT_SYMBOL(kthread_create_worker_on_cpu); /* * Returns true when the work could not be queued at the moment. * It happens when it is already pending in a worker list * or when it is being cancelled. */ static inline bool queuing_blocked(struct kthread_worker *worker, struct kthread_work *work) { lockdep_assert_held(&worker->lock); return !list_empty(&work->node) || work->canceling; } static void kthread_insert_work_sanity_check(struct kthread_worker *worker, struct kthread_work *work) { lockdep_assert_held(&worker->lock); WARN_ON_ONCE(!list_empty(&work->node)); /* Do not use a work with >1 worker, see kthread_queue_work() */ WARN_ON_ONCE(work->worker && work->worker != worker); } /* insert @work before @pos in @worker */ static void kthread_insert_work(struct kthread_worker *worker, struct kthread_work *work, struct list_head *pos) { kthread_insert_work_sanity_check(worker, work); trace_sched_kthread_work_queue_work(worker, work); list_add_tail(&work->node, pos); work->worker = worker; if (!worker->current_work && likely(worker->task)) wake_up_process(worker->task); } /** * kthread_queue_work - queue a kthread_work * @worker: target kthread_worker * @work: kthread_work to queue * * Queue @work to work processor @task for async execution. @task * must have been created with kthread_create_worker(). Returns %true * if @work was successfully queued, %false if it was already pending. * * Reinitialize the work if it needs to be used by another worker. * For example, when the worker was stopped and started again. */ bool kthread_queue_work(struct kthread_worker *worker, struct kthread_work *work) { bool ret = false; unsigned long flags; raw_spin_lock_irqsave(&worker->lock, flags); if (!queuing_blocked(worker, work)) { kthread_insert_work(worker, work, &worker->work_list); ret = true; } raw_spin_unlock_irqrestore(&worker->lock, flags); return ret; } EXPORT_SYMBOL_GPL(kthread_queue_work); /** * kthread_delayed_work_timer_fn - callback that queues the associated kthread * delayed work when the timer expires. * @t: pointer to the expired timer * * The format of the function is defined by struct timer_list. * It should have been called from irqsafe timer with irq already off. */ void kthread_delayed_work_timer_fn(struct timer_list *t) { struct kthread_delayed_work *dwork = timer_container_of(dwork, t, timer); struct kthread_work *work = &dwork->work; struct kthread_worker *worker = work->worker; unsigned long flags; /* * This might happen when a pending work is reinitialized. * It means that it is used a wrong way. */ if (WARN_ON_ONCE(!worker)) return; raw_spin_lock_irqsave(&worker->lock, flags); /* Work must not be used with >1 worker, see kthread_queue_work(). */ WARN_ON_ONCE(work->worker != worker); /* Move the work from worker->delayed_work_list. */ WARN_ON_ONCE(list_empty(&work->node)); list_del_init(&work->node); if (!work->canceling) kthread_insert_work(worker, work, &worker->work_list); raw_spin_unlock_irqrestore(&worker->lock, flags); } EXPORT_SYMBOL(kthread_delayed_work_timer_fn); static void __kthread_queue_delayed_work(struct kthread_worker *worker, struct kthread_delayed_work *dwork, unsigned long delay) { struct timer_list *timer = &dwork->timer; struct kthread_work *work = &dwork->work; WARN_ON_ONCE(timer->function != kthread_delayed_work_timer_fn); /* * If @delay is 0, queue @dwork->work immediately. This is for * both optimization and correctness. The earliest @timer can * expire is on the closest next tick and delayed_work users depend * on that there's no such delay when @delay is 0. */ if (!delay) { kthread_insert_work(worker, work, &worker->work_list); return; } /* Be paranoid and try to detect possible races already now. */ kthread_insert_work_sanity_check(worker, work); list_add(&work->node, &worker->delayed_work_list); work->worker = worker; timer->expires = jiffies + delay; add_timer(timer); } /** * kthread_queue_delayed_work - queue the associated kthread work * after a delay. * @worker: target kthread_worker * @dwork: kthread_delayed_work to queue * @delay: number of jiffies to wait before queuing * * If the work has not been pending it starts a timer that will queue * the work after the given @delay. If @delay is zero, it queues the * work immediately. * * Return: %false if the @work has already been pending. It means that * either the timer was running or the work was queued. It returns %true * otherwise. */ bool kthread_queue_delayed_work(struct kthread_worker *worker, struct kthread_delayed_work *dwork, unsigned long delay) { struct kthread_work *work = &dwork->work; unsigned long flags; bool ret = false; raw_spin_lock_irqsave(&worker->lock, flags); if (!queuing_blocked(worker, work)) { __kthread_queue_delayed_work(worker, dwork, delay); ret = true; } raw_spin_unlock_irqrestore(&worker->lock, flags); return ret; } EXPORT_SYMBOL_GPL(kthread_queue_delayed_work); struct kthread_flush_work { struct kthread_work work; struct completion done; }; static void kthread_flush_work_fn(struct kthread_work *work) { struct kthread_flush_work *fwork = container_of(work, struct kthread_flush_work, work); complete(&fwork->done); } /** * kthread_flush_work - flush a kthread_work * @work: work to flush * * If @work is queued or executing, wait for it to finish execution. */ void kthread_flush_work(struct kthread_work *work) { struct kthread_flush_work fwork = { KTHREAD_WORK_INIT(fwork.work, kthread_flush_work_fn), COMPLETION_INITIALIZER_ONSTACK(fwork.done), }; struct kthread_worker *worker; bool noop = false; worker = work->worker; if (!worker) return; raw_spin_lock_irq(&worker->lock); /* Work must not be used with >1 worker, see kthread_queue_work(). */ WARN_ON_ONCE(work->worker != worker); if (!list_empty(&work->node)) kthread_insert_work(worker, &fwork.work, work->node.next); else if (worker->current_work == work) kthread_insert_work(worker, &fwork.work, worker->work_list.next); else noop = true; raw_spin_unlock_irq(&worker->lock); if (!noop) wait_for_completion(&fwork.done); } EXPORT_SYMBOL_GPL(kthread_flush_work); /* * Make sure that the timer is neither set nor running and could * not manipulate the work list_head any longer. * * The function is called under worker->lock. The lock is temporary * released but the timer can't be set again in the meantime. */ static void kthread_cancel_delayed_work_timer(struct kthread_work *work, unsigned long *flags) { struct kthread_delayed_work *dwork = container_of(work, struct kthread_delayed_work, work); struct kthread_worker *worker = work->worker; /* * timer_delete_sync() must be called to make sure that the timer * callback is not running. The lock must be temporary released * to avoid a deadlock with the callback. In the meantime, * any queuing is blocked by setting the canceling counter. */ work->canceling++; raw_spin_unlock_irqrestore(&worker->lock, *flags); timer_delete_sync(&dwork->timer); raw_spin_lock_irqsave(&worker->lock, *flags); work->canceling--; } /* * This function removes the work from the worker queue. * * It is called under worker->lock. The caller must make sure that * the timer used by delayed work is not running, e.g. by calling * kthread_cancel_delayed_work_timer(). * * The work might still be in use when this function finishes. See the * current_work proceed by the worker. * * Return: %true if @work was pending and successfully canceled, * %false if @work was not pending */ static bool __kthread_cancel_work(struct kthread_work *work) { /* * Try to remove the work from a worker list. It might either * be from worker->work_list or from worker->delayed_work_list. */ if (!list_empty(&work->node)) { list_del_init(&work->node); return true; } return false; } /** * kthread_mod_delayed_work - modify delay of or queue a kthread delayed work * @worker: kthread worker to use * @dwork: kthread delayed work to queue * @delay: number of jiffies to wait before queuing * * If @dwork is idle, equivalent to kthread_queue_delayed_work(). Otherwise, * modify @dwork's timer so that it expires after @delay. If @delay is zero, * @work is guaranteed to be queued immediately. * * Return: %false if @dwork was idle and queued, %true otherwise. * * A special case is when the work is being canceled in parallel. * It might be caused either by the real kthread_cancel_delayed_work_sync() * or yet another kthread_mod_delayed_work() call. We let the other command * win and return %true here. The return value can be used for reference * counting and the number of queued works stays the same. Anyway, the caller * is supposed to synchronize these operations a reasonable way. * * This function is safe to call from any context including IRQ handler. * See __kthread_cancel_work() and kthread_delayed_work_timer_fn() * for details. */ bool kthread_mod_delayed_work(struct kthread_worker *worker, struct kthread_delayed_work *dwork, unsigned long delay) { struct kthread_work *work = &dwork->work; unsigned long flags; int ret; raw_spin_lock_irqsave(&worker->lock, flags); /* Do not bother with canceling when never queued. */ if (!work->worker) { ret = false; goto fast_queue; } /* Work must not be used with >1 worker, see kthread_queue_work() */ WARN_ON_ONCE(work->worker != worker); /* * Temporary cancel the work but do not fight with another command * that is canceling the work as well. * * It is a bit tricky because of possible races with another * mod_delayed_work() and cancel_delayed_work() callers. * * The timer must be canceled first because worker->lock is released * when doing so. But the work can be removed from the queue (list) * only when it can be queued again so that the return value can * be used for reference counting. */ kthread_cancel_delayed_work_timer(work, &flags); if (work->canceling) { /* The number of works in the queue does not change. */ ret = true; goto out; } ret = __kthread_cancel_work(work); fast_queue: __kthread_queue_delayed_work(worker, dwork, delay); out: raw_spin_unlock_irqrestore(&worker->lock, flags); return ret; } EXPORT_SYMBOL_GPL(kthread_mod_delayed_work); static bool __kthread_cancel_work_sync(struct kthread_work *work, bool is_dwork) { struct kthread_worker *worker = work->worker; unsigned long flags; int ret = false; if (!worker) goto out; raw_spin_lock_irqsave(&worker->lock, flags); /* Work must not be used with >1 worker, see kthread_queue_work(). */ WARN_ON_ONCE(work->worker != worker); if (is_dwork) kthread_cancel_delayed_work_timer(work, &flags); ret = __kthread_cancel_work(work); if (worker->current_work != work) goto out_fast; /* * The work is in progress and we need to wait with the lock released. * In the meantime, block any queuing by setting the canceling counter. */ work->canceling++; raw_spin_unlock_irqrestore(&worker->lock, flags); kthread_flush_work(work); raw_spin_lock_irqsave(&worker->lock, flags); work->canceling--; out_fast: raw_spin_unlock_irqrestore(&worker->lock, flags); out: return ret; } /** * kthread_cancel_work_sync - cancel a kthread work and wait for it to finish * @work: the kthread work to cancel * * Cancel @work and wait for its execution to finish. This function * can be used even if the work re-queues itself. On return from this * function, @work is guaranteed to be not pending or executing on any CPU. * * kthread_cancel_work_sync(&delayed_work->work) must not be used for * delayed_work's. Use kthread_cancel_delayed_work_sync() instead. * * The caller must ensure that the worker on which @work was last * queued can't be destroyed before this function returns. * * Return: %true if @work was pending, %false otherwise. */ bool kthread_cancel_work_sync(struct kthread_work *work) { return __kthread_cancel_work_sync(work, false); } EXPORT_SYMBOL_GPL(kthread_cancel_work_sync); /** * kthread_cancel_delayed_work_sync - cancel a kthread delayed work and * wait for it to finish. * @dwork: the kthread delayed work to cancel * * This is kthread_cancel_work_sync() for delayed works. * * Return: %true if @dwork was pending, %false otherwise. */ bool kthread_cancel_delayed_work_sync(struct kthread_delayed_work *dwork) { return __kthread_cancel_work_sync(&dwork->work, true); } EXPORT_SYMBOL_GPL(kthread_cancel_delayed_work_sync); /** * kthread_flush_worker - flush all current works on a kthread_worker * @worker: worker to flush * * Wait until all currently executing or pending works on @worker are * finished. */ void kthread_flush_worker(struct kthread_worker *worker) { struct kthread_flush_work fwork = { KTHREAD_WORK_INIT(fwork.work, kthread_flush_work_fn), COMPLETION_INITIALIZER_ONSTACK(fwork.done), }; kthread_queue_work(worker, &fwork.work); wait_for_completion(&fwork.done); } EXPORT_SYMBOL_GPL(kthread_flush_worker); /** * kthread_destroy_worker - destroy a kthread worker * @worker: worker to be destroyed * * Flush and destroy @worker. The simple flush is enough because the kthread * worker API is used only in trivial scenarios. There are no multi-step state * machines needed. * * Note that this function is not responsible for handling delayed work, so * caller should be responsible for queuing or canceling all delayed work items * before invoke this function. */ void kthread_destroy_worker(struct kthread_worker *worker) { struct task_struct *task; task = worker->task; if (WARN_ON(!task)) return; kthread_flush_worker(worker); kthread_stop(task); WARN_ON(!list_empty(&worker->delayed_work_list)); WARN_ON(!list_empty(&worker->work_list)); kfree(worker); } EXPORT_SYMBOL(kthread_destroy_worker); /** * kthread_use_mm - make the calling kthread operate on an address space * @mm: address space to operate on */ void kthread_use_mm(struct mm_struct *mm) { struct mm_struct *active_mm; struct task_struct *tsk = current; WARN_ON_ONCE(!(tsk->flags & PF_KTHREAD)); WARN_ON_ONCE(tsk->mm); /* * It is possible for mm to be the same as tsk->active_mm, but * we must still mmgrab(mm) and mmdrop_lazy_tlb(active_mm), * because these references are not equivalent. */ mmgrab(mm); task_lock(tsk); /* Hold off tlb flush IPIs while switching mm's */ local_irq_disable(); active_mm = tsk->active_mm; tsk->active_mm = mm; tsk->mm = mm; membarrier_update_current_mm(mm); switch_mm_irqs_off(active_mm, mm, tsk); local_irq_enable(); task_unlock(tsk); #ifdef finish_arch_post_lock_switch finish_arch_post_lock_switch(); #endif /* * When a kthread starts operating on an address space, the loop * in membarrier_{private,global}_expedited() may not observe * that tsk->mm, and not issue an IPI. Membarrier requires a * memory barrier after storing to tsk->mm, before accessing * user-space memory. A full memory barrier for membarrier * {PRIVATE,GLOBAL}_EXPEDITED is implicitly provided by * mmdrop_lazy_tlb(). */ mmdrop_lazy_tlb(active_mm); } EXPORT_SYMBOL_GPL(kthread_use_mm); /** * kthread_unuse_mm - reverse the effect of kthread_use_mm() * @mm: address space to operate on */ void kthread_unuse_mm(struct mm_struct *mm) { struct task_struct *tsk = current; WARN_ON_ONCE(!(tsk->flags & PF_KTHREAD)); WARN_ON_ONCE(!tsk->mm); task_lock(tsk); /* * When a kthread stops operating on an address space, the loop * in membarrier_{private,global}_expedited() may not observe * that tsk->mm, and not issue an IPI. Membarrier requires a * memory barrier after accessing user-space memory, before * clearing tsk->mm. */ smp_mb__after_spinlock(); local_irq_disable(); tsk->mm = NULL; membarrier_update_current_mm(NULL); mmgrab_lazy_tlb(mm); /* active_mm is still 'mm' */ enter_lazy_tlb(mm, tsk); local_irq_enable(); task_unlock(tsk); mmdrop(mm); } EXPORT_SYMBOL_GPL(kthread_unuse_mm); #ifdef CONFIG_BLK_CGROUP /** * kthread_associate_blkcg - associate blkcg to current kthread * @css: the cgroup info * * Current thread must be a kthread. The thread is running jobs on behalf of * other threads. In some cases, we expect the jobs attach cgroup info of * original threads instead of that of current thread. This function stores * original thread's cgroup info in current kthread context for later * retrieval. */ void kthread_associate_blkcg(struct cgroup_subsys_state *css) { struct kthread *kthread; if (!(current->flags & PF_KTHREAD)) return; kthread = to_kthread(current); if (!kthread) return; if (kthread->blkcg_css) { css_put(kthread->blkcg_css); kthread->blkcg_css = NULL; } if (css) { css_get(css); kthread->blkcg_css = css; } } EXPORT_SYMBOL(kthread_associate_blkcg); /** * kthread_blkcg - get associated blkcg css of current kthread * * Current thread must be a kthread. */ struct cgroup_subsys_state *kthread_blkcg(void) { struct kthread *kthread; if (current->flags & PF_KTHREAD) { kthread = to_kthread(current); if (kthread) return kthread->blkcg_css; } return NULL; } #endif |
| 1 1 1 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 | // SPDX-License-Identifier: GPL-2.0-or-later /* DataCenter TCP (DCTCP) congestion control. * * http://simula.stanford.edu/~alizade/Site/DCTCP.html * * This is an implementation of DCTCP over Reno, an enhancement to the * TCP congestion control algorithm designed for data centers. DCTCP * leverages Explicit Congestion Notification (ECN) in the network to * provide multi-bit feedback to the end hosts. DCTCP's goal is to meet * the following three data center transport requirements: * * - High burst tolerance (incast due to partition/aggregate) * - Low latency (short flows, queries) * - High throughput (continuous data updates, large file transfers) * with commodity shallow buffered switches * * The algorithm is described in detail in the following two papers: * * 1) Mohammad Alizadeh, Albert Greenberg, David A. Maltz, Jitendra Padhye, * Parveen Patel, Balaji Prabhakar, Sudipta Sengupta, and Murari Sridharan: * "Data Center TCP (DCTCP)", Data Center Networks session * Proc. ACM SIGCOMM, New Delhi, 2010. * http://simula.stanford.edu/~alizade/Site/DCTCP_files/dctcp-final.pdf * * 2) Mohammad Alizadeh, Adel Javanmard, and Balaji Prabhakar: * "Analysis of DCTCP: Stability, Convergence, and Fairness" * Proc. ACM SIGMETRICS, San Jose, 2011. * http://simula.stanford.edu/~alizade/Site/DCTCP_files/dctcp_analysis-full.pdf * * Initial prototype from Abdul Kabbani, Masato Yasuda and Mohammad Alizadeh. * * Authors: * * Daniel Borkmann <dborkman@redhat.com> * Florian Westphal <fw@strlen.de> * Glenn Judd <glenn.judd@morganstanley.com> */ #include <linux/btf.h> #include <linux/btf_ids.h> #include <linux/module.h> #include <linux/mm.h> #include <net/tcp.h> #include <linux/inet_diag.h> #include "tcp_dctcp.h" #define DCTCP_MAX_ALPHA 1024U struct dctcp { u32 old_delivered; u32 old_delivered_ce; u32 prior_rcv_nxt; u32 dctcp_alpha; u32 next_seq; u32 ce_state; u32 loss_cwnd; struct tcp_plb_state plb; }; static unsigned int dctcp_shift_g __read_mostly = 4; /* g = 1/2^4 */ static int dctcp_shift_g_set(const char *val, const struct kernel_param *kp) { return param_set_uint_minmax(val, kp, 0, 10); } static const struct kernel_param_ops dctcp_shift_g_ops = { .set = dctcp_shift_g_set, .get = param_get_uint, }; module_param_cb(dctcp_shift_g, &dctcp_shift_g_ops, &dctcp_shift_g, 0644); MODULE_PARM_DESC(dctcp_shift_g, "parameter g for updating dctcp_alpha"); static unsigned int dctcp_alpha_on_init __read_mostly = DCTCP_MAX_ALPHA; module_param(dctcp_alpha_on_init, uint, 0644); MODULE_PARM_DESC(dctcp_alpha_on_init, "parameter for initial alpha value"); static struct tcp_congestion_ops dctcp_reno; static void dctcp_reset(const struct tcp_sock *tp, struct dctcp *ca) { ca->next_seq = tp->snd_nxt; ca->old_delivered = tp->delivered; ca->old_delivered_ce = tp->delivered_ce; } __bpf_kfunc static void dctcp_init(struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); if (tcp_ecn_mode_any(tp) || (sk->sk_state == TCP_LISTEN || sk->sk_state == TCP_CLOSE)) { struct dctcp *ca = inet_csk_ca(sk); ca->prior_rcv_nxt = tp->rcv_nxt; ca->dctcp_alpha = min(dctcp_alpha_on_init, DCTCP_MAX_ALPHA); ca->loss_cwnd = 0; ca->ce_state = 0; dctcp_reset(tp, ca); tcp_plb_init(sk, &ca->plb); return; } /* No ECN support? Fall back to Reno. Also need to clear * ECT from sk since it is set during 3WHS for DCTCP. */ inet_csk(sk)->icsk_ca_ops = &dctcp_reno; INET_ECN_dontxmit(sk); } __bpf_kfunc static u32 dctcp_ssthresh(struct sock *sk) { struct dctcp *ca = inet_csk_ca(sk); struct tcp_sock *tp = tcp_sk(sk); ca->loss_cwnd = tcp_snd_cwnd(tp); return max(tcp_snd_cwnd(tp) - ((tcp_snd_cwnd(tp) * ca->dctcp_alpha) >> 11U), 2U); } __bpf_kfunc static void dctcp_update_alpha(struct sock *sk, u32 flags) { const struct tcp_sock *tp = tcp_sk(sk); struct dctcp *ca = inet_csk_ca(sk); /* Expired RTT */ if (!before(tp->snd_una, ca->next_seq)) { u32 delivered = tp->delivered - ca->old_delivered; u32 delivered_ce = tp->delivered_ce - ca->old_delivered_ce; u32 alpha = ca->dctcp_alpha; u32 ce_ratio = 0; if (delivered > 0) { /* dctcp_alpha keeps EWMA of fraction of ECN marked * packets. Because of EWMA smoothing, PLB reaction can * be slow so we use ce_ratio which is an instantaneous * measure of congestion. ce_ratio is the fraction of * ECN marked packets in the previous RTT. */ if (delivered_ce > 0) ce_ratio = (delivered_ce << TCP_PLB_SCALE) / delivered; tcp_plb_update_state(sk, &ca->plb, (int)ce_ratio); tcp_plb_check_rehash(sk, &ca->plb); } /* alpha = (1 - g) * alpha + g * F */ alpha -= min_not_zero(alpha, alpha >> dctcp_shift_g); if (delivered_ce) { /* If dctcp_shift_g == 1, a 32bit value would overflow * after 8 M packets. */ delivered_ce <<= (10 - dctcp_shift_g); delivered_ce /= max(1U, delivered); alpha = min(alpha + delivered_ce, DCTCP_MAX_ALPHA); } /* dctcp_alpha can be read from dctcp_get_info() without * synchro, so we ask compiler to not use dctcp_alpha * as a temporary variable in prior operations. */ WRITE_ONCE(ca->dctcp_alpha, alpha); dctcp_reset(tp, ca); } } static void dctcp_react_to_loss(struct sock *sk) { struct dctcp *ca = inet_csk_ca(sk); struct tcp_sock *tp = tcp_sk(sk); ca->loss_cwnd = tcp_snd_cwnd(tp); tp->snd_ssthresh = max(tcp_snd_cwnd(tp) >> 1U, 2U); } __bpf_kfunc static void dctcp_state(struct sock *sk, u8 new_state) { if (new_state == TCP_CA_Recovery && new_state != inet_csk(sk)->icsk_ca_state) dctcp_react_to_loss(sk); /* We handle RTO in dctcp_cwnd_event to ensure that we perform only * one loss-adjustment per RTT. */ } __bpf_kfunc static void dctcp_cwnd_event(struct sock *sk, enum tcp_ca_event ev) { struct dctcp *ca = inet_csk_ca(sk); switch (ev) { case CA_EVENT_ECN_IS_CE: case CA_EVENT_ECN_NO_CE: dctcp_ece_ack_update(sk, ev, &ca->prior_rcv_nxt, &ca->ce_state); break; case CA_EVENT_LOSS: tcp_plb_update_state_upon_rto(sk, &ca->plb); dctcp_react_to_loss(sk); break; case CA_EVENT_TX_START: tcp_plb_check_rehash(sk, &ca->plb); /* Maybe rehash when inflight is 0 */ break; default: /* Don't care for the rest. */ break; } } static size_t dctcp_get_info(struct sock *sk, u32 ext, int *attr, union tcp_cc_info *info) { const struct dctcp *ca = inet_csk_ca(sk); const struct tcp_sock *tp = tcp_sk(sk); /* Fill it also in case of VEGASINFO due to req struct limits. * We can still correctly retrieve it later. */ if (ext & (1 << (INET_DIAG_DCTCPINFO - 1)) || ext & (1 << (INET_DIAG_VEGASINFO - 1))) { memset(&info->dctcp, 0, sizeof(info->dctcp)); if (inet_csk(sk)->icsk_ca_ops != &dctcp_reno) { info->dctcp.dctcp_enabled = 1; info->dctcp.dctcp_ce_state = (u16) ca->ce_state; info->dctcp.dctcp_alpha = ca->dctcp_alpha; info->dctcp.dctcp_ab_ecn = tp->mss_cache * (tp->delivered_ce - ca->old_delivered_ce); info->dctcp.dctcp_ab_tot = tp->mss_cache * (tp->delivered - ca->old_delivered); } *attr = INET_DIAG_DCTCPINFO; return sizeof(info->dctcp); } return 0; } __bpf_kfunc static u32 dctcp_cwnd_undo(struct sock *sk) { const struct dctcp *ca = inet_csk_ca(sk); struct tcp_sock *tp = tcp_sk(sk); return max(tcp_snd_cwnd(tp), ca->loss_cwnd); } static struct tcp_congestion_ops dctcp __read_mostly = { .init = dctcp_init, .in_ack_event = dctcp_update_alpha, .cwnd_event = dctcp_cwnd_event, .ssthresh = dctcp_ssthresh, .cong_avoid = tcp_reno_cong_avoid, .undo_cwnd = dctcp_cwnd_undo, .set_state = dctcp_state, .get_info = dctcp_get_info, .flags = TCP_CONG_NEEDS_ECN, .owner = THIS_MODULE, .name = "dctcp", }; static struct tcp_congestion_ops dctcp_reno __read_mostly = { .ssthresh = tcp_reno_ssthresh, .cong_avoid = tcp_reno_cong_avoid, .undo_cwnd = tcp_reno_undo_cwnd, .get_info = dctcp_get_info, .owner = THIS_MODULE, .name = "dctcp-reno", }; BTF_KFUNCS_START(tcp_dctcp_check_kfunc_ids) BTF_ID_FLAGS(func, dctcp_init) BTF_ID_FLAGS(func, dctcp_update_alpha) BTF_ID_FLAGS(func, dctcp_cwnd_event) BTF_ID_FLAGS(func, dctcp_ssthresh) BTF_ID_FLAGS(func, dctcp_cwnd_undo) BTF_ID_FLAGS(func, dctcp_state) BTF_KFUNCS_END(tcp_dctcp_check_kfunc_ids) static const struct btf_kfunc_id_set tcp_dctcp_kfunc_set = { .owner = THIS_MODULE, .set = &tcp_dctcp_check_kfunc_ids, }; static int __init dctcp_register(void) { int ret; BUILD_BUG_ON(sizeof(struct dctcp) > ICSK_CA_PRIV_SIZE); ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &tcp_dctcp_kfunc_set); if (ret < 0) return ret; return tcp_register_congestion_control(&dctcp); } static void __exit dctcp_unregister(void) { tcp_unregister_congestion_control(&dctcp); } module_init(dctcp_register); module_exit(dctcp_unregister); MODULE_AUTHOR("Daniel Borkmann <dborkman@redhat.com>"); MODULE_AUTHOR("Florian Westphal <fw@strlen.de>"); MODULE_AUTHOR("Glenn Judd <glenn.judd@morganstanley.com>"); MODULE_LICENSE("GPL v2"); MODULE_DESCRIPTION("DataCenter TCP (DCTCP)"); |
| 5 4 5 7 2 5 5 5 5 4 5 5 5 3 3 7 7 7 7 6 6 6 6 6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 | // SPDX-License-Identifier: GPL-2.0-or-later /* Copyright (c) 2014 Mahesh Bandewar <maheshb@google.com> */ #include <net/inet_dscp.h> #include <net/ip.h> #include "ipvlan.h" static u32 ipvlan_jhash_secret __read_mostly; void ipvlan_init_secret(void) { net_get_random_once(&ipvlan_jhash_secret, sizeof(ipvlan_jhash_secret)); } void ipvlan_count_rx(const struct ipvl_dev *ipvlan, unsigned int len, bool success, bool mcast) { if (likely(success)) { struct ipvl_pcpu_stats *pcptr; pcptr = this_cpu_ptr(ipvlan->pcpu_stats); u64_stats_update_begin(&pcptr->syncp); u64_stats_inc(&pcptr->rx_pkts); u64_stats_add(&pcptr->rx_bytes, len); if (mcast) u64_stats_inc(&pcptr->rx_mcast); u64_stats_update_end(&pcptr->syncp); } else { this_cpu_inc(ipvlan->pcpu_stats->rx_errs); } } EXPORT_SYMBOL_GPL(ipvlan_count_rx); #if IS_ENABLED(CONFIG_IPV6) static u8 ipvlan_get_v6_hash(const void *iaddr) { const struct in6_addr *ip6_addr = iaddr; return __ipv6_addr_jhash(ip6_addr, ipvlan_jhash_secret) & IPVLAN_HASH_MASK; } #else static u8 ipvlan_get_v6_hash(const void *iaddr) { return 0; } #endif static u8 ipvlan_get_v4_hash(const void *iaddr) { const struct in_addr *ip4_addr = iaddr; return jhash_1word(ip4_addr->s_addr, ipvlan_jhash_secret) & IPVLAN_HASH_MASK; } static bool addr_equal(bool is_v6, struct ipvl_addr *addr, const void *iaddr) { if (!is_v6 && addr->atype == IPVL_IPV4) { struct in_addr *i4addr = (struct in_addr *)iaddr; return addr->ip4addr.s_addr == i4addr->s_addr; #if IS_ENABLED(CONFIG_IPV6) } else if (is_v6 && addr->atype == IPVL_IPV6) { struct in6_addr *i6addr = (struct in6_addr *)iaddr; return ipv6_addr_equal(&addr->ip6addr, i6addr); #endif } return false; } static struct ipvl_addr *ipvlan_ht_addr_lookup(const struct ipvl_port *port, const void *iaddr, bool is_v6) { struct ipvl_addr *addr; u8 hash; hash = is_v6 ? ipvlan_get_v6_hash(iaddr) : ipvlan_get_v4_hash(iaddr); hlist_for_each_entry_rcu(addr, &port->hlhead[hash], hlnode) if (addr_equal(is_v6, addr, iaddr)) return addr; return NULL; } void ipvlan_ht_addr_add(struct ipvl_dev *ipvlan, struct ipvl_addr *addr) { struct ipvl_port *port = ipvlan->port; u8 hash; hash = (addr->atype == IPVL_IPV6) ? ipvlan_get_v6_hash(&addr->ip6addr) : ipvlan_get_v4_hash(&addr->ip4addr); if (hlist_unhashed(&addr->hlnode)) hlist_add_head_rcu(&addr->hlnode, &port->hlhead[hash]); } void ipvlan_ht_addr_del(struct ipvl_addr *addr) { hlist_del_init_rcu(&addr->hlnode); } struct ipvl_addr *ipvlan_find_addr(const struct ipvl_dev *ipvlan, const void *iaddr, bool is_v6) { struct ipvl_addr *addr, *ret = NULL; rcu_read_lock(); list_for_each_entry_rcu(addr, &ipvlan->addrs, anode) { if (addr_equal(is_v6, addr, iaddr)) { ret = addr; break; } } rcu_read_unlock(); return ret; } bool ipvlan_addr_busy(struct ipvl_port *port, void *iaddr, bool is_v6) { struct ipvl_dev *ipvlan; bool ret = false; rcu_read_lock(); list_for_each_entry_rcu(ipvlan, &port->ipvlans, pnode) { if (ipvlan_find_addr(ipvlan, iaddr, is_v6)) { ret = true; break; } } rcu_read_unlock(); return ret; } void *ipvlan_get_L3_hdr(struct ipvl_port *port, struct sk_buff *skb, int *type) { void *lyr3h = NULL; switch (skb->protocol) { case htons(ETH_P_ARP): { struct arphdr *arph; if (unlikely(!pskb_may_pull(skb, arp_hdr_len(port->dev)))) return NULL; arph = arp_hdr(skb); *type = IPVL_ARP; lyr3h = arph; break; } case htons(ETH_P_IP): { u32 pktlen; struct iphdr *ip4h; if (unlikely(!pskb_may_pull(skb, sizeof(*ip4h)))) return NULL; ip4h = ip_hdr(skb); pktlen = skb_ip_totlen(skb); if (ip4h->ihl < 5 || ip4h->version != 4) return NULL; if (skb->len < pktlen || pktlen < (ip4h->ihl * 4)) return NULL; *type = IPVL_IPV4; lyr3h = ip4h; break; } #if IS_ENABLED(CONFIG_IPV6) case htons(ETH_P_IPV6): { struct ipv6hdr *ip6h; if (unlikely(!pskb_may_pull(skb, sizeof(*ip6h)))) return NULL; ip6h = ipv6_hdr(skb); if (ip6h->version != 6) return NULL; *type = IPVL_IPV6; lyr3h = ip6h; /* Only Neighbour Solicitation pkts need different treatment */ if (ipv6_addr_any(&ip6h->saddr) && ip6h->nexthdr == NEXTHDR_ICMP) { struct icmp6hdr *icmph; if (unlikely(!pskb_may_pull(skb, sizeof(*ip6h) + sizeof(*icmph)))) return NULL; ip6h = ipv6_hdr(skb); icmph = (struct icmp6hdr *)(ip6h + 1); if (icmph->icmp6_type == NDISC_NEIGHBOUR_SOLICITATION) { /* Need to access the ipv6 address in body */ if (unlikely(!pskb_may_pull(skb, sizeof(*ip6h) + sizeof(*icmph) + sizeof(struct in6_addr)))) return NULL; ip6h = ipv6_hdr(skb); icmph = (struct icmp6hdr *)(ip6h + 1); } *type = IPVL_ICMPV6; lyr3h = icmph; } break; } #endif default: return NULL; } return lyr3h; } unsigned int ipvlan_mac_hash(const unsigned char *addr) { u32 hash = jhash_1word(get_unaligned((u32 *)(addr + 2)), ipvlan_jhash_secret); return hash & IPVLAN_MAC_FILTER_MASK; } void ipvlan_process_multicast(struct work_struct *work) { struct ipvl_port *port = container_of(work, struct ipvl_port, wq); struct ethhdr *ethh; struct ipvl_dev *ipvlan; struct sk_buff *skb, *nskb; struct sk_buff_head list; unsigned int len; unsigned int mac_hash; int ret; u8 pkt_type; bool tx_pkt; __skb_queue_head_init(&list); spin_lock_bh(&port->backlog.lock); skb_queue_splice_tail_init(&port->backlog, &list); spin_unlock_bh(&port->backlog.lock); while ((skb = __skb_dequeue(&list)) != NULL) { struct net_device *dev = skb->dev; bool consumed = false; ethh = eth_hdr(skb); tx_pkt = IPVL_SKB_CB(skb)->tx_pkt; mac_hash = ipvlan_mac_hash(ethh->h_dest); if (ether_addr_equal(ethh->h_dest, port->dev->broadcast)) pkt_type = PACKET_BROADCAST; else pkt_type = PACKET_MULTICAST; rcu_read_lock(); list_for_each_entry_rcu(ipvlan, &port->ipvlans, pnode) { if (tx_pkt && (ipvlan->dev == skb->dev)) continue; if (!test_bit(mac_hash, ipvlan->mac_filters)) continue; if (!(ipvlan->dev->flags & IFF_UP)) continue; ret = NET_RX_DROP; len = skb->len + ETH_HLEN; nskb = skb_clone(skb, GFP_ATOMIC); local_bh_disable(); if (nskb) { consumed = true; nskb->pkt_type = pkt_type; nskb->dev = ipvlan->dev; if (tx_pkt) ret = dev_forward_skb(ipvlan->dev, nskb); else ret = netif_rx(nskb); } ipvlan_count_rx(ipvlan, len, ret == NET_RX_SUCCESS, true); local_bh_enable(); } rcu_read_unlock(); if (tx_pkt) { /* If the packet originated here, send it out. */ skb->dev = port->dev; skb->pkt_type = pkt_type; dev_queue_xmit(skb); } else { if (consumed) consume_skb(skb); else kfree_skb(skb); } dev_put(dev); cond_resched(); } } static void ipvlan_skb_crossing_ns(struct sk_buff *skb, struct net_device *dev) { bool xnet = true; if (dev) xnet = !net_eq(dev_net(skb->dev), dev_net(dev)); skb_scrub_packet(skb, xnet); if (dev) skb->dev = dev; } static int ipvlan_rcv_frame(struct ipvl_addr *addr, struct sk_buff **pskb, bool local) { struct ipvl_dev *ipvlan = addr->master; struct net_device *dev = ipvlan->dev; unsigned int len; rx_handler_result_t ret = RX_HANDLER_CONSUMED; bool success = false; struct sk_buff *skb = *pskb; len = skb->len + ETH_HLEN; /* Only packets exchanged between two local slaves need to have * device-up check as well as skb-share check. */ if (local) { if (unlikely(!(dev->flags & IFF_UP))) { kfree_skb(skb); goto out; } skb = skb_share_check(skb, GFP_ATOMIC); if (!skb) goto out; *pskb = skb; } if (local) { skb->pkt_type = PACKET_HOST; if (dev_forward_skb(ipvlan->dev, skb) == NET_RX_SUCCESS) success = true; } else { skb->dev = dev; ret = RX_HANDLER_ANOTHER; success = true; } out: ipvlan_count_rx(ipvlan, len, success, false); return ret; } struct ipvl_addr *ipvlan_addr_lookup(struct ipvl_port *port, void *lyr3h, int addr_type, bool use_dest) { struct ipvl_addr *addr = NULL; switch (addr_type) { #if IS_ENABLED(CONFIG_IPV6) case IPVL_IPV6: { struct ipv6hdr *ip6h; struct in6_addr *i6addr; ip6h = (struct ipv6hdr *)lyr3h; i6addr = use_dest ? &ip6h->daddr : &ip6h->saddr; addr = ipvlan_ht_addr_lookup(port, i6addr, true); break; } case IPVL_ICMPV6: { struct nd_msg *ndmh; struct in6_addr *i6addr; /* Make sure that the NeighborSolicitation ICMPv6 packets * are handled to avoid DAD issue. */ ndmh = (struct nd_msg *)lyr3h; if (ndmh->icmph.icmp6_type == NDISC_NEIGHBOUR_SOLICITATION) { i6addr = &ndmh->target; addr = ipvlan_ht_addr_lookup(port, i6addr, true); } break; } #endif case IPVL_IPV4: { struct iphdr *ip4h; __be32 *i4addr; ip4h = (struct iphdr *)lyr3h; i4addr = use_dest ? &ip4h->daddr : &ip4h->saddr; addr = ipvlan_ht_addr_lookup(port, i4addr, false); break; } case IPVL_ARP: { struct arphdr *arph; unsigned char *arp_ptr; __be32 dip; arph = (struct arphdr *)lyr3h; arp_ptr = (unsigned char *)(arph + 1); if (use_dest) arp_ptr += (2 * port->dev->addr_len) + 4; else arp_ptr += port->dev->addr_len; memcpy(&dip, arp_ptr, 4); addr = ipvlan_ht_addr_lookup(port, &dip, false); break; } } return addr; } static noinline_for_stack int ipvlan_process_v4_outbound(struct sk_buff *skb) { struct net_device *dev = skb->dev; struct net *net = dev_net(dev); int err, ret = NET_XMIT_DROP; const struct iphdr *ip4h; struct rtable *rt; struct flowi4 fl4 = { .flowi4_oif = dev->ifindex, .flowi4_flags = FLOWI_FLAG_ANYSRC, .flowi4_mark = skb->mark, }; if (!pskb_network_may_pull(skb, sizeof(struct iphdr))) goto err; ip4h = ip_hdr(skb); fl4.daddr = ip4h->daddr; fl4.saddr = ip4h->saddr; fl4.flowi4_tos = inet_dscp_to_dsfield(ip4h_dscp(ip4h)); rt = ip_route_output_flow(net, &fl4, NULL); if (IS_ERR(rt)) goto err; if (rt->rt_type != RTN_UNICAST && rt->rt_type != RTN_LOCAL) { ip_rt_put(rt); goto err; } skb_dst_set(skb, &rt->dst); memset(IPCB(skb), 0, sizeof(*IPCB(skb))); err = ip_local_out(net, NULL, skb); if (unlikely(net_xmit_eval(err))) DEV_STATS_INC(dev, tx_errors); else ret = NET_XMIT_SUCCESS; goto out; err: DEV_STATS_INC(dev, tx_errors); kfree_skb(skb); out: return ret; } #if IS_ENABLED(CONFIG_IPV6) static noinline_for_stack int ipvlan_route_v6_outbound(struct net_device *dev, struct sk_buff *skb) { const struct ipv6hdr *ip6h = ipv6_hdr(skb); struct flowi6 fl6 = { .flowi6_oif = dev->ifindex, .daddr = ip6h->daddr, .saddr = ip6h->saddr, .flowi6_flags = FLOWI_FLAG_ANYSRC, .flowlabel = ip6_flowinfo(ip6h), .flowi6_mark = skb->mark, .flowi6_proto = ip6h->nexthdr, }; struct dst_entry *dst; int err; dst = ip6_route_output(dev_net(dev), NULL, &fl6); err = dst->error; if (err) { dst_release(dst); return err; } skb_dst_set(skb, dst); return 0; } static int ipvlan_process_v6_outbound(struct sk_buff *skb) { struct net_device *dev = skb->dev; int err, ret = NET_XMIT_DROP; if (!pskb_network_may_pull(skb, sizeof(struct ipv6hdr))) { DEV_STATS_INC(dev, tx_errors); kfree_skb(skb); return ret; } err = ipvlan_route_v6_outbound(dev, skb); if (unlikely(err)) { DEV_STATS_INC(dev, tx_errors); kfree_skb(skb); return err; } memset(IP6CB(skb), 0, sizeof(*IP6CB(skb))); err = ip6_local_out(dev_net(dev), NULL, skb); if (unlikely(net_xmit_eval(err))) DEV_STATS_INC(dev, tx_errors); else ret = NET_XMIT_SUCCESS; return ret; } #else static int ipvlan_process_v6_outbound(struct sk_buff *skb) { return NET_XMIT_DROP; } #endif static int ipvlan_process_outbound(struct sk_buff *skb) { int ret = NET_XMIT_DROP; /* The ipvlan is a pseudo-L2 device, so the packets that we receive * will have L2; which need to discarded and processed further * in the net-ns of the main-device. */ if (skb_mac_header_was_set(skb)) { /* In this mode we dont care about * multicast and broadcast traffic */ struct ethhdr *ethh = eth_hdr(skb); if (is_multicast_ether_addr(ethh->h_dest)) { pr_debug_ratelimited( "Dropped {multi|broad}cast of type=[%x]\n", ntohs(skb->protocol)); kfree_skb(skb); goto out; } skb_pull(skb, sizeof(*ethh)); skb->mac_header = (typeof(skb->mac_header))~0U; skb_reset_network_header(skb); } if (skb->protocol == htons(ETH_P_IPV6)) ret = ipvlan_process_v6_outbound(skb); else if (skb->protocol == htons(ETH_P_IP)) ret = ipvlan_process_v4_outbound(skb); else { pr_warn_ratelimited("Dropped outbound packet type=%x\n", ntohs(skb->protocol)); kfree_skb(skb); } out: return ret; } static void ipvlan_multicast_enqueue(struct ipvl_port *port, struct sk_buff *skb, bool tx_pkt) { if (skb->protocol == htons(ETH_P_PAUSE)) { kfree_skb(skb); return; } /* Record that the deferred packet is from TX or RX path. By * looking at mac-addresses on packet will lead to erronus decisions. * (This would be true for a loopback-mode on master device or a * hair-pin mode of the switch.) */ IPVL_SKB_CB(skb)->tx_pkt = tx_pkt; spin_lock(&port->backlog.lock); if (skb_queue_len(&port->backlog) < IPVLAN_QBACKLOG_LIMIT) { dev_hold(skb->dev); __skb_queue_tail(&port->backlog, skb); spin_unlock(&port->backlog.lock); schedule_work(&port->wq); } else { spin_unlock(&port->backlog.lock); dev_core_stats_rx_dropped_inc(skb->dev); kfree_skb(skb); } } static int ipvlan_xmit_mode_l3(struct sk_buff *skb, struct net_device *dev) { const struct ipvl_dev *ipvlan = netdev_priv(dev); void *lyr3h; struct ipvl_addr *addr; int addr_type; lyr3h = ipvlan_get_L3_hdr(ipvlan->port, skb, &addr_type); if (!lyr3h) goto out; if (!ipvlan_is_vepa(ipvlan->port)) { addr = ipvlan_addr_lookup(ipvlan->port, lyr3h, addr_type, true); if (addr) { if (ipvlan_is_private(ipvlan->port)) { consume_skb(skb); return NET_XMIT_DROP; } ipvlan_rcv_frame(addr, &skb, true); return NET_XMIT_SUCCESS; } } out: ipvlan_skb_crossing_ns(skb, ipvlan->phy_dev); return ipvlan_process_outbound(skb); } static int ipvlan_xmit_mode_l2(struct sk_buff *skb, struct net_device *dev) { const struct ipvl_dev *ipvlan = netdev_priv(dev); struct ethhdr *eth = skb_eth_hdr(skb); struct ipvl_addr *addr; void *lyr3h; int addr_type; if (!ipvlan_is_vepa(ipvlan->port) && ether_addr_equal(eth->h_dest, eth->h_source)) { lyr3h = ipvlan_get_L3_hdr(ipvlan->port, skb, &addr_type); if (lyr3h) { addr = ipvlan_addr_lookup(ipvlan->port, lyr3h, addr_type, true); if (addr) { if (ipvlan_is_private(ipvlan->port)) { consume_skb(skb); return NET_XMIT_DROP; } ipvlan_rcv_frame(addr, &skb, true); return NET_XMIT_SUCCESS; } } skb = skb_share_check(skb, GFP_ATOMIC); if (!skb) return NET_XMIT_DROP; /* Packet definitely does not belong to any of the * virtual devices, but the dest is local. So forward * the skb for the main-dev. At the RX side we just return * RX_PASS for it to be processed further on the stack. */ dev_forward_skb(ipvlan->phy_dev, skb); return NET_XMIT_SUCCESS; } else if (is_multicast_ether_addr(eth->h_dest)) { skb_reset_mac_header(skb); ipvlan_skb_crossing_ns(skb, NULL); ipvlan_multicast_enqueue(ipvlan->port, skb, true); return NET_XMIT_SUCCESS; } skb->dev = ipvlan->phy_dev; return dev_queue_xmit(skb); } int ipvlan_queue_xmit(struct sk_buff *skb, struct net_device *dev) { struct ipvl_dev *ipvlan = netdev_priv(dev); struct ipvl_port *port = ipvlan_port_get_rcu_bh(ipvlan->phy_dev); if (!port) goto out; if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr)))) goto out; switch(port->mode) { case IPVLAN_MODE_L2: return ipvlan_xmit_mode_l2(skb, dev); case IPVLAN_MODE_L3: #ifdef CONFIG_IPVLAN_L3S case IPVLAN_MODE_L3S: #endif return ipvlan_xmit_mode_l3(skb, dev); } /* Should not reach here */ WARN_ONCE(true, "%s called for mode = [%x]\n", __func__, port->mode); out: kfree_skb(skb); return NET_XMIT_DROP; } static bool ipvlan_external_frame(struct sk_buff *skb, struct ipvl_port *port) { struct ethhdr *eth = eth_hdr(skb); struct ipvl_addr *addr; void *lyr3h; int addr_type; if (ether_addr_equal(eth->h_source, skb->dev->dev_addr)) { lyr3h = ipvlan_get_L3_hdr(port, skb, &addr_type); if (!lyr3h) return true; addr = ipvlan_addr_lookup(port, lyr3h, addr_type, false); if (addr) return false; } return true; } static rx_handler_result_t ipvlan_handle_mode_l3(struct sk_buff **pskb, struct ipvl_port *port) { void *lyr3h; int addr_type; struct ipvl_addr *addr; struct sk_buff *skb = *pskb; rx_handler_result_t ret = RX_HANDLER_PASS; lyr3h = ipvlan_get_L3_hdr(port, skb, &addr_type); if (!lyr3h) goto out; addr = ipvlan_addr_lookup(port, lyr3h, addr_type, true); if (addr) ret = ipvlan_rcv_frame(addr, pskb, false); out: return ret; } static rx_handler_result_t ipvlan_handle_mode_l2(struct sk_buff **pskb, struct ipvl_port *port) { struct sk_buff *skb = *pskb; struct ethhdr *eth = eth_hdr(skb); rx_handler_result_t ret = RX_HANDLER_PASS; if (is_multicast_ether_addr(eth->h_dest)) { if (ipvlan_external_frame(skb, port)) { struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC); /* External frames are queued for device local * distribution, but a copy is given to master * straight away to avoid sending duplicates later * when work-queue processes this frame. This is * achieved by returning RX_HANDLER_PASS. */ if (nskb) { ipvlan_skb_crossing_ns(nskb, NULL); ipvlan_multicast_enqueue(port, nskb, false); } } } else { /* Perform like l3 mode for non-multicast packet */ ret = ipvlan_handle_mode_l3(pskb, port); } return ret; } rx_handler_result_t ipvlan_handle_frame(struct sk_buff **pskb) { struct sk_buff *skb = *pskb; struct ipvl_port *port = ipvlan_port_get_rcu(skb->dev); if (!port) return RX_HANDLER_PASS; switch (port->mode) { case IPVLAN_MODE_L2: return ipvlan_handle_mode_l2(pskb, port); case IPVLAN_MODE_L3: return ipvlan_handle_mode_l3(pskb, port); #ifdef CONFIG_IPVLAN_L3S case IPVLAN_MODE_L3S: return RX_HANDLER_PASS; #endif } /* Should not reach here */ WARN_ONCE(true, "%s called for mode = [%x]\n", __func__, port->mode); kfree_skb(skb); return RX_HANDLER_CONSUMED; } |
| 56 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 | // SPDX-License-Identifier: GPL-2.0+ /* * USB Networking Link Interface * * Copyright (C) 2000-2005 by David Brownell <dbrownell@users.sourceforge.net> * Copyright (C) 2003-2005 David Hollis <dhollis@davehollis.com> */ #ifndef __LINUX_USB_USBNET_H #define __LINUX_USB_USBNET_H #include <linux/mii.h> #include <linux/netdevice.h> #include <linux/skbuff.h> #include <linux/types.h> #include <linux/usb.h> /* interface from usbnet core to each USB networking link we handle */ struct usbnet { /* housekeeping */ struct usb_device *udev; struct usb_interface *intf; const struct driver_info *driver_info; const char *driver_name; void *driver_priv; wait_queue_head_t wait; struct mutex phy_mutex; unsigned char suspend_count; unsigned char pkt_cnt, pkt_err; unsigned short rx_qlen, tx_qlen; unsigned can_dma_sg:1; /* i/o info: pipes etc */ unsigned in, out; struct usb_host_endpoint *status; unsigned maxpacket; struct timer_list delay; const char *padding_pkt; /* protocol/interface state */ struct net_device *net; int msg_enable; unsigned long data[5]; u32 xid; u32 hard_mtu; /* count any extra framing */ size_t rx_urb_size; /* size for rx urbs */ struct mii_if_info mii; long rx_speed; /* If MII not used */ long tx_speed; /* If MII not used */ # define SPEED_UNSET -1 /* various kinds of pending driver work */ struct sk_buff_head rxq; struct sk_buff_head txq; struct sk_buff_head done; struct sk_buff_head rxq_pause; struct urb *interrupt; unsigned interrupt_count; struct mutex interrupt_mutex; struct usb_anchor deferred; struct work_struct bh_work; struct work_struct kevent; unsigned long flags; # define EVENT_TX_HALT 0 # define EVENT_RX_HALT 1 # define EVENT_RX_MEMORY 2 # define EVENT_STS_SPLIT 3 # define EVENT_LINK_RESET 4 # define EVENT_RX_PAUSED 5 # define EVENT_DEV_ASLEEP 6 # define EVENT_DEV_OPEN 7 # define EVENT_DEVICE_REPORT_IDLE 8 # define EVENT_NO_RUNTIME_PM 9 # define EVENT_RX_KILL 10 # define EVENT_LINK_CHANGE 11 # define EVENT_SET_RX_MODE 12 # define EVENT_NO_IP_ALIGN 13 # define EVENT_LINK_CARRIER_ON 14 /* This one is special, as it indicates that the device is going away * there are cyclic dependencies between tasklet, timer and bh * that must be broken */ # define EVENT_UNPLUG 31 }; static inline bool usbnet_going_away(struct usbnet *ubn) { return test_bit(EVENT_UNPLUG, &ubn->flags); } static inline void usbnet_mark_going_away(struct usbnet *ubn) { set_bit(EVENT_UNPLUG, &ubn->flags); } static inline struct usb_driver *driver_of(struct usb_interface *intf) { return to_usb_driver(intf->dev.driver); } /* interface from the device/framing level "minidriver" to core */ struct driver_info { char *description; int flags; /* framing is CDC Ethernet, not writing ZLPs (hw issues), or optionally: */ #define FLAG_FRAMING_NC 0x0001 /* guard against device dropouts */ #define FLAG_FRAMING_GL 0x0002 /* genelink batches packets */ #define FLAG_FRAMING_Z 0x0004 /* zaurus adds a trailer */ #define FLAG_FRAMING_RN 0x0008 /* RNDIS batches, plus huge header */ #define FLAG_NO_SETINT 0x0010 /* device can't set_interface() */ #define FLAG_ETHER 0x0020 /* maybe use "eth%d" names */ #define FLAG_FRAMING_AX 0x0040 /* AX88772/178 packets */ #define FLAG_WLAN 0x0080 /* use "wlan%d" names */ #define FLAG_AVOID_UNLINK_URBS 0x0100 /* don't unlink urbs at usbnet_stop() */ #define FLAG_SEND_ZLP 0x0200 /* hw requires ZLPs are sent */ #define FLAG_WWAN 0x0400 /* use "wwan%d" names */ #define FLAG_LINK_INTR 0x0800 /* updates link (carrier) status */ #define FLAG_POINTTOPOINT 0x1000 /* possibly use "usb%d" names */ /* * Indicates to usbnet, that USB driver accumulates multiple IP packets. * Affects statistic (counters) and short packet handling. */ #define FLAG_MULTI_PACKET 0x2000 #define FLAG_RX_ASSEMBLE 0x4000 /* rx packets may span >1 frames */ #define FLAG_NOARP 0x8000 /* device can't do ARP */ /* init device ... can sleep, or cause probe() failure */ int (*bind)(struct usbnet *, struct usb_interface *); /* cleanup device ... can sleep, but can't fail */ void (*unbind)(struct usbnet *, struct usb_interface *); /* reset device ... can sleep */ int (*reset)(struct usbnet *); /* stop device ... can sleep */ int (*stop)(struct usbnet *); /* see if peer is connected ... can sleep */ int (*check_connect)(struct usbnet *); /* (dis)activate runtime power management */ int (*manage_power)(struct usbnet *, int); /* for status polling */ void (*status)(struct usbnet *, struct urb *); /* link reset handling, called from defer_kevent */ int (*link_reset)(struct usbnet *); /* fixup rx packet (strip framing) */ int (*rx_fixup)(struct usbnet *dev, struct sk_buff *skb); /* fixup tx packet (add framing) */ struct sk_buff *(*tx_fixup)(struct usbnet *dev, struct sk_buff *skb, gfp_t flags); /* recover from timeout */ void (*recover)(struct usbnet *dev); /* early initialization code, can sleep. This is for minidrivers * having 'subminidrivers' that need to do extra initialization * right after minidriver have initialized hardware. */ int (*early_init)(struct usbnet *dev); /* called by minidriver when receiving indication */ void (*indication)(struct usbnet *dev, void *ind, int indlen); /* rx mode change (device changes address list filtering) */ void (*set_rx_mode)(struct usbnet *dev); /* for new devices, use the descriptor-reading code instead */ int in; /* rx endpoint */ int out; /* tx endpoint */ unsigned long data; /* Misc driver specific data */ }; /* Minidrivers are just drivers using the "usbnet" core as a powerful * network-specific subroutine library ... that happens to do pretty * much everything except custom framing and chip-specific stuff. */ extern int usbnet_probe(struct usb_interface *, const struct usb_device_id *); extern int usbnet_suspend(struct usb_interface *, pm_message_t); extern int usbnet_resume(struct usb_interface *); extern void usbnet_disconnect(struct usb_interface *); extern void usbnet_device_suggests_idle(struct usbnet *dev); extern int usbnet_read_cmd(struct usbnet *dev, u8 cmd, u8 reqtype, u16 value, u16 index, void *data, u16 size); extern int usbnet_write_cmd(struct usbnet *dev, u8 cmd, u8 reqtype, u16 value, u16 index, const void *data, u16 size); extern int usbnet_read_cmd_nopm(struct usbnet *dev, u8 cmd, u8 reqtype, u16 value, u16 index, void *data, u16 size); extern int usbnet_write_cmd_nopm(struct usbnet *dev, u8 cmd, u8 reqtype, u16 value, u16 index, const void *data, u16 size); extern int usbnet_write_cmd_async(struct usbnet *dev, u8 cmd, u8 reqtype, u16 value, u16 index, const void *data, u16 size); /* Drivers that reuse some of the standard USB CDC infrastructure * (notably, using multiple interfaces according to the CDC * union descriptor) get some helper code. */ struct cdc_state { struct usb_cdc_header_desc *header; struct usb_cdc_union_desc *u; struct usb_cdc_ether_desc *ether; struct usb_interface *control; struct usb_interface *data; }; extern void usbnet_cdc_update_filter(struct usbnet *dev); extern int usbnet_generic_cdc_bind(struct usbnet *, struct usb_interface *); extern int usbnet_ether_cdc_bind(struct usbnet *dev, struct usb_interface *intf); extern int usbnet_cdc_bind(struct usbnet *, struct usb_interface *); extern void usbnet_cdc_unbind(struct usbnet *, struct usb_interface *); extern void usbnet_cdc_status(struct usbnet *, struct urb *); extern int usbnet_cdc_zte_rx_fixup(struct usbnet *dev, struct sk_buff *skb); /* CDC and RNDIS support the same host-chosen packet filters for IN transfers */ #define DEFAULT_FILTER (USB_CDC_PACKET_TYPE_BROADCAST \ |USB_CDC_PACKET_TYPE_ALL_MULTICAST \ |USB_CDC_PACKET_TYPE_PROMISCUOUS \ |USB_CDC_PACKET_TYPE_DIRECTED) /* we record the state for each of our queued skbs */ enum skb_state { illegal = 0, tx_start, tx_done, rx_start, rx_done, rx_cleanup, unlink_start }; struct skb_data { /* skb->cb is one of these */ struct urb *urb; struct usbnet *dev; enum skb_state state; long length; unsigned long packets; }; /* Drivers that set FLAG_MULTI_PACKET must call this in their * tx_fixup method before returning an skb. */ static inline void usbnet_set_skb_tx_stats(struct sk_buff *skb, unsigned long packets, long bytes_delta) { struct skb_data *entry = (struct skb_data *) skb->cb; entry->packets = packets; entry->length = bytes_delta; } extern int usbnet_open(struct net_device *net); extern int usbnet_stop(struct net_device *net); extern netdev_tx_t usbnet_start_xmit(struct sk_buff *skb, struct net_device *net); extern void usbnet_tx_timeout(struct net_device *net, unsigned int txqueue); extern int usbnet_change_mtu(struct net_device *net, int new_mtu); extern int usbnet_get_endpoints(struct usbnet *, struct usb_interface *); extern int usbnet_get_ethernet_addr(struct usbnet *, int); extern void usbnet_defer_kevent(struct usbnet *, int); extern void usbnet_skb_return(struct usbnet *, struct sk_buff *); extern void usbnet_unlink_rx_urbs(struct usbnet *); extern void usbnet_pause_rx(struct usbnet *); extern void usbnet_resume_rx(struct usbnet *); extern void usbnet_purge_paused_rxq(struct usbnet *); extern int usbnet_get_link_ksettings_mii(struct net_device *net, struct ethtool_link_ksettings *cmd); extern int usbnet_set_link_ksettings_mii(struct net_device *net, const struct ethtool_link_ksettings *cmd); extern int usbnet_get_link_ksettings_internal(struct net_device *net, struct ethtool_link_ksettings *cmd); extern u32 usbnet_get_link(struct net_device *net); extern u32 usbnet_get_msglevel(struct net_device *); extern void usbnet_set_msglevel(struct net_device *, u32); extern void usbnet_set_rx_mode(struct net_device *net); extern void usbnet_get_drvinfo(struct net_device *, struct ethtool_drvinfo *); extern int usbnet_nway_reset(struct net_device *net); extern int usbnet_manage_power(struct usbnet *, int); extern void usbnet_link_change(struct usbnet *, bool, bool); extern int usbnet_status_start(struct usbnet *dev, gfp_t mem_flags); extern void usbnet_status_stop(struct usbnet *dev); extern void usbnet_update_max_qlen(struct usbnet *dev); #endif /* __LINUX_USB_USBNET_H */ |
| 5645 1528 4737 4726 1528 4726 5265 5258 5292 5271 5263 156 158 5256 1513 1513 1519 1507 1511 168 170 1510 4721 4730 4727 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 | // SPDX-License-Identifier: GPL-2.0 /* * Lockless hierarchical page accounting & limiting * * Copyright (C) 2014 Red Hat, Inc., Johannes Weiner */ #include <linux/page_counter.h> #include <linux/atomic.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/sched.h> #include <linux/bug.h> #include <asm/page.h> static bool track_protection(struct page_counter *c) { return c->protection_support; } static void propagate_protected_usage(struct page_counter *c, unsigned long usage) { unsigned long protected, old_protected; long delta; if (!c->parent) return; protected = min(usage, READ_ONCE(c->min)); old_protected = atomic_long_read(&c->min_usage); if (protected != old_protected) { old_protected = atomic_long_xchg(&c->min_usage, protected); delta = protected - old_protected; if (delta) atomic_long_add(delta, &c->parent->children_min_usage); } protected = min(usage, READ_ONCE(c->low)); old_protected = atomic_long_read(&c->low_usage); if (protected != old_protected) { old_protected = atomic_long_xchg(&c->low_usage, protected); delta = protected - old_protected; if (delta) atomic_long_add(delta, &c->parent->children_low_usage); } } /** * page_counter_cancel - take pages out of the local counter * @counter: counter * @nr_pages: number of pages to cancel */ void page_counter_cancel(struct page_counter *counter, unsigned long nr_pages) { long new; new = atomic_long_sub_return(nr_pages, &counter->usage); /* More uncharges than charges? */ if (WARN_ONCE(new < 0, "page_counter underflow: %ld nr_pages=%lu\n", new, nr_pages)) { new = 0; atomic_long_set(&counter->usage, new); } if (track_protection(counter)) propagate_protected_usage(counter, new); } /** * page_counter_charge - hierarchically charge pages * @counter: counter * @nr_pages: number of pages to charge * * NOTE: This does not consider any configured counter limits. */ void page_counter_charge(struct page_counter *counter, unsigned long nr_pages) { struct page_counter *c; bool protection = track_protection(counter); for (c = counter; c; c = c->parent) { long new; new = atomic_long_add_return(nr_pages, &c->usage); if (protection) propagate_protected_usage(c, new); /* * This is indeed racy, but we can live with some * inaccuracy in the watermark. * * Notably, we have two watermarks to allow for both a globally * visible peak and one that can be reset at a smaller scope. * * Since we reset both watermarks when the global reset occurs, * we can guarantee that watermark >= local_watermark, so we * don't need to do both comparisons every time. * * On systems with branch predictors, the inner condition should * be almost free. */ if (new > READ_ONCE(c->local_watermark)) { WRITE_ONCE(c->local_watermark, new); if (new > READ_ONCE(c->watermark)) WRITE_ONCE(c->watermark, new); } } } /** * page_counter_try_charge - try to hierarchically charge pages * @counter: counter * @nr_pages: number of pages to charge * @fail: points first counter to hit its limit, if any * * Returns %true on success, or %false and @fail if the counter or one * of its ancestors has hit its configured limit. */ bool page_counter_try_charge(struct page_counter *counter, unsigned long nr_pages, struct page_counter **fail) { struct page_counter *c; bool protection = track_protection(counter); bool track_failcnt = counter->track_failcnt; for (c = counter; c; c = c->parent) { long new; /* * Charge speculatively to avoid an expensive CAS. If * a bigger charge fails, it might falsely lock out a * racing smaller charge and send it into reclaim * early, but the error is limited to the difference * between the two sizes, which is less than 2M/4M in * case of a THP locking out a regular page charge. * * The atomic_long_add_return() implies a full memory * barrier between incrementing the count and reading * the limit. When racing with page_counter_set_max(), * we either see the new limit or the setter sees the * counter has changed and retries. */ new = atomic_long_add_return(nr_pages, &c->usage); if (new > c->max) { atomic_long_sub(nr_pages, &c->usage); /* * This is racy, but we can live with some * inaccuracy in the failcnt which is only used * to report stats. */ if (track_failcnt) data_race(c->failcnt++); *fail = c; goto failed; } if (protection) propagate_protected_usage(c, new); /* see comment on page_counter_charge */ if (new > READ_ONCE(c->local_watermark)) { WRITE_ONCE(c->local_watermark, new); if (new > READ_ONCE(c->watermark)) WRITE_ONCE(c->watermark, new); } } return true; failed: for (c = counter; c != *fail; c = c->parent) page_counter_cancel(c, nr_pages); return false; } /** * page_counter_uncharge - hierarchically uncharge pages * @counter: counter * @nr_pages: number of pages to uncharge */ void page_counter_uncharge(struct page_counter *counter, unsigned long nr_pages) { struct page_counter *c; for (c = counter; c; c = c->parent) page_counter_cancel(c, nr_pages); } /** * page_counter_set_max - set the maximum number of pages allowed * @counter: counter * @nr_pages: limit to set * * Returns 0 on success, -EBUSY if the current number of pages on the * counter already exceeds the specified limit. * * The caller must serialize invocations on the same counter. */ int page_counter_set_max(struct page_counter *counter, unsigned long nr_pages) { for (;;) { unsigned long old; long usage; /* * Update the limit while making sure that it's not * below the concurrently-changing counter value. * * The xchg implies two full memory barriers before * and after, so the read-swap-read is ordered and * ensures coherency with page_counter_try_charge(): * that function modifies the count before checking * the limit, so if it sees the old limit, we see the * modified counter and retry. */ usage = page_counter_read(counter); if (usage > nr_pages) return -EBUSY; old = xchg(&counter->max, nr_pages); if (page_counter_read(counter) <= usage || nr_pages >= old) return 0; counter->max = old; cond_resched(); } } /** * page_counter_set_min - set the amount of protected memory * @counter: counter * @nr_pages: value to set * * The caller must serialize invocations on the same counter. */ void page_counter_set_min(struct page_counter *counter, unsigned long nr_pages) { struct page_counter *c; WRITE_ONCE(counter->min, nr_pages); for (c = counter; c; c = c->parent) propagate_protected_usage(c, atomic_long_read(&c->usage)); } /** * page_counter_set_low - set the amount of protected memory * @counter: counter * @nr_pages: value to set * * The caller must serialize invocations on the same counter. */ void page_counter_set_low(struct page_counter *counter, unsigned long nr_pages) { struct page_counter *c; WRITE_ONCE(counter->low, nr_pages); for (c = counter; c; c = c->parent) propagate_protected_usage(c, atomic_long_read(&c->usage)); } /** * page_counter_memparse - memparse() for page counter limits * @buf: string to parse * @max: string meaning maximum possible value * @nr_pages: returns the result in number of pages * * Returns -EINVAL, or 0 and @nr_pages on success. @nr_pages will be * limited to %PAGE_COUNTER_MAX. */ int page_counter_memparse(const char *buf, const char *max, unsigned long *nr_pages) { char *end; u64 bytes; if (!strcmp(buf, max)) { *nr_pages = PAGE_COUNTER_MAX; return 0; } bytes = memparse(buf, &end); if (*end != '\0') return -EINVAL; *nr_pages = min(bytes / PAGE_SIZE, (u64)PAGE_COUNTER_MAX); return 0; } #if IS_ENABLED(CONFIG_MEMCG) || IS_ENABLED(CONFIG_CGROUP_DMEM) /* * This function calculates an individual page counter's effective * protection which is derived from its own memory.min/low, its * parent's and siblings' settings, as well as the actual memory * distribution in the tree. * * The following rules apply to the effective protection values: * * 1. At the first level of reclaim, effective protection is equal to * the declared protection in memory.min and memory.low. * * 2. To enable safe delegation of the protection configuration, at * subsequent levels the effective protection is capped to the * parent's effective protection. * * 3. To make complex and dynamic subtrees easier to configure, the * user is allowed to overcommit the declared protection at a given * level. If that is the case, the parent's effective protection is * distributed to the children in proportion to how much protection * they have declared and how much of it they are utilizing. * * This makes distribution proportional, but also work-conserving: * if one counter claims much more protection than it uses memory, * the unused remainder is available to its siblings. * * 4. Conversely, when the declared protection is undercommitted at a * given level, the distribution of the larger parental protection * budget is NOT proportional. A counter's protection from a sibling * is capped to its own memory.min/low setting. * * 5. However, to allow protecting recursive subtrees from each other * without having to declare each individual counter's fixed share * of the ancestor's claim to protection, any unutilized - * "floating" - protection from up the tree is distributed in * proportion to each counter's *usage*. This makes the protection * neutral wrt sibling cgroups and lets them compete freely over * the shared parental protection budget, but it protects the * subtree as a whole from neighboring subtrees. * * Note that 4. and 5. are not in conflict: 4. is about protecting * against immediate siblings whereas 5. is about protecting against * neighboring subtrees. */ static unsigned long effective_protection(unsigned long usage, unsigned long parent_usage, unsigned long setting, unsigned long parent_effective, unsigned long siblings_protected, bool recursive_protection) { unsigned long protected; unsigned long ep; protected = min(usage, setting); /* * If all cgroups at this level combined claim and use more * protection than what the parent affords them, distribute * shares in proportion to utilization. * * We are using actual utilization rather than the statically * claimed protection in order to be work-conserving: claimed * but unused protection is available to siblings that would * otherwise get a smaller chunk than what they claimed. */ if (siblings_protected > parent_effective) return protected * parent_effective / siblings_protected; /* * Ok, utilized protection of all children is within what the * parent affords them, so we know whatever this child claims * and utilizes is effectively protected. * * If there is unprotected usage beyond this value, reclaim * will apply pressure in proportion to that amount. * * If there is unutilized protection, the cgroup will be fully * shielded from reclaim, but we do return a smaller value for * protection than what the group could enjoy in theory. This * is okay. With the overcommit distribution above, effective * protection is always dependent on how memory is actually * consumed among the siblings anyway. */ ep = protected; /* * If the children aren't claiming (all of) the protection * afforded to them by the parent, distribute the remainder in * proportion to the (unprotected) memory of each cgroup. That * way, cgroups that aren't explicitly prioritized wrt each * other compete freely over the allowance, but they are * collectively protected from neighboring trees. * * We're using unprotected memory for the weight so that if * some cgroups DO claim explicit protection, we don't protect * the same bytes twice. * * Check both usage and parent_usage against the respective * protected values. One should imply the other, but they * aren't read atomically - make sure the division is sane. */ if (!recursive_protection) return ep; if (parent_effective > siblings_protected && parent_usage > siblings_protected && usage > protected) { unsigned long unclaimed; unclaimed = parent_effective - siblings_protected; unclaimed *= usage - protected; unclaimed /= parent_usage - siblings_protected; ep += unclaimed; } return ep; } /** * page_counter_calculate_protection - check if memory consumption is in the normal range * @root: the top ancestor of the sub-tree being checked * @counter: the page_counter the counter to update * @recursive_protection: Whether to use memory_recursiveprot behavior. * * Calculates elow/emin thresholds for given page_counter. * * WARNING: This function is not stateless! It can only be used as part * of a top-down tree iteration, not for isolated queries. */ void page_counter_calculate_protection(struct page_counter *root, struct page_counter *counter, bool recursive_protection) { unsigned long usage, parent_usage; struct page_counter *parent = counter->parent; /* * Effective values of the reclaim targets are ignored so they * can be stale. Have a look at mem_cgroup_protection for more * details. * TODO: calculation should be more robust so that we do not need * that special casing. */ if (root == counter) return; usage = page_counter_read(counter); if (!usage) return; if (parent == root) { counter->emin = READ_ONCE(counter->min); counter->elow = READ_ONCE(counter->low); return; } parent_usage = page_counter_read(parent); WRITE_ONCE(counter->emin, effective_protection(usage, parent_usage, READ_ONCE(counter->min), READ_ONCE(parent->emin), atomic_long_read(&parent->children_min_usage), recursive_protection)); WRITE_ONCE(counter->elow, effective_protection(usage, parent_usage, READ_ONCE(counter->low), READ_ONCE(parent->elow), atomic_long_read(&parent->children_low_usage), recursive_protection)); } #endif /* CONFIG_MEMCG || CONFIG_CGROUP_DMEM */ |
| 10 11 5 5 4 5 2 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 | /* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. */ #ifndef _WG_QUEUEING_H #define _WG_QUEUEING_H #include "peer.h" #include <linux/types.h> #include <linux/skbuff.h> #include <linux/ip.h> #include <linux/ipv6.h> #include <net/ip_tunnels.h> struct wg_device; struct wg_peer; struct multicore_worker; struct crypt_queue; struct prev_queue; struct sk_buff; /* queueing.c APIs: */ int wg_packet_queue_init(struct crypt_queue *queue, work_func_t function, unsigned int len); void wg_packet_queue_free(struct crypt_queue *queue, bool purge); struct multicore_worker __percpu * wg_packet_percpu_multicore_worker_alloc(work_func_t function, void *ptr); /* receive.c APIs: */ void wg_packet_receive(struct wg_device *wg, struct sk_buff *skb); void wg_packet_handshake_receive_worker(struct work_struct *work); /* NAPI poll function: */ int wg_packet_rx_poll(struct napi_struct *napi, int budget); /* Workqueue worker: */ void wg_packet_decrypt_worker(struct work_struct *work); /* send.c APIs: */ void wg_packet_send_queued_handshake_initiation(struct wg_peer *peer, bool is_retry); void wg_packet_send_handshake_response(struct wg_peer *peer); void wg_packet_send_handshake_cookie(struct wg_device *wg, struct sk_buff *initiating_skb, __le32 sender_index); void wg_packet_send_keepalive(struct wg_peer *peer); void wg_packet_purge_staged_packets(struct wg_peer *peer); void wg_packet_send_staged_packets(struct wg_peer *peer); /* Workqueue workers: */ void wg_packet_handshake_send_worker(struct work_struct *work); void wg_packet_tx_worker(struct work_struct *work); void wg_packet_encrypt_worker(struct work_struct *work); enum packet_state { PACKET_STATE_UNCRYPTED, PACKET_STATE_CRYPTED, PACKET_STATE_DEAD }; struct packet_cb { u64 nonce; struct noise_keypair *keypair; atomic_t state; u32 mtu; u8 ds; }; #define PACKET_CB(skb) ((struct packet_cb *)((skb)->cb)) #define PACKET_PEER(skb) (PACKET_CB(skb)->keypair->entry.peer) static inline bool wg_check_packet_protocol(struct sk_buff *skb) { __be16 real_protocol = ip_tunnel_parse_protocol(skb); return real_protocol && skb->protocol == real_protocol; } static inline void wg_reset_packet(struct sk_buff *skb, bool encapsulating) { u8 l4_hash = skb->l4_hash; u8 sw_hash = skb->sw_hash; u32 hash = skb->hash; skb_scrub_packet(skb, true); memset(&skb->headers, 0, sizeof(skb->headers)); if (encapsulating) { skb->l4_hash = l4_hash; skb->sw_hash = sw_hash; skb->hash = hash; } skb->queue_mapping = 0; skb->nohdr = 0; skb->peeked = 0; skb->mac_len = 0; skb->dev = NULL; #ifdef CONFIG_NET_SCHED skb->tc_index = 0; #endif skb_reset_redirect(skb); skb->hdr_len = skb_headroom(skb); skb_reset_mac_header(skb); skb_reset_network_header(skb); skb_reset_transport_header(skb); skb_probe_transport_header(skb); skb_reset_inner_headers(skb); } static inline int wg_cpumask_choose_online(int *stored_cpu, unsigned int id) { unsigned int cpu = *stored_cpu, cpu_index, i; if (unlikely(cpu >= nr_cpu_ids || !cpumask_test_cpu(cpu, cpu_online_mask))) { cpu_index = id % cpumask_weight(cpu_online_mask); cpu = cpumask_first(cpu_online_mask); for (i = 0; i < cpu_index; ++i) cpu = cpumask_next(cpu, cpu_online_mask); *stored_cpu = cpu; } return cpu; } /* This function is racy, in the sense that it's called while last_cpu is * unlocked, so it could return the same CPU twice. Adding locking or using * atomic sequence numbers is slower though, and the consequences of racing are * harmless, so live with it. */ static inline int wg_cpumask_next_online(int *last_cpu) { int cpu = cpumask_next(READ_ONCE(*last_cpu), cpu_online_mask); if (cpu >= nr_cpu_ids) cpu = cpumask_first(cpu_online_mask); WRITE_ONCE(*last_cpu, cpu); return cpu; } void wg_prev_queue_init(struct prev_queue *queue); /* Multi producer */ bool wg_prev_queue_enqueue(struct prev_queue *queue, struct sk_buff *skb); /* Single consumer */ struct sk_buff *wg_prev_queue_dequeue(struct prev_queue *queue); /* Single consumer */ static inline struct sk_buff *wg_prev_queue_peek(struct prev_queue *queue) { if (queue->peeked) return queue->peeked; queue->peeked = wg_prev_queue_dequeue(queue); return queue->peeked; } /* Single consumer */ static inline void wg_prev_queue_drop_peeked(struct prev_queue *queue) { queue->peeked = NULL; } static inline int wg_queue_enqueue_per_device_and_peer( struct crypt_queue *device_queue, struct prev_queue *peer_queue, struct sk_buff *skb, struct workqueue_struct *wq) { int cpu; atomic_set_release(&PACKET_CB(skb)->state, PACKET_STATE_UNCRYPTED); /* We first queue this up for the peer ingestion, but the consumer * will wait for the state to change to CRYPTED or DEAD before. */ if (unlikely(!wg_prev_queue_enqueue(peer_queue, skb))) return -ENOSPC; /* Then we queue it up in the device queue, which consumes the * packet as soon as it can. */ cpu = wg_cpumask_next_online(&device_queue->last_cpu); if (unlikely(ptr_ring_produce_bh(&device_queue->ring, skb))) return -EPIPE; queue_work_on(cpu, wq, &per_cpu_ptr(device_queue->worker, cpu)->work); return 0; } static inline void wg_queue_enqueue_per_peer_tx(struct sk_buff *skb, enum packet_state state) { /* We take a reference, because as soon as we call atomic_set, the * peer can be freed from below us. */ struct wg_peer *peer = wg_peer_get(PACKET_PEER(skb)); atomic_set_release(&PACKET_CB(skb)->state, state); queue_work_on(wg_cpumask_choose_online(&peer->serial_work_cpu, peer->internal_id), peer->device->packet_crypt_wq, &peer->transmit_packet_work); wg_peer_put(peer); } static inline void wg_queue_enqueue_per_peer_rx(struct sk_buff *skb, enum packet_state state) { /* We take a reference, because as soon as we call atomic_set, the * peer can be freed from below us. */ struct wg_peer *peer = wg_peer_get(PACKET_PEER(skb)); atomic_set_release(&PACKET_CB(skb)->state, state); napi_schedule(&peer->napi); wg_peer_put(peer); } #ifdef DEBUG bool wg_packet_counter_selftest(void); #endif #endif /* _WG_QUEUEING_H */ |
| 5 5 4 4 3 2 2 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 | // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net> * Copyright (c) 2013 Eric Leblond <eric@regit.org> * * Development of this code funded by Astaro AG (http://www.astaro.com/) */ #include <linux/kernel.h> #include <linux/init.h> #include <linux/module.h> #include <linux/netlink.h> #include <linux/netfilter.h> #include <linux/netfilter/nf_tables.h> #include <net/netfilter/nf_tables.h> #include <net/netfilter/nft_reject.h> #include <linux/icmp.h> #include <linux/icmpv6.h> const struct nla_policy nft_reject_policy[NFTA_REJECT_MAX + 1] = { [NFTA_REJECT_TYPE] = NLA_POLICY_MAX(NLA_BE32, 255), [NFTA_REJECT_ICMP_CODE] = { .type = NLA_U8 }, }; EXPORT_SYMBOL_GPL(nft_reject_policy); int nft_reject_validate(const struct nft_ctx *ctx, const struct nft_expr *expr) { return nft_chain_validate_hooks(ctx->chain, (1 << NF_INET_LOCAL_IN) | (1 << NF_INET_FORWARD) | (1 << NF_INET_LOCAL_OUT) | (1 << NF_INET_PRE_ROUTING)); } EXPORT_SYMBOL_GPL(nft_reject_validate); int nft_reject_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) { struct nft_reject *priv = nft_expr_priv(expr); int icmp_code; if (tb[NFTA_REJECT_TYPE] == NULL) return -EINVAL; priv->type = ntohl(nla_get_be32(tb[NFTA_REJECT_TYPE])); switch (priv->type) { case NFT_REJECT_ICMP_UNREACH: case NFT_REJECT_ICMPX_UNREACH: if (tb[NFTA_REJECT_ICMP_CODE] == NULL) return -EINVAL; icmp_code = nla_get_u8(tb[NFTA_REJECT_ICMP_CODE]); if (priv->type == NFT_REJECT_ICMPX_UNREACH && icmp_code > NFT_REJECT_ICMPX_MAX) return -EINVAL; priv->icmp_code = icmp_code; break; case NFT_REJECT_TCP_RST: break; default: return -EINVAL; } return 0; } EXPORT_SYMBOL_GPL(nft_reject_init); int nft_reject_dump(struct sk_buff *skb, const struct nft_expr *expr, bool reset) { const struct nft_reject *priv = nft_expr_priv(expr); if (nla_put_be32(skb, NFTA_REJECT_TYPE, htonl(priv->type))) goto nla_put_failure; switch (priv->type) { case NFT_REJECT_ICMP_UNREACH: case NFT_REJECT_ICMPX_UNREACH: if (nla_put_u8(skb, NFTA_REJECT_ICMP_CODE, priv->icmp_code)) goto nla_put_failure; break; default: break; } return 0; nla_put_failure: return -1; } EXPORT_SYMBOL_GPL(nft_reject_dump); static u8 icmp_code_v4[NFT_REJECT_ICMPX_MAX + 1] = { [NFT_REJECT_ICMPX_NO_ROUTE] = ICMP_NET_UNREACH, [NFT_REJECT_ICMPX_PORT_UNREACH] = ICMP_PORT_UNREACH, [NFT_REJECT_ICMPX_HOST_UNREACH] = ICMP_HOST_UNREACH, [NFT_REJECT_ICMPX_ADMIN_PROHIBITED] = ICMP_PKT_FILTERED, }; int nft_reject_icmp_code(u8 code) { if (WARN_ON_ONCE(code > NFT_REJECT_ICMPX_MAX)) return ICMP_NET_UNREACH; return icmp_code_v4[code]; } EXPORT_SYMBOL_GPL(nft_reject_icmp_code); static u8 icmp_code_v6[NFT_REJECT_ICMPX_MAX + 1] = { [NFT_REJECT_ICMPX_NO_ROUTE] = ICMPV6_NOROUTE, [NFT_REJECT_ICMPX_PORT_UNREACH] = ICMPV6_PORT_UNREACH, [NFT_REJECT_ICMPX_HOST_UNREACH] = ICMPV6_ADDR_UNREACH, [NFT_REJECT_ICMPX_ADMIN_PROHIBITED] = ICMPV6_ADM_PROHIBITED, }; int nft_reject_icmpv6_code(u8 code) { if (WARN_ON_ONCE(code > NFT_REJECT_ICMPX_MAX)) return ICMPV6_NOROUTE; return icmp_code_v6[code]; } EXPORT_SYMBOL_GPL(nft_reject_icmpv6_code); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); MODULE_DESCRIPTION("Netfilter x_tables over nftables module"); |
| 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 | // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2010 Werner Fink, Jiri Slaby */ #include <linux/console.h> #include <linux/kernel.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/tty_driver.h> /* * This is handler for /proc/consoles */ static int show_console_dev(struct seq_file *m, void *v) { static const struct { short flag; char name; } con_flags[] = { { CON_ENABLED, 'E' }, { CON_CONSDEV, 'C' }, { CON_BOOT, 'B' }, { CON_NBCON, 'N' }, { CON_PRINTBUFFER, 'p' }, { CON_BRL, 'b' }, { CON_ANYTIME, 'a' }, }; char flags[ARRAY_SIZE(con_flags) + 1]; struct console *con = v; unsigned int a; dev_t dev = 0; if (con->device) { const struct tty_driver *driver; int index; /* * Take console_lock to serialize device() callback with * other console operations. For example, fg_console is * modified under console_lock when switching vt. */ console_lock(); driver = con->device(con, &index); console_unlock(); if (driver) { dev = MKDEV(driver->major, driver->minor_start); dev += index; } } for (a = 0; a < ARRAY_SIZE(con_flags); a++) flags[a] = (con->flags & con_flags[a].flag) ? con_flags[a].name : ' '; flags[a] = 0; seq_setwidth(m, 21 - 1); seq_printf(m, "%s%d", con->name, con->index); seq_pad(m, ' '); seq_printf(m, "%c%c%c (%s)", con->read ? 'R' : '-', ((con->flags & CON_NBCON) || con->write) ? 'W' : '-', con->unblank ? 'U' : '-', flags); if (dev) seq_printf(m, " %4d:%d", MAJOR(dev), MINOR(dev)); seq_putc(m, '\n'); return 0; } static void *c_start(struct seq_file *m, loff_t *pos) __acquires(&console_mutex) { struct console *con; loff_t off = 0; /* * Hold the console_list_lock to guarantee safe traversal of the * console list. SRCU cannot be used because there is no * place to store the SRCU cookie. */ console_list_lock(); for_each_console(con) if (off++ == *pos) break; return con; } static void *c_next(struct seq_file *m, void *v, loff_t *pos) { struct console *con = v; ++*pos; return hlist_entry_safe(con->node.next, struct console, node); } static void c_stop(struct seq_file *m, void *v) __releases(&console_mutex) { console_list_unlock(); } static const struct seq_operations consoles_op = { .start = c_start, .next = c_next, .stop = c_stop, .show = show_console_dev }; static int __init proc_consoles_init(void) { proc_create_seq("consoles", 0, NULL, &consoles_op); return 0; } fs_initcall(proc_consoles_init); |
| 7 7 7 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 | // SPDX-License-Identifier: GPL-2.0-or-later /* mpihelp-mul_3.c - MPI helper functions * Copyright (C) 1994, 1996, 1997, 1998, 2001 Free Software Foundation, Inc. * * This file is part of GnuPG. * * Note: This code is heavily based on the GNU MP Library. * Actually it's the same code with only minor changes in the * way the data is stored; this is to support the abstraction * of an optional secure memory allocation which may be used * to avoid revealing of sensitive data due to paging etc. * The GNU MP Library itself is published under the LGPL; * however I decided to publish this code under the plain GPL. */ #include "mpi-internal.h" #include "longlong.h" mpi_limb_t mpihelp_submul_1(mpi_ptr_t res_ptr, mpi_ptr_t s1_ptr, mpi_size_t s1_size, mpi_limb_t s2_limb) { mpi_limb_t cy_limb; mpi_size_t j; mpi_limb_t prod_high, prod_low; mpi_limb_t x; /* The loop counter and index J goes from -SIZE to -1. This way * the loop becomes faster. */ j = -s1_size; res_ptr -= j; s1_ptr -= j; cy_limb = 0; do { umul_ppmm(prod_high, prod_low, s1_ptr[j], s2_limb); prod_low += cy_limb; cy_limb = (prod_low < cy_limb ? 1 : 0) + prod_high; x = res_ptr[j]; prod_low = x - prod_low; cy_limb += prod_low > x ? 1 : 0; res_ptr[j] = prod_low; } while (++j); return cy_limb; } |
| 8 8 8 8 8 6 8 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 3 3 3 4 3 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 | /* * llc_sap.c - driver routines for SAP component. * * Copyright (c) 1997 by Procom Technology, Inc. * 2001-2003 by Arnaldo Carvalho de Melo <acme@conectiva.com.br> * * This program can be redistributed or modified under the terms of the * GNU General Public License as published by the Free Software Foundation. * This program is distributed without any warranty or implied warranty * of merchantability or fitness for a particular purpose. * * See the GNU General Public License for more details. */ #include <net/llc.h> #include <net/llc_if.h> #include <net/llc_conn.h> #include <net/llc_pdu.h> #include <net/llc_sap.h> #include <net/llc_s_ac.h> #include <net/llc_s_ev.h> #include <net/llc_s_st.h> #include <net/sock.h> #include <net/tcp_states.h> #include <linux/llc.h> #include <linux/slab.h> static int llc_mac_header_len(unsigned short devtype) { switch (devtype) { case ARPHRD_ETHER: case ARPHRD_LOOPBACK: return sizeof(struct ethhdr); } return 0; } /** * llc_alloc_frame - allocates sk_buff for frame * @sk: socket to allocate frame to * @dev: network device this skb will be sent over * @type: pdu type to allocate * @data_size: data size to allocate * * Allocates an sk_buff for frame and initializes sk_buff fields. * Returns allocated skb or %NULL when out of memory. */ struct sk_buff *llc_alloc_frame(struct sock *sk, struct net_device *dev, u8 type, u32 data_size) { int hlen = type == LLC_PDU_TYPE_U ? 3 : 4; struct sk_buff *skb; hlen += llc_mac_header_len(dev->type); skb = alloc_skb(hlen + data_size, GFP_ATOMIC); if (skb) { skb_reset_mac_header(skb); skb_reserve(skb, hlen); skb_reset_network_header(skb); skb_reset_transport_header(skb); skb->protocol = htons(ETH_P_802_2); skb->dev = dev; if (sk != NULL) skb_set_owner_w(skb, sk); } return skb; } void llc_save_primitive(struct sock *sk, struct sk_buff *skb, u8 prim) { struct sockaddr_llc *addr; /* save primitive for use by the user. */ addr = llc_ui_skb_cb(skb); memset(addr, 0, sizeof(*addr)); addr->sllc_family = sk->sk_family; addr->sllc_arphrd = skb->dev->type; addr->sllc_test = prim == LLC_TEST_PRIM; addr->sllc_xid = prim == LLC_XID_PRIM; addr->sllc_ua = prim == LLC_DATAUNIT_PRIM; llc_pdu_decode_sa(skb, addr->sllc_mac); llc_pdu_decode_ssap(skb, &addr->sllc_sap); } /** * llc_sap_rtn_pdu - Informs upper layer on rx of an UI, XID or TEST pdu. * @sap: pointer to SAP * @skb: received pdu */ void llc_sap_rtn_pdu(struct llc_sap *sap, struct sk_buff *skb) { struct llc_sap_state_ev *ev = llc_sap_ev(skb); struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb); switch (LLC_U_PDU_RSP(pdu)) { case LLC_1_PDU_CMD_TEST: ev->prim = LLC_TEST_PRIM; break; case LLC_1_PDU_CMD_XID: ev->prim = LLC_XID_PRIM; break; case LLC_1_PDU_CMD_UI: ev->prim = LLC_DATAUNIT_PRIM; break; } ev->ind_cfm_flag = LLC_IND; } /** * llc_find_sap_trans - finds transition for event * @sap: pointer to SAP * @skb: happened event * * This function finds transition that matches with happened event. * Returns the pointer to found transition on success or %NULL for * failure. */ static const struct llc_sap_state_trans *llc_find_sap_trans(struct llc_sap *sap, struct sk_buff *skb) { int i = 0; const struct llc_sap_state_trans *rc = NULL; const struct llc_sap_state_trans **next_trans; struct llc_sap_state *curr_state = &llc_sap_state_table[sap->state - 1]; /* * Search thru events for this state until list exhausted or until * its obvious the event is not valid for the current state */ for (next_trans = curr_state->transitions; next_trans[i]->ev; i++) if (!next_trans[i]->ev(sap, skb)) { rc = next_trans[i]; /* got event match; return it */ break; } return rc; } /** * llc_exec_sap_trans_actions - execute actions related to event * @sap: pointer to SAP * @trans: pointer to transition that it's actions must be performed * @skb: happened event. * * This function executes actions that is related to happened event. * Returns 0 for success and 1 for failure of at least one action. */ static int llc_exec_sap_trans_actions(struct llc_sap *sap, const struct llc_sap_state_trans *trans, struct sk_buff *skb) { int rc = 0; const llc_sap_action_t *next_action = trans->ev_actions; for (; next_action && *next_action; next_action++) if ((*next_action)(sap, skb)) rc = 1; return rc; } /** * llc_sap_next_state - finds transition, execs actions & change SAP state * @sap: pointer to SAP * @skb: happened event * * This function finds transition that matches with happened event, then * executes related actions and finally changes state of SAP. It returns * 0 on success and 1 for failure. */ static int llc_sap_next_state(struct llc_sap *sap, struct sk_buff *skb) { const struct llc_sap_state_trans *trans; int rc = 1; if (sap->state > LLC_NR_SAP_STATES) goto out; trans = llc_find_sap_trans(sap, skb); if (!trans) goto out; /* * Got the state to which we next transition; perform the actions * associated with this transition before actually transitioning to the * next state */ rc = llc_exec_sap_trans_actions(sap, trans, skb); if (rc) goto out; /* * Transition SAP to next state if all actions execute successfully */ sap->state = trans->next_state; out: return rc; } /** * llc_sap_state_process - sends event to SAP state machine * @sap: sap to use * @skb: pointer to occurred event * * After executing actions of the event, upper layer will be indicated * if needed(on receiving an UI frame). sk can be null for the * datalink_proto case. * * This function always consumes a reference to the skb. */ static void llc_sap_state_process(struct llc_sap *sap, struct sk_buff *skb) { struct llc_sap_state_ev *ev = llc_sap_ev(skb); ev->ind_cfm_flag = 0; llc_sap_next_state(sap, skb); if (ev->ind_cfm_flag == LLC_IND && skb->sk->sk_state != TCP_LISTEN) { llc_save_primitive(skb->sk, skb, ev->prim); /* queue skb to the user. */ if (sock_queue_rcv_skb(skb->sk, skb) == 0) return; } kfree_skb(skb); } /** * llc_build_and_send_test_pkt - TEST interface for upper layers. * @sap: sap to use * @skb: packet to send * @dmac: destination mac address * @dsap: destination sap * * This function is called when upper layer wants to send a TEST pdu. * Returns 0 for success, 1 otherwise. */ void llc_build_and_send_test_pkt(struct llc_sap *sap, struct sk_buff *skb, u8 *dmac, u8 dsap) { struct llc_sap_state_ev *ev = llc_sap_ev(skb); ev->saddr.lsap = sap->laddr.lsap; ev->daddr.lsap = dsap; memcpy(ev->saddr.mac, skb->dev->dev_addr, IFHWADDRLEN); memcpy(ev->daddr.mac, dmac, IFHWADDRLEN); ev->type = LLC_SAP_EV_TYPE_PRIM; ev->prim = LLC_TEST_PRIM; ev->prim_type = LLC_PRIM_TYPE_REQ; llc_sap_state_process(sap, skb); } /** * llc_build_and_send_xid_pkt - XID interface for upper layers * @sap: sap to use * @skb: packet to send * @dmac: destination mac address * @dsap: destination sap * * This function is called when upper layer wants to send a XID pdu. * Returns 0 for success, 1 otherwise. */ void llc_build_and_send_xid_pkt(struct llc_sap *sap, struct sk_buff *skb, u8 *dmac, u8 dsap) { struct llc_sap_state_ev *ev = llc_sap_ev(skb); ev->saddr.lsap = sap->laddr.lsap; ev->daddr.lsap = dsap; memcpy(ev->saddr.mac, skb->dev->dev_addr, IFHWADDRLEN); memcpy(ev->daddr.mac, dmac, IFHWADDRLEN); ev->type = LLC_SAP_EV_TYPE_PRIM; ev->prim = LLC_XID_PRIM; ev->prim_type = LLC_PRIM_TYPE_REQ; llc_sap_state_process(sap, skb); } /** * llc_sap_rcv - sends received pdus to the sap state machine * @sap: current sap component structure. * @skb: received frame. * @sk: socket to associate to frame * * Sends received pdus to the sap state machine. */ static void llc_sap_rcv(struct llc_sap *sap, struct sk_buff *skb, struct sock *sk) { struct llc_sap_state_ev *ev = llc_sap_ev(skb); ev->type = LLC_SAP_EV_TYPE_PDU; ev->reason = 0; skb_orphan(skb); sock_hold(sk); skb->sk = sk; skb->destructor = sock_efree; llc_sap_state_process(sap, skb); } static inline bool llc_dgram_match(const struct llc_sap *sap, const struct llc_addr *laddr, const struct sock *sk, const struct net *net) { struct llc_sock *llc = llc_sk(sk); return sk->sk_type == SOCK_DGRAM && net_eq(sock_net(sk), net) && llc->laddr.lsap == laddr->lsap && ether_addr_equal(llc->laddr.mac, laddr->mac); } /** * llc_lookup_dgram - Finds dgram socket for the local sap/mac * @sap: SAP * @laddr: address of local LLC (MAC + SAP) * @net: netns to look up a socket in * * Search socket list of the SAP and finds connection using the local * mac, and local sap. Returns pointer for socket found, %NULL otherwise. */ static struct sock *llc_lookup_dgram(struct llc_sap *sap, const struct llc_addr *laddr, const struct net *net) { struct sock *rc; struct hlist_nulls_node *node; int slot = llc_sk_laddr_hashfn(sap, laddr); struct hlist_nulls_head *laddr_hb = &sap->sk_laddr_hash[slot]; rcu_read_lock_bh(); again: sk_nulls_for_each_rcu(rc, node, laddr_hb) { if (llc_dgram_match(sap, laddr, rc, net)) { /* Extra checks required by SLAB_TYPESAFE_BY_RCU */ if (unlikely(!refcount_inc_not_zero(&rc->sk_refcnt))) goto again; if (unlikely(llc_sk(rc)->sap != sap || !llc_dgram_match(sap, laddr, rc, net))) { sock_put(rc); continue; } goto found; } } rc = NULL; /* * if the nulls value we got at the end of this lookup is * not the expected one, we must restart lookup. * We probably met an item that was moved to another chain. */ if (unlikely(get_nulls_value(node) != slot)) goto again; found: rcu_read_unlock_bh(); return rc; } static inline bool llc_mcast_match(const struct llc_sap *sap, const struct llc_addr *laddr, const struct sk_buff *skb, const struct sock *sk) { struct llc_sock *llc = llc_sk(sk); return sk->sk_type == SOCK_DGRAM && llc->laddr.lsap == laddr->lsap && llc->dev == skb->dev; } static void llc_do_mcast(struct llc_sap *sap, struct sk_buff *skb, struct sock **stack, int count) { struct sk_buff *skb1; int i; for (i = 0; i < count; i++) { skb1 = skb_clone(skb, GFP_ATOMIC); if (!skb1) { sock_put(stack[i]); continue; } llc_sap_rcv(sap, skb1, stack[i]); sock_put(stack[i]); } } /** * llc_sap_mcast - Deliver multicast PDU's to all matching datagram sockets. * @sap: SAP * @laddr: address of local LLC (MAC + SAP) * @skb: PDU to deliver * * Search socket list of the SAP and finds connections with same sap. * Deliver clone to each. */ static void llc_sap_mcast(struct llc_sap *sap, const struct llc_addr *laddr, struct sk_buff *skb) { int i = 0; struct sock *sk; struct sock *stack[256 / sizeof(struct sock *)]; struct llc_sock *llc; struct hlist_head *dev_hb = llc_sk_dev_hash(sap, skb->dev->ifindex); spin_lock_bh(&sap->sk_lock); hlist_for_each_entry(llc, dev_hb, dev_hash_node) { sk = &llc->sk; if (!llc_mcast_match(sap, laddr, skb, sk)) continue; sock_hold(sk); if (i < ARRAY_SIZE(stack)) stack[i++] = sk; else { llc_do_mcast(sap, skb, stack, i); i = 0; } } spin_unlock_bh(&sap->sk_lock); llc_do_mcast(sap, skb, stack, i); } void llc_sap_handler(struct llc_sap *sap, struct sk_buff *skb) { struct llc_addr laddr; llc_pdu_decode_da(skb, laddr.mac); llc_pdu_decode_dsap(skb, &laddr.lsap); if (is_multicast_ether_addr(laddr.mac)) { llc_sap_mcast(sap, &laddr, skb); kfree_skb(skb); } else { struct sock *sk = llc_lookup_dgram(sap, &laddr, dev_net(skb->dev)); if (sk) { llc_sap_rcv(sap, skb, sk); sock_put(sk); } else kfree_skb(skb); } } |
| 2 2 1 1 1 1 4 4 4 1 5 4 5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 | // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB /* * Copyright (c) 2019 Mellanox Technologies. All rights reserved. */ #include <rdma/ib_verbs.h> #include <rdma/rdma_counter.h> #include "core_priv.h" #include "restrack.h" #define ALL_AUTO_MODE_MASKS (RDMA_COUNTER_MASK_QP_TYPE | RDMA_COUNTER_MASK_PID) static int __counter_set_mode(struct rdma_port_counter *port_counter, enum rdma_nl_counter_mode new_mode, enum rdma_nl_counter_mask new_mask, bool bind_opcnt) { if (new_mode == RDMA_COUNTER_MODE_AUTO) { if (new_mask & (~ALL_AUTO_MODE_MASKS)) return -EINVAL; if (port_counter->num_counters) return -EBUSY; } port_counter->mode.mode = new_mode; port_counter->mode.mask = new_mask; port_counter->mode.bind_opcnt = bind_opcnt; return 0; } /* * rdma_counter_set_auto_mode() - Turn on/off per-port auto mode * * @dev: Device to operate * @port: Port to use * @mask: Mask to configure * @extack: Message to the user * * Return 0 on success. If counter mode wasn't changed then it is considered * as success as well. * Return -EBUSY when changing to auto mode while there are bounded counters. * */ int rdma_counter_set_auto_mode(struct ib_device *dev, u32 port, enum rdma_nl_counter_mask mask, bool bind_opcnt, struct netlink_ext_ack *extack) { struct rdma_port_counter *port_counter; enum rdma_nl_counter_mode mode; int ret; port_counter = &dev->port_data[port].port_counter; if (!port_counter->hstats) return -EOPNOTSUPP; mutex_lock(&port_counter->lock); if (mask) mode = RDMA_COUNTER_MODE_AUTO; else mode = (port_counter->num_counters) ? RDMA_COUNTER_MODE_MANUAL : RDMA_COUNTER_MODE_NONE; if (port_counter->mode.mode == mode && port_counter->mode.mask == mask && port_counter->mode.bind_opcnt == bind_opcnt) { ret = 0; goto out; } ret = __counter_set_mode(port_counter, mode, mask, bind_opcnt); out: mutex_unlock(&port_counter->lock); if (ret == -EBUSY) NL_SET_ERR_MSG( extack, "Modifying auto mode is not allowed when there is a bound QP"); return ret; } static void auto_mode_init_counter(struct rdma_counter *counter, const struct ib_qp *qp, enum rdma_nl_counter_mask new_mask) { struct auto_mode_param *param = &counter->mode.param; counter->mode.mode = RDMA_COUNTER_MODE_AUTO; counter->mode.mask = new_mask; if (new_mask & RDMA_COUNTER_MASK_QP_TYPE) param->qp_type = qp->qp_type; } static int __rdma_counter_bind_qp(struct rdma_counter *counter, struct ib_qp *qp, u32 port) { int ret; if (qp->counter) return -EINVAL; if (!qp->device->ops.counter_bind_qp) return -EOPNOTSUPP; mutex_lock(&counter->lock); ret = qp->device->ops.counter_bind_qp(counter, qp, port); mutex_unlock(&counter->lock); return ret; } int rdma_counter_modify(struct ib_device *dev, u32 port, unsigned int index, bool enable) { struct rdma_hw_stats *stats; int ret = 0; if (!dev->ops.modify_hw_stat) return -EOPNOTSUPP; stats = ib_get_hw_stats_port(dev, port); if (!stats || index >= stats->num_counters || !(stats->descs[index].flags & IB_STAT_FLAG_OPTIONAL)) return -EINVAL; mutex_lock(&stats->lock); if (enable != test_bit(index, stats->is_disabled)) goto out; ret = dev->ops.modify_hw_stat(dev, port, index, enable); if (ret) goto out; if (enable) clear_bit(index, stats->is_disabled); else set_bit(index, stats->is_disabled); out: mutex_unlock(&stats->lock); return ret; } static struct rdma_counter *alloc_and_bind(struct ib_device *dev, u32 port, struct ib_qp *qp, enum rdma_nl_counter_mode mode, bool bind_opcnt) { struct rdma_port_counter *port_counter; struct rdma_counter *counter; int ret; if (!dev->ops.counter_dealloc || !dev->ops.counter_alloc_stats) return NULL; counter = rdma_zalloc_drv_obj(dev, rdma_counter); if (!counter) return NULL; counter->device = dev; counter->port = port; dev->ops.counter_init(counter); rdma_restrack_new(&counter->res, RDMA_RESTRACK_COUNTER); counter->stats = dev->ops.counter_alloc_stats(counter); if (!counter->stats) goto err_stats; port_counter = &dev->port_data[port].port_counter; mutex_lock(&port_counter->lock); switch (mode) { case RDMA_COUNTER_MODE_MANUAL: ret = __counter_set_mode(port_counter, RDMA_COUNTER_MODE_MANUAL, 0, bind_opcnt); if (ret) { mutex_unlock(&port_counter->lock); goto err_mode; } break; case RDMA_COUNTER_MODE_AUTO: auto_mode_init_counter(counter, qp, port_counter->mode.mask); break; default: ret = -EOPNOTSUPP; mutex_unlock(&port_counter->lock); goto err_mode; } port_counter->num_counters++; mutex_unlock(&port_counter->lock); counter->mode.mode = mode; counter->mode.bind_opcnt = bind_opcnt; kref_init(&counter->kref); mutex_init(&counter->lock); ret = __rdma_counter_bind_qp(counter, qp, port); if (ret) goto err_mode; rdma_restrack_parent_name(&counter->res, &qp->res); rdma_restrack_add(&counter->res); return counter; err_mode: rdma_free_hw_stats_struct(counter->stats); err_stats: rdma_restrack_put(&counter->res); kfree(counter); return NULL; } static void rdma_counter_free(struct rdma_counter *counter) { struct rdma_port_counter *port_counter; port_counter = &counter->device->port_data[counter->port].port_counter; mutex_lock(&port_counter->lock); port_counter->num_counters--; if (!port_counter->num_counters && (port_counter->mode.mode == RDMA_COUNTER_MODE_MANUAL)) __counter_set_mode(port_counter, RDMA_COUNTER_MODE_NONE, 0, false); mutex_unlock(&port_counter->lock); rdma_restrack_del(&counter->res); rdma_free_hw_stats_struct(counter->stats); kfree(counter); } static bool auto_mode_match(struct ib_qp *qp, struct rdma_counter *counter, enum rdma_nl_counter_mask auto_mask) { struct auto_mode_param *param = &counter->mode.param; bool match = true; if (auto_mask & RDMA_COUNTER_MASK_QP_TYPE) match &= (param->qp_type == qp->qp_type); if (auto_mask & RDMA_COUNTER_MASK_PID) match &= (task_pid_nr(counter->res.task) == task_pid_nr(qp->res.task)); return match; } static int __rdma_counter_unbind_qp(struct ib_qp *qp, u32 port) { struct rdma_counter *counter = qp->counter; int ret; if (!qp->device->ops.counter_unbind_qp) return -EOPNOTSUPP; mutex_lock(&counter->lock); ret = qp->device->ops.counter_unbind_qp(qp, port); mutex_unlock(&counter->lock); return ret; } static void counter_history_stat_update(struct rdma_counter *counter) { struct ib_device *dev = counter->device; struct rdma_port_counter *port_counter; int i; port_counter = &dev->port_data[counter->port].port_counter; if (!port_counter->hstats) return; rdma_counter_query_stats(counter); for (i = 0; i < counter->stats->num_counters; i++) port_counter->hstats->value[i] += counter->stats->value[i]; } /* * rdma_get_counter_auto_mode - Find the counter that @qp should be bound * with in auto mode * * Return: The counter (with ref-count increased) if found */ static struct rdma_counter *rdma_get_counter_auto_mode(struct ib_qp *qp, u32 port) { struct rdma_port_counter *port_counter; struct rdma_counter *counter = NULL; struct ib_device *dev = qp->device; struct rdma_restrack_entry *res; struct rdma_restrack_root *rt; unsigned long id = 0; port_counter = &dev->port_data[port].port_counter; rt = &dev->res[RDMA_RESTRACK_COUNTER]; xa_lock(&rt->xa); xa_for_each(&rt->xa, id, res) { counter = container_of(res, struct rdma_counter, res); if ((counter->device != qp->device) || (counter->port != port)) goto next; if (auto_mode_match(qp, counter, port_counter->mode.mask)) break; next: counter = NULL; } if (counter && !kref_get_unless_zero(&counter->kref)) counter = NULL; xa_unlock(&rt->xa); return counter; } static void counter_release(struct kref *kref) { struct rdma_counter *counter; counter = container_of(kref, struct rdma_counter, kref); counter_history_stat_update(counter); counter->device->ops.counter_dealloc(counter); rdma_counter_free(counter); } /* * rdma_counter_bind_qp_auto - Check and bind the QP to a counter base on * the auto-mode rule */ int rdma_counter_bind_qp_auto(struct ib_qp *qp, u32 port) { struct rdma_port_counter *port_counter; struct ib_device *dev = qp->device; struct rdma_counter *counter; int ret; if (!rdma_restrack_is_tracked(&qp->res) || rdma_is_kernel_res(&qp->res)) return 0; if (!rdma_is_port_valid(dev, port)) return -EINVAL; port_counter = &dev->port_data[port].port_counter; if (port_counter->mode.mode != RDMA_COUNTER_MODE_AUTO) return 0; counter = rdma_get_counter_auto_mode(qp, port); if (counter) { ret = __rdma_counter_bind_qp(counter, qp, port); if (ret) { kref_put(&counter->kref, counter_release); return ret; } } else { counter = alloc_and_bind(dev, port, qp, RDMA_COUNTER_MODE_AUTO, port_counter->mode.bind_opcnt); if (!counter) return -ENOMEM; } return 0; } /* * rdma_counter_unbind_qp - Unbind a qp from a counter * @force: * true - Decrease the counter ref-count anyway (e.g., qp destroy) */ int rdma_counter_unbind_qp(struct ib_qp *qp, u32 port, bool force) { struct rdma_counter *counter = qp->counter; int ret; if (!counter) return -EINVAL; ret = __rdma_counter_unbind_qp(qp, port); if (ret && !force) return ret; kref_put(&counter->kref, counter_release); return 0; } int rdma_counter_query_stats(struct rdma_counter *counter) { struct ib_device *dev = counter->device; int ret; if (!dev->ops.counter_update_stats) return -EINVAL; mutex_lock(&counter->lock); ret = dev->ops.counter_update_stats(counter); mutex_unlock(&counter->lock); return ret; } static u64 get_running_counters_hwstat_sum(struct ib_device *dev, u32 port, u32 index) { struct rdma_restrack_entry *res; struct rdma_restrack_root *rt; struct rdma_counter *counter; unsigned long id = 0; u64 sum = 0; rt = &dev->res[RDMA_RESTRACK_COUNTER]; xa_lock(&rt->xa); xa_for_each(&rt->xa, id, res) { if (!rdma_restrack_get(res)) continue; xa_unlock(&rt->xa); counter = container_of(res, struct rdma_counter, res); if ((counter->device != dev) || (counter->port != port) || rdma_counter_query_stats(counter)) goto next; sum += counter->stats->value[index]; next: xa_lock(&rt->xa); rdma_restrack_put(res); } xa_unlock(&rt->xa); return sum; } /* * rdma_counter_get_hwstat_value() - Get the sum value of all counters on a * specific port, including the running ones and history data */ u64 rdma_counter_get_hwstat_value(struct ib_device *dev, u32 port, u32 index) { struct rdma_port_counter *port_counter; u64 sum; port_counter = &dev->port_data[port].port_counter; if (!port_counter->hstats) return 0; sum = get_running_counters_hwstat_sum(dev, port, index); sum += port_counter->hstats->value[index]; return sum; } static struct ib_qp *rdma_counter_get_qp(struct ib_device *dev, u32 qp_num) { struct rdma_restrack_entry *res = NULL; struct ib_qp *qp = NULL; res = rdma_restrack_get_byid(dev, RDMA_RESTRACK_QP, qp_num); if (IS_ERR(res)) return NULL; qp = container_of(res, struct ib_qp, res); if (qp->qp_type == IB_QPT_RAW_PACKET && !rdma_dev_has_raw_cap(dev)) goto err; return qp; err: rdma_restrack_put(res); return NULL; } static struct rdma_counter *rdma_get_counter_by_id(struct ib_device *dev, u32 counter_id) { struct rdma_restrack_entry *res; struct rdma_counter *counter; res = rdma_restrack_get_byid(dev, RDMA_RESTRACK_COUNTER, counter_id); if (IS_ERR(res)) return NULL; counter = container_of(res, struct rdma_counter, res); kref_get(&counter->kref); rdma_restrack_put(res); return counter; } /* * rdma_counter_bind_qpn() - Bind QP @qp_num to counter @counter_id */ int rdma_counter_bind_qpn(struct ib_device *dev, u32 port, u32 qp_num, u32 counter_id) { struct rdma_port_counter *port_counter; struct rdma_counter *counter; struct ib_qp *qp; int ret; port_counter = &dev->port_data[port].port_counter; if (port_counter->mode.mode == RDMA_COUNTER_MODE_AUTO) return -EINVAL; qp = rdma_counter_get_qp(dev, qp_num); if (!qp) return -ENOENT; counter = rdma_get_counter_by_id(dev, counter_id); if (!counter) { ret = -ENOENT; goto err; } if (rdma_is_kernel_res(&counter->res) != rdma_is_kernel_res(&qp->res)) { ret = -EINVAL; goto err_task; } if ((counter->device != qp->device) || (counter->port != qp->port)) { ret = -EINVAL; goto err_task; } ret = __rdma_counter_bind_qp(counter, qp, port); if (ret) goto err_task; rdma_restrack_put(&qp->res); return 0; err_task: kref_put(&counter->kref, counter_release); err: rdma_restrack_put(&qp->res); return ret; } /* * rdma_counter_bind_qpn_alloc() - Alloc a counter and bind QP @qp_num to it * The id of new counter is returned in @counter_id */ int rdma_counter_bind_qpn_alloc(struct ib_device *dev, u32 port, u32 qp_num, u32 *counter_id) { struct rdma_port_counter *port_counter; struct rdma_counter *counter; struct ib_qp *qp; int ret; if (!rdma_is_port_valid(dev, port)) return -EINVAL; port_counter = &dev->port_data[port].port_counter; if (!port_counter->hstats) return -EOPNOTSUPP; if (port_counter->mode.mode == RDMA_COUNTER_MODE_AUTO) return -EINVAL; qp = rdma_counter_get_qp(dev, qp_num); if (!qp) return -ENOENT; if (rdma_is_port_valid(dev, qp->port) && (qp->port != port)) { ret = -EINVAL; goto err; } counter = alloc_and_bind(dev, port, qp, RDMA_COUNTER_MODE_MANUAL, true); if (!counter) { ret = -ENOMEM; goto err; } if (counter_id) *counter_id = counter->id; rdma_restrack_put(&qp->res); return 0; err: rdma_restrack_put(&qp->res); return ret; } /* * rdma_counter_unbind_qpn() - Unbind QP @qp_num from a counter */ int rdma_counter_unbind_qpn(struct ib_device *dev, u32 port, u32 qp_num, u32 counter_id) { struct rdma_port_counter *port_counter; struct ib_qp *qp; int ret; if (!rdma_is_port_valid(dev, port)) return -EINVAL; qp = rdma_counter_get_qp(dev, qp_num); if (!qp) return -ENOENT; if (rdma_is_port_valid(dev, qp->port) && (qp->port != port)) { ret = -EINVAL; goto out; } port_counter = &dev->port_data[port].port_counter; if (!qp->counter || qp->counter->id != counter_id || port_counter->mode.mode != RDMA_COUNTER_MODE_MANUAL) { ret = -EINVAL; goto out; } ret = rdma_counter_unbind_qp(qp, port, false); out: rdma_restrack_put(&qp->res); return ret; } int rdma_counter_get_mode(struct ib_device *dev, u32 port, enum rdma_nl_counter_mode *mode, enum rdma_nl_counter_mask *mask, bool *opcnt) { struct rdma_port_counter *port_counter; port_counter = &dev->port_data[port].port_counter; *mode = port_counter->mode.mode; *mask = port_counter->mode.mask; *opcnt = port_counter->mode.bind_opcnt; return 0; } void rdma_counter_init(struct ib_device *dev) { struct rdma_port_counter *port_counter; u32 port, i; if (!dev->port_data) return; rdma_for_each_port(dev, port) { port_counter = &dev->port_data[port].port_counter; port_counter->mode.mode = RDMA_COUNTER_MODE_NONE; mutex_init(&port_counter->lock); if (!dev->ops.alloc_hw_port_stats) continue; port_counter->hstats = dev->ops.alloc_hw_port_stats(dev, port); if (!port_counter->hstats) goto fail; } return; fail: for (i = port; i >= rdma_start_port(dev); i--) { port_counter = &dev->port_data[port].port_counter; rdma_free_hw_stats_struct(port_counter->hstats); port_counter->hstats = NULL; mutex_destroy(&port_counter->lock); } } void rdma_counter_release(struct ib_device *dev) { struct rdma_port_counter *port_counter; u32 port; rdma_for_each_port(dev, port) { port_counter = &dev->port_data[port].port_counter; rdma_free_hw_stats_struct(port_counter->hstats); mutex_destroy(&port_counter->lock); } } |
| 29 1 30 28 22 22 30 25 25 25 25 25 25 25 25 25 25 24 23 23 25 25 25 24 24 2 4 25 25 25 6 20 24 25 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 | // SPDX-License-Identifier: GPL-2.0+ #include <linux/dma-fence.h> #include <drm/drm_atomic.h> #include <drm/drm_atomic_helper.h> #include <drm/drm_managed.h> #include <drm/drm_probe_helper.h> #include <drm/drm_vblank.h> #include "vkms_drv.h" static enum hrtimer_restart vkms_vblank_simulate(struct hrtimer *timer) { struct vkms_output *output = container_of(timer, struct vkms_output, vblank_hrtimer); struct drm_crtc *crtc = &output->crtc; struct vkms_crtc_state *state; u64 ret_overrun; bool ret, fence_cookie; fence_cookie = dma_fence_begin_signalling(); ret_overrun = hrtimer_forward_now(&output->vblank_hrtimer, output->period_ns); if (ret_overrun != 1) pr_warn("%s: vblank timer overrun\n", __func__); spin_lock(&output->lock); ret = drm_crtc_handle_vblank(crtc); if (!ret) DRM_ERROR("vkms failure on handling vblank"); state = output->composer_state; spin_unlock(&output->lock); if (state && output->composer_enabled) { u64 frame = drm_crtc_accurate_vblank_count(crtc); /* update frame_start only if a queued vkms_composer_worker() * has read the data */ spin_lock(&output->composer_lock); if (!state->crc_pending) state->frame_start = frame; else DRM_DEBUG_DRIVER("crc worker falling behind, frame_start: %llu, frame_end: %llu\n", state->frame_start, frame); state->frame_end = frame; state->crc_pending = true; spin_unlock(&output->composer_lock); ret = queue_work(output->composer_workq, &state->composer_work); if (!ret) DRM_DEBUG_DRIVER("Composer worker already queued\n"); } dma_fence_end_signalling(fence_cookie); return HRTIMER_RESTART; } static int vkms_enable_vblank(struct drm_crtc *crtc) { struct drm_vblank_crtc *vblank = drm_crtc_vblank_crtc(crtc); struct vkms_output *out = drm_crtc_to_vkms_output(crtc); hrtimer_setup(&out->vblank_hrtimer, &vkms_vblank_simulate, CLOCK_MONOTONIC, HRTIMER_MODE_REL); out->period_ns = ktime_set(0, vblank->framedur_ns); hrtimer_start(&out->vblank_hrtimer, out->period_ns, HRTIMER_MODE_REL); return 0; } static void vkms_disable_vblank(struct drm_crtc *crtc) { struct vkms_output *out = drm_crtc_to_vkms_output(crtc); hrtimer_cancel(&out->vblank_hrtimer); } static bool vkms_get_vblank_timestamp(struct drm_crtc *crtc, int *max_error, ktime_t *vblank_time, bool in_vblank_irq) { struct vkms_output *output = drm_crtc_to_vkms_output(crtc); struct drm_vblank_crtc *vblank = drm_crtc_vblank_crtc(crtc); if (!READ_ONCE(vblank->enabled)) { *vblank_time = ktime_get(); return true; } *vblank_time = READ_ONCE(output->vblank_hrtimer.node.expires); if (WARN_ON(*vblank_time == vblank->time)) return true; /* * To prevent races we roll the hrtimer forward before we do any * interrupt processing - this is how real hw works (the interrupt is * only generated after all the vblank registers are updated) and what * the vblank core expects. Therefore we need to always correct the * timestampe by one frame. */ *vblank_time -= output->period_ns; return true; } static struct drm_crtc_state * vkms_atomic_crtc_duplicate_state(struct drm_crtc *crtc) { struct vkms_crtc_state *vkms_state; if (WARN_ON(!crtc->state)) return NULL; vkms_state = kzalloc(sizeof(*vkms_state), GFP_KERNEL); if (!vkms_state) return NULL; __drm_atomic_helper_crtc_duplicate_state(crtc, &vkms_state->base); INIT_WORK(&vkms_state->composer_work, vkms_composer_worker); return &vkms_state->base; } static void vkms_atomic_crtc_destroy_state(struct drm_crtc *crtc, struct drm_crtc_state *state) { struct vkms_crtc_state *vkms_state = to_vkms_crtc_state(state); __drm_atomic_helper_crtc_destroy_state(state); WARN_ON(work_pending(&vkms_state->composer_work)); kfree(vkms_state->active_planes); kfree(vkms_state); } static void vkms_atomic_crtc_reset(struct drm_crtc *crtc) { struct vkms_crtc_state *vkms_state = kzalloc(sizeof(*vkms_state), GFP_KERNEL); if (crtc->state) vkms_atomic_crtc_destroy_state(crtc, crtc->state); __drm_atomic_helper_crtc_reset(crtc, &vkms_state->base); if (vkms_state) INIT_WORK(&vkms_state->composer_work, vkms_composer_worker); } static const struct drm_crtc_funcs vkms_crtc_funcs = { .set_config = drm_atomic_helper_set_config, .page_flip = drm_atomic_helper_page_flip, .reset = vkms_atomic_crtc_reset, .atomic_duplicate_state = vkms_atomic_crtc_duplicate_state, .atomic_destroy_state = vkms_atomic_crtc_destroy_state, .enable_vblank = vkms_enable_vblank, .disable_vblank = vkms_disable_vblank, .get_vblank_timestamp = vkms_get_vblank_timestamp, .get_crc_sources = vkms_get_crc_sources, .set_crc_source = vkms_set_crc_source, .verify_crc_source = vkms_verify_crc_source, }; static int vkms_crtc_atomic_check(struct drm_crtc *crtc, struct drm_atomic_state *state) { struct drm_crtc_state *crtc_state = drm_atomic_get_new_crtc_state(state, crtc); struct vkms_crtc_state *vkms_state = to_vkms_crtc_state(crtc_state); struct drm_plane *plane; struct drm_plane_state *plane_state; int i = 0, ret; if (vkms_state->active_planes) return 0; ret = drm_atomic_add_affected_planes(crtc_state->state, crtc); if (ret < 0) return ret; drm_for_each_plane_mask(plane, crtc->dev, crtc_state->plane_mask) { plane_state = drm_atomic_get_existing_plane_state(crtc_state->state, plane); WARN_ON(!plane_state); if (!plane_state->visible) continue; i++; } vkms_state->active_planes = kcalloc(i, sizeof(*vkms_state->active_planes), GFP_KERNEL); if (!vkms_state->active_planes) return -ENOMEM; vkms_state->num_active_planes = i; i = 0; drm_for_each_plane_mask(plane, crtc->dev, crtc_state->plane_mask) { plane_state = drm_atomic_get_existing_plane_state(crtc_state->state, plane); if (!plane_state->visible) continue; vkms_state->active_planes[i++] = to_vkms_plane_state(plane_state); } return 0; } static void vkms_crtc_atomic_enable(struct drm_crtc *crtc, struct drm_atomic_state *state) { drm_crtc_vblank_on(crtc); } static void vkms_crtc_atomic_disable(struct drm_crtc *crtc, struct drm_atomic_state *state) { drm_crtc_vblank_off(crtc); } static void vkms_crtc_atomic_begin(struct drm_crtc *crtc, struct drm_atomic_state *state) __acquires(&vkms_output->lock) { struct vkms_output *vkms_output = drm_crtc_to_vkms_output(crtc); /* This lock is held across the atomic commit to block vblank timer * from scheduling vkms_composer_worker until the composer is updated */ spin_lock_irq(&vkms_output->lock); } static void vkms_crtc_atomic_flush(struct drm_crtc *crtc, struct drm_atomic_state *state) __releases(&vkms_output->lock) { struct vkms_output *vkms_output = drm_crtc_to_vkms_output(crtc); if (crtc->state->event) { spin_lock(&crtc->dev->event_lock); if (drm_crtc_vblank_get(crtc) != 0) drm_crtc_send_vblank_event(crtc, crtc->state->event); else drm_crtc_arm_vblank_event(crtc, crtc->state->event); spin_unlock(&crtc->dev->event_lock); crtc->state->event = NULL; } vkms_output->composer_state = to_vkms_crtc_state(crtc->state); spin_unlock_irq(&vkms_output->lock); } static const struct drm_crtc_helper_funcs vkms_crtc_helper_funcs = { .atomic_check = vkms_crtc_atomic_check, .atomic_begin = vkms_crtc_atomic_begin, .atomic_flush = vkms_crtc_atomic_flush, .atomic_enable = vkms_crtc_atomic_enable, .atomic_disable = vkms_crtc_atomic_disable, }; struct vkms_output *vkms_crtc_init(struct drm_device *dev, struct drm_plane *primary, struct drm_plane *cursor) { struct vkms_output *vkms_out; struct drm_crtc *crtc; int ret; vkms_out = drmm_crtc_alloc_with_planes(dev, struct vkms_output, crtc, primary, cursor, &vkms_crtc_funcs, NULL); if (IS_ERR(vkms_out)) { DRM_DEV_ERROR(dev->dev, "Failed to init CRTC\n"); return vkms_out; } crtc = &vkms_out->crtc; drm_crtc_helper_add(crtc, &vkms_crtc_helper_funcs); ret = drm_mode_crtc_set_gamma_size(crtc, VKMS_LUT_SIZE); if (ret) { DRM_ERROR("Failed to set gamma size\n"); return ERR_PTR(ret); } drm_crtc_enable_color_mgmt(crtc, 0, false, VKMS_LUT_SIZE); spin_lock_init(&vkms_out->lock); spin_lock_init(&vkms_out->composer_lock); vkms_out->composer_workq = drmm_alloc_ordered_workqueue(dev, "vkms_composer", 0); if (IS_ERR(vkms_out->composer_workq)) return ERR_CAST(vkms_out->composer_workq); return vkms_out; } |
| 1 1 1 1 1 1 1 1 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 | // SPDX-License-Identifier: GPL-2.0-or-later /* drivers/net/ifb.c: The purpose of this driver is to provide a device that allows for sharing of resources: 1) qdiscs/policies that are per device as opposed to system wide. ifb allows for a device which can be redirected to thus providing an impression of sharing. 2) Allows for queueing incoming traffic for shaping instead of dropping. The original concept is based on what is known as the IMQ driver initially written by Martin Devera, later rewritten by Patrick McHardy and then maintained by Andre Correa. You need the tc action mirror or redirect to feed this device packets. Authors: Jamal Hadi Salim (2005) */ #include <linux/module.h> #include <linux/kernel.h> #include <linux/netdevice.h> #include <linux/ethtool.h> #include <linux/etherdevice.h> #include <linux/init.h> #include <linux/interrupt.h> #include <linux/moduleparam.h> #include <linux/netfilter_netdev.h> #include <net/pkt_sched.h> #include <net/net_namespace.h> #define TX_Q_LIMIT 32 struct ifb_q_stats { u64 packets; u64 bytes; struct u64_stats_sync sync; }; struct ifb_q_private { struct net_device *dev; struct tasklet_struct ifb_tasklet; int tasklet_pending; int txqnum; struct sk_buff_head rq; struct sk_buff_head tq; struct ifb_q_stats rx_stats; struct ifb_q_stats tx_stats; } ____cacheline_aligned_in_smp; struct ifb_dev_private { struct ifb_q_private *tx_private; }; /* For ethtools stats. */ struct ifb_q_stats_desc { char desc[ETH_GSTRING_LEN]; size_t offset; }; #define IFB_Q_STAT(m) offsetof(struct ifb_q_stats, m) static const struct ifb_q_stats_desc ifb_q_stats_desc[] = { { "packets", IFB_Q_STAT(packets) }, { "bytes", IFB_Q_STAT(bytes) }, }; #define IFB_Q_STATS_LEN ARRAY_SIZE(ifb_q_stats_desc) static netdev_tx_t ifb_xmit(struct sk_buff *skb, struct net_device *dev); static int ifb_open(struct net_device *dev); static int ifb_close(struct net_device *dev); static void ifb_update_q_stats(struct ifb_q_stats *stats, int len) { u64_stats_update_begin(&stats->sync); stats->packets++; stats->bytes += len; u64_stats_update_end(&stats->sync); } static void ifb_ri_tasklet(struct tasklet_struct *t) { struct ifb_q_private *txp = from_tasklet(txp, t, ifb_tasklet); struct netdev_queue *txq; struct sk_buff *skb; txq = netdev_get_tx_queue(txp->dev, txp->txqnum); skb = skb_peek(&txp->tq); if (!skb) { if (!__netif_tx_trylock(txq)) goto resched; skb_queue_splice_tail_init(&txp->rq, &txp->tq); __netif_tx_unlock(txq); } while ((skb = __skb_dequeue(&txp->tq)) != NULL) { /* Skip tc and netfilter to prevent redirection loop. */ skb->redirected = 0; #ifdef CONFIG_NET_CLS_ACT skb->tc_skip_classify = 1; #endif nf_skip_egress(skb, true); ifb_update_q_stats(&txp->tx_stats, skb->len); rcu_read_lock(); skb->dev = dev_get_by_index_rcu(dev_net(txp->dev), skb->skb_iif); if (!skb->dev) { rcu_read_unlock(); dev_kfree_skb(skb); txp->dev->stats.tx_dropped++; if (skb_queue_len(&txp->tq) != 0) goto resched; break; } rcu_read_unlock(); skb->skb_iif = txp->dev->ifindex; if (!skb->from_ingress) { dev_queue_xmit(skb); } else { skb_pull_rcsum(skb, skb->mac_len); netif_receive_skb(skb); } } if (__netif_tx_trylock(txq)) { skb = skb_peek(&txp->rq); if (!skb) { txp->tasklet_pending = 0; if (netif_tx_queue_stopped(txq)) netif_tx_wake_queue(txq); } else { __netif_tx_unlock(txq); goto resched; } __netif_tx_unlock(txq); } else { resched: txp->tasklet_pending = 1; tasklet_schedule(&txp->ifb_tasklet); } } static void ifb_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats) { struct ifb_dev_private *dp = netdev_priv(dev); struct ifb_q_private *txp = dp->tx_private; unsigned int start; u64 packets, bytes; int i; for (i = 0; i < dev->num_tx_queues; i++,txp++) { do { start = u64_stats_fetch_begin(&txp->rx_stats.sync); packets = txp->rx_stats.packets; bytes = txp->rx_stats.bytes; } while (u64_stats_fetch_retry(&txp->rx_stats.sync, start)); stats->rx_packets += packets; stats->rx_bytes += bytes; do { start = u64_stats_fetch_begin(&txp->tx_stats.sync); packets = txp->tx_stats.packets; bytes = txp->tx_stats.bytes; } while (u64_stats_fetch_retry(&txp->tx_stats.sync, start)); stats->tx_packets += packets; stats->tx_bytes += bytes; } stats->rx_dropped = dev->stats.rx_dropped; stats->tx_dropped = dev->stats.tx_dropped; } static int ifb_dev_init(struct net_device *dev) { struct ifb_dev_private *dp = netdev_priv(dev); struct ifb_q_private *txp; int i; txp = kcalloc(dev->num_tx_queues, sizeof(*txp), GFP_KERNEL); if (!txp) return -ENOMEM; dp->tx_private = txp; for (i = 0; i < dev->num_tx_queues; i++,txp++) { txp->txqnum = i; txp->dev = dev; __skb_queue_head_init(&txp->rq); __skb_queue_head_init(&txp->tq); u64_stats_init(&txp->rx_stats.sync); u64_stats_init(&txp->tx_stats.sync); tasklet_setup(&txp->ifb_tasklet, ifb_ri_tasklet); netif_tx_start_queue(netdev_get_tx_queue(dev, i)); } return 0; } static void ifb_get_strings(struct net_device *dev, u32 stringset, u8 *buf) { u8 *p = buf; int i, j; switch (stringset) { case ETH_SS_STATS: for (i = 0; i < dev->real_num_rx_queues; i++) for (j = 0; j < IFB_Q_STATS_LEN; j++) ethtool_sprintf(&p, "rx_queue_%u_%.18s", i, ifb_q_stats_desc[j].desc); for (i = 0; i < dev->real_num_tx_queues; i++) for (j = 0; j < IFB_Q_STATS_LEN; j++) ethtool_sprintf(&p, "tx_queue_%u_%.18s", i, ifb_q_stats_desc[j].desc); break; } } static int ifb_get_sset_count(struct net_device *dev, int sset) { switch (sset) { case ETH_SS_STATS: return IFB_Q_STATS_LEN * (dev->real_num_rx_queues + dev->real_num_tx_queues); default: return -EOPNOTSUPP; } } static void ifb_fill_stats_data(u64 **data, struct ifb_q_stats *q_stats) { void *stats_base = (void *)q_stats; unsigned int start; size_t offset; int j; do { start = u64_stats_fetch_begin(&q_stats->sync); for (j = 0; j < IFB_Q_STATS_LEN; j++) { offset = ifb_q_stats_desc[j].offset; (*data)[j] = *(u64 *)(stats_base + offset); } } while (u64_stats_fetch_retry(&q_stats->sync, start)); *data += IFB_Q_STATS_LEN; } static void ifb_get_ethtool_stats(struct net_device *dev, struct ethtool_stats *stats, u64 *data) { struct ifb_dev_private *dp = netdev_priv(dev); struct ifb_q_private *txp; int i; for (i = 0; i < dev->real_num_rx_queues; i++) { txp = dp->tx_private + i; ifb_fill_stats_data(&data, &txp->rx_stats); } for (i = 0; i < dev->real_num_tx_queues; i++) { txp = dp->tx_private + i; ifb_fill_stats_data(&data, &txp->tx_stats); } } static const struct net_device_ops ifb_netdev_ops = { .ndo_open = ifb_open, .ndo_stop = ifb_close, .ndo_get_stats64 = ifb_stats64, .ndo_start_xmit = ifb_xmit, .ndo_validate_addr = eth_validate_addr, .ndo_init = ifb_dev_init, }; static const struct ethtool_ops ifb_ethtool_ops = { .get_strings = ifb_get_strings, .get_sset_count = ifb_get_sset_count, .get_ethtool_stats = ifb_get_ethtool_stats, }; #define IFB_FEATURES (NETIF_F_HW_CSUM | NETIF_F_SG | NETIF_F_FRAGLIST | \ NETIF_F_GSO_SOFTWARE | NETIF_F_GSO_ENCAP_ALL | \ NETIF_F_HIGHDMA | NETIF_F_HW_VLAN_CTAG_TX | \ NETIF_F_HW_VLAN_STAG_TX) static void ifb_dev_free(struct net_device *dev) { struct ifb_dev_private *dp = netdev_priv(dev); struct ifb_q_private *txp = dp->tx_private; int i; for (i = 0; i < dev->num_tx_queues; i++,txp++) { tasklet_kill(&txp->ifb_tasklet); __skb_queue_purge(&txp->rq); __skb_queue_purge(&txp->tq); } kfree(dp->tx_private); } static void ifb_setup(struct net_device *dev) { /* Initialize the device structure. */ dev->netdev_ops = &ifb_netdev_ops; dev->ethtool_ops = &ifb_ethtool_ops; /* Fill in device structure with ethernet-generic values. */ ether_setup(dev); dev->tx_queue_len = TX_Q_LIMIT; dev->features |= IFB_FEATURES; dev->hw_features |= dev->features; dev->hw_enc_features |= dev->features; dev->vlan_features |= IFB_FEATURES & ~(NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_STAG_TX); dev->flags |= IFF_NOARP; dev->flags &= ~IFF_MULTICAST; dev->priv_flags &= ~IFF_TX_SKB_SHARING; netif_keep_dst(dev); eth_hw_addr_random(dev); dev->needs_free_netdev = true; dev->priv_destructor = ifb_dev_free; dev->min_mtu = 0; dev->max_mtu = 0; netif_set_tso_max_size(dev, GSO_MAX_SIZE); } static netdev_tx_t ifb_xmit(struct sk_buff *skb, struct net_device *dev) { struct ifb_dev_private *dp = netdev_priv(dev); struct ifb_q_private *txp = dp->tx_private + skb_get_queue_mapping(skb); ifb_update_q_stats(&txp->rx_stats, skb->len); if (!skb->redirected || !skb->skb_iif) { dev_kfree_skb(skb); dev->stats.rx_dropped++; return NETDEV_TX_OK; } if (skb_queue_len(&txp->rq) >= dev->tx_queue_len) netif_tx_stop_queue(netdev_get_tx_queue(dev, txp->txqnum)); __skb_queue_tail(&txp->rq, skb); if (!txp->tasklet_pending) { txp->tasklet_pending = 1; tasklet_schedule(&txp->ifb_tasklet); } return NETDEV_TX_OK; } static int ifb_close(struct net_device *dev) { netif_tx_stop_all_queues(dev); return 0; } static int ifb_open(struct net_device *dev) { netif_tx_start_all_queues(dev); return 0; } static int ifb_validate(struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { if (tb[IFLA_ADDRESS]) { if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN) return -EINVAL; if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS]))) return -EADDRNOTAVAIL; } return 0; } static struct rtnl_link_ops ifb_link_ops __read_mostly = { .kind = "ifb", .priv_size = sizeof(struct ifb_dev_private), .setup = ifb_setup, .validate = ifb_validate, }; /* Number of ifb devices to be set up by this module. * Note that these legacy devices have one queue. * Prefer something like : ip link add ifb10 numtxqueues 8 type ifb */ static int numifbs = 2; module_param(numifbs, int, 0); MODULE_PARM_DESC(numifbs, "Number of ifb devices"); static int __init ifb_init_one(int index) { struct net_device *dev_ifb; int err; dev_ifb = alloc_netdev(sizeof(struct ifb_dev_private), "ifb%d", NET_NAME_UNKNOWN, ifb_setup); if (!dev_ifb) return -ENOMEM; dev_ifb->rtnl_link_ops = &ifb_link_ops; err = register_netdevice(dev_ifb); if (err < 0) goto err; return 0; err: free_netdev(dev_ifb); return err; } static int __init ifb_init_module(void) { int i, err; err = rtnl_link_register(&ifb_link_ops); if (err < 0) return err; rtnl_net_lock(&init_net); for (i = 0; i < numifbs && !err; i++) { err = ifb_init_one(i); cond_resched(); } rtnl_net_unlock(&init_net); if (err) rtnl_link_unregister(&ifb_link_ops); return err; } static void __exit ifb_cleanup_module(void) { rtnl_link_unregister(&ifb_link_ops); } module_init(ifb_init_module); module_exit(ifb_cleanup_module); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Intermediate Functional Block (ifb) netdevice driver for sharing of resources and ingress packet queuing"); MODULE_AUTHOR("Jamal Hadi Salim"); MODULE_ALIAS_RTNL_LINK("ifb"); |
| 32 24 30 7 28 24 2 2 2 11 11 11 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 | // SPDX-License-Identifier: GPL-2.0 /* Multipath TCP * * Copyright (c) 2019, Tessares SA. */ #ifdef CONFIG_SYSCTL #include <linux/sysctl.h> #endif #include <net/net_namespace.h> #include <net/netns/generic.h> #include "protocol.h" #include "mib.h" #define MPTCP_SYSCTL_PATH "net/mptcp" static int mptcp_pernet_id; #ifdef CONFIG_SYSCTL static int mptcp_pm_type_max = __MPTCP_PM_TYPE_MAX; #endif struct mptcp_pernet { #ifdef CONFIG_SYSCTL struct ctl_table_header *ctl_table_hdr; #endif unsigned int add_addr_timeout; unsigned int blackhole_timeout; unsigned int close_timeout; unsigned int stale_loss_cnt; atomic_t active_disable_times; u8 syn_retrans_before_tcp_fallback; unsigned long active_disable_stamp; u8 mptcp_enabled; u8 checksum_enabled; u8 allow_join_initial_addr_port; u8 pm_type; char scheduler[MPTCP_SCHED_NAME_MAX]; char path_manager[MPTCP_PM_NAME_MAX]; }; static struct mptcp_pernet *mptcp_get_pernet(const struct net *net) { return net_generic(net, mptcp_pernet_id); } int mptcp_is_enabled(const struct net *net) { return mptcp_get_pernet(net)->mptcp_enabled; } unsigned int mptcp_get_add_addr_timeout(const struct net *net) { return mptcp_get_pernet(net)->add_addr_timeout; } int mptcp_is_checksum_enabled(const struct net *net) { return mptcp_get_pernet(net)->checksum_enabled; } int mptcp_allow_join_id0(const struct net *net) { return mptcp_get_pernet(net)->allow_join_initial_addr_port; } unsigned int mptcp_stale_loss_cnt(const struct net *net) { return mptcp_get_pernet(net)->stale_loss_cnt; } unsigned int mptcp_close_timeout(const struct sock *sk) { if (sock_flag(sk, SOCK_DEAD)) return TCP_TIMEWAIT_LEN; return mptcp_get_pernet(sock_net(sk))->close_timeout; } int mptcp_get_pm_type(const struct net *net) { return mptcp_get_pernet(net)->pm_type; } const char *mptcp_get_path_manager(const struct net *net) { return mptcp_get_pernet(net)->path_manager; } const char *mptcp_get_scheduler(const struct net *net) { return mptcp_get_pernet(net)->scheduler; } static void mptcp_pernet_set_defaults(struct mptcp_pernet *pernet) { pernet->mptcp_enabled = 1; pernet->add_addr_timeout = TCP_RTO_MAX; pernet->blackhole_timeout = 3600; pernet->syn_retrans_before_tcp_fallback = 2; atomic_set(&pernet->active_disable_times, 0); pernet->close_timeout = TCP_TIMEWAIT_LEN; pernet->checksum_enabled = 0; pernet->allow_join_initial_addr_port = 1; pernet->stale_loss_cnt = 4; pernet->pm_type = MPTCP_PM_TYPE_KERNEL; strscpy(pernet->scheduler, "default", sizeof(pernet->scheduler)); strscpy(pernet->path_manager, "kernel", sizeof(pernet->path_manager)); } #ifdef CONFIG_SYSCTL static int mptcp_set_scheduler(char *scheduler, const char *name) { struct mptcp_sched_ops *sched; int ret = 0; rcu_read_lock(); sched = mptcp_sched_find(name); if (sched) strscpy(scheduler, name, MPTCP_SCHED_NAME_MAX); else ret = -ENOENT; rcu_read_unlock(); return ret; } static int proc_scheduler(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { char (*scheduler)[MPTCP_SCHED_NAME_MAX] = ctl->data; char val[MPTCP_SCHED_NAME_MAX]; struct ctl_table tbl = { .data = val, .maxlen = MPTCP_SCHED_NAME_MAX, }; int ret; strscpy(val, *scheduler, MPTCP_SCHED_NAME_MAX); ret = proc_dostring(&tbl, write, buffer, lenp, ppos); if (write && ret == 0) ret = mptcp_set_scheduler(*scheduler, val); return ret; } static int proc_available_schedulers(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table tbl = { .maxlen = MPTCP_SCHED_BUF_MAX, }; int ret; tbl.data = kmalloc(tbl.maxlen, GFP_USER); if (!tbl.data) return -ENOMEM; mptcp_get_available_schedulers(tbl.data, MPTCP_SCHED_BUF_MAX); ret = proc_dostring(&tbl, write, buffer, lenp, ppos); kfree(tbl.data); return ret; } static int proc_blackhole_detect_timeout(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct mptcp_pernet *pernet = container_of(table->data, struct mptcp_pernet, blackhole_timeout); int ret; ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (write && ret == 0) atomic_set(&pernet->active_disable_times, 0); return ret; } static int mptcp_set_path_manager(char *path_manager, const char *name) { struct mptcp_pm_ops *pm_ops; int ret = 0; rcu_read_lock(); pm_ops = mptcp_pm_find(name); if (pm_ops) strscpy(path_manager, name, MPTCP_PM_NAME_MAX); else ret = -ENOENT; rcu_read_unlock(); return ret; } static int proc_path_manager(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct mptcp_pernet *pernet = container_of(ctl->data, struct mptcp_pernet, path_manager); char (*path_manager)[MPTCP_PM_NAME_MAX] = ctl->data; char pm_name[MPTCP_PM_NAME_MAX]; const struct ctl_table tbl = { .data = pm_name, .maxlen = MPTCP_PM_NAME_MAX, }; int ret; strscpy(pm_name, *path_manager, MPTCP_PM_NAME_MAX); ret = proc_dostring(&tbl, write, buffer, lenp, ppos); if (write && ret == 0) { ret = mptcp_set_path_manager(*path_manager, pm_name); if (ret == 0) { u8 pm_type = __MPTCP_PM_TYPE_NR; if (strncmp(pm_name, "kernel", MPTCP_PM_NAME_MAX) == 0) pm_type = MPTCP_PM_TYPE_KERNEL; else if (strncmp(pm_name, "userspace", MPTCP_PM_NAME_MAX) == 0) pm_type = MPTCP_PM_TYPE_USERSPACE; pernet->pm_type = pm_type; } } return ret; } static int proc_pm_type(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct mptcp_pernet *pernet = container_of(ctl->data, struct mptcp_pernet, pm_type); int ret; ret = proc_dou8vec_minmax(ctl, write, buffer, lenp, ppos); if (write && ret == 0) { u8 pm_type = READ_ONCE(*(u8 *)ctl->data); char *pm_name = ""; if (pm_type == MPTCP_PM_TYPE_KERNEL) pm_name = "kernel"; else if (pm_type == MPTCP_PM_TYPE_USERSPACE) pm_name = "userspace"; mptcp_set_path_manager(pernet->path_manager, pm_name); } return ret; } static int proc_available_path_managers(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table tbl = { .maxlen = MPTCP_PM_BUF_MAX, }; int ret; tbl.data = kmalloc(tbl.maxlen, GFP_USER); if (!tbl.data) return -ENOMEM; mptcp_pm_get_available(tbl.data, MPTCP_PM_BUF_MAX); ret = proc_dostring(&tbl, write, buffer, lenp, ppos); kfree(tbl.data); return ret; } static struct ctl_table mptcp_sysctl_table[] = { { .procname = "enabled", .maxlen = sizeof(u8), .mode = 0644, /* users with CAP_NET_ADMIN or root (not and) can change this * value, same as other sysctl or the 'net' tree. */ .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE }, { .procname = "add_addr_timeout", .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, { .procname = "checksum_enabled", .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE }, { .procname = "allow_join_initial_addr_port", .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE }, { .procname = "stale_loss_cnt", .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_douintvec_minmax, }, { .procname = "pm_type", .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_pm_type, .extra1 = SYSCTL_ZERO, .extra2 = &mptcp_pm_type_max }, { .procname = "scheduler", .maxlen = MPTCP_SCHED_NAME_MAX, .mode = 0644, .proc_handler = proc_scheduler, }, { .procname = "available_schedulers", .maxlen = MPTCP_SCHED_BUF_MAX, .mode = 0444, .proc_handler = proc_available_schedulers, }, { .procname = "close_timeout", .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, { .procname = "blackhole_timeout", .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_blackhole_detect_timeout, .extra1 = SYSCTL_ZERO, }, { .procname = "syn_retrans_before_tcp_fallback", .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, }, { .procname = "path_manager", .maxlen = MPTCP_PM_NAME_MAX, .mode = 0644, .proc_handler = proc_path_manager, }, { .procname = "available_path_managers", .maxlen = MPTCP_PM_BUF_MAX, .mode = 0444, .proc_handler = proc_available_path_managers, }, }; static int mptcp_pernet_new_table(struct net *net, struct mptcp_pernet *pernet) { struct ctl_table_header *hdr; struct ctl_table *table; table = mptcp_sysctl_table; if (!net_eq(net, &init_net)) { table = kmemdup(table, sizeof(mptcp_sysctl_table), GFP_KERNEL); if (!table) goto err_alloc; } table[0].data = &pernet->mptcp_enabled; table[1].data = &pernet->add_addr_timeout; table[2].data = &pernet->checksum_enabled; table[3].data = &pernet->allow_join_initial_addr_port; table[4].data = &pernet->stale_loss_cnt; table[5].data = &pernet->pm_type; table[6].data = &pernet->scheduler; /* table[7] is for available_schedulers which is read-only info */ table[8].data = &pernet->close_timeout; table[9].data = &pernet->blackhole_timeout; table[10].data = &pernet->syn_retrans_before_tcp_fallback; table[11].data = &pernet->path_manager; /* table[12] is for available_path_managers which is read-only info */ hdr = register_net_sysctl_sz(net, MPTCP_SYSCTL_PATH, table, ARRAY_SIZE(mptcp_sysctl_table)); if (!hdr) goto err_reg; pernet->ctl_table_hdr = hdr; return 0; err_reg: if (!net_eq(net, &init_net)) kfree(table); err_alloc: return -ENOMEM; } static void mptcp_pernet_del_table(struct mptcp_pernet *pernet) { const struct ctl_table *table = pernet->ctl_table_hdr->ctl_table_arg; unregister_net_sysctl_table(pernet->ctl_table_hdr); kfree(table); } #else static int mptcp_pernet_new_table(struct net *net, struct mptcp_pernet *pernet) { return 0; } static void mptcp_pernet_del_table(struct mptcp_pernet *pernet) {} #endif /* CONFIG_SYSCTL */ /* The following code block is to deal with middle box issues with MPTCP, * similar to what is done with TFO. * The proposed solution is to disable active MPTCP globally when SYN+MPC are * dropped, while SYN without MPC aren't. In this case, active side MPTCP is * disabled globally for 1hr at first. Then if it happens again, it is disabled * for 2h, then 4h, 8h, ... * The timeout is reset back to 1hr when a successful active MPTCP connection is * fully established. */ /* Disable active MPTCP and record current jiffies and active_disable_times */ void mptcp_active_disable(struct sock *sk) { struct net *net = sock_net(sk); struct mptcp_pernet *pernet; pernet = mptcp_get_pernet(net); if (!READ_ONCE(pernet->blackhole_timeout)) return; /* Paired with READ_ONCE() in mptcp_active_should_disable() */ WRITE_ONCE(pernet->active_disable_stamp, jiffies); /* Paired with smp_rmb() in mptcp_active_should_disable(). * We want pernet->active_disable_stamp to be updated first. */ smp_mb__before_atomic(); atomic_inc(&pernet->active_disable_times); MPTCP_INC_STATS(net, MPTCP_MIB_BLACKHOLE); } /* Calculate timeout for MPTCP active disable * Return true if we are still in the active MPTCP disable period * Return false if timeout already expired and we should use active MPTCP */ bool mptcp_active_should_disable(struct sock *ssk) { struct net *net = sock_net(ssk); unsigned int blackhole_timeout; struct mptcp_pernet *pernet; unsigned long timeout; int disable_times; int multiplier; pernet = mptcp_get_pernet(net); blackhole_timeout = READ_ONCE(pernet->blackhole_timeout); if (!blackhole_timeout) return false; disable_times = atomic_read(&pernet->active_disable_times); if (!disable_times) return false; /* Paired with smp_mb__before_atomic() in mptcp_active_disable() */ smp_rmb(); /* Limit timeout to max: 2^6 * initial timeout */ multiplier = 1 << min(disable_times - 1, 6); /* Paired with the WRITE_ONCE() in mptcp_active_disable(). */ timeout = READ_ONCE(pernet->active_disable_stamp) + multiplier * blackhole_timeout * HZ; return time_before(jiffies, timeout); } /* Enable active MPTCP and reset active_disable_times if needed */ void mptcp_active_enable(struct sock *sk) { struct mptcp_pernet *pernet = mptcp_get_pernet(sock_net(sk)); if (atomic_read(&pernet->active_disable_times)) { struct dst_entry *dst = sk_dst_get(sk); if (dst && dst->dev && (dst->dev->flags & IFF_LOOPBACK)) atomic_set(&pernet->active_disable_times, 0); } } /* Check the number of retransmissions, and fallback to TCP if needed */ void mptcp_active_detect_blackhole(struct sock *ssk, bool expired) { struct mptcp_subflow_context *subflow; u8 timeouts, to_max; struct net *net; /* Only check MPTCP SYN ... */ if (likely(!sk_is_mptcp(ssk) || ssk->sk_state != TCP_SYN_SENT)) return; subflow = mptcp_subflow_ctx(ssk); /* ... + MP_CAPABLE */ if (!subflow->request_mptcp) { /* Mark as blackhole iif the 1st non-MPTCP SYN is accepted */ subflow->mpc_drop = 0; return; } net = sock_net(ssk); timeouts = inet_csk(ssk)->icsk_retransmits; to_max = mptcp_get_pernet(net)->syn_retrans_before_tcp_fallback; if (timeouts == to_max || (timeouts < to_max && expired)) { subflow->mpc_drop = 1; mptcp_early_fallback(mptcp_sk(subflow->conn), subflow, MPTCP_MIB_MPCAPABLEACTIVEDROP); } } static int __net_init mptcp_net_init(struct net *net) { struct mptcp_pernet *pernet = mptcp_get_pernet(net); mptcp_pernet_set_defaults(pernet); return mptcp_pernet_new_table(net, pernet); } /* Note: the callback will only be called per extra netns */ static void __net_exit mptcp_net_exit(struct net *net) { struct mptcp_pernet *pernet = mptcp_get_pernet(net); mptcp_pernet_del_table(pernet); } static struct pernet_operations mptcp_pernet_ops = { .init = mptcp_net_init, .exit = mptcp_net_exit, .id = &mptcp_pernet_id, .size = sizeof(struct mptcp_pernet), }; void __init mptcp_init(void) { mptcp_join_cookie_init(); mptcp_proto_init(); if (register_pernet_subsys(&mptcp_pernet_ops) < 0) panic("Failed to register MPTCP pernet subsystem.\n"); } #if IS_ENABLED(CONFIG_MPTCP_IPV6) int __init mptcpv6_init(void) { int err; err = mptcp_proto_v6_init(); return err; } #endif |
| 6 27 2 9 139 10 343 339 6 6 6 6 6 6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 | /* SPDX-License-Identifier: GPL-2.0 */ /* * include/linux/backing-dev.h * * low-level device information and state which is propagated up through * to high-level code. */ #ifndef _LINUX_BACKING_DEV_H #define _LINUX_BACKING_DEV_H #include <linux/kernel.h> #include <linux/fs.h> #include <linux/sched.h> #include <linux/device.h> #include <linux/writeback.h> #include <linux/backing-dev-defs.h> #include <linux/slab.h> static inline struct backing_dev_info *bdi_get(struct backing_dev_info *bdi) { kref_get(&bdi->refcnt); return bdi; } struct backing_dev_info *bdi_get_by_id(u64 id); void bdi_put(struct backing_dev_info *bdi); __printf(2, 3) int bdi_register(struct backing_dev_info *bdi, const char *fmt, ...); __printf(2, 0) int bdi_register_va(struct backing_dev_info *bdi, const char *fmt, va_list args); void bdi_set_owner(struct backing_dev_info *bdi, struct device *owner); void bdi_unregister(struct backing_dev_info *bdi); struct backing_dev_info *bdi_alloc(int node_id); void wb_start_background_writeback(struct bdi_writeback *wb); void wb_workfn(struct work_struct *work); void wb_wait_for_completion(struct wb_completion *done); extern spinlock_t bdi_lock; extern struct list_head bdi_list; extern struct workqueue_struct *bdi_wq; static inline bool wb_has_dirty_io(struct bdi_writeback *wb) { return test_bit(WB_has_dirty_io, &wb->state); } static inline bool bdi_has_dirty_io(struct backing_dev_info *bdi) { /* * @bdi->tot_write_bandwidth is guaranteed to be > 0 if there are * any dirty wbs. See wb_update_write_bandwidth(). */ return atomic_long_read(&bdi->tot_write_bandwidth); } static inline void wb_stat_mod(struct bdi_writeback *wb, enum wb_stat_item item, s64 amount) { percpu_counter_add_batch(&wb->stat[item], amount, WB_STAT_BATCH); } static inline void inc_wb_stat(struct bdi_writeback *wb, enum wb_stat_item item) { wb_stat_mod(wb, item, 1); } static inline void dec_wb_stat(struct bdi_writeback *wb, enum wb_stat_item item) { wb_stat_mod(wb, item, -1); } static inline s64 wb_stat(struct bdi_writeback *wb, enum wb_stat_item item) { return percpu_counter_read_positive(&wb->stat[item]); } static inline s64 wb_stat_sum(struct bdi_writeback *wb, enum wb_stat_item item) { return percpu_counter_sum_positive(&wb->stat[item]); } extern void wb_writeout_inc(struct bdi_writeback *wb); /* * maximal error of a stat counter. */ static inline unsigned long wb_stat_error(void) { #ifdef CONFIG_SMP return nr_cpu_ids * WB_STAT_BATCH; #else return 1; #endif } /* BDI ratio is expressed as part per 1000000 for finer granularity. */ #define BDI_RATIO_SCALE 10000 u64 bdi_get_min_bytes(struct backing_dev_info *bdi); u64 bdi_get_max_bytes(struct backing_dev_info *bdi); int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio); int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio); int bdi_set_min_ratio_no_scale(struct backing_dev_info *bdi, unsigned int min_ratio); int bdi_set_max_ratio_no_scale(struct backing_dev_info *bdi, unsigned int max_ratio); int bdi_set_min_bytes(struct backing_dev_info *bdi, u64 min_bytes); int bdi_set_max_bytes(struct backing_dev_info *bdi, u64 max_bytes); int bdi_set_strict_limit(struct backing_dev_info *bdi, unsigned int strict_limit); /* * Flags in backing_dev_info::capability * * BDI_CAP_WRITEBACK: Supports dirty page writeback, and dirty pages * should contribute to accounting * BDI_CAP_WRITEBACK_ACCT: Automatically account writeback pages * BDI_CAP_STRICTLIMIT: Keep number of dirty pages below bdi threshold */ #define BDI_CAP_WRITEBACK (1 << 0) #define BDI_CAP_WRITEBACK_ACCT (1 << 1) #define BDI_CAP_STRICTLIMIT (1 << 2) extern struct backing_dev_info noop_backing_dev_info; int bdi_init(struct backing_dev_info *bdi); /** * writeback_in_progress - determine whether there is writeback in progress * @wb: bdi_writeback of interest * * Determine whether there is writeback waiting to be handled against a * bdi_writeback. */ static inline bool writeback_in_progress(struct bdi_writeback *wb) { return test_bit(WB_writeback_running, &wb->state); } struct backing_dev_info *inode_to_bdi(struct inode *inode); static inline bool mapping_can_writeback(struct address_space *mapping) { return inode_to_bdi(mapping->host)->capabilities & BDI_CAP_WRITEBACK; } #ifdef CONFIG_CGROUP_WRITEBACK struct bdi_writeback *wb_get_lookup(struct backing_dev_info *bdi, struct cgroup_subsys_state *memcg_css); struct bdi_writeback *wb_get_create(struct backing_dev_info *bdi, struct cgroup_subsys_state *memcg_css, gfp_t gfp); void wb_memcg_offline(struct mem_cgroup *memcg); void wb_blkcg_offline(struct cgroup_subsys_state *css); /** * inode_cgwb_enabled - test whether cgroup writeback is enabled on an inode * @inode: inode of interest * * Cgroup writeback requires support from the filesystem. Also, both memcg and * iocg have to be on the default hierarchy. Test whether all conditions are * met. * * Note that the test result may change dynamically on the same inode * depending on how memcg and iocg are configured. */ static inline bool inode_cgwb_enabled(struct inode *inode) { struct backing_dev_info *bdi = inode_to_bdi(inode); return cgroup_subsys_on_dfl(memory_cgrp_subsys) && cgroup_subsys_on_dfl(io_cgrp_subsys) && (bdi->capabilities & BDI_CAP_WRITEBACK) && (inode->i_sb->s_iflags & SB_I_CGROUPWB); } /** * wb_find_current - find wb for %current on a bdi * @bdi: bdi of interest * * Find the wb of @bdi which matches both the memcg and blkcg of %current. * Must be called under rcu_read_lock() which protects the returend wb. * NULL if not found. */ static inline struct bdi_writeback *wb_find_current(struct backing_dev_info *bdi) { struct cgroup_subsys_state *memcg_css; struct bdi_writeback *wb; memcg_css = task_css(current, memory_cgrp_id); if (!memcg_css->parent) return &bdi->wb; wb = radix_tree_lookup(&bdi->cgwb_tree, memcg_css->id); /* * %current's blkcg equals the effective blkcg of its memcg. No * need to use the relatively expensive cgroup_get_e_css(). */ if (likely(wb && wb->blkcg_css == task_css(current, io_cgrp_id))) return wb; return NULL; } /** * wb_get_create_current - get or create wb for %current on a bdi * @bdi: bdi of interest * @gfp: allocation mask * * Equivalent to wb_get_create() on %current's memcg. This function is * called from a relatively hot path and optimizes the common cases using * wb_find_current(). */ static inline struct bdi_writeback * wb_get_create_current(struct backing_dev_info *bdi, gfp_t gfp) { struct bdi_writeback *wb; rcu_read_lock(); wb = wb_find_current(bdi); if (wb && unlikely(!wb_tryget(wb))) wb = NULL; rcu_read_unlock(); if (unlikely(!wb)) { struct cgroup_subsys_state *memcg_css; memcg_css = task_get_css(current, memory_cgrp_id); wb = wb_get_create(bdi, memcg_css, gfp); css_put(memcg_css); } return wb; } /** * inode_to_wb - determine the wb of an inode * @inode: inode of interest * * Returns the wb @inode is currently associated with. The caller must be * holding either @inode->i_lock, the i_pages lock, or the * associated wb's list_lock. */ static inline struct bdi_writeback *inode_to_wb(const struct inode *inode) { #ifdef CONFIG_LOCKDEP WARN_ON_ONCE(debug_locks && (inode->i_sb->s_iflags & SB_I_CGROUPWB) && (!lockdep_is_held(&inode->i_lock) && !lockdep_is_held(&inode->i_mapping->i_pages.xa_lock) && !lockdep_is_held(&inode->i_wb->list_lock))); #endif return inode->i_wb; } static inline struct bdi_writeback *inode_to_wb_wbc( struct inode *inode, struct writeback_control *wbc) { /* * If wbc does not have inode attached, it means cgroup writeback was * disabled when wbc started. Just use the default wb in that case. */ return wbc->wb ? wbc->wb : &inode_to_bdi(inode)->wb; } /** * unlocked_inode_to_wb_begin - begin unlocked inode wb access transaction * @inode: target inode * @cookie: output param, to be passed to the end function * * The caller wants to access the wb associated with @inode but isn't * holding inode->i_lock, the i_pages lock or wb->list_lock. This * function determines the wb associated with @inode and ensures that the * association doesn't change until the transaction is finished with * unlocked_inode_to_wb_end(). * * The caller must call unlocked_inode_to_wb_end() with *@cookie afterwards and * can't sleep during the transaction. IRQs may or may not be disabled on * return. */ static inline struct bdi_writeback * unlocked_inode_to_wb_begin(struct inode *inode, struct wb_lock_cookie *cookie) { rcu_read_lock(); /* * Paired with store_release in inode_switch_wbs_work_fn() and * ensures that we see the new wb if we see cleared I_WB_SWITCH. */ cookie->locked = smp_load_acquire(&inode->i_state) & I_WB_SWITCH; if (unlikely(cookie->locked)) xa_lock_irqsave(&inode->i_mapping->i_pages, cookie->flags); /* * Protected by either !I_WB_SWITCH + rcu_read_lock() or the i_pages * lock. inode_to_wb() will bark. Deref directly. */ return inode->i_wb; } /** * unlocked_inode_to_wb_end - end inode wb access transaction * @inode: target inode * @cookie: @cookie from unlocked_inode_to_wb_begin() */ static inline void unlocked_inode_to_wb_end(struct inode *inode, struct wb_lock_cookie *cookie) { if (unlikely(cookie->locked)) xa_unlock_irqrestore(&inode->i_mapping->i_pages, cookie->flags); rcu_read_unlock(); } #else /* CONFIG_CGROUP_WRITEBACK */ static inline bool inode_cgwb_enabled(struct inode *inode) { return false; } static inline struct bdi_writeback *wb_find_current(struct backing_dev_info *bdi) { return &bdi->wb; } static inline struct bdi_writeback * wb_get_create_current(struct backing_dev_info *bdi, gfp_t gfp) { return &bdi->wb; } static inline struct bdi_writeback *inode_to_wb(struct inode *inode) { return &inode_to_bdi(inode)->wb; } static inline struct bdi_writeback *inode_to_wb_wbc( struct inode *inode, struct writeback_control *wbc) { return inode_to_wb(inode); } static inline struct bdi_writeback * unlocked_inode_to_wb_begin(struct inode *inode, struct wb_lock_cookie *cookie) { return inode_to_wb(inode); } static inline void unlocked_inode_to_wb_end(struct inode *inode, struct wb_lock_cookie *cookie) { } static inline void wb_memcg_offline(struct mem_cgroup *memcg) { } static inline void wb_blkcg_offline(struct cgroup_subsys_state *css) { } #endif /* CONFIG_CGROUP_WRITEBACK */ const char *bdi_dev_name(struct backing_dev_info *bdi); #endif /* _LINUX_BACKING_DEV_H */ |
| 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 | /* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright (C) 2012 Red Hat * * based in parts on udlfb.c: * Copyright (C) 2009 Roberto De Ioris <roberto@unbit.it> * Copyright (C) 2009 Jaya Kumar <jayakumar.lkml@gmail.com> * Copyright (C) 2009 Bernie Thompson <bernie@plugable.com> */ #ifndef UDL_DRV_H #define UDL_DRV_H #include <linux/mm_types.h> #include <linux/usb.h> #include <drm/drm_connector.h> #include <drm/drm_crtc.h> #include <drm/drm_device.h> #include <drm/drm_encoder.h> #include <drm/drm_framebuffer.h> #include <drm/drm_gem.h> #include <drm/drm_plane.h> struct drm_mode_create_dumb; #define DRIVER_NAME "udl" #define DRIVER_DESC "DisplayLink" #define DRIVER_MAJOR 0 #define DRIVER_MINOR 0 #define DRIVER_PATCHLEVEL 1 struct udl_device; struct urb_node { struct list_head entry; struct udl_device *dev; struct urb *urb; }; struct urb_list { struct list_head list; spinlock_t lock; wait_queue_head_t sleep; int available; int count; size_t size; }; struct udl_device { struct drm_device drm; unsigned long sku_pixel_limit; struct drm_plane primary_plane; struct drm_crtc crtc; struct drm_encoder encoder; struct drm_connector connector; struct urb_list urbs; }; #define to_udl(x) container_of(x, struct udl_device, drm) static inline struct usb_device *udl_to_usb_device(struct udl_device *udl) { return interface_to_usbdev(to_usb_interface(udl->drm.dev)); } /* modeset */ int udl_modeset_init(struct udl_device *udl); struct drm_connector *udl_connector_init(struct drm_device *dev); struct urb *udl_get_urb(struct udl_device *udl); int udl_submit_urb(struct udl_device *udl, struct urb *urb, size_t len); void udl_sync_pending_urbs(struct udl_device *udl); void udl_urb_completion(struct urb *urb); int udl_init(struct udl_device *udl); int udl_render_hline(struct udl_device *udl, int log_bpp, struct urb **urb_ptr, const char *front, char **urb_buf_ptr, u32 byte_offset, u32 device_byte_offset, u32 byte_width); int udl_drop_usb(struct udl_device *udl); int udl_select_std_channel(struct udl_device *udl); #endif |
| 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 | // SPDX-License-Identifier: GPL-2.0-or-later /* * (C) 2010 Pablo Neira Ayuso <pablo@netfilter.org> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/netfilter.h> #include <linux/slab.h> #include <linux/kernel.h> #include <linux/moduleparam.h> #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_extend.h> #include <net/netfilter/nf_conntrack_timestamp.h> static bool nf_ct_tstamp __read_mostly; module_param_named(tstamp, nf_ct_tstamp, bool, 0644); MODULE_PARM_DESC(tstamp, "Enable connection tracking flow timestamping."); void nf_conntrack_tstamp_pernet_init(struct net *net) { net->ct.sysctl_tstamp = nf_ct_tstamp; } |
| 134 19 19 94 37 35 37 37 19 36 36 37 65 64 39 2 2 37 20 39 17 19 19 19 19 19 32 31 32 4 3 32 9 9 9 23 22 23 5 129 62 22 30 22 8 5 5 5 4 76 76 76 76 23 68 69 134 133 1992 1986 460 461 460 184 460 7 7 7 7 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 | // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2005-2010 IBM Corporation * * Author: * Mimi Zohar <zohar@us.ibm.com> * Kylene Hall <kjhall@us.ibm.com> * * File: evm_main.c * implements evm_inode_setxattr, evm_inode_post_setxattr, * evm_inode_removexattr, evm_verifyxattr, and evm_inode_set_acl. */ #define pr_fmt(fmt) "EVM: "fmt #include <linux/init.h> #include <linux/audit.h> #include <linux/xattr.h> #include <linux/integrity.h> #include <linux/evm.h> #include <linux/magic.h> #include <linux/posix_acl_xattr.h> #include <linux/lsm_hooks.h> #include <crypto/hash.h> #include <crypto/hash_info.h> #include <crypto/utils.h> #include "evm.h" int evm_initialized; static const char * const integrity_status_msg[] = { "pass", "pass_immutable", "fail", "fail_immutable", "no_label", "no_xattrs", "unknown" }; int evm_hmac_attrs; static struct xattr_list evm_config_default_xattrnames[] = { { .name = XATTR_NAME_SELINUX, .enabled = IS_ENABLED(CONFIG_SECURITY_SELINUX) }, { .name = XATTR_NAME_SMACK, .enabled = IS_ENABLED(CONFIG_SECURITY_SMACK) }, { .name = XATTR_NAME_SMACKEXEC, .enabled = IS_ENABLED(CONFIG_EVM_EXTRA_SMACK_XATTRS) }, { .name = XATTR_NAME_SMACKTRANSMUTE, .enabled = IS_ENABLED(CONFIG_EVM_EXTRA_SMACK_XATTRS) }, { .name = XATTR_NAME_SMACKMMAP, .enabled = IS_ENABLED(CONFIG_EVM_EXTRA_SMACK_XATTRS) }, { .name = XATTR_NAME_APPARMOR, .enabled = IS_ENABLED(CONFIG_SECURITY_APPARMOR) }, { .name = XATTR_NAME_IMA, .enabled = IS_ENABLED(CONFIG_IMA_APPRAISE) }, { .name = XATTR_NAME_CAPS, .enabled = true }, }; LIST_HEAD(evm_config_xattrnames); static int evm_fixmode __ro_after_init; static int __init evm_set_fixmode(char *str) { if (strncmp(str, "fix", 3) == 0) evm_fixmode = 1; else pr_err("invalid \"%s\" mode", str); return 1; } __setup("evm=", evm_set_fixmode); static void __init evm_init_config(void) { int i, xattrs; xattrs = ARRAY_SIZE(evm_config_default_xattrnames); pr_info("Initialising EVM extended attributes:\n"); for (i = 0; i < xattrs; i++) { pr_info("%s%s\n", evm_config_default_xattrnames[i].name, !evm_config_default_xattrnames[i].enabled ? " (disabled)" : ""); list_add_tail(&evm_config_default_xattrnames[i].list, &evm_config_xattrnames); } #ifdef CONFIG_EVM_ATTR_FSUUID evm_hmac_attrs |= EVM_ATTR_FSUUID; #endif pr_info("HMAC attrs: 0x%x\n", evm_hmac_attrs); } static bool evm_key_loaded(void) { return (bool)(evm_initialized & EVM_KEY_MASK); } /* * This function determines whether or not it is safe to ignore verification * errors, based on the ability of EVM to calculate HMACs. If the HMAC key * is not loaded, and it cannot be loaded in the future due to the * EVM_SETUP_COMPLETE initialization flag, allowing an operation despite the * attrs/xattrs being found invalid will not make them valid. */ static bool evm_hmac_disabled(void) { if (evm_initialized & EVM_INIT_HMAC) return false; if (!(evm_initialized & EVM_SETUP_COMPLETE)) return false; return true; } static int evm_find_protected_xattrs(struct dentry *dentry) { struct inode *inode = d_backing_inode(dentry); struct xattr_list *xattr; int error; int count = 0; if (!(inode->i_opflags & IOP_XATTR)) return -EOPNOTSUPP; list_for_each_entry_lockless(xattr, &evm_config_xattrnames, list) { error = __vfs_getxattr(dentry, inode, xattr->name, NULL, 0); if (error < 0) { if (error == -ENODATA) continue; return error; } count++; } return count; } static int is_unsupported_hmac_fs(struct dentry *dentry) { struct inode *inode = d_backing_inode(dentry); if (inode->i_sb->s_iflags & SB_I_EVM_HMAC_UNSUPPORTED) { pr_info_once("%s not supported\n", inode->i_sb->s_type->name); return 1; } return 0; } /* * evm_verify_hmac - calculate and compare the HMAC with the EVM xattr * * Compute the HMAC on the dentry's protected set of extended attributes * and compare it against the stored security.evm xattr. * * For performance: * - use the previously retrieved xattr value and length to calculate the * HMAC.) * - cache the verification result in the iint, when available. * * Returns integrity status */ static enum integrity_status evm_verify_hmac(struct dentry *dentry, const char *xattr_name, char *xattr_value, size_t xattr_value_len) { struct evm_ima_xattr_data *xattr_data = NULL; struct signature_v2_hdr *hdr; enum integrity_status evm_status = INTEGRITY_PASS; struct evm_digest digest; struct inode *inode = d_backing_inode(dentry); struct evm_iint_cache *iint = evm_iint_inode(inode); int rc, xattr_len, evm_immutable = 0; if (iint && (iint->evm_status == INTEGRITY_PASS || iint->evm_status == INTEGRITY_PASS_IMMUTABLE)) return iint->evm_status; /* * On unsupported filesystems without EVM_INIT_X509 enabled, skip * signature verification. */ if (!(evm_initialized & EVM_INIT_X509) && is_unsupported_hmac_fs(dentry)) return INTEGRITY_UNKNOWN; /* if status is not PASS, try to check again - against -ENOMEM */ /* first need to know the sig type */ rc = vfs_getxattr_alloc(&nop_mnt_idmap, dentry, XATTR_NAME_EVM, (char **)&xattr_data, 0, GFP_NOFS); if (rc <= 0) { evm_status = INTEGRITY_FAIL; if (rc == -ENODATA) { rc = evm_find_protected_xattrs(dentry); if (rc > 0) evm_status = INTEGRITY_NOLABEL; else if (rc == 0) evm_status = INTEGRITY_NOXATTRS; /* new file */ } else if (rc == -EOPNOTSUPP) { evm_status = INTEGRITY_UNKNOWN; } goto out; } xattr_len = rc; /* check value type */ switch (xattr_data->type) { case EVM_XATTR_HMAC: if (xattr_len != sizeof(struct evm_xattr)) { evm_status = INTEGRITY_FAIL; goto out; } digest.hdr.algo = HASH_ALGO_SHA1; rc = evm_calc_hmac(dentry, xattr_name, xattr_value, xattr_value_len, &digest, iint); if (rc) break; rc = crypto_memneq(xattr_data->data, digest.digest, SHA1_DIGEST_SIZE); if (rc) rc = -EINVAL; break; case EVM_XATTR_PORTABLE_DIGSIG: evm_immutable = 1; fallthrough; case EVM_IMA_XATTR_DIGSIG: /* accept xattr with non-empty signature field */ if (xattr_len <= sizeof(struct signature_v2_hdr)) { evm_status = INTEGRITY_FAIL; goto out; } hdr = (struct signature_v2_hdr *)xattr_data; digest.hdr.algo = hdr->hash_algo; rc = evm_calc_hash(dentry, xattr_name, xattr_value, xattr_value_len, xattr_data->type, &digest, iint); if (rc) break; rc = integrity_digsig_verify(INTEGRITY_KEYRING_EVM, (const char *)xattr_data, xattr_len, digest.digest, digest.hdr.length); if (!rc) { if (xattr_data->type == EVM_XATTR_PORTABLE_DIGSIG) { if (iint) iint->flags |= EVM_IMMUTABLE_DIGSIG; evm_status = INTEGRITY_PASS_IMMUTABLE; } else if (!IS_RDONLY(inode) && !(inode->i_sb->s_readonly_remount) && !IS_IMMUTABLE(inode) && !is_unsupported_hmac_fs(dentry)) { evm_update_evmxattr(dentry, xattr_name, xattr_value, xattr_value_len); } } break; default: rc = -EINVAL; break; } if (rc) { if (rc == -ENODATA) evm_status = INTEGRITY_NOXATTRS; else if (evm_immutable) evm_status = INTEGRITY_FAIL_IMMUTABLE; else evm_status = INTEGRITY_FAIL; } pr_debug("digest: (%d) [%*phN]\n", digest.hdr.length, digest.hdr.length, digest.digest); out: if (iint) iint->evm_status = evm_status; kfree(xattr_data); return evm_status; } static int evm_protected_xattr_common(const char *req_xattr_name, bool all_xattrs) { int namelen; int found = 0; struct xattr_list *xattr; namelen = strlen(req_xattr_name); list_for_each_entry_lockless(xattr, &evm_config_xattrnames, list) { if (!all_xattrs && !xattr->enabled) continue; if ((strlen(xattr->name) == namelen) && (strncmp(req_xattr_name, xattr->name, namelen) == 0)) { found = 1; break; } if (strncmp(req_xattr_name, xattr->name + XATTR_SECURITY_PREFIX_LEN, strlen(req_xattr_name)) == 0) { found = 1; break; } } return found; } int evm_protected_xattr(const char *req_xattr_name) { return evm_protected_xattr_common(req_xattr_name, false); } int evm_protected_xattr_if_enabled(const char *req_xattr_name) { return evm_protected_xattr_common(req_xattr_name, true); } /** * evm_read_protected_xattrs - read EVM protected xattr names, lengths, values * @dentry: dentry of the read xattrs * @buffer: buffer xattr names, lengths or values are copied to * @buffer_size: size of buffer * @type: n: names, l: lengths, v: values * @canonical_fmt: data format (true: little endian, false: native format) * * Read protected xattr names (separated by |), lengths (u32) or values for a * given dentry and return the total size of copied data. If buffer is NULL, * just return the total size. * * Returns the total size on success, a negative value on error. */ int evm_read_protected_xattrs(struct dentry *dentry, u8 *buffer, int buffer_size, char type, bool canonical_fmt) { struct xattr_list *xattr; int rc, size, total_size = 0; list_for_each_entry_lockless(xattr, &evm_config_xattrnames, list) { rc = __vfs_getxattr(dentry, d_backing_inode(dentry), xattr->name, NULL, 0); if (rc < 0 && rc == -ENODATA) continue; else if (rc < 0) return rc; switch (type) { case 'n': size = strlen(xattr->name) + 1; if (buffer) { if (total_size) *(buffer + total_size - 1) = '|'; memcpy(buffer + total_size, xattr->name, size); } break; case 'l': size = sizeof(u32); if (buffer) { if (canonical_fmt) rc = (__force int)cpu_to_le32(rc); *(u32 *)(buffer + total_size) = rc; } break; case 'v': size = rc; if (buffer) { rc = __vfs_getxattr(dentry, d_backing_inode(dentry), xattr->name, buffer + total_size, buffer_size - total_size); if (rc < 0) return rc; } break; default: return -EINVAL; } total_size += size; } return total_size; } /** * evm_verifyxattr - verify the integrity of the requested xattr * @dentry: object of the verify xattr * @xattr_name: requested xattr * @xattr_value: requested xattr value * @xattr_value_len: requested xattr value length * * Calculate the HMAC for the given dentry and verify it against the stored * security.evm xattr. For performance, use the xattr value and length * previously retrieved to calculate the HMAC. * * Returns the xattr integrity status. * * This function requires the caller to lock the inode's i_mutex before it * is executed. */ enum integrity_status evm_verifyxattr(struct dentry *dentry, const char *xattr_name, void *xattr_value, size_t xattr_value_len) { if (!evm_key_loaded() || !evm_protected_xattr(xattr_name)) return INTEGRITY_UNKNOWN; return evm_verify_hmac(dentry, xattr_name, xattr_value, xattr_value_len); } EXPORT_SYMBOL_GPL(evm_verifyxattr); /* * evm_verify_current_integrity - verify the dentry's metadata integrity * @dentry: pointer to the affected dentry * * Verify and return the dentry's metadata integrity. The exceptions are * before EVM is initialized or in 'fix' mode. */ static enum integrity_status evm_verify_current_integrity(struct dentry *dentry) { struct inode *inode = d_backing_inode(dentry); if (!evm_key_loaded() || !S_ISREG(inode->i_mode) || evm_fixmode) return INTEGRITY_PASS; return evm_verify_hmac(dentry, NULL, NULL, 0); } /* * evm_xattr_change - check if passed xattr value differs from current value * @idmap: idmap of the mount * @dentry: pointer to the affected dentry * @xattr_name: requested xattr * @xattr_value: requested xattr value * @xattr_value_len: requested xattr value length * * Check if passed xattr value differs from current value. * * Returns 1 if passed xattr value differs from current value, 0 otherwise. */ static int evm_xattr_change(struct mnt_idmap *idmap, struct dentry *dentry, const char *xattr_name, const void *xattr_value, size_t xattr_value_len) { char *xattr_data = NULL; int rc = 0; rc = vfs_getxattr_alloc(&nop_mnt_idmap, dentry, xattr_name, &xattr_data, 0, GFP_NOFS); if (rc < 0) { rc = 1; goto out; } if (rc == xattr_value_len) rc = !!memcmp(xattr_value, xattr_data, rc); else rc = 1; out: kfree(xattr_data); return rc; } /* * evm_protect_xattr - protect the EVM extended attribute * * Prevent security.evm from being modified or removed without the * necessary permissions or when the existing value is invalid. * * The posix xattr acls are 'system' prefixed, which normally would not * affect security.evm. An interesting side affect of writing posix xattr * acls is their modifying of the i_mode, which is included in security.evm. * For posix xattr acls only, permit security.evm, even if it currently * doesn't exist, to be updated unless the EVM signature is immutable. */ static int evm_protect_xattr(struct mnt_idmap *idmap, struct dentry *dentry, const char *xattr_name, const void *xattr_value, size_t xattr_value_len) { enum integrity_status evm_status; if (strcmp(xattr_name, XATTR_NAME_EVM) == 0) { if (!capable(CAP_SYS_ADMIN)) return -EPERM; if (is_unsupported_hmac_fs(dentry)) return -EPERM; } else if (!evm_protected_xattr(xattr_name)) { if (!posix_xattr_acl(xattr_name)) return 0; if (is_unsupported_hmac_fs(dentry)) return 0; evm_status = evm_verify_current_integrity(dentry); if ((evm_status == INTEGRITY_PASS) || (evm_status == INTEGRITY_NOXATTRS)) return 0; goto out; } else if (is_unsupported_hmac_fs(dentry)) return 0; evm_status = evm_verify_current_integrity(dentry); if (evm_status == INTEGRITY_NOXATTRS) { struct evm_iint_cache *iint; /* Exception if the HMAC is not going to be calculated. */ if (evm_hmac_disabled()) return 0; iint = evm_iint_inode(d_backing_inode(dentry)); if (iint && (iint->flags & EVM_NEW_FILE)) return 0; /* exception for pseudo filesystems */ if (dentry->d_sb->s_magic == TMPFS_MAGIC || dentry->d_sb->s_magic == SYSFS_MAGIC) return 0; integrity_audit_msg(AUDIT_INTEGRITY_METADATA, dentry->d_inode, dentry->d_name.name, "update_metadata", integrity_status_msg[evm_status], -EPERM, 0); } out: /* Exception if the HMAC is not going to be calculated. */ if (evm_hmac_disabled() && (evm_status == INTEGRITY_NOLABEL || evm_status == INTEGRITY_UNKNOWN)) return 0; /* * Writing other xattrs is safe for portable signatures, as portable * signatures are immutable and can never be updated. */ if (evm_status == INTEGRITY_FAIL_IMMUTABLE) return 0; if (evm_status == INTEGRITY_PASS_IMMUTABLE && !evm_xattr_change(idmap, dentry, xattr_name, xattr_value, xattr_value_len)) return 0; if (evm_status != INTEGRITY_PASS && evm_status != INTEGRITY_PASS_IMMUTABLE) integrity_audit_msg(AUDIT_INTEGRITY_METADATA, d_backing_inode(dentry), dentry->d_name.name, "appraise_metadata", integrity_status_msg[evm_status], -EPERM, 0); return evm_status == INTEGRITY_PASS ? 0 : -EPERM; } /** * evm_inode_setxattr - protect the EVM extended attribute * @idmap: idmap of the mount * @dentry: pointer to the affected dentry * @xattr_name: pointer to the affected extended attribute name * @xattr_value: pointer to the new extended attribute value * @xattr_value_len: pointer to the new extended attribute value length * @flags: flags to pass into filesystem operations * * Before allowing the 'security.evm' protected xattr to be updated, * verify the existing value is valid. As only the kernel should have * access to the EVM encrypted key needed to calculate the HMAC, prevent * userspace from writing HMAC value. Writing 'security.evm' requires * requires CAP_SYS_ADMIN privileges. */ static int evm_inode_setxattr(struct mnt_idmap *idmap, struct dentry *dentry, const char *xattr_name, const void *xattr_value, size_t xattr_value_len, int flags) { const struct evm_ima_xattr_data *xattr_data = xattr_value; /* Policy permits modification of the protected xattrs even though * there's no HMAC key loaded */ if (evm_initialized & EVM_ALLOW_METADATA_WRITES) return 0; if (strcmp(xattr_name, XATTR_NAME_EVM) == 0) { if (!xattr_value_len) return -EINVAL; if (xattr_data->type != EVM_IMA_XATTR_DIGSIG && xattr_data->type != EVM_XATTR_PORTABLE_DIGSIG) return -EPERM; } return evm_protect_xattr(idmap, dentry, xattr_name, xattr_value, xattr_value_len); } /** * evm_inode_removexattr - protect the EVM extended attribute * @idmap: idmap of the mount * @dentry: pointer to the affected dentry * @xattr_name: pointer to the affected extended attribute name * * Removing 'security.evm' requires CAP_SYS_ADMIN privileges and that * the current value is valid. */ static int evm_inode_removexattr(struct mnt_idmap *idmap, struct dentry *dentry, const char *xattr_name) { /* Policy permits modification of the protected xattrs even though * there's no HMAC key loaded */ if (evm_initialized & EVM_ALLOW_METADATA_WRITES) return 0; return evm_protect_xattr(idmap, dentry, xattr_name, NULL, 0); } #ifdef CONFIG_FS_POSIX_ACL static int evm_inode_set_acl_change(struct mnt_idmap *idmap, struct dentry *dentry, const char *name, struct posix_acl *kacl) { int rc; umode_t mode; struct inode *inode = d_backing_inode(dentry); if (!kacl) return 1; rc = posix_acl_update_mode(idmap, inode, &mode, &kacl); if (rc || (inode->i_mode != mode)) return 1; return 0; } #else static inline int evm_inode_set_acl_change(struct mnt_idmap *idmap, struct dentry *dentry, const char *name, struct posix_acl *kacl) { return 0; } #endif /** * evm_inode_set_acl - protect the EVM extended attribute from posix acls * @idmap: idmap of the idmapped mount * @dentry: pointer to the affected dentry * @acl_name: name of the posix acl * @kacl: pointer to the posix acls * * Prevent modifying posix acls causing the EVM HMAC to be re-calculated * and 'security.evm' xattr updated, unless the existing 'security.evm' is * valid. * * Return: zero on success, -EPERM on failure. */ static int evm_inode_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name, struct posix_acl *kacl) { enum integrity_status evm_status; /* Policy permits modification of the protected xattrs even though * there's no HMAC key loaded */ if (evm_initialized & EVM_ALLOW_METADATA_WRITES) return 0; evm_status = evm_verify_current_integrity(dentry); if ((evm_status == INTEGRITY_PASS) || (evm_status == INTEGRITY_NOXATTRS)) return 0; /* Exception if the HMAC is not going to be calculated. */ if (evm_hmac_disabled() && (evm_status == INTEGRITY_NOLABEL || evm_status == INTEGRITY_UNKNOWN)) return 0; /* * Writing other xattrs is safe for portable signatures, as portable * signatures are immutable and can never be updated. */ if (evm_status == INTEGRITY_FAIL_IMMUTABLE) return 0; if (evm_status == INTEGRITY_PASS_IMMUTABLE && !evm_inode_set_acl_change(idmap, dentry, acl_name, kacl)) return 0; if (evm_status != INTEGRITY_PASS_IMMUTABLE) integrity_audit_msg(AUDIT_INTEGRITY_METADATA, d_backing_inode(dentry), dentry->d_name.name, "appraise_metadata", integrity_status_msg[evm_status], -EPERM, 0); return -EPERM; } /** * evm_inode_remove_acl - Protect the EVM extended attribute from posix acls * @idmap: idmap of the mount * @dentry: pointer to the affected dentry * @acl_name: name of the posix acl * * Prevent removing posix acls causing the EVM HMAC to be re-calculated * and 'security.evm' xattr updated, unless the existing 'security.evm' is * valid. * * Return: zero on success, -EPERM on failure. */ static int evm_inode_remove_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name) { return evm_inode_set_acl(idmap, dentry, acl_name, NULL); } static void evm_reset_status(struct inode *inode) { struct evm_iint_cache *iint; iint = evm_iint_inode(inode); if (iint) iint->evm_status = INTEGRITY_UNKNOWN; } /** * evm_metadata_changed: Detect changes to the metadata * @inode: a file's inode * @metadata_inode: metadata inode * * On a stacked filesystem detect whether the metadata has changed. If this is * the case reset the evm_status associated with the inode that represents the * file. */ bool evm_metadata_changed(struct inode *inode, struct inode *metadata_inode) { struct evm_iint_cache *iint = evm_iint_inode(inode); bool ret = false; if (iint) { ret = (!IS_I_VERSION(metadata_inode) || integrity_inode_attrs_changed(&iint->metadata_inode, metadata_inode)); if (ret) iint->evm_status = INTEGRITY_UNKNOWN; } return ret; } /** * evm_revalidate_status - report whether EVM status re-validation is necessary * @xattr_name: pointer to the affected extended attribute name * * Report whether callers of evm_verifyxattr() should re-validate the * EVM status. * * Return true if re-validation is necessary, false otherwise. */ bool evm_revalidate_status(const char *xattr_name) { if (!evm_key_loaded()) return false; /* evm_inode_post_setattr() passes NULL */ if (!xattr_name) return true; if (!evm_protected_xattr(xattr_name) && !posix_xattr_acl(xattr_name) && strcmp(xattr_name, XATTR_NAME_EVM)) return false; return true; } /** * evm_inode_post_setxattr - update 'security.evm' to reflect the changes * @dentry: pointer to the affected dentry * @xattr_name: pointer to the affected extended attribute name * @xattr_value: pointer to the new extended attribute value * @xattr_value_len: pointer to the new extended attribute value length * @flags: flags to pass into filesystem operations * * Update the HMAC stored in 'security.evm' to reflect the change. * * No need to take the i_mutex lock here, as this function is called from * __vfs_setxattr_noperm(). The caller of which has taken the inode's * i_mutex lock. */ static void evm_inode_post_setxattr(struct dentry *dentry, const char *xattr_name, const void *xattr_value, size_t xattr_value_len, int flags) { if (!evm_revalidate_status(xattr_name)) return; evm_reset_status(dentry->d_inode); if (!strcmp(xattr_name, XATTR_NAME_EVM)) return; if (!(evm_initialized & EVM_INIT_HMAC)) return; if (is_unsupported_hmac_fs(dentry)) return; evm_update_evmxattr(dentry, xattr_name, xattr_value, xattr_value_len); } /** * evm_inode_post_set_acl - Update the EVM extended attribute from posix acls * @dentry: pointer to the affected dentry * @acl_name: name of the posix acl * @kacl: pointer to the posix acls * * Update the 'security.evm' xattr with the EVM HMAC re-calculated after setting * posix acls. */ static void evm_inode_post_set_acl(struct dentry *dentry, const char *acl_name, struct posix_acl *kacl) { return evm_inode_post_setxattr(dentry, acl_name, NULL, 0, 0); } /** * evm_inode_post_removexattr - update 'security.evm' after removing the xattr * @dentry: pointer to the affected dentry * @xattr_name: pointer to the affected extended attribute name * * Update the HMAC stored in 'security.evm' to reflect removal of the xattr. * * No need to take the i_mutex lock here, as this function is called from * vfs_removexattr() which takes the i_mutex. */ static void evm_inode_post_removexattr(struct dentry *dentry, const char *xattr_name) { if (!evm_revalidate_status(xattr_name)) return; evm_reset_status(dentry->d_inode); if (!strcmp(xattr_name, XATTR_NAME_EVM)) return; if (!(evm_initialized & EVM_INIT_HMAC)) return; evm_update_evmxattr(dentry, xattr_name, NULL, 0); } /** * evm_inode_post_remove_acl - Update the EVM extended attribute from posix acls * @idmap: idmap of the mount * @dentry: pointer to the affected dentry * @acl_name: name of the posix acl * * Update the 'security.evm' xattr with the EVM HMAC re-calculated after * removing posix acls. */ static inline void evm_inode_post_remove_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name) { evm_inode_post_removexattr(dentry, acl_name); } static int evm_attr_change(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_backing_inode(dentry); unsigned int ia_valid = attr->ia_valid; if (!i_uid_needs_update(idmap, attr, inode) && !i_gid_needs_update(idmap, attr, inode) && (!(ia_valid & ATTR_MODE) || attr->ia_mode == inode->i_mode)) return 0; return 1; } /** * evm_inode_setattr - prevent updating an invalid EVM extended attribute * @idmap: idmap of the mount * @dentry: pointer to the affected dentry * @attr: iattr structure containing the new file attributes * * Permit update of file attributes when files have a valid EVM signature, * except in the case of them having an immutable portable signature. */ static int evm_inode_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { unsigned int ia_valid = attr->ia_valid; enum integrity_status evm_status; /* Policy permits modification of the protected attrs even though * there's no HMAC key loaded */ if (evm_initialized & EVM_ALLOW_METADATA_WRITES) return 0; if (is_unsupported_hmac_fs(dentry)) return 0; if (!(ia_valid & (ATTR_MODE | ATTR_UID | ATTR_GID))) return 0; evm_status = evm_verify_current_integrity(dentry); /* * Writing attrs is safe for portable signatures, as portable signatures * are immutable and can never be updated. */ if ((evm_status == INTEGRITY_PASS) || (evm_status == INTEGRITY_NOXATTRS) || (evm_status == INTEGRITY_FAIL_IMMUTABLE) || (evm_hmac_disabled() && (evm_status == INTEGRITY_NOLABEL || evm_status == INTEGRITY_UNKNOWN))) return 0; if (evm_status == INTEGRITY_PASS_IMMUTABLE && !evm_attr_change(idmap, dentry, attr)) return 0; integrity_audit_msg(AUDIT_INTEGRITY_METADATA, d_backing_inode(dentry), dentry->d_name.name, "appraise_metadata", integrity_status_msg[evm_status], -EPERM, 0); return -EPERM; } /** * evm_inode_post_setattr - update 'security.evm' after modifying metadata * @idmap: idmap of the idmapped mount * @dentry: pointer to the affected dentry * @ia_valid: for the UID and GID status * * For now, update the HMAC stored in 'security.evm' to reflect UID/GID * changes. * * This function is called from notify_change(), which expects the caller * to lock the inode's i_mutex. */ static void evm_inode_post_setattr(struct mnt_idmap *idmap, struct dentry *dentry, int ia_valid) { if (!evm_revalidate_status(NULL)) return; evm_reset_status(dentry->d_inode); if (!(evm_initialized & EVM_INIT_HMAC)) return; if (is_unsupported_hmac_fs(dentry)) return; if (ia_valid & (ATTR_MODE | ATTR_UID | ATTR_GID)) evm_update_evmxattr(dentry, NULL, NULL, 0); } static int evm_inode_copy_up_xattr(struct dentry *src, const char *name) { struct evm_ima_xattr_data *xattr_data = NULL; int rc; if (strcmp(name, XATTR_NAME_EVM) != 0) return -EOPNOTSUPP; /* first need to know the sig type */ rc = vfs_getxattr_alloc(&nop_mnt_idmap, src, XATTR_NAME_EVM, (char **)&xattr_data, 0, GFP_NOFS); if (rc <= 0) return -EPERM; if (rc < offsetof(struct evm_ima_xattr_data, type) + sizeof(xattr_data->type)) return -EPERM; switch (xattr_data->type) { case EVM_XATTR_PORTABLE_DIGSIG: rc = 0; /* allow copy-up */ break; case EVM_XATTR_HMAC: case EVM_IMA_XATTR_DIGSIG: default: rc = -ECANCELED; /* discard */ } kfree(xattr_data); return rc; } /* * evm_inode_init_security - initializes security.evm HMAC value */ int evm_inode_init_security(struct inode *inode, struct inode *dir, const struct qstr *qstr, struct xattr *xattrs, int *xattr_count) { struct evm_xattr *xattr_data; struct xattr *xattr, *evm_xattr; bool evm_protected_xattrs = false; int rc; if (!(evm_initialized & EVM_INIT_HMAC) || !xattrs) return 0; /* * security_inode_init_security() makes sure that the xattrs array is * contiguous, there is enough space for security.evm, and that there is * a terminator at the end of the array. */ for (xattr = xattrs; xattr->name; xattr++) { if (evm_protected_xattr(xattr->name)) evm_protected_xattrs = true; } /* EVM xattr not needed. */ if (!evm_protected_xattrs) return 0; evm_xattr = lsm_get_xattr_slot(xattrs, xattr_count); /* * Array terminator (xattr name = NULL) must be the first non-filled * xattr slot. */ WARN_ONCE(evm_xattr != xattr, "%s: xattrs terminator is not the first non-filled slot\n", __func__); xattr_data = kzalloc(sizeof(*xattr_data), GFP_NOFS); if (!xattr_data) return -ENOMEM; xattr_data->data.type = EVM_XATTR_HMAC; rc = evm_init_hmac(inode, xattrs, xattr_data->digest); if (rc < 0) goto out; evm_xattr->value = xattr_data; evm_xattr->value_len = sizeof(*xattr_data); evm_xattr->name = XATTR_EVM_SUFFIX; return 0; out: kfree(xattr_data); return rc; } EXPORT_SYMBOL_GPL(evm_inode_init_security); static int evm_inode_alloc_security(struct inode *inode) { struct evm_iint_cache *iint = evm_iint_inode(inode); /* Called by security_inode_alloc(), it cannot be NULL. */ iint->flags = 0UL; iint->evm_status = INTEGRITY_UNKNOWN; return 0; } static void evm_file_release(struct file *file) { struct inode *inode = file_inode(file); struct evm_iint_cache *iint = evm_iint_inode(inode); fmode_t mode = file->f_mode; if (!S_ISREG(inode->i_mode) || !(mode & FMODE_WRITE)) return; if (iint && iint->flags & EVM_NEW_FILE && atomic_read(&inode->i_writecount) == 1) iint->flags &= ~EVM_NEW_FILE; } static void evm_post_path_mknod(struct mnt_idmap *idmap, struct dentry *dentry) { struct inode *inode = d_backing_inode(dentry); struct evm_iint_cache *iint = evm_iint_inode(inode); if (!S_ISREG(inode->i_mode)) return; if (iint) iint->flags |= EVM_NEW_FILE; } #ifdef CONFIG_EVM_LOAD_X509 void __init evm_load_x509(void) { int rc; rc = integrity_load_x509(INTEGRITY_KEYRING_EVM, CONFIG_EVM_X509_PATH); if (!rc) evm_initialized |= EVM_INIT_X509; } #endif static int __init init_evm(void) { int error; struct list_head *pos, *q; evm_init_config(); error = integrity_init_keyring(INTEGRITY_KEYRING_EVM); if (error) goto error; error = evm_init_secfs(); if (error < 0) { pr_info("Error registering secfs\n"); goto error; } error: if (error != 0) { if (!list_empty(&evm_config_xattrnames)) { list_for_each_safe(pos, q, &evm_config_xattrnames) list_del(pos); } } return error; } static struct security_hook_list evm_hooks[] __ro_after_init = { LSM_HOOK_INIT(inode_setattr, evm_inode_setattr), LSM_HOOK_INIT(inode_post_setattr, evm_inode_post_setattr), LSM_HOOK_INIT(inode_copy_up_xattr, evm_inode_copy_up_xattr), LSM_HOOK_INIT(inode_setxattr, evm_inode_setxattr), LSM_HOOK_INIT(inode_post_setxattr, evm_inode_post_setxattr), LSM_HOOK_INIT(inode_set_acl, evm_inode_set_acl), LSM_HOOK_INIT(inode_post_set_acl, evm_inode_post_set_acl), LSM_HOOK_INIT(inode_remove_acl, evm_inode_remove_acl), LSM_HOOK_INIT(inode_post_remove_acl, evm_inode_post_remove_acl), LSM_HOOK_INIT(inode_removexattr, evm_inode_removexattr), LSM_HOOK_INIT(inode_post_removexattr, evm_inode_post_removexattr), LSM_HOOK_INIT(inode_init_security, evm_inode_init_security), LSM_HOOK_INIT(inode_alloc_security, evm_inode_alloc_security), LSM_HOOK_INIT(file_release, evm_file_release), LSM_HOOK_INIT(path_post_mknod, evm_post_path_mknod), }; static const struct lsm_id evm_lsmid = { .name = "evm", .id = LSM_ID_EVM, }; static int __init init_evm_lsm(void) { security_add_hooks(evm_hooks, ARRAY_SIZE(evm_hooks), &evm_lsmid); return 0; } struct lsm_blob_sizes evm_blob_sizes __ro_after_init = { .lbs_inode = sizeof(struct evm_iint_cache), .lbs_xattr_count = 1, }; DEFINE_LSM(evm) = { .name = "evm", .init = init_evm_lsm, .order = LSM_ORDER_LAST, .blobs = &evm_blob_sizes, }; late_initcall(init_evm); |
| 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 3 2 2 3 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 | // SPDX-License-Identifier: GPL-2.0-only /* Copyright (C) 2013 Cisco Systems, Inc, 2013. * * Author: Vijay Subramanian <vijaynsu@cisco.com> * Author: Mythili Prabhu <mysuryan@cisco.com> * * ECN support is added by Naeem Khademi <naeemk@ifi.uio.no> * University of Oslo, Norway. * * References: * RFC 8033: https://tools.ietf.org/html/rfc8033 */ #include <linux/module.h> #include <linux/slab.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/errno.h> #include <linux/skbuff.h> #include <net/pkt_sched.h> #include <net/inet_ecn.h> #include <net/pie.h> /* private data for the Qdisc */ struct pie_sched_data { struct pie_vars vars; struct pie_params params; struct pie_stats stats; struct timer_list adapt_timer; struct Qdisc *sch; }; bool pie_drop_early(struct Qdisc *sch, struct pie_params *params, struct pie_vars *vars, u32 backlog, u32 packet_size) { u64 rnd; u64 local_prob = vars->prob; u32 mtu = psched_mtu(qdisc_dev(sch)); /* If there is still burst allowance left skip random early drop */ if (vars->burst_time > 0) return false; /* If current delay is less than half of target, and * if drop prob is low already, disable early_drop */ if ((vars->qdelay < params->target / 2) && (vars->prob < MAX_PROB / 5)) return false; /* If we have fewer than 2 mtu-sized packets, disable pie_drop_early, * similar to min_th in RED */ if (backlog < 2 * mtu) return false; /* If bytemode is turned on, use packet size to compute new * probablity. Smaller packets will have lower drop prob in this case */ if (params->bytemode && packet_size <= mtu) local_prob = (u64)packet_size * div_u64(local_prob, mtu); else local_prob = vars->prob; if (local_prob == 0) vars->accu_prob = 0; else vars->accu_prob += local_prob; if (vars->accu_prob < (MAX_PROB / 100) * 85) return false; if (vars->accu_prob >= (MAX_PROB / 2) * 17) return true; get_random_bytes(&rnd, 8); if ((rnd >> BITS_PER_BYTE) < local_prob) { vars->accu_prob = 0; return true; } return false; } EXPORT_SYMBOL_GPL(pie_drop_early); static int pie_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { enum skb_drop_reason reason = SKB_DROP_REASON_QDISC_OVERLIMIT; struct pie_sched_data *q = qdisc_priv(sch); bool enqueue = false; if (unlikely(qdisc_qlen(sch) >= sch->limit)) { q->stats.overlimit++; goto out; } reason = SKB_DROP_REASON_QDISC_CONGESTED; if (!pie_drop_early(sch, &q->params, &q->vars, sch->qstats.backlog, skb->len)) { enqueue = true; } else if (q->params.ecn && (q->vars.prob <= MAX_PROB / 10) && INET_ECN_set_ce(skb)) { /* If packet is ecn capable, mark it if drop probability * is lower than 10%, else drop it. */ q->stats.ecn_mark++; enqueue = true; } /* we can enqueue the packet */ if (enqueue) { /* Set enqueue time only when dq_rate_estimator is disabled. */ if (!q->params.dq_rate_estimator) pie_set_enqueue_time(skb); q->stats.packets_in++; if (qdisc_qlen(sch) > q->stats.maxq) q->stats.maxq = qdisc_qlen(sch); return qdisc_enqueue_tail(skb, sch); } out: q->stats.dropped++; q->vars.accu_prob = 0; return qdisc_drop_reason(skb, sch, to_free, reason); } static const struct nla_policy pie_policy[TCA_PIE_MAX + 1] = { [TCA_PIE_TARGET] = {.type = NLA_U32}, [TCA_PIE_LIMIT] = {.type = NLA_U32}, [TCA_PIE_TUPDATE] = {.type = NLA_U32}, [TCA_PIE_ALPHA] = {.type = NLA_U32}, [TCA_PIE_BETA] = {.type = NLA_U32}, [TCA_PIE_ECN] = {.type = NLA_U32}, [TCA_PIE_BYTEMODE] = {.type = NLA_U32}, [TCA_PIE_DQ_RATE_ESTIMATOR] = {.type = NLA_U32}, }; static int pie_change(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { struct pie_sched_data *q = qdisc_priv(sch); struct nlattr *tb[TCA_PIE_MAX + 1]; unsigned int qlen, dropped = 0; int err; err = nla_parse_nested_deprecated(tb, TCA_PIE_MAX, opt, pie_policy, NULL); if (err < 0) return err; sch_tree_lock(sch); /* convert from microseconds to pschedtime */ if (tb[TCA_PIE_TARGET]) { /* target is in us */ u32 target = nla_get_u32(tb[TCA_PIE_TARGET]); /* convert to pschedtime */ WRITE_ONCE(q->params.target, PSCHED_NS2TICKS((u64)target * NSEC_PER_USEC)); } /* tupdate is in jiffies */ if (tb[TCA_PIE_TUPDATE]) WRITE_ONCE(q->params.tupdate, usecs_to_jiffies(nla_get_u32(tb[TCA_PIE_TUPDATE]))); if (tb[TCA_PIE_LIMIT]) { u32 limit = nla_get_u32(tb[TCA_PIE_LIMIT]); WRITE_ONCE(q->params.limit, limit); WRITE_ONCE(sch->limit, limit); } if (tb[TCA_PIE_ALPHA]) WRITE_ONCE(q->params.alpha, nla_get_u32(tb[TCA_PIE_ALPHA])); if (tb[TCA_PIE_BETA]) WRITE_ONCE(q->params.beta, nla_get_u32(tb[TCA_PIE_BETA])); if (tb[TCA_PIE_ECN]) WRITE_ONCE(q->params.ecn, nla_get_u32(tb[TCA_PIE_ECN])); if (tb[TCA_PIE_BYTEMODE]) WRITE_ONCE(q->params.bytemode, nla_get_u32(tb[TCA_PIE_BYTEMODE])); if (tb[TCA_PIE_DQ_RATE_ESTIMATOR]) WRITE_ONCE(q->params.dq_rate_estimator, nla_get_u32(tb[TCA_PIE_DQ_RATE_ESTIMATOR])); /* Drop excess packets if new limit is lower */ qlen = sch->q.qlen; while (sch->q.qlen > sch->limit) { struct sk_buff *skb = qdisc_dequeue_internal(sch, true); dropped += qdisc_pkt_len(skb); qdisc_qstats_backlog_dec(sch, skb); rtnl_qdisc_drop(skb, sch); } qdisc_tree_reduce_backlog(sch, qlen - sch->q.qlen, dropped); sch_tree_unlock(sch); return 0; } void pie_process_dequeue(struct sk_buff *skb, struct pie_params *params, struct pie_vars *vars, u32 backlog) { psched_time_t now = psched_get_time(); u32 dtime = 0; /* If dq_rate_estimator is disabled, calculate qdelay using the * packet timestamp. */ if (!params->dq_rate_estimator) { vars->qdelay = now - pie_get_enqueue_time(skb); if (vars->dq_tstamp != DTIME_INVALID) dtime = now - vars->dq_tstamp; vars->dq_tstamp = now; if (backlog == 0) vars->qdelay = 0; if (dtime == 0) return; goto burst_allowance_reduction; } /* If current queue is about 10 packets or more and dq_count is unset * we have enough packets to calculate the drain rate. Save * current time as dq_tstamp and start measurement cycle. */ if (backlog >= QUEUE_THRESHOLD && vars->dq_count == DQCOUNT_INVALID) { vars->dq_tstamp = psched_get_time(); vars->dq_count = 0; } /* Calculate the average drain rate from this value. If queue length * has receded to a small value viz., <= QUEUE_THRESHOLD bytes, reset * the dq_count to -1 as we don't have enough packets to calculate the * drain rate anymore. The following if block is entered only when we * have a substantial queue built up (QUEUE_THRESHOLD bytes or more) * and we calculate the drain rate for the threshold here. dq_count is * in bytes, time difference in psched_time, hence rate is in * bytes/psched_time. */ if (vars->dq_count != DQCOUNT_INVALID) { vars->dq_count += skb->len; if (vars->dq_count >= QUEUE_THRESHOLD) { u32 count = vars->dq_count << PIE_SCALE; dtime = now - vars->dq_tstamp; if (dtime == 0) return; count = count / dtime; if (vars->avg_dq_rate == 0) vars->avg_dq_rate = count; else vars->avg_dq_rate = (vars->avg_dq_rate - (vars->avg_dq_rate >> 3)) + (count >> 3); /* If the queue has receded below the threshold, we hold * on to the last drain rate calculated, else we reset * dq_count to 0 to re-enter the if block when the next * packet is dequeued */ if (backlog < QUEUE_THRESHOLD) { vars->dq_count = DQCOUNT_INVALID; } else { vars->dq_count = 0; vars->dq_tstamp = psched_get_time(); } goto burst_allowance_reduction; } } return; burst_allowance_reduction: if (vars->burst_time > 0) { if (vars->burst_time > dtime) vars->burst_time -= dtime; else vars->burst_time = 0; } } EXPORT_SYMBOL_GPL(pie_process_dequeue); void pie_calculate_probability(struct pie_params *params, struct pie_vars *vars, u32 backlog) { psched_time_t qdelay = 0; /* in pschedtime */ psched_time_t qdelay_old = 0; /* in pschedtime */ s64 delta = 0; /* determines the change in probability */ u64 oldprob; u64 alpha, beta; u32 power; bool update_prob = true; if (params->dq_rate_estimator) { qdelay_old = vars->qdelay; vars->qdelay_old = vars->qdelay; if (vars->avg_dq_rate > 0) qdelay = (backlog << PIE_SCALE) / vars->avg_dq_rate; else qdelay = 0; } else { qdelay = vars->qdelay; qdelay_old = vars->qdelay_old; } /* If qdelay is zero and backlog is not, it means backlog is very small, * so we do not update probability in this round. */ if (qdelay == 0 && backlog != 0) update_prob = false; /* In the algorithm, alpha and beta are between 0 and 2 with typical * value for alpha as 0.125. In this implementation, we use values 0-32 * passed from user space to represent this. Also, alpha and beta have * unit of HZ and need to be scaled before they can used to update * probability. alpha/beta are updated locally below by scaling down * by 16 to come to 0-2 range. */ alpha = ((u64)params->alpha * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 4; beta = ((u64)params->beta * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 4; /* We scale alpha and beta differently depending on how heavy the * congestion is. Please see RFC 8033 for details. */ if (vars->prob < MAX_PROB / 10) { alpha >>= 1; beta >>= 1; power = 100; while (vars->prob < div_u64(MAX_PROB, power) && power <= 1000000) { alpha >>= 2; beta >>= 2; power *= 10; } } /* alpha and beta should be between 0 and 32, in multiples of 1/16 */ delta += alpha * (qdelay - params->target); delta += beta * (qdelay - qdelay_old); oldprob = vars->prob; /* to ensure we increase probability in steps of no more than 2% */ if (delta > (s64)(MAX_PROB / (100 / 2)) && vars->prob >= MAX_PROB / 10) delta = (MAX_PROB / 100) * 2; /* Non-linear drop: * Tune drop probability to increase quickly for high delays(>= 250ms) * 250ms is derived through experiments and provides error protection */ if (qdelay > (PSCHED_NS2TICKS(250 * NSEC_PER_MSEC))) delta += MAX_PROB / (100 / 2); vars->prob += delta; if (delta > 0) { /* prevent overflow */ if (vars->prob < oldprob) { vars->prob = MAX_PROB; /* Prevent normalization error. If probability is at * maximum value already, we normalize it here, and * skip the check to do a non-linear drop in the next * section. */ update_prob = false; } } else { /* prevent underflow */ if (vars->prob > oldprob) vars->prob = 0; } /* Non-linear drop in probability: Reduce drop probability quickly if * delay is 0 for 2 consecutive Tupdate periods. */ if (qdelay == 0 && qdelay_old == 0 && update_prob) /* Reduce drop probability to 98.4% */ vars->prob -= vars->prob / 64; vars->qdelay = qdelay; vars->backlog_old = backlog; /* We restart the measurement cycle if the following conditions are met * 1. If the delay has been low for 2 consecutive Tupdate periods * 2. Calculated drop probability is zero * 3. If average dq_rate_estimator is enabled, we have at least one * estimate for the avg_dq_rate ie., is a non-zero value */ if ((vars->qdelay < params->target / 2) && (vars->qdelay_old < params->target / 2) && vars->prob == 0 && (!params->dq_rate_estimator || vars->avg_dq_rate > 0)) { pie_vars_init(vars); } if (!params->dq_rate_estimator) vars->qdelay_old = qdelay; } EXPORT_SYMBOL_GPL(pie_calculate_probability); static void pie_timer(struct timer_list *t) { struct pie_sched_data *q = timer_container_of(q, t, adapt_timer); struct Qdisc *sch = q->sch; spinlock_t *root_lock; rcu_read_lock(); root_lock = qdisc_lock(qdisc_root_sleeping(sch)); spin_lock(root_lock); pie_calculate_probability(&q->params, &q->vars, sch->qstats.backlog); /* reset the timer to fire after 'tupdate'. tupdate is in jiffies. */ if (q->params.tupdate) mod_timer(&q->adapt_timer, jiffies + q->params.tupdate); spin_unlock(root_lock); rcu_read_unlock(); } static int pie_init(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { struct pie_sched_data *q = qdisc_priv(sch); pie_params_init(&q->params); pie_vars_init(&q->vars); sch->limit = q->params.limit; q->sch = sch; timer_setup(&q->adapt_timer, pie_timer, 0); if (opt) { int err = pie_change(sch, opt, extack); if (err) return err; } mod_timer(&q->adapt_timer, jiffies + HZ / 2); return 0; } static int pie_dump(struct Qdisc *sch, struct sk_buff *skb) { struct pie_sched_data *q = qdisc_priv(sch); struct nlattr *opts; opts = nla_nest_start_noflag(skb, TCA_OPTIONS); if (!opts) goto nla_put_failure; /* convert target from pschedtime to us */ if (nla_put_u32(skb, TCA_PIE_TARGET, ((u32)PSCHED_TICKS2NS(READ_ONCE(q->params.target))) / NSEC_PER_USEC) || nla_put_u32(skb, TCA_PIE_LIMIT, READ_ONCE(sch->limit)) || nla_put_u32(skb, TCA_PIE_TUPDATE, jiffies_to_usecs(READ_ONCE(q->params.tupdate))) || nla_put_u32(skb, TCA_PIE_ALPHA, READ_ONCE(q->params.alpha)) || nla_put_u32(skb, TCA_PIE_BETA, READ_ONCE(q->params.beta)) || nla_put_u32(skb, TCA_PIE_ECN, q->params.ecn) || nla_put_u32(skb, TCA_PIE_BYTEMODE, READ_ONCE(q->params.bytemode)) || nla_put_u32(skb, TCA_PIE_DQ_RATE_ESTIMATOR, READ_ONCE(q->params.dq_rate_estimator))) goto nla_put_failure; return nla_nest_end(skb, opts); nla_put_failure: nla_nest_cancel(skb, opts); return -1; } static int pie_dump_stats(struct Qdisc *sch, struct gnet_dump *d) { struct pie_sched_data *q = qdisc_priv(sch); struct tc_pie_xstats st = { .prob = q->vars.prob << BITS_PER_BYTE, .delay = ((u32)PSCHED_TICKS2NS(q->vars.qdelay)) / NSEC_PER_USEC, .packets_in = q->stats.packets_in, .overlimit = q->stats.overlimit, .maxq = q->stats.maxq, .dropped = q->stats.dropped, .ecn_mark = q->stats.ecn_mark, }; /* avg_dq_rate is only valid if dq_rate_estimator is enabled */ st.dq_rate_estimating = q->params.dq_rate_estimator; /* unscale and return dq_rate in bytes per sec */ if (q->params.dq_rate_estimator) st.avg_dq_rate = q->vars.avg_dq_rate * (PSCHED_TICKS_PER_SEC) >> PIE_SCALE; return gnet_stats_copy_app(d, &st, sizeof(st)); } static struct sk_buff *pie_qdisc_dequeue(struct Qdisc *sch) { struct pie_sched_data *q = qdisc_priv(sch); struct sk_buff *skb = qdisc_dequeue_head(sch); if (!skb) return NULL; pie_process_dequeue(skb, &q->params, &q->vars, sch->qstats.backlog); return skb; } static void pie_reset(struct Qdisc *sch) { struct pie_sched_data *q = qdisc_priv(sch); qdisc_reset_queue(sch); pie_vars_init(&q->vars); } static void pie_destroy(struct Qdisc *sch) { struct pie_sched_data *q = qdisc_priv(sch); q->params.tupdate = 0; timer_delete_sync(&q->adapt_timer); } static struct Qdisc_ops pie_qdisc_ops __read_mostly = { .id = "pie", .priv_size = sizeof(struct pie_sched_data), .enqueue = pie_qdisc_enqueue, .dequeue = pie_qdisc_dequeue, .peek = qdisc_peek_dequeued, .init = pie_init, .destroy = pie_destroy, .reset = pie_reset, .change = pie_change, .dump = pie_dump, .dump_stats = pie_dump_stats, .owner = THIS_MODULE, }; MODULE_ALIAS_NET_SCH("pie"); static int __init pie_module_init(void) { return register_qdisc(&pie_qdisc_ops); } static void __exit pie_module_exit(void) { unregister_qdisc(&pie_qdisc_ops); } module_init(pie_module_init); module_exit(pie_module_exit); MODULE_DESCRIPTION("Proportional Integral controller Enhanced (PIE) scheduler"); MODULE_AUTHOR("Vijay Subramanian"); MODULE_AUTHOR("Mythili Prabhu"); MODULE_LICENSE("GPL"); |
| 7 6 2 1 5 5 2 2 2 3 5 1 7 2 1 1 2 30 30 6 5 6 9 9 2 2 2 7 7 9 9 9 9 7 9 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 | // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2023 Isovalent */ #include <linux/bpf.h> #include <linux/bpf_mprog.h> #include <linux/netdevice.h> #include <net/tcx.h> int tcx_prog_attach(const union bpf_attr *attr, struct bpf_prog *prog) { bool created, ingress = attr->attach_type == BPF_TCX_INGRESS; struct net *net = current->nsproxy->net_ns; struct bpf_mprog_entry *entry, *entry_new; struct bpf_prog *replace_prog = NULL; struct net_device *dev; int ret; rtnl_lock(); dev = __dev_get_by_index(net, attr->target_ifindex); if (!dev) { ret = -ENODEV; goto out; } if (attr->attach_flags & BPF_F_REPLACE) { replace_prog = bpf_prog_get_type(attr->replace_bpf_fd, prog->type); if (IS_ERR(replace_prog)) { ret = PTR_ERR(replace_prog); replace_prog = NULL; goto out; } } entry = tcx_entry_fetch_or_create(dev, ingress, &created); if (!entry) { ret = -ENOMEM; goto out; } ret = bpf_mprog_attach(entry, &entry_new, prog, NULL, replace_prog, attr->attach_flags, attr->relative_fd, attr->expected_revision); if (!ret) { if (entry != entry_new) { tcx_entry_update(dev, entry_new, ingress); tcx_entry_sync(); tcx_skeys_inc(ingress); } bpf_mprog_commit(entry); } else if (created) { tcx_entry_free(entry); } out: if (replace_prog) bpf_prog_put(replace_prog); rtnl_unlock(); return ret; } int tcx_prog_detach(const union bpf_attr *attr, struct bpf_prog *prog) { bool ingress = attr->attach_type == BPF_TCX_INGRESS; struct net *net = current->nsproxy->net_ns; struct bpf_mprog_entry *entry, *entry_new; struct net_device *dev; int ret; rtnl_lock(); dev = __dev_get_by_index(net, attr->target_ifindex); if (!dev) { ret = -ENODEV; goto out; } entry = tcx_entry_fetch(dev, ingress); if (!entry) { ret = -ENOENT; goto out; } ret = bpf_mprog_detach(entry, &entry_new, prog, NULL, attr->attach_flags, attr->relative_fd, attr->expected_revision); if (!ret) { if (!tcx_entry_is_active(entry_new)) entry_new = NULL; tcx_entry_update(dev, entry_new, ingress); tcx_entry_sync(); tcx_skeys_dec(ingress); bpf_mprog_commit(entry); if (!entry_new) tcx_entry_free(entry); } out: rtnl_unlock(); return ret; } void tcx_uninstall(struct net_device *dev, bool ingress) { struct bpf_mprog_entry *entry, *entry_new = NULL; struct bpf_tuple tuple = {}; struct bpf_mprog_fp *fp; struct bpf_mprog_cp *cp; bool active; entry = tcx_entry_fetch(dev, ingress); if (!entry) return; active = tcx_entry(entry)->miniq_active; if (active) bpf_mprog_clear_all(entry, &entry_new); tcx_entry_update(dev, entry_new, ingress); tcx_entry_sync(); bpf_mprog_foreach_tuple(entry, fp, cp, tuple) { if (tuple.link) tcx_link(tuple.link)->dev = NULL; else bpf_prog_put(tuple.prog); tcx_skeys_dec(ingress); } if (!active) tcx_entry_free(entry); } int tcx_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr) { bool ingress = attr->query.attach_type == BPF_TCX_INGRESS; struct net *net = current->nsproxy->net_ns; struct net_device *dev; int ret; rtnl_lock(); dev = __dev_get_by_index(net, attr->query.target_ifindex); if (!dev) { ret = -ENODEV; goto out; } ret = bpf_mprog_query(attr, uattr, tcx_entry_fetch(dev, ingress)); out: rtnl_unlock(); return ret; } static int tcx_link_prog_attach(struct bpf_link *link, u32 flags, u32 id_or_fd, u64 revision) { struct tcx_link *tcx = tcx_link(link); bool created, ingress = link->attach_type == BPF_TCX_INGRESS; struct bpf_mprog_entry *entry, *entry_new; struct net_device *dev = tcx->dev; int ret; ASSERT_RTNL(); entry = tcx_entry_fetch_or_create(dev, ingress, &created); if (!entry) return -ENOMEM; ret = bpf_mprog_attach(entry, &entry_new, link->prog, link, NULL, flags, id_or_fd, revision); if (!ret) { if (entry != entry_new) { tcx_entry_update(dev, entry_new, ingress); tcx_entry_sync(); tcx_skeys_inc(ingress); } bpf_mprog_commit(entry); } else if (created) { tcx_entry_free(entry); } return ret; } static void tcx_link_release(struct bpf_link *link) { struct tcx_link *tcx = tcx_link(link); bool ingress = link->attach_type == BPF_TCX_INGRESS; struct bpf_mprog_entry *entry, *entry_new; struct net_device *dev; int ret = 0; rtnl_lock(); dev = tcx->dev; if (!dev) goto out; entry = tcx_entry_fetch(dev, ingress); if (!entry) { ret = -ENOENT; goto out; } ret = bpf_mprog_detach(entry, &entry_new, link->prog, link, 0, 0, 0); if (!ret) { if (!tcx_entry_is_active(entry_new)) entry_new = NULL; tcx_entry_update(dev, entry_new, ingress); tcx_entry_sync(); tcx_skeys_dec(ingress); bpf_mprog_commit(entry); if (!entry_new) tcx_entry_free(entry); tcx->dev = NULL; } out: WARN_ON_ONCE(ret); rtnl_unlock(); } static int tcx_link_update(struct bpf_link *link, struct bpf_prog *nprog, struct bpf_prog *oprog) { struct tcx_link *tcx = tcx_link(link); bool ingress = link->attach_type == BPF_TCX_INGRESS; struct bpf_mprog_entry *entry, *entry_new; struct net_device *dev; int ret = 0; rtnl_lock(); dev = tcx->dev; if (!dev) { ret = -ENOLINK; goto out; } if (oprog && link->prog != oprog) { ret = -EPERM; goto out; } oprog = link->prog; if (oprog == nprog) { bpf_prog_put(nprog); goto out; } entry = tcx_entry_fetch(dev, ingress); if (!entry) { ret = -ENOENT; goto out; } ret = bpf_mprog_attach(entry, &entry_new, nprog, link, oprog, BPF_F_REPLACE | BPF_F_ID, link->prog->aux->id, 0); if (!ret) { WARN_ON_ONCE(entry != entry_new); oprog = xchg(&link->prog, nprog); bpf_prog_put(oprog); bpf_mprog_commit(entry); } out: rtnl_unlock(); return ret; } static void tcx_link_dealloc(struct bpf_link *link) { kfree(tcx_link(link)); } static void tcx_link_fdinfo(const struct bpf_link *link, struct seq_file *seq) { const struct tcx_link *tcx = tcx_link(link); u32 ifindex = 0; rtnl_lock(); if (tcx->dev) ifindex = tcx->dev->ifindex; rtnl_unlock(); seq_printf(seq, "ifindex:\t%u\n", ifindex); seq_printf(seq, "attach_type:\t%u (%s)\n", link->attach_type, link->attach_type == BPF_TCX_INGRESS ? "ingress" : "egress"); } static int tcx_link_fill_info(const struct bpf_link *link, struct bpf_link_info *info) { const struct tcx_link *tcx = tcx_link(link); u32 ifindex = 0; rtnl_lock(); if (tcx->dev) ifindex = tcx->dev->ifindex; rtnl_unlock(); info->tcx.ifindex = ifindex; info->tcx.attach_type = link->attach_type; return 0; } static int tcx_link_detach(struct bpf_link *link) { tcx_link_release(link); return 0; } static const struct bpf_link_ops tcx_link_lops = { .release = tcx_link_release, .detach = tcx_link_detach, .dealloc = tcx_link_dealloc, .update_prog = tcx_link_update, .show_fdinfo = tcx_link_fdinfo, .fill_link_info = tcx_link_fill_info, }; static int tcx_link_init(struct tcx_link *tcx, struct bpf_link_primer *link_primer, const union bpf_attr *attr, struct net_device *dev, struct bpf_prog *prog) { bpf_link_init(&tcx->link, BPF_LINK_TYPE_TCX, &tcx_link_lops, prog, attr->link_create.attach_type); tcx->dev = dev; return bpf_link_prime(&tcx->link, link_primer); } int tcx_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) { struct net *net = current->nsproxy->net_ns; struct bpf_link_primer link_primer; struct net_device *dev; struct tcx_link *tcx; int ret; rtnl_lock(); dev = __dev_get_by_index(net, attr->link_create.target_ifindex); if (!dev) { ret = -ENODEV; goto out; } tcx = kzalloc(sizeof(*tcx), GFP_USER); if (!tcx) { ret = -ENOMEM; goto out; } ret = tcx_link_init(tcx, &link_primer, attr, dev, prog); if (ret) { kfree(tcx); goto out; } ret = tcx_link_prog_attach(&tcx->link, attr->link_create.flags, attr->link_create.tcx.relative_fd, attr->link_create.tcx.expected_revision); if (ret) { tcx->dev = NULL; bpf_link_cleanup(&link_primer); goto out; } ret = bpf_link_settle(&link_primer); out: rtnl_unlock(); return ret; } |
| 533 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_HUGETLB_INLINE_H #define _LINUX_HUGETLB_INLINE_H #ifdef CONFIG_HUGETLB_PAGE #include <linux/mm.h> static inline bool is_vm_hugetlb_page(struct vm_area_struct *vma) { return !!(vma->vm_flags & VM_HUGETLB); } #else static inline bool is_vm_hugetlb_page(struct vm_area_struct *vma) { return false; } #endif #endif |
| 12 12 12 12 12 10 12 1 1 1 11 1 1 1 10 1 1 1 9 7 9 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 | /* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright (c) 2008, Intel Corporation. * * Author: Alexander Duyck <alexander.h.duyck@intel.com> */ #ifndef __NET_TC_SKBEDIT_H #define __NET_TC_SKBEDIT_H #include <net/act_api.h> #include <linux/tc_act/tc_skbedit.h> struct tcf_skbedit_params { int action; u32 flags; u32 priority; u32 mark; u32 mask; u16 queue_mapping; u16 mapping_mod; u16 ptype; struct rcu_head rcu; }; struct tcf_skbedit { struct tc_action common; struct tcf_skbedit_params __rcu *params; }; #define to_skbedit(a) ((struct tcf_skbedit *)a) /* Return true iff action is the one identified by FLAG. */ static inline bool is_tcf_skbedit_with_flag(const struct tc_action *a, u32 flag) { #ifdef CONFIG_NET_CLS_ACT u32 flags; if (a->ops && a->ops->id == TCA_ID_SKBEDIT) { rcu_read_lock(); flags = rcu_dereference(to_skbedit(a)->params)->flags; rcu_read_unlock(); return flags == flag; } #endif return false; } /* Return true iff action is mark */ static inline bool is_tcf_skbedit_mark(const struct tc_action *a) { return is_tcf_skbedit_with_flag(a, SKBEDIT_F_MARK); } static inline u32 tcf_skbedit_mark(const struct tc_action *a) { u32 mark; rcu_read_lock(); mark = rcu_dereference(to_skbedit(a)->params)->mark; rcu_read_unlock(); return mark; } /* Return true iff action is ptype */ static inline bool is_tcf_skbedit_ptype(const struct tc_action *a) { return is_tcf_skbedit_with_flag(a, SKBEDIT_F_PTYPE); } static inline u32 tcf_skbedit_ptype(const struct tc_action *a) { u16 ptype; rcu_read_lock(); ptype = rcu_dereference(to_skbedit(a)->params)->ptype; rcu_read_unlock(); return ptype; } /* Return true iff action is priority */ static inline bool is_tcf_skbedit_priority(const struct tc_action *a) { return is_tcf_skbedit_with_flag(a, SKBEDIT_F_PRIORITY); } static inline u32 tcf_skbedit_priority(const struct tc_action *a) { u32 priority; rcu_read_lock(); priority = rcu_dereference(to_skbedit(a)->params)->priority; rcu_read_unlock(); return priority; } static inline u16 tcf_skbedit_rx_queue_mapping(const struct tc_action *a) { u16 rx_queue; rcu_read_lock(); rx_queue = rcu_dereference(to_skbedit(a)->params)->queue_mapping; rcu_read_unlock(); return rx_queue; } /* Return true iff action is queue_mapping */ static inline bool is_tcf_skbedit_queue_mapping(const struct tc_action *a) { return is_tcf_skbedit_with_flag(a, SKBEDIT_F_QUEUE_MAPPING); } /* Return true if action is on ingress traffic */ static inline bool is_tcf_skbedit_ingress(u32 flags) { return flags & TCA_ACT_FLAGS_AT_INGRESS; } static inline bool is_tcf_skbedit_tx_queue_mapping(const struct tc_action *a) { return is_tcf_skbedit_queue_mapping(a) && !is_tcf_skbedit_ingress(a->tcfa_flags); } static inline bool is_tcf_skbedit_rx_queue_mapping(const struct tc_action *a) { return is_tcf_skbedit_queue_mapping(a) && is_tcf_skbedit_ingress(a->tcfa_flags); } /* Return true iff action is inheritdsfield */ static inline bool is_tcf_skbedit_inheritdsfield(const struct tc_action *a) { return is_tcf_skbedit_with_flag(a, SKBEDIT_F_INHERITDSFIELD); } #endif /* __NET_TC_SKBEDIT_H */ |
| 1 1 1 1 1 1 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 | // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2021 Cong Wang <cong.wang@bytedance.com> */ #include <linux/bpf.h> #include <linux/skmsg.h> #include <net/af_unix.h> #include "af_unix.h" #define unix_sk_has_data(__sk, __psock) \ ({ !skb_queue_empty(&__sk->sk_receive_queue) || \ !skb_queue_empty(&__psock->ingress_skb) || \ !list_empty(&__psock->ingress_msg); \ }) static int unix_msg_wait_data(struct sock *sk, struct sk_psock *psock, long timeo) { DEFINE_WAIT_FUNC(wait, woken_wake_function); struct unix_sock *u = unix_sk(sk); int ret = 0; if (sk->sk_shutdown & RCV_SHUTDOWN) return 1; if (!timeo) return ret; add_wait_queue(sk_sleep(sk), &wait); sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); if (!unix_sk_has_data(sk, psock)) { mutex_unlock(&u->iolock); wait_woken(&wait, TASK_INTERRUPTIBLE, timeo); mutex_lock(&u->iolock); ret = unix_sk_has_data(sk, psock); } sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); remove_wait_queue(sk_sleep(sk), &wait); return ret; } static int __unix_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags) { if (sk->sk_type == SOCK_DGRAM) return __unix_dgram_recvmsg(sk, msg, len, flags); else return __unix_stream_recvmsg(sk, msg, len, flags); } static int unix_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, int *addr_len) { struct unix_sock *u = unix_sk(sk); struct sk_psock *psock; int copied; if (flags & MSG_OOB) return -EOPNOTSUPP; if (!len) return 0; psock = sk_psock_get(sk); if (unlikely(!psock)) return __unix_recvmsg(sk, msg, len, flags); mutex_lock(&u->iolock); if (!skb_queue_empty(&sk->sk_receive_queue) && sk_psock_queue_empty(psock)) { mutex_unlock(&u->iolock); sk_psock_put(sk, psock); return __unix_recvmsg(sk, msg, len, flags); } msg_bytes_ready: copied = sk_msg_recvmsg(sk, psock, msg, len, flags); if (!copied) { long timeo; int data; timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); data = unix_msg_wait_data(sk, psock, timeo); if (data) { if (!sk_psock_queue_empty(psock)) goto msg_bytes_ready; mutex_unlock(&u->iolock); sk_psock_put(sk, psock); return __unix_recvmsg(sk, msg, len, flags); } copied = -EAGAIN; } mutex_unlock(&u->iolock); sk_psock_put(sk, psock); return copied; } static struct proto *unix_dgram_prot_saved __read_mostly; static DEFINE_SPINLOCK(unix_dgram_prot_lock); static struct proto unix_dgram_bpf_prot; static struct proto *unix_stream_prot_saved __read_mostly; static DEFINE_SPINLOCK(unix_stream_prot_lock); static struct proto unix_stream_bpf_prot; static void unix_dgram_bpf_rebuild_protos(struct proto *prot, const struct proto *base) { *prot = *base; prot->close = sock_map_close; prot->recvmsg = unix_bpf_recvmsg; prot->sock_is_readable = sk_msg_is_readable; } static void unix_stream_bpf_rebuild_protos(struct proto *prot, const struct proto *base) { *prot = *base; prot->close = sock_map_close; prot->recvmsg = unix_bpf_recvmsg; prot->sock_is_readable = sk_msg_is_readable; prot->unhash = sock_map_unhash; } static void unix_dgram_bpf_check_needs_rebuild(struct proto *ops) { if (unlikely(ops != smp_load_acquire(&unix_dgram_prot_saved))) { spin_lock_bh(&unix_dgram_prot_lock); if (likely(ops != unix_dgram_prot_saved)) { unix_dgram_bpf_rebuild_protos(&unix_dgram_bpf_prot, ops); smp_store_release(&unix_dgram_prot_saved, ops); } spin_unlock_bh(&unix_dgram_prot_lock); } } static void unix_stream_bpf_check_needs_rebuild(struct proto *ops) { if (unlikely(ops != smp_load_acquire(&unix_stream_prot_saved))) { spin_lock_bh(&unix_stream_prot_lock); if (likely(ops != unix_stream_prot_saved)) { unix_stream_bpf_rebuild_protos(&unix_stream_bpf_prot, ops); smp_store_release(&unix_stream_prot_saved, ops); } spin_unlock_bh(&unix_stream_prot_lock); } } int unix_dgram_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore) { if (sk->sk_type != SOCK_DGRAM) return -EOPNOTSUPP; if (restore) { sk->sk_write_space = psock->saved_write_space; sock_replace_proto(sk, psock->sk_proto); return 0; } unix_dgram_bpf_check_needs_rebuild(psock->sk_proto); sock_replace_proto(sk, &unix_dgram_bpf_prot); return 0; } int unix_stream_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore) { struct sock *sk_pair; /* Restore does not decrement the sk_pair reference yet because we must * keep the a reference to the socket until after an RCU grace period * and any pending sends have completed. */ if (restore) { sk->sk_write_space = psock->saved_write_space; sock_replace_proto(sk, psock->sk_proto); return 0; } /* psock_update_sk_prot can be called multiple times if psock is * added to multiple maps and/or slots in the same map. There is * also an edge case where replacing a psock with itself can trigger * an extra psock_update_sk_prot during the insert process. So it * must be safe to do multiple calls. Here we need to ensure we don't * increment the refcnt through sock_hold many times. There will only * be a single matching destroy operation. */ if (!psock->sk_pair) { sk_pair = unix_peer(sk); sock_hold(sk_pair); psock->sk_pair = sk_pair; } unix_stream_bpf_check_needs_rebuild(psock->sk_proto); sock_replace_proto(sk, &unix_stream_bpf_prot); return 0; } void __init unix_bpf_build_proto(void) { unix_dgram_bpf_rebuild_protos(&unix_dgram_bpf_prot, &unix_dgram_proto); unix_stream_bpf_rebuild_protos(&unix_stream_bpf_prot, &unix_stream_proto); } |
| 488 488 489 487 487 188 2 1 2 184 184 2 1 2 1 1 2 1 1 2 489 183 183 183 183 183 183 183 183 183 1 1 1 1 1 1 34 34 34 481 485 485 484 484 485 184 184 184 2 1 1 1 1 2 2 185 184 99 183 183 183 183 183 183 183 183 185 183 183 183 104 183 183 183 188 188 188 188 188 188 188 188 188 188 191 191 191 191 191 191 191 183 187 187 187 187 182 184 182 186 187 187 187 187 184 184 182 182 2 2 2 2 2 2 2 2 180 103 102 54 54 71 71 71 71 68 71 58 58 31 34 34 20 20 47 71 4 4 4 4 3 2 2 2 4 1 1 1 5 5 4 5 1 4 3 4 4 4 4 4 5 3 3 191 2 2 183 183 183 183 183 180 180 180 180 180 180 180 194 194 185 183 188 184 182 103 4 3 192 194 180 180 180 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 | // SPDX-License-Identifier: GPL-2.0 /* * USB Raw Gadget driver. * See Documentation/usb/raw-gadget.rst for more details. * * Copyright (c) 2020 Google, Inc. * Author: Andrey Konovalov <andreyknvl@gmail.com> */ #include <linux/compiler.h> #include <linux/ctype.h> #include <linux/debugfs.h> #include <linux/delay.h> #include <linux/idr.h> #include <linux/kref.h> #include <linux/miscdevice.h> #include <linux/module.h> #include <linux/semaphore.h> #include <linux/sched.h> #include <linux/slab.h> #include <linux/uaccess.h> #include <linux/wait.h> #include <linux/usb.h> #include <linux/usb/ch9.h> #include <linux/usb/ch11.h> #include <linux/usb/gadget.h> #include <linux/usb/composite.h> #include <uapi/linux/usb/raw_gadget.h> #define DRIVER_DESC "USB Raw Gadget" #define DRIVER_NAME "raw-gadget" MODULE_DESCRIPTION(DRIVER_DESC); MODULE_AUTHOR("Andrey Konovalov"); MODULE_LICENSE("GPL"); /*----------------------------------------------------------------------*/ static DEFINE_IDA(driver_id_numbers); #define DRIVER_DRIVER_NAME_LENGTH_MAX 32 #define RAW_EVENT_QUEUE_SIZE 16 struct raw_event_queue { /* See the comment in raw_event_queue_fetch() for locking details. */ spinlock_t lock; struct semaphore sema; struct usb_raw_event *events[RAW_EVENT_QUEUE_SIZE]; int size; }; static void raw_event_queue_init(struct raw_event_queue *queue) { spin_lock_init(&queue->lock); sema_init(&queue->sema, 0); queue->size = 0; } static int raw_event_queue_add(struct raw_event_queue *queue, enum usb_raw_event_type type, size_t length, const void *data) { unsigned long flags; struct usb_raw_event *event; spin_lock_irqsave(&queue->lock, flags); if (queue->size >= RAW_EVENT_QUEUE_SIZE) { spin_unlock_irqrestore(&queue->lock, flags); return -ENOMEM; } event = kmalloc(sizeof(*event) + length, GFP_ATOMIC); if (!event) { spin_unlock_irqrestore(&queue->lock, flags); return -ENOMEM; } event->type = type; event->length = length; if (event->length) memcpy(&event->data[0], data, length); queue->events[queue->size] = event; queue->size++; up(&queue->sema); spin_unlock_irqrestore(&queue->lock, flags); return 0; } static struct usb_raw_event *raw_event_queue_fetch( struct raw_event_queue *queue) { int ret; unsigned long flags; struct usb_raw_event *event; /* * This function can be called concurrently. We first check that * there's at least one event queued by decrementing the semaphore, * and then take the lock to protect queue struct fields. */ ret = down_interruptible(&queue->sema); if (ret) return ERR_PTR(ret); spin_lock_irqsave(&queue->lock, flags); /* * queue->size must have the same value as queue->sema counter (before * the down_interruptible() call above), so this check is a fail-safe. */ if (WARN_ON(!queue->size)) { spin_unlock_irqrestore(&queue->lock, flags); return ERR_PTR(-ENODEV); } event = queue->events[0]; queue->size--; memmove(&queue->events[0], &queue->events[1], queue->size * sizeof(queue->events[0])); spin_unlock_irqrestore(&queue->lock, flags); return event; } static void raw_event_queue_destroy(struct raw_event_queue *queue) { int i; for (i = 0; i < queue->size; i++) kfree(queue->events[i]); queue->size = 0; } /*----------------------------------------------------------------------*/ struct raw_dev; enum ep_state { STATE_EP_DISABLED, STATE_EP_ENABLED, }; struct raw_ep { struct raw_dev *dev; enum ep_state state; struct usb_ep *ep; u8 addr; struct usb_request *req; bool urb_queued; bool disabling; ssize_t status; }; enum dev_state { STATE_DEV_INVALID = 0, STATE_DEV_OPENED, STATE_DEV_INITIALIZED, STATE_DEV_REGISTERING, STATE_DEV_RUNNING, STATE_DEV_CLOSED, STATE_DEV_FAILED }; struct raw_dev { struct kref count; spinlock_t lock; const char *udc_name; struct usb_gadget_driver driver; /* Reference to misc device: */ struct device *dev; /* Make driver names unique */ int driver_id_number; /* Protected by lock: */ enum dev_state state; bool gadget_registered; struct usb_gadget *gadget; struct usb_request *req; bool ep0_in_pending; bool ep0_out_pending; bool ep0_urb_queued; ssize_t ep0_status; struct raw_ep eps[USB_RAW_EPS_NUM_MAX]; int eps_num; struct completion ep0_done; struct raw_event_queue queue; }; static struct raw_dev *dev_new(void) { struct raw_dev *dev; dev = kzalloc(sizeof(*dev), GFP_KERNEL); if (!dev) return NULL; /* Matches kref_put() in raw_release(). */ kref_init(&dev->count); spin_lock_init(&dev->lock); init_completion(&dev->ep0_done); raw_event_queue_init(&dev->queue); dev->driver_id_number = -1; return dev; } static void dev_free(struct kref *kref) { struct raw_dev *dev = container_of(kref, struct raw_dev, count); int i; kfree(dev->udc_name); kfree(dev->driver.udc_name); kfree(dev->driver.driver.name); if (dev->driver_id_number >= 0) ida_free(&driver_id_numbers, dev->driver_id_number); if (dev->req) { if (dev->ep0_urb_queued) usb_ep_dequeue(dev->gadget->ep0, dev->req); usb_ep_free_request(dev->gadget->ep0, dev->req); } raw_event_queue_destroy(&dev->queue); for (i = 0; i < dev->eps_num; i++) { if (dev->eps[i].state == STATE_EP_DISABLED) continue; usb_ep_disable(dev->eps[i].ep); usb_ep_free_request(dev->eps[i].ep, dev->eps[i].req); kfree(dev->eps[i].ep->desc); dev->eps[i].state = STATE_EP_DISABLED; } kfree(dev); } /*----------------------------------------------------------------------*/ static int raw_queue_event(struct raw_dev *dev, enum usb_raw_event_type type, size_t length, const void *data) { int ret = 0; unsigned long flags; ret = raw_event_queue_add(&dev->queue, type, length, data); if (ret < 0) { spin_lock_irqsave(&dev->lock, flags); dev->state = STATE_DEV_FAILED; spin_unlock_irqrestore(&dev->lock, flags); } return ret; } static void gadget_ep0_complete(struct usb_ep *ep, struct usb_request *req) { struct raw_dev *dev = req->context; unsigned long flags; spin_lock_irqsave(&dev->lock, flags); if (req->status) dev->ep0_status = req->status; else dev->ep0_status = req->actual; if (dev->ep0_in_pending) dev->ep0_in_pending = false; else dev->ep0_out_pending = false; spin_unlock_irqrestore(&dev->lock, flags); complete(&dev->ep0_done); } static u8 get_ep_addr(const char *name) { /* If the endpoint has fixed function (named as e.g. "ep12out-bulk"), * parse the endpoint address from its name. We deliberately use * deprecated simple_strtoul() function here, as the number isn't * followed by '\0' nor '\n'. */ if (isdigit(name[2])) return simple_strtoul(&name[2], NULL, 10); /* Otherwise the endpoint is configurable (named as e.g. "ep-a"). */ return USB_RAW_EP_ADDR_ANY; } static int gadget_bind(struct usb_gadget *gadget, struct usb_gadget_driver *driver) { int ret = 0, i = 0; struct raw_dev *dev = container_of(driver, struct raw_dev, driver); struct usb_request *req; struct usb_ep *ep; unsigned long flags; if (strcmp(gadget->name, dev->udc_name) != 0) return -ENODEV; set_gadget_data(gadget, dev); req = usb_ep_alloc_request(gadget->ep0, GFP_KERNEL); if (!req) { dev_err(&gadget->dev, "usb_ep_alloc_request failed\n"); set_gadget_data(gadget, NULL); return -ENOMEM; } spin_lock_irqsave(&dev->lock, flags); dev->req = req; dev->req->context = dev; dev->req->complete = gadget_ep0_complete; dev->gadget = gadget; gadget_for_each_ep(ep, dev->gadget) { dev->eps[i].ep = ep; dev->eps[i].addr = get_ep_addr(ep->name); dev->eps[i].state = STATE_EP_DISABLED; i++; } dev->eps_num = i; spin_unlock_irqrestore(&dev->lock, flags); dev_dbg(&gadget->dev, "gadget connected\n"); ret = raw_queue_event(dev, USB_RAW_EVENT_CONNECT, 0, NULL); if (ret < 0) { dev_err(&gadget->dev, "failed to queue connect event\n"); set_gadget_data(gadget, NULL); return ret; } /* Matches kref_put() in gadget_unbind(). */ kref_get(&dev->count); return ret; } static void gadget_unbind(struct usb_gadget *gadget) { struct raw_dev *dev = get_gadget_data(gadget); set_gadget_data(gadget, NULL); /* Matches kref_get() in gadget_bind(). */ kref_put(&dev->count, dev_free); } static int gadget_setup(struct usb_gadget *gadget, const struct usb_ctrlrequest *ctrl) { int ret = 0; struct raw_dev *dev = get_gadget_data(gadget); unsigned long flags; spin_lock_irqsave(&dev->lock, flags); if (dev->state != STATE_DEV_RUNNING) { dev_err(&gadget->dev, "ignoring, device is not running\n"); ret = -ENODEV; goto out_unlock; } if (dev->ep0_in_pending || dev->ep0_out_pending) { dev_dbg(&gadget->dev, "stalling, request already pending\n"); ret = -EBUSY; goto out_unlock; } if ((ctrl->bRequestType & USB_DIR_IN) && ctrl->wLength) dev->ep0_in_pending = true; else dev->ep0_out_pending = true; spin_unlock_irqrestore(&dev->lock, flags); ret = raw_queue_event(dev, USB_RAW_EVENT_CONTROL, sizeof(*ctrl), ctrl); if (ret < 0) dev_err(&gadget->dev, "failed to queue control event\n"); goto out; out_unlock: spin_unlock_irqrestore(&dev->lock, flags); out: if (ret == 0 && ctrl->wLength == 0) { /* * Return USB_GADGET_DELAYED_STATUS as a workaround to stop * some UDC drivers (e.g. dwc3) from automatically proceeding * with the status stage for 0-length transfers. * Should be removed once all UDC drivers are fixed to always * delay the status stage until a response is queued to EP0. */ return USB_GADGET_DELAYED_STATUS; } return ret; } static void gadget_disconnect(struct usb_gadget *gadget) { struct raw_dev *dev = get_gadget_data(gadget); int ret; dev_dbg(&gadget->dev, "gadget disconnected\n"); ret = raw_queue_event(dev, USB_RAW_EVENT_DISCONNECT, 0, NULL); if (ret < 0) dev_err(&gadget->dev, "failed to queue disconnect event\n"); } static void gadget_suspend(struct usb_gadget *gadget) { struct raw_dev *dev = get_gadget_data(gadget); int ret; dev_dbg(&gadget->dev, "gadget suspended\n"); ret = raw_queue_event(dev, USB_RAW_EVENT_SUSPEND, 0, NULL); if (ret < 0) dev_err(&gadget->dev, "failed to queue suspend event\n"); } static void gadget_resume(struct usb_gadget *gadget) { struct raw_dev *dev = get_gadget_data(gadget); int ret; dev_dbg(&gadget->dev, "gadget resumed\n"); ret = raw_queue_event(dev, USB_RAW_EVENT_RESUME, 0, NULL); if (ret < 0) dev_err(&gadget->dev, "failed to queue resume event\n"); } static void gadget_reset(struct usb_gadget *gadget) { struct raw_dev *dev = get_gadget_data(gadget); int ret; dev_dbg(&gadget->dev, "gadget reset\n"); ret = raw_queue_event(dev, USB_RAW_EVENT_RESET, 0, NULL); if (ret < 0) dev_err(&gadget->dev, "failed to queue reset event\n"); } /*----------------------------------------------------------------------*/ static struct miscdevice raw_misc_device; static int raw_open(struct inode *inode, struct file *fd) { struct raw_dev *dev; /* Nonblocking I/O is not supported yet. */ if (fd->f_flags & O_NONBLOCK) return -EINVAL; dev = dev_new(); if (!dev) return -ENOMEM; fd->private_data = dev; dev->state = STATE_DEV_OPENED; dev->dev = raw_misc_device.this_device; return 0; } static int raw_release(struct inode *inode, struct file *fd) { int ret = 0; struct raw_dev *dev = fd->private_data; unsigned long flags; bool unregister = false; spin_lock_irqsave(&dev->lock, flags); dev->state = STATE_DEV_CLOSED; if (!dev->gadget) { spin_unlock_irqrestore(&dev->lock, flags); goto out_put; } if (dev->gadget_registered) unregister = true; dev->gadget_registered = false; spin_unlock_irqrestore(&dev->lock, flags); if (unregister) { ret = usb_gadget_unregister_driver(&dev->driver); if (ret != 0) dev_err(dev->dev, "usb_gadget_unregister_driver() failed with %d\n", ret); /* Matches kref_get() in raw_ioctl_run(). */ kref_put(&dev->count, dev_free); } out_put: /* Matches dev_new() in raw_open(). */ kref_put(&dev->count, dev_free); return ret; } /*----------------------------------------------------------------------*/ static int raw_ioctl_init(struct raw_dev *dev, unsigned long value) { int ret = 0; int driver_id_number; struct usb_raw_init arg; char *udc_driver_name; char *udc_device_name; char *driver_driver_name; unsigned long flags; if (copy_from_user(&arg, (void __user *)value, sizeof(arg))) return -EFAULT; switch (arg.speed) { case USB_SPEED_UNKNOWN: arg.speed = USB_SPEED_HIGH; break; case USB_SPEED_LOW: case USB_SPEED_FULL: case USB_SPEED_HIGH: case USB_SPEED_SUPER: break; default: return -EINVAL; } driver_id_number = ida_alloc(&driver_id_numbers, GFP_KERNEL); if (driver_id_number < 0) return driver_id_number; driver_driver_name = kmalloc(DRIVER_DRIVER_NAME_LENGTH_MAX, GFP_KERNEL); if (!driver_driver_name) { ret = -ENOMEM; goto out_free_driver_id_number; } snprintf(driver_driver_name, DRIVER_DRIVER_NAME_LENGTH_MAX, DRIVER_NAME ".%d", driver_id_number); udc_driver_name = kmalloc(UDC_NAME_LENGTH_MAX, GFP_KERNEL); if (!udc_driver_name) { ret = -ENOMEM; goto out_free_driver_driver_name; } ret = strscpy(udc_driver_name, &arg.driver_name[0], UDC_NAME_LENGTH_MAX); if (ret < 0) goto out_free_udc_driver_name; ret = 0; udc_device_name = kmalloc(UDC_NAME_LENGTH_MAX, GFP_KERNEL); if (!udc_device_name) { ret = -ENOMEM; goto out_free_udc_driver_name; } ret = strscpy(udc_device_name, &arg.device_name[0], UDC_NAME_LENGTH_MAX); if (ret < 0) goto out_free_udc_device_name; ret = 0; spin_lock_irqsave(&dev->lock, flags); if (dev->state != STATE_DEV_OPENED) { dev_dbg(dev->dev, "fail, device is not opened\n"); ret = -EINVAL; goto out_unlock; } dev->udc_name = udc_driver_name; dev->driver.function = DRIVER_DESC; dev->driver.max_speed = arg.speed; dev->driver.setup = gadget_setup; dev->driver.disconnect = gadget_disconnect; dev->driver.bind = gadget_bind; dev->driver.unbind = gadget_unbind; dev->driver.suspend = gadget_suspend; dev->driver.resume = gadget_resume; dev->driver.reset = gadget_reset; dev->driver.driver.name = driver_driver_name; dev->driver.udc_name = udc_device_name; dev->driver.match_existing_only = 1; dev->driver_id_number = driver_id_number; dev->state = STATE_DEV_INITIALIZED; spin_unlock_irqrestore(&dev->lock, flags); return ret; out_unlock: spin_unlock_irqrestore(&dev->lock, flags); out_free_udc_device_name: kfree(udc_device_name); out_free_udc_driver_name: kfree(udc_driver_name); out_free_driver_driver_name: kfree(driver_driver_name); out_free_driver_id_number: ida_free(&driver_id_numbers, driver_id_number); return ret; } static int raw_ioctl_run(struct raw_dev *dev, unsigned long value) { int ret = 0; unsigned long flags; if (value) return -EINVAL; spin_lock_irqsave(&dev->lock, flags); if (dev->state != STATE_DEV_INITIALIZED) { dev_dbg(dev->dev, "fail, device is not initialized\n"); ret = -EINVAL; goto out_unlock; } dev->state = STATE_DEV_REGISTERING; spin_unlock_irqrestore(&dev->lock, flags); ret = usb_gadget_register_driver(&dev->driver); spin_lock_irqsave(&dev->lock, flags); if (ret) { dev_err(dev->dev, "fail, usb_gadget_register_driver returned %d\n", ret); dev->state = STATE_DEV_FAILED; goto out_unlock; } dev->gadget_registered = true; dev->state = STATE_DEV_RUNNING; /* Matches kref_put() in raw_release(). */ kref_get(&dev->count); out_unlock: spin_unlock_irqrestore(&dev->lock, flags); return ret; } static int raw_ioctl_event_fetch(struct raw_dev *dev, unsigned long value) { struct usb_raw_event arg; unsigned long flags; struct usb_raw_event *event; uint32_t length; if (copy_from_user(&arg, (void __user *)value, sizeof(arg))) return -EFAULT; spin_lock_irqsave(&dev->lock, flags); if (dev->state != STATE_DEV_RUNNING) { dev_dbg(dev->dev, "fail, device is not running\n"); spin_unlock_irqrestore(&dev->lock, flags); return -EINVAL; } if (!dev->gadget) { dev_dbg(dev->dev, "fail, gadget is not bound\n"); spin_unlock_irqrestore(&dev->lock, flags); return -EBUSY; } spin_unlock_irqrestore(&dev->lock, flags); event = raw_event_queue_fetch(&dev->queue); if (PTR_ERR(event) == -EINTR) { dev_dbg(&dev->gadget->dev, "event fetching interrupted\n"); return -EINTR; } if (IS_ERR(event)) { dev_err(&dev->gadget->dev, "failed to fetch event\n"); spin_lock_irqsave(&dev->lock, flags); dev->state = STATE_DEV_FAILED; spin_unlock_irqrestore(&dev->lock, flags); return -ENODEV; } length = min(arg.length, event->length); if (copy_to_user((void __user *)value, event, sizeof(*event) + length)) { kfree(event); return -EFAULT; } kfree(event); return 0; } static void *raw_alloc_io_data(struct usb_raw_ep_io *io, void __user *ptr, bool get_from_user) { void *data; if (copy_from_user(io, ptr, sizeof(*io))) return ERR_PTR(-EFAULT); if (io->ep >= USB_RAW_EPS_NUM_MAX) return ERR_PTR(-EINVAL); if (!usb_raw_io_flags_valid(io->flags)) return ERR_PTR(-EINVAL); if (io->length > PAGE_SIZE) return ERR_PTR(-EINVAL); if (get_from_user) data = memdup_user(ptr + sizeof(*io), io->length); else { data = kmalloc(io->length, GFP_KERNEL); if (!data) data = ERR_PTR(-ENOMEM); } return data; } static int raw_process_ep0_io(struct raw_dev *dev, struct usb_raw_ep_io *io, void *data, bool in) { int ret = 0; unsigned long flags; spin_lock_irqsave(&dev->lock, flags); if (dev->state != STATE_DEV_RUNNING) { dev_dbg(dev->dev, "fail, device is not running\n"); ret = -EINVAL; goto out_unlock; } if (!dev->gadget) { dev_dbg(dev->dev, "fail, gadget is not bound\n"); ret = -EBUSY; goto out_unlock; } if (dev->ep0_urb_queued) { dev_dbg(&dev->gadget->dev, "fail, urb already queued\n"); ret = -EBUSY; goto out_unlock; } if ((in && !dev->ep0_in_pending) || (!in && !dev->ep0_out_pending)) { dev_dbg(&dev->gadget->dev, "fail, wrong direction\n"); ret = -EBUSY; goto out_unlock; } if (WARN_ON(in && dev->ep0_out_pending)) { ret = -ENODEV; dev->state = STATE_DEV_FAILED; goto out_unlock; } if (WARN_ON(!in && dev->ep0_in_pending)) { ret = -ENODEV; dev->state = STATE_DEV_FAILED; goto out_unlock; } dev->req->buf = data; dev->req->length = io->length; dev->req->zero = usb_raw_io_flags_zero(io->flags); dev->ep0_urb_queued = true; spin_unlock_irqrestore(&dev->lock, flags); ret = usb_ep_queue(dev->gadget->ep0, dev->req, GFP_KERNEL); if (ret) { dev_err(&dev->gadget->dev, "fail, usb_ep_queue returned %d\n", ret); spin_lock_irqsave(&dev->lock, flags); goto out_queue_failed; } ret = wait_for_completion_interruptible(&dev->ep0_done); if (ret) { dev_dbg(&dev->gadget->dev, "wait interrupted\n"); usb_ep_dequeue(dev->gadget->ep0, dev->req); wait_for_completion(&dev->ep0_done); spin_lock_irqsave(&dev->lock, flags); if (dev->ep0_status == -ECONNRESET) dev->ep0_status = -EINTR; goto out_interrupted; } spin_lock_irqsave(&dev->lock, flags); out_interrupted: ret = dev->ep0_status; out_queue_failed: dev->ep0_urb_queued = false; out_unlock: spin_unlock_irqrestore(&dev->lock, flags); return ret; } static int raw_ioctl_ep0_write(struct raw_dev *dev, unsigned long value) { int ret = 0; void *data; struct usb_raw_ep_io io; data = raw_alloc_io_data(&io, (void __user *)value, true); if (IS_ERR(data)) return PTR_ERR(data); ret = raw_process_ep0_io(dev, &io, data, true); kfree(data); return ret; } static int raw_ioctl_ep0_read(struct raw_dev *dev, unsigned long value) { int ret = 0; void *data; struct usb_raw_ep_io io; unsigned int length; data = raw_alloc_io_data(&io, (void __user *)value, false); if (IS_ERR(data)) return PTR_ERR(data); ret = raw_process_ep0_io(dev, &io, data, false); if (ret < 0) goto free; length = min_t(unsigned int, io.length, ret); if (copy_to_user((void __user *)(value + sizeof(io)), data, length)) ret = -EFAULT; else ret = length; free: kfree(data); return ret; } static int raw_ioctl_ep0_stall(struct raw_dev *dev, unsigned long value) { int ret = 0; unsigned long flags; if (value) return -EINVAL; spin_lock_irqsave(&dev->lock, flags); if (dev->state != STATE_DEV_RUNNING) { dev_dbg(dev->dev, "fail, device is not running\n"); ret = -EINVAL; goto out_unlock; } if (!dev->gadget) { dev_dbg(dev->dev, "fail, gadget is not bound\n"); ret = -EBUSY; goto out_unlock; } if (dev->ep0_urb_queued) { dev_dbg(&dev->gadget->dev, "fail, urb already queued\n"); ret = -EBUSY; goto out_unlock; } if (!dev->ep0_in_pending && !dev->ep0_out_pending) { dev_dbg(&dev->gadget->dev, "fail, no request pending\n"); ret = -EBUSY; goto out_unlock; } ret = usb_ep_set_halt(dev->gadget->ep0); if (ret < 0) dev_err(&dev->gadget->dev, "fail, usb_ep_set_halt returned %d\n", ret); if (dev->ep0_in_pending) dev->ep0_in_pending = false; else dev->ep0_out_pending = false; out_unlock: spin_unlock_irqrestore(&dev->lock, flags); return ret; } static int raw_ioctl_ep_enable(struct raw_dev *dev, unsigned long value) { int ret = 0, i; unsigned long flags; struct usb_endpoint_descriptor *desc; struct raw_ep *ep; bool ep_props_matched = false; desc = memdup_user((void __user *)value, sizeof(*desc)); if (IS_ERR(desc)) return PTR_ERR(desc); /* * Endpoints with a maxpacket length of 0 can cause crashes in UDC * drivers. */ if (usb_endpoint_maxp(desc) == 0) { dev_dbg(dev->dev, "fail, bad endpoint maxpacket\n"); kfree(desc); return -EINVAL; } spin_lock_irqsave(&dev->lock, flags); if (dev->state != STATE_DEV_RUNNING) { dev_dbg(dev->dev, "fail, device is not running\n"); ret = -EINVAL; goto out_free; } if (!dev->gadget) { dev_dbg(dev->dev, "fail, gadget is not bound\n"); ret = -EBUSY; goto out_free; } for (i = 0; i < dev->eps_num; i++) { ep = &dev->eps[i]; if (ep->addr != usb_endpoint_num(desc) && ep->addr != USB_RAW_EP_ADDR_ANY) continue; if (!usb_gadget_ep_match_desc(dev->gadget, ep->ep, desc, NULL)) continue; ep_props_matched = true; if (ep->state != STATE_EP_DISABLED) continue; ep->ep->desc = desc; ret = usb_ep_enable(ep->ep); if (ret < 0) { dev_err(&dev->gadget->dev, "fail, usb_ep_enable returned %d\n", ret); goto out_free; } ep->req = usb_ep_alloc_request(ep->ep, GFP_ATOMIC); if (!ep->req) { dev_err(&dev->gadget->dev, "fail, usb_ep_alloc_request failed\n"); usb_ep_disable(ep->ep); ret = -ENOMEM; goto out_free; } ep->state = STATE_EP_ENABLED; ep->ep->driver_data = ep; ret = i; goto out_unlock; } if (!ep_props_matched) { dev_dbg(&dev->gadget->dev, "fail, bad endpoint descriptor\n"); ret = -EINVAL; } else { dev_dbg(&dev->gadget->dev, "fail, no endpoints available\n"); ret = -EBUSY; } out_free: kfree(desc); out_unlock: spin_unlock_irqrestore(&dev->lock, flags); return ret; } static int raw_ioctl_ep_disable(struct raw_dev *dev, unsigned long value) { int ret = 0, i = value; unsigned long flags; spin_lock_irqsave(&dev->lock, flags); if (dev->state != STATE_DEV_RUNNING) { dev_dbg(dev->dev, "fail, device is not running\n"); ret = -EINVAL; goto out_unlock; } if (!dev->gadget) { dev_dbg(dev->dev, "fail, gadget is not bound\n"); ret = -EBUSY; goto out_unlock; } if (i < 0 || i >= dev->eps_num) { dev_dbg(dev->dev, "fail, invalid endpoint\n"); ret = -EBUSY; goto out_unlock; } if (dev->eps[i].state == STATE_EP_DISABLED) { dev_dbg(&dev->gadget->dev, "fail, endpoint is not enabled\n"); ret = -EINVAL; goto out_unlock; } if (dev->eps[i].disabling) { dev_dbg(&dev->gadget->dev, "fail, disable already in progress\n"); ret = -EINVAL; goto out_unlock; } if (dev->eps[i].urb_queued) { dev_dbg(&dev->gadget->dev, "fail, waiting for urb completion\n"); ret = -EINVAL; goto out_unlock; } dev->eps[i].disabling = true; spin_unlock_irqrestore(&dev->lock, flags); usb_ep_disable(dev->eps[i].ep); spin_lock_irqsave(&dev->lock, flags); usb_ep_free_request(dev->eps[i].ep, dev->eps[i].req); kfree(dev->eps[i].ep->desc); dev->eps[i].state = STATE_EP_DISABLED; dev->eps[i].disabling = false; out_unlock: spin_unlock_irqrestore(&dev->lock, flags); return ret; } static int raw_ioctl_ep_set_clear_halt_wedge(struct raw_dev *dev, unsigned long value, bool set, bool halt) { int ret = 0, i = value; unsigned long flags; spin_lock_irqsave(&dev->lock, flags); if (dev->state != STATE_DEV_RUNNING) { dev_dbg(dev->dev, "fail, device is not running\n"); ret = -EINVAL; goto out_unlock; } if (!dev->gadget) { dev_dbg(dev->dev, "fail, gadget is not bound\n"); ret = -EBUSY; goto out_unlock; } if (i < 0 || i >= dev->eps_num) { dev_dbg(dev->dev, "fail, invalid endpoint\n"); ret = -EBUSY; goto out_unlock; } if (dev->eps[i].state == STATE_EP_DISABLED) { dev_dbg(&dev->gadget->dev, "fail, endpoint is not enabled\n"); ret = -EINVAL; goto out_unlock; } if (dev->eps[i].disabling) { dev_dbg(&dev->gadget->dev, "fail, disable is in progress\n"); ret = -EINVAL; goto out_unlock; } if (dev->eps[i].urb_queued) { dev_dbg(&dev->gadget->dev, "fail, waiting for urb completion\n"); ret = -EINVAL; goto out_unlock; } if (usb_endpoint_xfer_isoc(dev->eps[i].ep->desc)) { dev_dbg(&dev->gadget->dev, "fail, can't halt/wedge ISO endpoint\n"); ret = -EINVAL; goto out_unlock; } if (set && halt) { ret = usb_ep_set_halt(dev->eps[i].ep); if (ret < 0) dev_err(&dev->gadget->dev, "fail, usb_ep_set_halt returned %d\n", ret); } else if (!set && halt) { ret = usb_ep_clear_halt(dev->eps[i].ep); if (ret < 0) dev_err(&dev->gadget->dev, "fail, usb_ep_clear_halt returned %d\n", ret); } else if (set && !halt) { ret = usb_ep_set_wedge(dev->eps[i].ep); if (ret < 0) dev_err(&dev->gadget->dev, "fail, usb_ep_set_wedge returned %d\n", ret); } out_unlock: spin_unlock_irqrestore(&dev->lock, flags); return ret; } static void gadget_ep_complete(struct usb_ep *ep, struct usb_request *req) { struct raw_ep *r_ep = (struct raw_ep *)ep->driver_data; struct raw_dev *dev = r_ep->dev; unsigned long flags; spin_lock_irqsave(&dev->lock, flags); if (req->status) r_ep->status = req->status; else r_ep->status = req->actual; spin_unlock_irqrestore(&dev->lock, flags); complete((struct completion *)req->context); } static int raw_process_ep_io(struct raw_dev *dev, struct usb_raw_ep_io *io, void *data, bool in) { int ret = 0; unsigned long flags; struct raw_ep *ep; DECLARE_COMPLETION_ONSTACK(done); spin_lock_irqsave(&dev->lock, flags); if (dev->state != STATE_DEV_RUNNING) { dev_dbg(dev->dev, "fail, device is not running\n"); ret = -EINVAL; goto out_unlock; } if (!dev->gadget) { dev_dbg(dev->dev, "fail, gadget is not bound\n"); ret = -EBUSY; goto out_unlock; } if (io->ep >= dev->eps_num) { dev_dbg(&dev->gadget->dev, "fail, invalid endpoint\n"); ret = -EINVAL; goto out_unlock; } ep = &dev->eps[io->ep]; if (ep->state != STATE_EP_ENABLED) { dev_dbg(&dev->gadget->dev, "fail, endpoint is not enabled\n"); ret = -EBUSY; goto out_unlock; } if (ep->disabling) { dev_dbg(&dev->gadget->dev, "fail, endpoint is already being disabled\n"); ret = -EBUSY; goto out_unlock; } if (ep->urb_queued) { dev_dbg(&dev->gadget->dev, "fail, urb already queued\n"); ret = -EBUSY; goto out_unlock; } if (in != usb_endpoint_dir_in(ep->ep->desc)) { dev_dbg(&dev->gadget->dev, "fail, wrong direction\n"); ret = -EINVAL; goto out_unlock; } ep->dev = dev; ep->req->context = &done; ep->req->complete = gadget_ep_complete; ep->req->buf = data; ep->req->length = io->length; ep->req->zero = usb_raw_io_flags_zero(io->flags); ep->urb_queued = true; spin_unlock_irqrestore(&dev->lock, flags); ret = usb_ep_queue(ep->ep, ep->req, GFP_KERNEL); if (ret) { dev_err(&dev->gadget->dev, "fail, usb_ep_queue returned %d\n", ret); spin_lock_irqsave(&dev->lock, flags); goto out_queue_failed; } ret = wait_for_completion_interruptible(&done); if (ret) { dev_dbg(&dev->gadget->dev, "wait interrupted\n"); usb_ep_dequeue(ep->ep, ep->req); wait_for_completion(&done); spin_lock_irqsave(&dev->lock, flags); if (ep->status == -ECONNRESET) ep->status = -EINTR; goto out_interrupted; } spin_lock_irqsave(&dev->lock, flags); out_interrupted: ret = ep->status; out_queue_failed: ep->urb_queued = false; out_unlock: spin_unlock_irqrestore(&dev->lock, flags); return ret; } static int raw_ioctl_ep_write(struct raw_dev *dev, unsigned long value) { int ret = 0; char *data; struct usb_raw_ep_io io; data = raw_alloc_io_data(&io, (void __user *)value, true); if (IS_ERR(data)) return PTR_ERR(data); ret = raw_process_ep_io(dev, &io, data, true); kfree(data); return ret; } static int raw_ioctl_ep_read(struct raw_dev *dev, unsigned long value) { int ret = 0; char *data; struct usb_raw_ep_io io; unsigned int length; data = raw_alloc_io_data(&io, (void __user *)value, false); if (IS_ERR(data)) return PTR_ERR(data); ret = raw_process_ep_io(dev, &io, data, false); if (ret < 0) goto free; length = min_t(unsigned int, io.length, ret); if (copy_to_user((void __user *)(value + sizeof(io)), data, length)) ret = -EFAULT; else ret = length; free: kfree(data); return ret; } static int raw_ioctl_configure(struct raw_dev *dev, unsigned long value) { int ret = 0; unsigned long flags; if (value) return -EINVAL; spin_lock_irqsave(&dev->lock, flags); if (dev->state != STATE_DEV_RUNNING) { dev_dbg(dev->dev, "fail, device is not running\n"); ret = -EINVAL; goto out_unlock; } if (!dev->gadget) { dev_dbg(dev->dev, "fail, gadget is not bound\n"); ret = -EBUSY; goto out_unlock; } usb_gadget_set_state(dev->gadget, USB_STATE_CONFIGURED); out_unlock: spin_unlock_irqrestore(&dev->lock, flags); return ret; } static int raw_ioctl_vbus_draw(struct raw_dev *dev, unsigned long value) { int ret = 0; unsigned long flags; spin_lock_irqsave(&dev->lock, flags); if (dev->state != STATE_DEV_RUNNING) { dev_dbg(dev->dev, "fail, device is not running\n"); ret = -EINVAL; goto out_unlock; } if (!dev->gadget) { dev_dbg(dev->dev, "fail, gadget is not bound\n"); ret = -EBUSY; goto out_unlock; } usb_gadget_vbus_draw(dev->gadget, 2 * value); out_unlock: spin_unlock_irqrestore(&dev->lock, flags); return ret; } static void fill_ep_caps(struct usb_ep_caps *caps, struct usb_raw_ep_caps *raw_caps) { raw_caps->type_control = caps->type_control; raw_caps->type_iso = caps->type_iso; raw_caps->type_bulk = caps->type_bulk; raw_caps->type_int = caps->type_int; raw_caps->dir_in = caps->dir_in; raw_caps->dir_out = caps->dir_out; } static void fill_ep_limits(struct usb_ep *ep, struct usb_raw_ep_limits *limits) { limits->maxpacket_limit = ep->maxpacket_limit; limits->max_streams = ep->max_streams; } static int raw_ioctl_eps_info(struct raw_dev *dev, unsigned long value) { int ret = 0, i; unsigned long flags; struct usb_raw_eps_info *info; struct raw_ep *ep; info = kzalloc(sizeof(*info), GFP_KERNEL); if (!info) { ret = -ENOMEM; goto out; } spin_lock_irqsave(&dev->lock, flags); if (dev->state != STATE_DEV_RUNNING) { dev_dbg(dev->dev, "fail, device is not running\n"); ret = -EINVAL; spin_unlock_irqrestore(&dev->lock, flags); goto out_free; } if (!dev->gadget) { dev_dbg(dev->dev, "fail, gadget is not bound\n"); ret = -EBUSY; spin_unlock_irqrestore(&dev->lock, flags); goto out_free; } for (i = 0; i < dev->eps_num; i++) { ep = &dev->eps[i]; strscpy(&info->eps[i].name[0], ep->ep->name, USB_RAW_EP_NAME_MAX); info->eps[i].addr = ep->addr; fill_ep_caps(&ep->ep->caps, &info->eps[i].caps); fill_ep_limits(ep->ep, &info->eps[i].limits); } ret = dev->eps_num; spin_unlock_irqrestore(&dev->lock, flags); if (copy_to_user((void __user *)value, info, sizeof(*info))) ret = -EFAULT; out_free: kfree(info); out: return ret; } static long raw_ioctl(struct file *fd, unsigned int cmd, unsigned long value) { struct raw_dev *dev = fd->private_data; int ret = 0; if (!dev) return -EBUSY; switch (cmd) { case USB_RAW_IOCTL_INIT: ret = raw_ioctl_init(dev, value); break; case USB_RAW_IOCTL_RUN: ret = raw_ioctl_run(dev, value); break; case USB_RAW_IOCTL_EVENT_FETCH: ret = raw_ioctl_event_fetch(dev, value); break; case USB_RAW_IOCTL_EP0_WRITE: ret = raw_ioctl_ep0_write(dev, value); break; case USB_RAW_IOCTL_EP0_READ: ret = raw_ioctl_ep0_read(dev, value); break; case USB_RAW_IOCTL_EP_ENABLE: ret = raw_ioctl_ep_enable(dev, value); break; case USB_RAW_IOCTL_EP_DISABLE: ret = raw_ioctl_ep_disable(dev, value); break; case USB_RAW_IOCTL_EP_WRITE: ret = raw_ioctl_ep_write(dev, value); break; case USB_RAW_IOCTL_EP_READ: ret = raw_ioctl_ep_read(dev, value); break; case USB_RAW_IOCTL_CONFIGURE: ret = raw_ioctl_configure(dev, value); break; case USB_RAW_IOCTL_VBUS_DRAW: ret = raw_ioctl_vbus_draw(dev, value); break; case USB_RAW_IOCTL_EPS_INFO: ret = raw_ioctl_eps_info(dev, value); break; case USB_RAW_IOCTL_EP0_STALL: ret = raw_ioctl_ep0_stall(dev, value); break; case USB_RAW_IOCTL_EP_SET_HALT: ret = raw_ioctl_ep_set_clear_halt_wedge( dev, value, true, true); break; case USB_RAW_IOCTL_EP_CLEAR_HALT: ret = raw_ioctl_ep_set_clear_halt_wedge( dev, value, false, true); break; case USB_RAW_IOCTL_EP_SET_WEDGE: ret = raw_ioctl_ep_set_clear_halt_wedge( dev, value, true, false); break; default: ret = -EINVAL; } return ret; } /*----------------------------------------------------------------------*/ static const struct file_operations raw_fops = { .open = raw_open, .unlocked_ioctl = raw_ioctl, .compat_ioctl = raw_ioctl, .release = raw_release, }; static struct miscdevice raw_misc_device = { .minor = MISC_DYNAMIC_MINOR, .name = DRIVER_NAME, .fops = &raw_fops, }; module_misc_device(raw_misc_device); |
| 1 1 2 2 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 | /* CoreChip-sz SR9800 one chip USB 2.0 Ethernet Devices * * Author : Liu Junliang <liujunliang_ljl@163.com> * * Based on asix_common.c, asix_devices.c * * This file is licensed under the terms of the GNU General Public License * version 2. This program is licensed "as is" without any warranty of any * kind, whether express or implied.* */ #include <linux/module.h> #include <linux/kmod.h> #include <linux/init.h> #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/ethtool.h> #include <linux/workqueue.h> #include <linux/mii.h> #include <linux/usb.h> #include <linux/crc32.h> #include <linux/usb/usbnet.h> #include <linux/slab.h> #include <linux/if_vlan.h> #include "sr9800.h" static int sr_read_cmd(struct usbnet *dev, u8 cmd, u16 value, u16 index, u16 size, void *data) { int err; err = usbnet_read_cmd(dev, cmd, SR_REQ_RD_REG, value, index, data, size); if ((err != size) && (err >= 0)) err = -EINVAL; return err; } static int sr_write_cmd(struct usbnet *dev, u8 cmd, u16 value, u16 index, u16 size, void *data) { int err; err = usbnet_write_cmd(dev, cmd, SR_REQ_WR_REG, value, index, data, size); if ((err != size) && (err >= 0)) err = -EINVAL; return err; } static void sr_write_cmd_async(struct usbnet *dev, u8 cmd, u16 value, u16 index, u16 size, void *data) { usbnet_write_cmd_async(dev, cmd, SR_REQ_WR_REG, value, index, data, size); } static int sr_rx_fixup(struct usbnet *dev, struct sk_buff *skb) { int offset = 0; /* This check is no longer done by usbnet */ if (skb->len < dev->net->hard_header_len) return 0; while (offset + sizeof(u32) < skb->len) { struct sk_buff *sr_skb; u16 size; u32 header = get_unaligned_le32(skb->data + offset); offset += sizeof(u32); /* get the packet length */ size = (u16) (header & 0x7ff); if (size != ((~header >> 16) & 0x07ff)) { netdev_err(dev->net, "%s : Bad Header Length\n", __func__); return 0; } if ((size > dev->net->mtu + ETH_HLEN + VLAN_HLEN) || (size + offset > skb->len)) { netdev_err(dev->net, "%s : Bad RX Length %d\n", __func__, size); return 0; } sr_skb = netdev_alloc_skb_ip_align(dev->net, size); if (!sr_skb) return 0; skb_put(sr_skb, size); memcpy(sr_skb->data, skb->data + offset, size); usbnet_skb_return(dev, sr_skb); offset += (size + 1) & 0xfffe; } if (skb->len != offset) { netdev_err(dev->net, "%s : Bad SKB Length %d\n", __func__, skb->len); return 0; } return 1; } static struct sk_buff *sr_tx_fixup(struct usbnet *dev, struct sk_buff *skb, gfp_t flags) { int headroom = skb_headroom(skb); int tailroom = skb_tailroom(skb); u32 padbytes = 0xffff0000; u32 packet_len; int padlen; void *ptr; padlen = ((skb->len + 4) % (dev->maxpacket - 1)) ? 0 : 4; if ((!skb_cloned(skb)) && ((headroom + tailroom) >= (4 + padlen))) { if ((headroom < 4) || (tailroom < padlen)) { skb->data = memmove(skb->head + 4, skb->data, skb->len); skb_set_tail_pointer(skb, skb->len); } } else { struct sk_buff *skb2; skb2 = skb_copy_expand(skb, 4, padlen, flags); dev_kfree_skb_any(skb); skb = skb2; if (!skb) return NULL; } ptr = skb_push(skb, 4); packet_len = (((skb->len - 4) ^ 0x0000ffff) << 16) + (skb->len - 4); put_unaligned_le32(packet_len, ptr); if (padlen) { put_unaligned_le32(padbytes, skb_tail_pointer(skb)); skb_put(skb, sizeof(padbytes)); } usbnet_set_skb_tx_stats(skb, 1, 0); return skb; } static void sr_status(struct usbnet *dev, struct urb *urb) { struct sr9800_int_data *event; int link; if (urb->actual_length < 8) return; event = urb->transfer_buffer; link = event->link & 0x01; if (netif_carrier_ok(dev->net) != link) { usbnet_link_change(dev, link, 1); netdev_dbg(dev->net, "Link Status is: %d\n", link); } return; } static inline int sr_set_sw_mii(struct usbnet *dev) { int ret; ret = sr_write_cmd(dev, SR_CMD_SET_SW_MII, 0x0000, 0, 0, NULL); if (ret < 0) netdev_err(dev->net, "Failed to enable software MII access\n"); return ret; } static inline int sr_set_hw_mii(struct usbnet *dev) { int ret; ret = sr_write_cmd(dev, SR_CMD_SET_HW_MII, 0x0000, 0, 0, NULL); if (ret < 0) netdev_err(dev->net, "Failed to enable hardware MII access\n"); return ret; } static inline int sr_get_phy_addr(struct usbnet *dev) { u8 buf[2]; int ret; ret = sr_read_cmd(dev, SR_CMD_READ_PHY_ID, 0, 0, 2, buf); if (ret < 0) { netdev_err(dev->net, "%s : Error reading PHYID register:%02x\n", __func__, ret); goto out; } netdev_dbg(dev->net, "%s : returning 0x%04x\n", __func__, *((__le16 *)buf)); ret = buf[1]; out: return ret; } static int sr_sw_reset(struct usbnet *dev, u8 flags) { int ret; ret = sr_write_cmd(dev, SR_CMD_SW_RESET, flags, 0, 0, NULL); if (ret < 0) netdev_err(dev->net, "Failed to send software reset:%02x\n", ret); return ret; } static u16 sr_read_rx_ctl(struct usbnet *dev) { __le16 v; int ret; ret = sr_read_cmd(dev, SR_CMD_READ_RX_CTL, 0, 0, 2, &v); if (ret < 0) { netdev_err(dev->net, "Error reading RX_CTL register:%02x\n", ret); goto out; } ret = le16_to_cpu(v); out: return ret; } static int sr_write_rx_ctl(struct usbnet *dev, u16 mode) { int ret; netdev_dbg(dev->net, "%s : mode = 0x%04x\n", __func__, mode); ret = sr_write_cmd(dev, SR_CMD_WRITE_RX_CTL, mode, 0, 0, NULL); if (ret < 0) netdev_err(dev->net, "Failed to write RX_CTL mode to 0x%04x:%02x\n", mode, ret); return ret; } static u16 sr_read_medium_status(struct usbnet *dev) { __le16 v; int ret; ret = sr_read_cmd(dev, SR_CMD_READ_MEDIUM_STATUS, 0, 0, 2, &v); if (ret < 0) { netdev_err(dev->net, "Error reading Medium Status register:%02x\n", ret); return ret; /* TODO: callers not checking for error ret */ } return le16_to_cpu(v); } static int sr_write_medium_mode(struct usbnet *dev, u16 mode) { int ret; netdev_dbg(dev->net, "%s : mode = 0x%04x\n", __func__, mode); ret = sr_write_cmd(dev, SR_CMD_WRITE_MEDIUM_MODE, mode, 0, 0, NULL); if (ret < 0) netdev_err(dev->net, "Failed to write Medium Mode mode to 0x%04x:%02x\n", mode, ret); return ret; } static int sr_write_gpio(struct usbnet *dev, u16 value, int sleep) { int ret; netdev_dbg(dev->net, "%s : value = 0x%04x\n", __func__, value); ret = sr_write_cmd(dev, SR_CMD_WRITE_GPIOS, value, 0, 0, NULL); if (ret < 0) netdev_err(dev->net, "Failed to write GPIO value 0x%04x:%02x\n", value, ret); if (sleep) msleep(sleep); return ret; } /* SR9800 have a 16-bit RX_CTL value */ static void sr_set_multicast(struct net_device *net) { struct usbnet *dev = netdev_priv(net); struct sr_data *data = (struct sr_data *)&dev->data; u16 rx_ctl = SR_DEFAULT_RX_CTL; if (net->flags & IFF_PROMISC) { rx_ctl |= SR_RX_CTL_PRO; } else if (net->flags & IFF_ALLMULTI || netdev_mc_count(net) > SR_MAX_MCAST) { rx_ctl |= SR_RX_CTL_AMALL; } else if (netdev_mc_empty(net)) { /* just broadcast and directed */ } else { /* We use the 20 byte dev->data * for our 8 byte filter buffer * to avoid allocating memory that * is tricky to free later */ struct netdev_hw_addr *ha; u32 crc_bits; memset(data->multi_filter, 0, SR_MCAST_FILTER_SIZE); /* Build the multicast hash filter. */ netdev_for_each_mc_addr(ha, net) { crc_bits = ether_crc(ETH_ALEN, ha->addr) >> 26; data->multi_filter[crc_bits >> 3] |= 1 << (crc_bits & 7); } sr_write_cmd_async(dev, SR_CMD_WRITE_MULTI_FILTER, 0, 0, SR_MCAST_FILTER_SIZE, data->multi_filter); rx_ctl |= SR_RX_CTL_AM; } sr_write_cmd_async(dev, SR_CMD_WRITE_RX_CTL, rx_ctl, 0, 0, NULL); } static int sr_mdio_read(struct net_device *net, int phy_id, int loc) { struct usbnet *dev = netdev_priv(net); __le16 res = 0; mutex_lock(&dev->phy_mutex); sr_set_sw_mii(dev); sr_read_cmd(dev, SR_CMD_READ_MII_REG, phy_id, (__u16)loc, 2, &res); sr_set_hw_mii(dev); mutex_unlock(&dev->phy_mutex); netdev_dbg(dev->net, "%s : phy_id=0x%02x, loc=0x%02x, returns=0x%04x\n", __func__, phy_id, loc, le16_to_cpu(res)); return le16_to_cpu(res); } static void sr_mdio_write(struct net_device *net, int phy_id, int loc, int val) { struct usbnet *dev = netdev_priv(net); __le16 res = cpu_to_le16(val); netdev_dbg(dev->net, "%s : phy_id=0x%02x, loc=0x%02x, val=0x%04x\n", __func__, phy_id, loc, val); mutex_lock(&dev->phy_mutex); sr_set_sw_mii(dev); sr_write_cmd(dev, SR_CMD_WRITE_MII_REG, phy_id, (__u16)loc, 2, &res); sr_set_hw_mii(dev); mutex_unlock(&dev->phy_mutex); } /* Get the PHY Identifier from the PHYSID1 & PHYSID2 MII registers */ static u32 sr_get_phyid(struct usbnet *dev) { int phy_reg; u32 phy_id; int i; /* Poll for the rare case the FW or phy isn't ready yet. */ for (i = 0; i < 100; i++) { phy_reg = sr_mdio_read(dev->net, dev->mii.phy_id, MII_PHYSID1); if (phy_reg != 0 && phy_reg != 0xFFFF) break; mdelay(1); } if (phy_reg <= 0 || phy_reg == 0xFFFF) return 0; phy_id = (phy_reg & 0xffff) << 16; phy_reg = sr_mdio_read(dev->net, dev->mii.phy_id, MII_PHYSID2); if (phy_reg < 0) return 0; phy_id |= (phy_reg & 0xffff); return phy_id; } static void sr_get_wol(struct net_device *net, struct ethtool_wolinfo *wolinfo) { struct usbnet *dev = netdev_priv(net); u8 opt; if (sr_read_cmd(dev, SR_CMD_READ_MONITOR_MODE, 0, 0, 1, &opt) < 0) { wolinfo->supported = 0; wolinfo->wolopts = 0; return; } wolinfo->supported = WAKE_PHY | WAKE_MAGIC; wolinfo->wolopts = 0; if (opt & SR_MONITOR_LINK) wolinfo->wolopts |= WAKE_PHY; if (opt & SR_MONITOR_MAGIC) wolinfo->wolopts |= WAKE_MAGIC; } static int sr_set_wol(struct net_device *net, struct ethtool_wolinfo *wolinfo) { struct usbnet *dev = netdev_priv(net); u8 opt = 0; if (wolinfo->wolopts & ~(WAKE_PHY | WAKE_MAGIC)) return -EINVAL; if (wolinfo->wolopts & WAKE_PHY) opt |= SR_MONITOR_LINK; if (wolinfo->wolopts & WAKE_MAGIC) opt |= SR_MONITOR_MAGIC; if (sr_write_cmd(dev, SR_CMD_WRITE_MONITOR_MODE, opt, 0, 0, NULL) < 0) return -EINVAL; return 0; } static int sr_get_eeprom_len(struct net_device *net) { struct usbnet *dev = netdev_priv(net); struct sr_data *data = (struct sr_data *)&dev->data; return data->eeprom_len; } static int sr_get_eeprom(struct net_device *net, struct ethtool_eeprom *eeprom, u8 *data) { struct usbnet *dev = netdev_priv(net); __le16 *ebuf = (__le16 *)data; int ret; int i; /* Crude hack to ensure that we don't overwrite memory * if an odd length is supplied */ if (eeprom->len % 2) return -EINVAL; eeprom->magic = SR_EEPROM_MAGIC; /* sr9800 returns 2 bytes from eeprom on read */ for (i = 0; i < eeprom->len / 2; i++) { ret = sr_read_cmd(dev, SR_CMD_READ_EEPROM, eeprom->offset + i, 0, 2, &ebuf[i]); if (ret < 0) return -EINVAL; } return 0; } static void sr_get_drvinfo(struct net_device *net, struct ethtool_drvinfo *info) { /* Inherit standard device info */ usbnet_get_drvinfo(net, info); strscpy(info->driver, DRIVER_NAME, sizeof(info->driver)); strscpy(info->version, DRIVER_VERSION, sizeof(info->version)); } static u32 sr_get_link(struct net_device *net) { struct usbnet *dev = netdev_priv(net); return mii_link_ok(&dev->mii); } static int sr_ioctl(struct net_device *net, struct ifreq *rq, int cmd) { struct usbnet *dev = netdev_priv(net); return generic_mii_ioctl(&dev->mii, if_mii(rq), cmd, NULL); } static int sr_set_mac_address(struct net_device *net, void *p) { struct usbnet *dev = netdev_priv(net); struct sr_data *data = (struct sr_data *)&dev->data; struct sockaddr *addr = p; if (netif_running(net)) return -EBUSY; if (!is_valid_ether_addr(addr->sa_data)) return -EADDRNOTAVAIL; eth_hw_addr_set(net, addr->sa_data); /* We use the 20 byte dev->data * for our 6 byte mac buffer * to avoid allocating memory that * is tricky to free later */ memcpy(data->mac_addr, addr->sa_data, ETH_ALEN); sr_write_cmd_async(dev, SR_CMD_WRITE_NODE_ID, 0, 0, ETH_ALEN, data->mac_addr); return 0; } static const struct ethtool_ops sr9800_ethtool_ops = { .get_drvinfo = sr_get_drvinfo, .get_link = sr_get_link, .get_msglevel = usbnet_get_msglevel, .set_msglevel = usbnet_set_msglevel, .get_wol = sr_get_wol, .set_wol = sr_set_wol, .get_eeprom_len = sr_get_eeprom_len, .get_eeprom = sr_get_eeprom, .nway_reset = usbnet_nway_reset, .get_link_ksettings = usbnet_get_link_ksettings_mii, .set_link_ksettings = usbnet_set_link_ksettings_mii, }; static int sr9800_link_reset(struct usbnet *dev) { struct ethtool_cmd ecmd = { .cmd = ETHTOOL_GSET }; u16 mode; mii_check_media(&dev->mii, 1, 1); mii_ethtool_gset(&dev->mii, &ecmd); mode = SR9800_MEDIUM_DEFAULT; if (ethtool_cmd_speed(&ecmd) != SPEED_100) mode &= ~SR_MEDIUM_PS; if (ecmd.duplex != DUPLEX_FULL) mode &= ~SR_MEDIUM_FD; netdev_dbg(dev->net, "%s : speed: %u duplex: %d mode: 0x%04x\n", __func__, ethtool_cmd_speed(&ecmd), ecmd.duplex, mode); sr_write_medium_mode(dev, mode); return 0; } static int sr9800_set_default_mode(struct usbnet *dev) { u16 rx_ctl; int ret; sr_mdio_write(dev->net, dev->mii.phy_id, MII_BMCR, BMCR_RESET); sr_mdio_write(dev->net, dev->mii.phy_id, MII_ADVERTISE, ADVERTISE_ALL | ADVERTISE_CSMA); mii_nway_restart(&dev->mii); ret = sr_write_medium_mode(dev, SR9800_MEDIUM_DEFAULT); if (ret < 0) goto out; ret = sr_write_cmd(dev, SR_CMD_WRITE_IPG012, SR9800_IPG0_DEFAULT | SR9800_IPG1_DEFAULT, SR9800_IPG2_DEFAULT, 0, NULL); if (ret < 0) { netdev_dbg(dev->net, "Write IPG,IPG1,IPG2 failed: %d\n", ret); goto out; } /* Set RX_CTL to default values with 2k buffer, and enable cactus */ ret = sr_write_rx_ctl(dev, SR_DEFAULT_RX_CTL); if (ret < 0) goto out; rx_ctl = sr_read_rx_ctl(dev); netdev_dbg(dev->net, "RX_CTL is 0x%04x after all initializations\n", rx_ctl); rx_ctl = sr_read_medium_status(dev); netdev_dbg(dev->net, "Medium Status:0x%04x after all initializations\n", rx_ctl); return 0; out: return ret; } static int sr9800_reset(struct usbnet *dev) { struct sr_data *data = (struct sr_data *)&dev->data; int ret, embd_phy; u16 rx_ctl; ret = sr_write_gpio(dev, SR_GPIO_RSE | SR_GPIO_GPO_2 | SR_GPIO_GPO2EN, 5); if (ret < 0) goto out; embd_phy = ((sr_get_phy_addr(dev) & 0x1f) == 0x10 ? 1 : 0); ret = sr_write_cmd(dev, SR_CMD_SW_PHY_SELECT, embd_phy, 0, 0, NULL); if (ret < 0) { netdev_dbg(dev->net, "Select PHY #1 failed: %d\n", ret); goto out; } ret = sr_sw_reset(dev, SR_SWRESET_IPPD | SR_SWRESET_PRL); if (ret < 0) goto out; msleep(150); ret = sr_sw_reset(dev, SR_SWRESET_CLEAR); if (ret < 0) goto out; msleep(150); if (embd_phy) { ret = sr_sw_reset(dev, SR_SWRESET_IPRL); if (ret < 0) goto out; } else { ret = sr_sw_reset(dev, SR_SWRESET_PRTE); if (ret < 0) goto out; } msleep(150); rx_ctl = sr_read_rx_ctl(dev); netdev_dbg(dev->net, "RX_CTL is 0x%04x after software reset\n", rx_ctl); ret = sr_write_rx_ctl(dev, 0x0000); if (ret < 0) goto out; rx_ctl = sr_read_rx_ctl(dev); netdev_dbg(dev->net, "RX_CTL is 0x%04x setting to 0x0000\n", rx_ctl); ret = sr_sw_reset(dev, SR_SWRESET_PRL); if (ret < 0) goto out; msleep(150); ret = sr_sw_reset(dev, SR_SWRESET_IPRL | SR_SWRESET_PRL); if (ret < 0) goto out; msleep(150); ret = sr9800_set_default_mode(dev); if (ret < 0) goto out; /* Rewrite MAC address */ memcpy(data->mac_addr, dev->net->dev_addr, ETH_ALEN); ret = sr_write_cmd(dev, SR_CMD_WRITE_NODE_ID, 0, 0, ETH_ALEN, data->mac_addr); if (ret < 0) goto out; return 0; out: return ret; } static const struct net_device_ops sr9800_netdev_ops = { .ndo_open = usbnet_open, .ndo_stop = usbnet_stop, .ndo_start_xmit = usbnet_start_xmit, .ndo_tx_timeout = usbnet_tx_timeout, .ndo_change_mtu = usbnet_change_mtu, .ndo_get_stats64 = dev_get_tstats64, .ndo_set_mac_address = sr_set_mac_address, .ndo_validate_addr = eth_validate_addr, .ndo_eth_ioctl = sr_ioctl, .ndo_set_rx_mode = sr_set_multicast, }; static int sr9800_phy_powerup(struct usbnet *dev) { int ret; /* set the embedded Ethernet PHY in power-down state */ ret = sr_sw_reset(dev, SR_SWRESET_IPPD | SR_SWRESET_IPRL); if (ret < 0) { netdev_err(dev->net, "Failed to power down PHY : %d\n", ret); return ret; } msleep(20); /* set the embedded Ethernet PHY in power-up state */ ret = sr_sw_reset(dev, SR_SWRESET_IPRL); if (ret < 0) { netdev_err(dev->net, "Failed to reset PHY: %d\n", ret); return ret; } msleep(600); /* set the embedded Ethernet PHY in reset state */ ret = sr_sw_reset(dev, SR_SWRESET_CLEAR); if (ret < 0) { netdev_err(dev->net, "Failed to power up PHY: %d\n", ret); return ret; } msleep(20); /* set the embedded Ethernet PHY in power-up state */ ret = sr_sw_reset(dev, SR_SWRESET_IPRL); if (ret < 0) { netdev_err(dev->net, "Failed to reset PHY: %d\n", ret); return ret; } return 0; } static int sr9800_bind(struct usbnet *dev, struct usb_interface *intf) { struct sr_data *data = (struct sr_data *)&dev->data; u16 led01_mux, led23_mux; int ret, embd_phy; u8 addr[ETH_ALEN]; u32 phyid; u16 rx_ctl; data->eeprom_len = SR9800_EEPROM_LEN; ret = usbnet_get_endpoints(dev, intf); if (ret) goto out; /* LED Setting Rule : * AABB:CCDD * AA : MFA0(LED0) * BB : MFA1(LED1) * CC : MFA2(LED2), Reserved for SR9800 * DD : MFA3(LED3), Reserved for SR9800 */ led01_mux = (SR_LED_MUX_LINK_ACTIVE << 8) | SR_LED_MUX_LINK; led23_mux = (SR_LED_MUX_LINK_ACTIVE << 8) | SR_LED_MUX_TX_ACTIVE; ret = sr_write_cmd(dev, SR_CMD_LED_MUX, led01_mux, led23_mux, 0, NULL); if (ret < 0) { netdev_err(dev->net, "set LINK LED failed : %d\n", ret); goto out; } /* Get the MAC address */ ret = sr_read_cmd(dev, SR_CMD_READ_NODE_ID, 0, 0, ETH_ALEN, addr); if (ret < 0) { netdev_dbg(dev->net, "Failed to read MAC address: %d\n", ret); return ret; } eth_hw_addr_set(dev->net, addr); netdev_dbg(dev->net, "mac addr : %pM\n", dev->net->dev_addr); /* Initialize MII structure */ dev->mii.dev = dev->net; dev->mii.mdio_read = sr_mdio_read; dev->mii.mdio_write = sr_mdio_write; dev->mii.phy_id_mask = 0x1f; dev->mii.reg_num_mask = 0x1f; dev->mii.phy_id = sr_get_phy_addr(dev); dev->net->netdev_ops = &sr9800_netdev_ops; dev->net->ethtool_ops = &sr9800_ethtool_ops; embd_phy = ((dev->mii.phy_id & 0x1f) == 0x10 ? 1 : 0); /* Reset the PHY to normal operation mode */ ret = sr_write_cmd(dev, SR_CMD_SW_PHY_SELECT, embd_phy, 0, 0, NULL); if (ret < 0) { netdev_dbg(dev->net, "Select PHY #1 failed: %d\n", ret); return ret; } /* Init PHY routine */ ret = sr9800_phy_powerup(dev); if (ret < 0) goto out; rx_ctl = sr_read_rx_ctl(dev); netdev_dbg(dev->net, "RX_CTL is 0x%04x after software reset\n", rx_ctl); ret = sr_write_rx_ctl(dev, 0x0000); if (ret < 0) goto out; rx_ctl = sr_read_rx_ctl(dev); netdev_dbg(dev->net, "RX_CTL is 0x%04x setting to 0x0000\n", rx_ctl); /* Read PHYID register *AFTER* the PHY was reset properly */ phyid = sr_get_phyid(dev); netdev_dbg(dev->net, "PHYID=0x%08x\n", phyid); /* medium mode setting */ ret = sr9800_set_default_mode(dev); if (ret < 0) goto out; if (dev->udev->speed == USB_SPEED_HIGH) { ret = sr_write_cmd(dev, SR_CMD_BULKIN_SIZE, SR9800_BULKIN_SIZE[SR9800_MAX_BULKIN_4K].byte_cnt, SR9800_BULKIN_SIZE[SR9800_MAX_BULKIN_4K].threshold, 0, NULL); if (ret < 0) { netdev_err(dev->net, "Reset RX_CTL failed: %d\n", ret); goto out; } dev->rx_urb_size = SR9800_BULKIN_SIZE[SR9800_MAX_BULKIN_4K].size; } else { ret = sr_write_cmd(dev, SR_CMD_BULKIN_SIZE, SR9800_BULKIN_SIZE[SR9800_MAX_BULKIN_2K].byte_cnt, SR9800_BULKIN_SIZE[SR9800_MAX_BULKIN_2K].threshold, 0, NULL); if (ret < 0) { netdev_err(dev->net, "Reset RX_CTL failed: %d\n", ret); goto out; } dev->rx_urb_size = SR9800_BULKIN_SIZE[SR9800_MAX_BULKIN_2K].size; } netdev_dbg(dev->net, "%s : setting rx_urb_size with : %zu\n", __func__, dev->rx_urb_size); return 0; out: return ret; } static const struct driver_info sr9800_driver_info = { .description = "CoreChip SR9800 USB 2.0 Ethernet", .bind = sr9800_bind, .status = sr_status, .link_reset = sr9800_link_reset, .reset = sr9800_reset, .flags = DRIVER_FLAG, .rx_fixup = sr_rx_fixup, .tx_fixup = sr_tx_fixup, }; static const struct usb_device_id products[] = { { USB_DEVICE(0x0fe6, 0x9800), /* SR9800 Device */ .driver_info = (unsigned long) &sr9800_driver_info, }, {}, /* END */ }; MODULE_DEVICE_TABLE(usb, products); static struct usb_driver sr_driver = { .name = DRIVER_NAME, .id_table = products, .probe = usbnet_probe, .suspend = usbnet_suspend, .resume = usbnet_resume, .disconnect = usbnet_disconnect, .supports_autosuspend = 1, }; module_usb_driver(sr_driver); MODULE_AUTHOR("Liu Junliang <liujunliang_ljl@163.com"); MODULE_VERSION(DRIVER_VERSION); MODULE_DESCRIPTION("SR9800 USB 2.0 USB2NET Dev : http://www.corechip-sz.com"); MODULE_LICENSE("GPL"); |
| 39 2 2 2 2 2 2 2 2 2 1 2 36 1 1 1 1 1 1 1 1 1 1 1 39 39 38 10 39 39 39 38 38 38 38 38 2 36 36 36 36 36 36 36 36 36 36 36 41 42 1 41 41 38 39 39 39 39 36 36 36 36 36 2 30 30 15 15 15 75 75 75 74 75 75 75 75 75 75 75 83 84 82 82 82 82 82 82 82 84 15 57 56 57 49 49 49 49 49 48 49 49 49 49 49 49 75 49 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 2 44 44 42 42 1 42 5 36 36 1 1 45 45 21 11 44 45 45 49 49 7 1 49 43 49 47 49 40 7 44 25 25 25 25 25 24 24 24 25 25 22 22 22 22 21 21 11 21 21 21 25 4 4 1 4 22 22 22 22 22 22 22 1 21 22 26 26 26 26 22 22 22 3 19 19 22 26 4 22 4 26 75 1 1 75 75 21 22 4 3 2 2 26 75 75 1 74 1 75 75 75 75 75 15 78 77 5 77 5 76 76 76 5 76 78 3 3 2 1 1 1 5 5 4 2 4 5 1 1 1 1 5 5 5 4 4 2 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 | /* * An async IO implementation for Linux * Written by Benjamin LaHaise <bcrl@kvack.org> * * Implements an efficient asynchronous io interface. * * Copyright 2000, 2001, 2002 Red Hat, Inc. All Rights Reserved. * Copyright 2018 Christoph Hellwig. * * See ../COPYING for licensing terms. */ #define pr_fmt(fmt) "%s: " fmt, __func__ #include <linux/kernel.h> #include <linux/init.h> #include <linux/errno.h> #include <linux/time.h> #include <linux/aio_abi.h> #include <linux/export.h> #include <linux/syscalls.h> #include <linux/backing-dev.h> #include <linux/refcount.h> #include <linux/uio.h> #include <linux/sched/signal.h> #include <linux/fs.h> #include <linux/file.h> #include <linux/mm.h> #include <linux/mman.h> #include <linux/percpu.h> #include <linux/slab.h> #include <linux/timer.h> #include <linux/aio.h> #include <linux/highmem.h> #include <linux/workqueue.h> #include <linux/security.h> #include <linux/eventfd.h> #include <linux/blkdev.h> #include <linux/compat.h> #include <linux/migrate.h> #include <linux/ramfs.h> #include <linux/percpu-refcount.h> #include <linux/mount.h> #include <linux/pseudo_fs.h> #include <linux/uaccess.h> #include <linux/nospec.h> #include "internal.h" #define KIOCB_KEY 0 #define AIO_RING_MAGIC 0xa10a10a1 #define AIO_RING_COMPAT_FEATURES 1 #define AIO_RING_INCOMPAT_FEATURES 0 struct aio_ring { unsigned id; /* kernel internal index number */ unsigned nr; /* number of io_events */ unsigned head; /* Written to by userland or under ring_lock * mutex by aio_read_events_ring(). */ unsigned tail; unsigned magic; unsigned compat_features; unsigned incompat_features; unsigned header_length; /* size of aio_ring */ struct io_event io_events[]; }; /* 128 bytes + ring size */ /* * Plugging is meant to work with larger batches of IOs. If we don't * have more than the below, then don't bother setting up a plug. */ #define AIO_PLUG_THRESHOLD 2 #define AIO_RING_PAGES 8 struct kioctx_table { struct rcu_head rcu; unsigned nr; struct kioctx __rcu *table[] __counted_by(nr); }; struct kioctx_cpu { unsigned reqs_available; }; struct ctx_rq_wait { struct completion comp; atomic_t count; }; struct kioctx { struct percpu_ref users; atomic_t dead; struct percpu_ref reqs; unsigned long user_id; struct kioctx_cpu __percpu *cpu; /* * For percpu reqs_available, number of slots we move to/from global * counter at a time: */ unsigned req_batch; /* * This is what userspace passed to io_setup(), it's not used for * anything but counting against the global max_reqs quota. * * The real limit is nr_events - 1, which will be larger (see * aio_setup_ring()) */ unsigned max_reqs; /* Size of ringbuffer, in units of struct io_event */ unsigned nr_events; unsigned long mmap_base; unsigned long mmap_size; struct folio **ring_folios; long nr_pages; struct rcu_work free_rwork; /* see free_ioctx() */ /* * signals when all in-flight requests are done */ struct ctx_rq_wait *rq_wait; struct { /* * This counts the number of available slots in the ringbuffer, * so we avoid overflowing it: it's decremented (if positive) * when allocating a kiocb and incremented when the resulting * io_event is pulled off the ringbuffer. * * We batch accesses to it with a percpu version. */ atomic_t reqs_available; } ____cacheline_aligned_in_smp; struct { spinlock_t ctx_lock; struct list_head active_reqs; /* used for cancellation */ } ____cacheline_aligned_in_smp; struct { struct mutex ring_lock; wait_queue_head_t wait; } ____cacheline_aligned_in_smp; struct { unsigned tail; unsigned completed_events; spinlock_t completion_lock; } ____cacheline_aligned_in_smp; struct folio *internal_folios[AIO_RING_PAGES]; struct file *aio_ring_file; unsigned id; }; /* * First field must be the file pointer in all the * iocb unions! See also 'struct kiocb' in <linux/fs.h> */ struct fsync_iocb { struct file *file; struct work_struct work; bool datasync; struct cred *creds; }; struct poll_iocb { struct file *file; struct wait_queue_head *head; __poll_t events; bool cancelled; bool work_scheduled; bool work_need_resched; struct wait_queue_entry wait; struct work_struct work; }; /* * NOTE! Each of the iocb union members has the file pointer * as the first entry in their struct definition. So you can * access the file pointer through any of the sub-structs, * or directly as just 'ki_filp' in this struct. */ struct aio_kiocb { union { struct file *ki_filp; struct kiocb rw; struct fsync_iocb fsync; struct poll_iocb poll; }; struct kioctx *ki_ctx; kiocb_cancel_fn *ki_cancel; struct io_event ki_res; struct list_head ki_list; /* the aio core uses this * for cancellation */ refcount_t ki_refcnt; /* * If the aio_resfd field of the userspace iocb is not zero, * this is the underlying eventfd context to deliver events to. */ struct eventfd_ctx *ki_eventfd; }; /*------ sysctl variables----*/ static DEFINE_SPINLOCK(aio_nr_lock); static unsigned long aio_nr; /* current system wide number of aio requests */ static unsigned long aio_max_nr = 0x10000; /* system wide maximum number of aio requests */ /*----end sysctl variables---*/ #ifdef CONFIG_SYSCTL static const struct ctl_table aio_sysctls[] = { { .procname = "aio-nr", .data = &aio_nr, .maxlen = sizeof(aio_nr), .mode = 0444, .proc_handler = proc_doulongvec_minmax, }, { .procname = "aio-max-nr", .data = &aio_max_nr, .maxlen = sizeof(aio_max_nr), .mode = 0644, .proc_handler = proc_doulongvec_minmax, }, }; static void __init aio_sysctl_init(void) { register_sysctl_init("fs", aio_sysctls); } #else #define aio_sysctl_init() do { } while (0) #endif static struct kmem_cache *kiocb_cachep; static struct kmem_cache *kioctx_cachep; static struct vfsmount *aio_mnt; static const struct file_operations aio_ring_fops; static const struct address_space_operations aio_ctx_aops; static struct file *aio_private_file(struct kioctx *ctx, loff_t nr_pages) { struct file *file; struct inode *inode = alloc_anon_inode(aio_mnt->mnt_sb); if (IS_ERR(inode)) return ERR_CAST(inode); inode->i_mapping->a_ops = &aio_ctx_aops; inode->i_mapping->i_private_data = ctx; inode->i_size = PAGE_SIZE * nr_pages; file = alloc_file_pseudo(inode, aio_mnt, "[aio]", O_RDWR, &aio_ring_fops); if (IS_ERR(file)) iput(inode); return file; } static int aio_init_fs_context(struct fs_context *fc) { if (!init_pseudo(fc, AIO_RING_MAGIC)) return -ENOMEM; fc->s_iflags |= SB_I_NOEXEC; return 0; } /* aio_setup * Creates the slab caches used by the aio routines, panic on * failure as this is done early during the boot sequence. */ static int __init aio_setup(void) { static struct file_system_type aio_fs = { .name = "aio", .init_fs_context = aio_init_fs_context, .kill_sb = kill_anon_super, }; aio_mnt = kern_mount(&aio_fs); if (IS_ERR(aio_mnt)) panic("Failed to create aio fs mount."); kiocb_cachep = KMEM_CACHE(aio_kiocb, SLAB_HWCACHE_ALIGN|SLAB_PANIC); kioctx_cachep = KMEM_CACHE(kioctx,SLAB_HWCACHE_ALIGN|SLAB_PANIC); aio_sysctl_init(); return 0; } __initcall(aio_setup); static void put_aio_ring_file(struct kioctx *ctx) { struct file *aio_ring_file = ctx->aio_ring_file; struct address_space *i_mapping; if (aio_ring_file) { truncate_setsize(file_inode(aio_ring_file), 0); /* Prevent further access to the kioctx from migratepages */ i_mapping = aio_ring_file->f_mapping; spin_lock(&i_mapping->i_private_lock); i_mapping->i_private_data = NULL; ctx->aio_ring_file = NULL; spin_unlock(&i_mapping->i_private_lock); fput(aio_ring_file); } } static void aio_free_ring(struct kioctx *ctx) { int i; /* Disconnect the kiotx from the ring file. This prevents future * accesses to the kioctx from page migration. */ put_aio_ring_file(ctx); for (i = 0; i < ctx->nr_pages; i++) { struct folio *folio = ctx->ring_folios[i]; if (!folio) continue; pr_debug("pid(%d) [%d] folio->count=%d\n", current->pid, i, folio_ref_count(folio)); ctx->ring_folios[i] = NULL; folio_put(folio); } if (ctx->ring_folios && ctx->ring_folios != ctx->internal_folios) { kfree(ctx->ring_folios); ctx->ring_folios = NULL; } } static int aio_ring_mremap(struct vm_area_struct *vma) { struct file *file = vma->vm_file; struct mm_struct *mm = vma->vm_mm; struct kioctx_table *table; int i, res = -EINVAL; spin_lock(&mm->ioctx_lock); rcu_read_lock(); table = rcu_dereference(mm->ioctx_table); if (!table) goto out_unlock; for (i = 0; i < table->nr; i++) { struct kioctx *ctx; ctx = rcu_dereference(table->table[i]); if (ctx && ctx->aio_ring_file == file) { if (!atomic_read(&ctx->dead)) { ctx->user_id = ctx->mmap_base = vma->vm_start; res = 0; } break; } } out_unlock: rcu_read_unlock(); spin_unlock(&mm->ioctx_lock); return res; } static const struct vm_operations_struct aio_ring_vm_ops = { .mremap = aio_ring_mremap, #if IS_ENABLED(CONFIG_MMU) .fault = filemap_fault, .map_pages = filemap_map_pages, .page_mkwrite = filemap_page_mkwrite, #endif }; static int aio_ring_mmap_prepare(struct vm_area_desc *desc) { desc->vm_flags |= VM_DONTEXPAND; desc->vm_ops = &aio_ring_vm_ops; return 0; } static const struct file_operations aio_ring_fops = { .mmap_prepare = aio_ring_mmap_prepare, }; #if IS_ENABLED(CONFIG_MIGRATION) static int aio_migrate_folio(struct address_space *mapping, struct folio *dst, struct folio *src, enum migrate_mode mode) { struct kioctx *ctx; unsigned long flags; pgoff_t idx; int rc = 0; /* mapping->i_private_lock here protects against the kioctx teardown. */ spin_lock(&mapping->i_private_lock); ctx = mapping->i_private_data; if (!ctx) { rc = -EINVAL; goto out; } /* The ring_lock mutex. The prevents aio_read_events() from writing * to the ring's head, and prevents page migration from mucking in * a partially initialized kiotx. */ if (!mutex_trylock(&ctx->ring_lock)) { rc = -EAGAIN; goto out; } idx = src->index; if (idx < (pgoff_t)ctx->nr_pages) { /* Make sure the old folio hasn't already been changed */ if (ctx->ring_folios[idx] != src) rc = -EAGAIN; } else rc = -EINVAL; if (rc != 0) goto out_unlock; /* Writeback must be complete */ BUG_ON(folio_test_writeback(src)); folio_get(dst); rc = folio_migrate_mapping(mapping, dst, src, 1); if (rc != MIGRATEPAGE_SUCCESS) { folio_put(dst); goto out_unlock; } /* Take completion_lock to prevent other writes to the ring buffer * while the old folio is copied to the new. This prevents new * events from being lost. */ spin_lock_irqsave(&ctx->completion_lock, flags); folio_copy(dst, src); folio_migrate_flags(dst, src); BUG_ON(ctx->ring_folios[idx] != src); ctx->ring_folios[idx] = dst; spin_unlock_irqrestore(&ctx->completion_lock, flags); /* The old folio is no longer accessible. */ folio_put(src); out_unlock: mutex_unlock(&ctx->ring_lock); out: spin_unlock(&mapping->i_private_lock); return rc; } #else #define aio_migrate_folio NULL #endif static const struct address_space_operations aio_ctx_aops = { .dirty_folio = noop_dirty_folio, .migrate_folio = aio_migrate_folio, }; static int aio_setup_ring(struct kioctx *ctx, unsigned int nr_events) { struct aio_ring *ring; struct mm_struct *mm = current->mm; unsigned long size, unused; int nr_pages; int i; struct file *file; /* Compensate for the ring buffer's head/tail overlap entry */ nr_events += 2; /* 1 is required, 2 for good luck */ size = sizeof(struct aio_ring); size += sizeof(struct io_event) * nr_events; nr_pages = PFN_UP(size); if (nr_pages < 0) return -EINVAL; file = aio_private_file(ctx, nr_pages); if (IS_ERR(file)) { ctx->aio_ring_file = NULL; return -ENOMEM; } ctx->aio_ring_file = file; nr_events = (PAGE_SIZE * nr_pages - sizeof(struct aio_ring)) / sizeof(struct io_event); ctx->ring_folios = ctx->internal_folios; if (nr_pages > AIO_RING_PAGES) { ctx->ring_folios = kcalloc(nr_pages, sizeof(struct folio *), GFP_KERNEL); if (!ctx->ring_folios) { put_aio_ring_file(ctx); return -ENOMEM; } } for (i = 0; i < nr_pages; i++) { struct folio *folio; folio = __filemap_get_folio(file->f_mapping, i, FGP_LOCK | FGP_ACCESSED | FGP_CREAT, GFP_USER | __GFP_ZERO); if (IS_ERR(folio)) break; pr_debug("pid(%d) [%d] folio->count=%d\n", current->pid, i, folio_ref_count(folio)); folio_end_read(folio, true); ctx->ring_folios[i] = folio; } ctx->nr_pages = i; if (unlikely(i != nr_pages)) { aio_free_ring(ctx); return -ENOMEM; } ctx->mmap_size = nr_pages * PAGE_SIZE; pr_debug("attempting mmap of %lu bytes\n", ctx->mmap_size); if (mmap_write_lock_killable(mm)) { ctx->mmap_size = 0; aio_free_ring(ctx); return -EINTR; } ctx->mmap_base = do_mmap(ctx->aio_ring_file, 0, ctx->mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED, 0, 0, &unused, NULL); mmap_write_unlock(mm); if (IS_ERR((void *)ctx->mmap_base)) { ctx->mmap_size = 0; aio_free_ring(ctx); return -ENOMEM; } pr_debug("mmap address: 0x%08lx\n", ctx->mmap_base); ctx->user_id = ctx->mmap_base; ctx->nr_events = nr_events; /* trusted copy */ ring = folio_address(ctx->ring_folios[0]); ring->nr = nr_events; /* user copy */ ring->id = ~0U; ring->head = ring->tail = 0; ring->magic = AIO_RING_MAGIC; ring->compat_features = AIO_RING_COMPAT_FEATURES; ring->incompat_features = AIO_RING_INCOMPAT_FEATURES; ring->header_length = sizeof(struct aio_ring); flush_dcache_folio(ctx->ring_folios[0]); return 0; } #define AIO_EVENTS_PER_PAGE (PAGE_SIZE / sizeof(struct io_event)) #define AIO_EVENTS_FIRST_PAGE ((PAGE_SIZE - sizeof(struct aio_ring)) / sizeof(struct io_event)) #define AIO_EVENTS_OFFSET (AIO_EVENTS_PER_PAGE - AIO_EVENTS_FIRST_PAGE) void kiocb_set_cancel_fn(struct kiocb *iocb, kiocb_cancel_fn *cancel) { struct aio_kiocb *req; struct kioctx *ctx; unsigned long flags; /* * kiocb didn't come from aio or is neither a read nor a write, hence * ignore it. */ if (!(iocb->ki_flags & IOCB_AIO_RW)) return; req = container_of(iocb, struct aio_kiocb, rw); if (WARN_ON_ONCE(!list_empty(&req->ki_list))) return; ctx = req->ki_ctx; spin_lock_irqsave(&ctx->ctx_lock, flags); list_add_tail(&req->ki_list, &ctx->active_reqs); req->ki_cancel = cancel; spin_unlock_irqrestore(&ctx->ctx_lock, flags); } EXPORT_SYMBOL(kiocb_set_cancel_fn); /* * free_ioctx() should be RCU delayed to synchronize against the RCU * protected lookup_ioctx() and also needs process context to call * aio_free_ring(). Use rcu_work. */ static void free_ioctx(struct work_struct *work) { struct kioctx *ctx = container_of(to_rcu_work(work), struct kioctx, free_rwork); pr_debug("freeing %p\n", ctx); aio_free_ring(ctx); free_percpu(ctx->cpu); percpu_ref_exit(&ctx->reqs); percpu_ref_exit(&ctx->users); kmem_cache_free(kioctx_cachep, ctx); } static void free_ioctx_reqs(struct percpu_ref *ref) { struct kioctx *ctx = container_of(ref, struct kioctx, reqs); /* At this point we know that there are no any in-flight requests */ if (ctx->rq_wait && atomic_dec_and_test(&ctx->rq_wait->count)) complete(&ctx->rq_wait->comp); /* Synchronize against RCU protected table->table[] dereferences */ INIT_RCU_WORK(&ctx->free_rwork, free_ioctx); queue_rcu_work(system_wq, &ctx->free_rwork); } /* * When this function runs, the kioctx has been removed from the "hash table" * and ctx->users has dropped to 0, so we know no more kiocbs can be submitted - * now it's safe to cancel any that need to be. */ static void free_ioctx_users(struct percpu_ref *ref) { struct kioctx *ctx = container_of(ref, struct kioctx, users); struct aio_kiocb *req; spin_lock_irq(&ctx->ctx_lock); while (!list_empty(&ctx->active_reqs)) { req = list_first_entry(&ctx->active_reqs, struct aio_kiocb, ki_list); req->ki_cancel(&req->rw); list_del_init(&req->ki_list); } spin_unlock_irq(&ctx->ctx_lock); percpu_ref_kill(&ctx->reqs); percpu_ref_put(&ctx->reqs); } static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm) { unsigned i, new_nr; struct kioctx_table *table, *old; struct aio_ring *ring; spin_lock(&mm->ioctx_lock); table = rcu_dereference_raw(mm->ioctx_table); while (1) { if (table) for (i = 0; i < table->nr; i++) if (!rcu_access_pointer(table->table[i])) { ctx->id = i; rcu_assign_pointer(table->table[i], ctx); spin_unlock(&mm->ioctx_lock); /* While kioctx setup is in progress, * we are protected from page migration * changes ring_folios by ->ring_lock. */ ring = folio_address(ctx->ring_folios[0]); ring->id = ctx->id; return 0; } new_nr = (table ? table->nr : 1) * 4; spin_unlock(&mm->ioctx_lock); table = kzalloc(struct_size(table, table, new_nr), GFP_KERNEL); if (!table) return -ENOMEM; table->nr = new_nr; spin_lock(&mm->ioctx_lock); old = rcu_dereference_raw(mm->ioctx_table); if (!old) { rcu_assign_pointer(mm->ioctx_table, table); } else if (table->nr > old->nr) { memcpy(table->table, old->table, old->nr * sizeof(struct kioctx *)); rcu_assign_pointer(mm->ioctx_table, table); kfree_rcu(old, rcu); } else { kfree(table); table = old; } } } static void aio_nr_sub(unsigned nr) { spin_lock(&aio_nr_lock); if (WARN_ON(aio_nr - nr > aio_nr)) aio_nr = 0; else aio_nr -= nr; spin_unlock(&aio_nr_lock); } /* ioctx_alloc * Allocates and initializes an ioctx. Returns an ERR_PTR if it failed. */ static struct kioctx *ioctx_alloc(unsigned nr_events) { struct mm_struct *mm = current->mm; struct kioctx *ctx; int err = -ENOMEM; /* * Store the original nr_events -- what userspace passed to io_setup(), * for counting against the global limit -- before it changes. */ unsigned int max_reqs = nr_events; /* * We keep track of the number of available ringbuffer slots, to prevent * overflow (reqs_available), and we also use percpu counters for this. * * So since up to half the slots might be on other cpu's percpu counters * and unavailable, double nr_events so userspace sees what they * expected: additionally, we move req_batch slots to/from percpu * counters at a time, so make sure that isn't 0: */ nr_events = max(nr_events, num_possible_cpus() * 4); nr_events *= 2; /* Prevent overflows */ if (nr_events > (0x10000000U / sizeof(struct io_event))) { pr_debug("ENOMEM: nr_events too high\n"); return ERR_PTR(-EINVAL); } if (!nr_events || (unsigned long)max_reqs > aio_max_nr) return ERR_PTR(-EAGAIN); ctx = kmem_cache_zalloc(kioctx_cachep, GFP_KERNEL); if (!ctx) return ERR_PTR(-ENOMEM); ctx->max_reqs = max_reqs; spin_lock_init(&ctx->ctx_lock); spin_lock_init(&ctx->completion_lock); mutex_init(&ctx->ring_lock); /* Protect against page migration throughout kiotx setup by keeping * the ring_lock mutex held until setup is complete. */ mutex_lock(&ctx->ring_lock); init_waitqueue_head(&ctx->wait); INIT_LIST_HEAD(&ctx->active_reqs); if (percpu_ref_init(&ctx->users, free_ioctx_users, 0, GFP_KERNEL)) goto err; if (percpu_ref_init(&ctx->reqs, free_ioctx_reqs, 0, GFP_KERNEL)) goto err; ctx->cpu = alloc_percpu(struct kioctx_cpu); if (!ctx->cpu) goto err; err = aio_setup_ring(ctx, nr_events); if (err < 0) goto err; atomic_set(&ctx->reqs_available, ctx->nr_events - 1); ctx->req_batch = (ctx->nr_events - 1) / (num_possible_cpus() * 4); if (ctx->req_batch < 1) ctx->req_batch = 1; /* limit the number of system wide aios */ spin_lock(&aio_nr_lock); if (aio_nr + ctx->max_reqs > aio_max_nr || aio_nr + ctx->max_reqs < aio_nr) { spin_unlock(&aio_nr_lock); err = -EAGAIN; goto err_ctx; } aio_nr += ctx->max_reqs; spin_unlock(&aio_nr_lock); percpu_ref_get(&ctx->users); /* io_setup() will drop this ref */ percpu_ref_get(&ctx->reqs); /* free_ioctx_users() will drop this */ err = ioctx_add_table(ctx, mm); if (err) goto err_cleanup; /* Release the ring_lock mutex now that all setup is complete. */ mutex_unlock(&ctx->ring_lock); pr_debug("allocated ioctx %p[%ld]: mm=%p mask=0x%x\n", ctx, ctx->user_id, mm, ctx->nr_events); return ctx; err_cleanup: aio_nr_sub(ctx->max_reqs); err_ctx: atomic_set(&ctx->dead, 1); if (ctx->mmap_size) vm_munmap(ctx->mmap_base, ctx->mmap_size); aio_free_ring(ctx); err: mutex_unlock(&ctx->ring_lock); free_percpu(ctx->cpu); percpu_ref_exit(&ctx->reqs); percpu_ref_exit(&ctx->users); kmem_cache_free(kioctx_cachep, ctx); pr_debug("error allocating ioctx %d\n", err); return ERR_PTR(err); } /* kill_ioctx * Cancels all outstanding aio requests on an aio context. Used * when the processes owning a context have all exited to encourage * the rapid destruction of the kioctx. */ static int kill_ioctx(struct mm_struct *mm, struct kioctx *ctx, struct ctx_rq_wait *wait) { struct kioctx_table *table; spin_lock(&mm->ioctx_lock); if (atomic_xchg(&ctx->dead, 1)) { spin_unlock(&mm->ioctx_lock); return -EINVAL; } table = rcu_dereference_raw(mm->ioctx_table); WARN_ON(ctx != rcu_access_pointer(table->table[ctx->id])); RCU_INIT_POINTER(table->table[ctx->id], NULL); spin_unlock(&mm->ioctx_lock); /* free_ioctx_reqs() will do the necessary RCU synchronization */ wake_up_all(&ctx->wait); /* * It'd be more correct to do this in free_ioctx(), after all * the outstanding kiocbs have finished - but by then io_destroy * has already returned, so io_setup() could potentially return * -EAGAIN with no ioctxs actually in use (as far as userspace * could tell). */ aio_nr_sub(ctx->max_reqs); if (ctx->mmap_size) vm_munmap(ctx->mmap_base, ctx->mmap_size); ctx->rq_wait = wait; percpu_ref_kill(&ctx->users); return 0; } /* * exit_aio: called when the last user of mm goes away. At this point, there is * no way for any new requests to be submited or any of the io_* syscalls to be * called on the context. * * There may be outstanding kiocbs, but free_ioctx() will explicitly wait on * them. */ void exit_aio(struct mm_struct *mm) { struct kioctx_table *table = rcu_dereference_raw(mm->ioctx_table); struct ctx_rq_wait wait; int i, skipped; if (!table) return; atomic_set(&wait.count, table->nr); init_completion(&wait.comp); skipped = 0; for (i = 0; i < table->nr; ++i) { struct kioctx *ctx = rcu_dereference_protected(table->table[i], true); if (!ctx) { skipped++; continue; } /* * We don't need to bother with munmap() here - exit_mmap(mm) * is coming and it'll unmap everything. And we simply can't, * this is not necessarily our ->mm. * Since kill_ioctx() uses non-zero ->mmap_size as indicator * that it needs to unmap the area, just set it to 0. */ ctx->mmap_size = 0; kill_ioctx(mm, ctx, &wait); } if (!atomic_sub_and_test(skipped, &wait.count)) { /* Wait until all IO for the context are done. */ wait_for_completion(&wait.comp); } RCU_INIT_POINTER(mm->ioctx_table, NULL); kfree(table); } static void put_reqs_available(struct kioctx *ctx, unsigned nr) { struct kioctx_cpu *kcpu; unsigned long flags; local_irq_save(flags); kcpu = this_cpu_ptr(ctx->cpu); kcpu->reqs_available += nr; while (kcpu->reqs_available >= ctx->req_batch * 2) { kcpu->reqs_available -= ctx->req_batch; atomic_add(ctx->req_batch, &ctx->reqs_available); } local_irq_restore(flags); } static bool __get_reqs_available(struct kioctx *ctx) { struct kioctx_cpu *kcpu; bool ret = false; unsigned long flags; local_irq_save(flags); kcpu = this_cpu_ptr(ctx->cpu); if (!kcpu->reqs_available) { int avail = atomic_read(&ctx->reqs_available); do { if (avail < ctx->req_batch) goto out; } while (!atomic_try_cmpxchg(&ctx->reqs_available, &avail, avail - ctx->req_batch)); kcpu->reqs_available += ctx->req_batch; } ret = true; kcpu->reqs_available--; out: local_irq_restore(flags); return ret; } /* refill_reqs_available * Updates the reqs_available reference counts used for tracking the * number of free slots in the completion ring. This can be called * from aio_complete() (to optimistically update reqs_available) or * from aio_get_req() (the we're out of events case). It must be * called holding ctx->completion_lock. */ static void refill_reqs_available(struct kioctx *ctx, unsigned head, unsigned tail) { unsigned events_in_ring, completed; /* Clamp head since userland can write to it. */ head %= ctx->nr_events; if (head <= tail) events_in_ring = tail - head; else events_in_ring = ctx->nr_events - (head - tail); completed = ctx->completed_events; if (events_in_ring < completed) completed -= events_in_ring; else completed = 0; if (!completed) return; ctx->completed_events -= completed; put_reqs_available(ctx, completed); } /* user_refill_reqs_available * Called to refill reqs_available when aio_get_req() encounters an * out of space in the completion ring. */ static void user_refill_reqs_available(struct kioctx *ctx) { spin_lock_irq(&ctx->completion_lock); if (ctx->completed_events) { struct aio_ring *ring; unsigned head; /* Access of ring->head may race with aio_read_events_ring() * here, but that's okay since whether we read the old version * or the new version, and either will be valid. The important * part is that head cannot pass tail since we prevent * aio_complete() from updating tail by holding * ctx->completion_lock. Even if head is invalid, the check * against ctx->completed_events below will make sure we do the * safe/right thing. */ ring = folio_address(ctx->ring_folios[0]); head = ring->head; refill_reqs_available(ctx, head, ctx->tail); } spin_unlock_irq(&ctx->completion_lock); } static bool get_reqs_available(struct kioctx *ctx) { if (__get_reqs_available(ctx)) return true; user_refill_reqs_available(ctx); return __get_reqs_available(ctx); } /* aio_get_req * Allocate a slot for an aio request. * Returns NULL if no requests are free. * * The refcount is initialized to 2 - one for the async op completion, * one for the synchronous code that does this. */ static inline struct aio_kiocb *aio_get_req(struct kioctx *ctx) { struct aio_kiocb *req; req = kmem_cache_alloc(kiocb_cachep, GFP_KERNEL); if (unlikely(!req)) return NULL; if (unlikely(!get_reqs_available(ctx))) { kmem_cache_free(kiocb_cachep, req); return NULL; } percpu_ref_get(&ctx->reqs); req->ki_ctx = ctx; INIT_LIST_HEAD(&req->ki_list); refcount_set(&req->ki_refcnt, 2); req->ki_eventfd = NULL; return req; } static struct kioctx *lookup_ioctx(unsigned long ctx_id) { struct aio_ring __user *ring = (void __user *)ctx_id; struct mm_struct *mm = current->mm; struct kioctx *ctx, *ret = NULL; struct kioctx_table *table; unsigned id; if (get_user(id, &ring->id)) return NULL; rcu_read_lock(); table = rcu_dereference(mm->ioctx_table); if (!table || id >= table->nr) goto out; id = array_index_nospec(id, table->nr); ctx = rcu_dereference(table->table[id]); if (ctx && ctx->user_id == ctx_id) { if (percpu_ref_tryget_live(&ctx->users)) ret = ctx; } out: rcu_read_unlock(); return ret; } static inline void iocb_destroy(struct aio_kiocb *iocb) { if (iocb->ki_eventfd) eventfd_ctx_put(iocb->ki_eventfd); if (iocb->ki_filp) fput(iocb->ki_filp); percpu_ref_put(&iocb->ki_ctx->reqs); kmem_cache_free(kiocb_cachep, iocb); } struct aio_waiter { struct wait_queue_entry w; size_t min_nr; }; /* aio_complete * Called when the io request on the given iocb is complete. */ static void aio_complete(struct aio_kiocb *iocb) { struct kioctx *ctx = iocb->ki_ctx; struct aio_ring *ring; struct io_event *ev_page, *event; unsigned tail, pos, head, avail; unsigned long flags; /* * Add a completion event to the ring buffer. Must be done holding * ctx->completion_lock to prevent other code from messing with the tail * pointer since we might be called from irq context. */ spin_lock_irqsave(&ctx->completion_lock, flags); tail = ctx->tail; pos = tail + AIO_EVENTS_OFFSET; if (++tail >= ctx->nr_events) tail = 0; ev_page = folio_address(ctx->ring_folios[pos / AIO_EVENTS_PER_PAGE]); event = ev_page + pos % AIO_EVENTS_PER_PAGE; *event = iocb->ki_res; flush_dcache_folio(ctx->ring_folios[pos / AIO_EVENTS_PER_PAGE]); pr_debug("%p[%u]: %p: %p %Lx %Lx %Lx\n", ctx, tail, iocb, (void __user *)(unsigned long)iocb->ki_res.obj, iocb->ki_res.data, iocb->ki_res.res, iocb->ki_res.res2); /* after flagging the request as done, we * must never even look at it again */ smp_wmb(); /* make event visible before updating tail */ ctx->tail = tail; ring = folio_address(ctx->ring_folios[0]); head = ring->head; ring->tail = tail; flush_dcache_folio(ctx->ring_folios[0]); ctx->completed_events++; if (ctx->completed_events > 1) refill_reqs_available(ctx, head, tail); avail = tail > head ? tail - head : tail + ctx->nr_events - head; spin_unlock_irqrestore(&ctx->completion_lock, flags); pr_debug("added to ring %p at [%u]\n", iocb, tail); /* * Check if the user asked us to deliver the result through an * eventfd. The eventfd_signal() function is safe to be called * from IRQ context. */ if (iocb->ki_eventfd) eventfd_signal(iocb->ki_eventfd); /* * We have to order our ring_info tail store above and test * of the wait list below outside the wait lock. This is * like in wake_up_bit() where clearing a bit has to be * ordered with the unlocked test. */ smp_mb(); if (waitqueue_active(&ctx->wait)) { struct aio_waiter *curr, *next; unsigned long flags; spin_lock_irqsave(&ctx->wait.lock, flags); list_for_each_entry_safe(curr, next, &ctx->wait.head, w.entry) if (avail >= curr->min_nr) { wake_up_process(curr->w.private); list_del_init_careful(&curr->w.entry); } spin_unlock_irqrestore(&ctx->wait.lock, flags); } } static inline void iocb_put(struct aio_kiocb *iocb) { if (refcount_dec_and_test(&iocb->ki_refcnt)) { aio_complete(iocb); iocb_destroy(iocb); } } /* aio_read_events_ring * Pull an event off of the ioctx's event ring. Returns the number of * events fetched */ static long aio_read_events_ring(struct kioctx *ctx, struct io_event __user *event, long nr) { struct aio_ring *ring; unsigned head, tail, pos; long ret = 0; int copy_ret; /* * The mutex can block and wake us up and that will cause * wait_event_interruptible_hrtimeout() to schedule without sleeping * and repeat. This should be rare enough that it doesn't cause * peformance issues. See the comment in read_events() for more detail. */ sched_annotate_sleep(); mutex_lock(&ctx->ring_lock); /* Access to ->ring_folios here is protected by ctx->ring_lock. */ ring = folio_address(ctx->ring_folios[0]); head = ring->head; tail = ring->tail; /* * Ensure that once we've read the current tail pointer, that * we also see the events that were stored up to the tail. */ smp_rmb(); pr_debug("h%u t%u m%u\n", head, tail, ctx->nr_events); if (head == tail) goto out; head %= ctx->nr_events; tail %= ctx->nr_events; while (ret < nr) { long avail; struct io_event *ev; struct folio *folio; avail = (head <= tail ? tail : ctx->nr_events) - head; if (head == tail) break; pos = head + AIO_EVENTS_OFFSET; folio = ctx->ring_folios[pos / AIO_EVENTS_PER_PAGE]; pos %= AIO_EVENTS_PER_PAGE; avail = min(avail, nr - ret); avail = min_t(long, avail, AIO_EVENTS_PER_PAGE - pos); ev = folio_address(folio); copy_ret = copy_to_user(event + ret, ev + pos, sizeof(*ev) * avail); if (unlikely(copy_ret)) { ret = -EFAULT; goto out; } ret += avail; head += avail; head %= ctx->nr_events; } ring = folio_address(ctx->ring_folios[0]); ring->head = head; flush_dcache_folio(ctx->ring_folios[0]); pr_debug("%li h%u t%u\n", ret, head, tail); out: mutex_unlock(&ctx->ring_lock); return ret; } static bool aio_read_events(struct kioctx *ctx, long min_nr, long nr, struct io_event __user *event, long *i) { long ret = aio_read_events_ring(ctx, event + *i, nr - *i); if (ret > 0) *i += ret; if (unlikely(atomic_read(&ctx->dead))) ret = -EINVAL; if (!*i) *i = ret; return ret < 0 || *i >= min_nr; } static long read_events(struct kioctx *ctx, long min_nr, long nr, struct io_event __user *event, ktime_t until) { struct hrtimer_sleeper t; struct aio_waiter w; long ret = 0, ret2 = 0; /* * Note that aio_read_events() is being called as the conditional - i.e. * we're calling it after prepare_to_wait() has set task state to * TASK_INTERRUPTIBLE. * * But aio_read_events() can block, and if it blocks it's going to flip * the task state back to TASK_RUNNING. * * This should be ok, provided it doesn't flip the state back to * TASK_RUNNING and return 0 too much - that causes us to spin. That * will only happen if the mutex_lock() call blocks, and we then find * the ringbuffer empty. So in practice we should be ok, but it's * something to be aware of when touching this code. */ aio_read_events(ctx, min_nr, nr, event, &ret); if (until == 0 || ret < 0 || ret >= min_nr) return ret; hrtimer_setup_sleeper_on_stack(&t, CLOCK_MONOTONIC, HRTIMER_MODE_REL); if (until != KTIME_MAX) { hrtimer_set_expires_range_ns(&t.timer, until, current->timer_slack_ns); hrtimer_sleeper_start_expires(&t, HRTIMER_MODE_REL); } init_wait(&w.w); while (1) { unsigned long nr_got = ret; w.min_nr = min_nr - ret; ret2 = prepare_to_wait_event(&ctx->wait, &w.w, TASK_INTERRUPTIBLE); if (!ret2 && !t.task) ret2 = -ETIME; if (aio_read_events(ctx, min_nr, nr, event, &ret) || ret2) break; if (nr_got == ret) schedule(); } finish_wait(&ctx->wait, &w.w); hrtimer_cancel(&t.timer); destroy_hrtimer_on_stack(&t.timer); return ret; } /* sys_io_setup: * Create an aio_context capable of receiving at least nr_events. * ctxp must not point to an aio_context that already exists, and * must be initialized to 0 prior to the call. On successful * creation of the aio_context, *ctxp is filled in with the resulting * handle. May fail with -EINVAL if *ctxp is not initialized, * if the specified nr_events exceeds internal limits. May fail * with -EAGAIN if the specified nr_events exceeds the user's limit * of available events. May fail with -ENOMEM if insufficient kernel * resources are available. May fail with -EFAULT if an invalid * pointer is passed for ctxp. Will fail with -ENOSYS if not * implemented. */ SYSCALL_DEFINE2(io_setup, unsigned, nr_events, aio_context_t __user *, ctxp) { struct kioctx *ioctx = NULL; unsigned long ctx; long ret; ret = get_user(ctx, ctxp); if (unlikely(ret)) goto out; ret = -EINVAL; if (unlikely(ctx || nr_events == 0)) { pr_debug("EINVAL: ctx %lu nr_events %u\n", ctx, nr_events); goto out; } ioctx = ioctx_alloc(nr_events); ret = PTR_ERR(ioctx); if (!IS_ERR(ioctx)) { ret = put_user(ioctx->user_id, ctxp); if (ret) kill_ioctx(current->mm, ioctx, NULL); percpu_ref_put(&ioctx->users); } out: return ret; } #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE2(io_setup, unsigned, nr_events, u32 __user *, ctx32p) { struct kioctx *ioctx = NULL; unsigned long ctx; long ret; ret = get_user(ctx, ctx32p); if (unlikely(ret)) goto out; ret = -EINVAL; if (unlikely(ctx || nr_events == 0)) { pr_debug("EINVAL: ctx %lu nr_events %u\n", ctx, nr_events); goto out; } ioctx = ioctx_alloc(nr_events); ret = PTR_ERR(ioctx); if (!IS_ERR(ioctx)) { /* truncating is ok because it's a user address */ ret = put_user((u32)ioctx->user_id, ctx32p); if (ret) kill_ioctx(current->mm, ioctx, NULL); percpu_ref_put(&ioctx->users); } out: return ret; } #endif /* sys_io_destroy: * Destroy the aio_context specified. May cancel any outstanding * AIOs and block on completion. Will fail with -ENOSYS if not * implemented. May fail with -EINVAL if the context pointed to * is invalid. */ SYSCALL_DEFINE1(io_destroy, aio_context_t, ctx) { struct kioctx *ioctx = lookup_ioctx(ctx); if (likely(NULL != ioctx)) { struct ctx_rq_wait wait; int ret; init_completion(&wait.comp); atomic_set(&wait.count, 1); /* Pass requests_done to kill_ioctx() where it can be set * in a thread-safe way. If we try to set it here then we have * a race condition if two io_destroy() called simultaneously. */ ret = kill_ioctx(current->mm, ioctx, &wait); percpu_ref_put(&ioctx->users); /* Wait until all IO for the context are done. Otherwise kernel * keep using user-space buffers even if user thinks the context * is destroyed. */ if (!ret) wait_for_completion(&wait.comp); return ret; } pr_debug("EINVAL: invalid context id\n"); return -EINVAL; } static void aio_remove_iocb(struct aio_kiocb *iocb) { struct kioctx *ctx = iocb->ki_ctx; unsigned long flags; spin_lock_irqsave(&ctx->ctx_lock, flags); list_del(&iocb->ki_list); spin_unlock_irqrestore(&ctx->ctx_lock, flags); } static void aio_complete_rw(struct kiocb *kiocb, long res) { struct aio_kiocb *iocb = container_of(kiocb, struct aio_kiocb, rw); if (!list_empty_careful(&iocb->ki_list)) aio_remove_iocb(iocb); if (kiocb->ki_flags & IOCB_WRITE) { struct inode *inode = file_inode(kiocb->ki_filp); if (S_ISREG(inode->i_mode)) kiocb_end_write(kiocb); } iocb->ki_res.res = res; iocb->ki_res.res2 = 0; iocb_put(iocb); } static int aio_prep_rw(struct kiocb *req, const struct iocb *iocb, int rw_type) { int ret; req->ki_write_stream = 0; req->ki_complete = aio_complete_rw; req->private = NULL; req->ki_pos = iocb->aio_offset; req->ki_flags = req->ki_filp->f_iocb_flags | IOCB_AIO_RW; if (iocb->aio_flags & IOCB_FLAG_RESFD) req->ki_flags |= IOCB_EVENTFD; if (iocb->aio_flags & IOCB_FLAG_IOPRIO) { /* * If the IOCB_FLAG_IOPRIO flag of aio_flags is set, then * aio_reqprio is interpreted as an I/O scheduling * class and priority. */ ret = ioprio_check_cap(iocb->aio_reqprio); if (ret) { pr_debug("aio ioprio check cap error: %d\n", ret); return ret; } req->ki_ioprio = iocb->aio_reqprio; } else req->ki_ioprio = get_current_ioprio(); ret = kiocb_set_rw_flags(req, iocb->aio_rw_flags, rw_type); if (unlikely(ret)) return ret; req->ki_flags &= ~IOCB_HIPRI; /* no one is going to poll for this I/O */ return 0; } static ssize_t aio_setup_rw(int rw, const struct iocb *iocb, struct iovec **iovec, bool vectored, bool compat, struct iov_iter *iter) { void __user *buf = (void __user *)(uintptr_t)iocb->aio_buf; size_t len = iocb->aio_nbytes; if (!vectored) { ssize_t ret = import_ubuf(rw, buf, len, iter); *iovec = NULL; return ret; } return __import_iovec(rw, buf, len, UIO_FASTIOV, iovec, iter, compat); } static inline void aio_rw_done(struct kiocb *req, ssize_t ret) { switch (ret) { case -EIOCBQUEUED: break; case -ERESTARTSYS: case -ERESTARTNOINTR: case -ERESTARTNOHAND: case -ERESTART_RESTARTBLOCK: /* * There's no easy way to restart the syscall since other AIO's * may be already running. Just fail this IO with EINTR. */ ret = -EINTR; fallthrough; default: req->ki_complete(req, ret); } } static int aio_read(struct kiocb *req, const struct iocb *iocb, bool vectored, bool compat) { struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; struct iov_iter iter; struct file *file; int ret; ret = aio_prep_rw(req, iocb, READ); if (ret) return ret; file = req->ki_filp; if (unlikely(!(file->f_mode & FMODE_READ))) return -EBADF; if (unlikely(!file->f_op->read_iter)) return -EINVAL; ret = aio_setup_rw(ITER_DEST, iocb, &iovec, vectored, compat, &iter); if (ret < 0) return ret; ret = rw_verify_area(READ, file, &req->ki_pos, iov_iter_count(&iter)); if (!ret) aio_rw_done(req, file->f_op->read_iter(req, &iter)); kfree(iovec); return ret; } static int aio_write(struct kiocb *req, const struct iocb *iocb, bool vectored, bool compat) { struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; struct iov_iter iter; struct file *file; int ret; ret = aio_prep_rw(req, iocb, WRITE); if (ret) return ret; file = req->ki_filp; if (unlikely(!(file->f_mode & FMODE_WRITE))) return -EBADF; if (unlikely(!file->f_op->write_iter)) return -EINVAL; ret = aio_setup_rw(ITER_SOURCE, iocb, &iovec, vectored, compat, &iter); if (ret < 0) return ret; ret = rw_verify_area(WRITE, file, &req->ki_pos, iov_iter_count(&iter)); if (!ret) { if (S_ISREG(file_inode(file)->i_mode)) kiocb_start_write(req); req->ki_flags |= IOCB_WRITE; aio_rw_done(req, file->f_op->write_iter(req, &iter)); } kfree(iovec); return ret; } static void aio_fsync_work(struct work_struct *work) { struct aio_kiocb *iocb = container_of(work, struct aio_kiocb, fsync.work); const struct cred *old_cred = override_creds(iocb->fsync.creds); iocb->ki_res.res = vfs_fsync(iocb->fsync.file, iocb->fsync.datasync); revert_creds(old_cred); put_cred(iocb->fsync.creds); iocb_put(iocb); } static int aio_fsync(struct fsync_iocb *req, const struct iocb *iocb, bool datasync) { if (unlikely(iocb->aio_buf || iocb->aio_offset || iocb->aio_nbytes || iocb->aio_rw_flags)) return -EINVAL; if (unlikely(!req->file->f_op->fsync)) return -EINVAL; req->creds = prepare_creds(); if (!req->creds) return -ENOMEM; req->datasync = datasync; INIT_WORK(&req->work, aio_fsync_work); schedule_work(&req->work); return 0; } static void aio_poll_put_work(struct work_struct *work) { struct poll_iocb *req = container_of(work, struct poll_iocb, work); struct aio_kiocb *iocb = container_of(req, struct aio_kiocb, poll); iocb_put(iocb); } /* * Safely lock the waitqueue which the request is on, synchronizing with the * case where the ->poll() provider decides to free its waitqueue early. * * Returns true on success, meaning that req->head->lock was locked, req->wait * is on req->head, and an RCU read lock was taken. Returns false if the * request was already removed from its waitqueue (which might no longer exist). */ static bool poll_iocb_lock_wq(struct poll_iocb *req) { wait_queue_head_t *head; /* * While we hold the waitqueue lock and the waitqueue is nonempty, * wake_up_pollfree() will wait for us. However, taking the waitqueue * lock in the first place can race with the waitqueue being freed. * * We solve this as eventpoll does: by taking advantage of the fact that * all users of wake_up_pollfree() will RCU-delay the actual free. If * we enter rcu_read_lock() and see that the pointer to the queue is * non-NULL, we can then lock it without the memory being freed out from * under us, then check whether the request is still on the queue. * * Keep holding rcu_read_lock() as long as we hold the queue lock, in * case the caller deletes the entry from the queue, leaving it empty. * In that case, only RCU prevents the queue memory from being freed. */ rcu_read_lock(); head = smp_load_acquire(&req->head); if (head) { spin_lock(&head->lock); if (!list_empty(&req->wait.entry)) return true; spin_unlock(&head->lock); } rcu_read_unlock(); return false; } static void poll_iocb_unlock_wq(struct poll_iocb *req) { spin_unlock(&req->head->lock); rcu_read_unlock(); } static void aio_poll_complete_work(struct work_struct *work) { struct poll_iocb *req = container_of(work, struct poll_iocb, work); struct aio_kiocb *iocb = container_of(req, struct aio_kiocb, poll); struct poll_table_struct pt = { ._key = req->events }; struct kioctx *ctx = iocb->ki_ctx; __poll_t mask = 0; if (!READ_ONCE(req->cancelled)) mask = vfs_poll(req->file, &pt) & req->events; /* * Note that ->ki_cancel callers also delete iocb from active_reqs after * calling ->ki_cancel. We need the ctx_lock roundtrip here to * synchronize with them. In the cancellation case the list_del_init * itself is not actually needed, but harmless so we keep it in to * avoid further branches in the fast path. */ spin_lock_irq(&ctx->ctx_lock); if (poll_iocb_lock_wq(req)) { if (!mask && !READ_ONCE(req->cancelled)) { /* * The request isn't actually ready to be completed yet. * Reschedule completion if another wakeup came in. */ if (req->work_need_resched) { schedule_work(&req->work); req->work_need_resched = false; } else { req->work_scheduled = false; } poll_iocb_unlock_wq(req); spin_unlock_irq(&ctx->ctx_lock); return; } list_del_init(&req->wait.entry); poll_iocb_unlock_wq(req); } /* else, POLLFREE has freed the waitqueue, so we must complete */ list_del_init(&iocb->ki_list); iocb->ki_res.res = mangle_poll(mask); spin_unlock_irq(&ctx->ctx_lock); iocb_put(iocb); } /* assumes we are called with irqs disabled */ static int aio_poll_cancel(struct kiocb *iocb) { struct aio_kiocb *aiocb = container_of(iocb, struct aio_kiocb, rw); struct poll_iocb *req = &aiocb->poll; if (poll_iocb_lock_wq(req)) { WRITE_ONCE(req->cancelled, true); if (!req->work_scheduled) { schedule_work(&aiocb->poll.work); req->work_scheduled = true; } poll_iocb_unlock_wq(req); } /* else, the request was force-cancelled by POLLFREE already */ return 0; } static int aio_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, void *key) { struct poll_iocb *req = container_of(wait, struct poll_iocb, wait); struct aio_kiocb *iocb = container_of(req, struct aio_kiocb, poll); __poll_t mask = key_to_poll(key); unsigned long flags; /* for instances that support it check for an event match first: */ if (mask && !(mask & req->events)) return 0; /* * Complete the request inline if possible. This requires that three * conditions be met: * 1. An event mask must have been passed. If a plain wakeup was done * instead, then mask == 0 and we have to call vfs_poll() to get * the events, so inline completion isn't possible. * 2. The completion work must not have already been scheduled. * 3. ctx_lock must not be busy. We have to use trylock because we * already hold the waitqueue lock, so this inverts the normal * locking order. Use irqsave/irqrestore because not all * filesystems (e.g. fuse) call this function with IRQs disabled, * yet IRQs have to be disabled before ctx_lock is obtained. */ if (mask && !req->work_scheduled && spin_trylock_irqsave(&iocb->ki_ctx->ctx_lock, flags)) { struct kioctx *ctx = iocb->ki_ctx; list_del_init(&req->wait.entry); list_del(&iocb->ki_list); iocb->ki_res.res = mangle_poll(mask); if (iocb->ki_eventfd && !eventfd_signal_allowed()) { iocb = NULL; INIT_WORK(&req->work, aio_poll_put_work); schedule_work(&req->work); } spin_unlock_irqrestore(&ctx->ctx_lock, flags); if (iocb) iocb_put(iocb); } else { /* * Schedule the completion work if needed. If it was already * scheduled, record that another wakeup came in. * * Don't remove the request from the waitqueue here, as it might * not actually be complete yet (we won't know until vfs_poll() * is called), and we must not miss any wakeups. POLLFREE is an * exception to this; see below. */ if (req->work_scheduled) { req->work_need_resched = true; } else { schedule_work(&req->work); req->work_scheduled = true; } /* * If the waitqueue is being freed early but we can't complete * the request inline, we have to tear down the request as best * we can. That means immediately removing the request from its * waitqueue and preventing all further accesses to the * waitqueue via the request. We also need to schedule the * completion work (done above). Also mark the request as * cancelled, to potentially skip an unneeded call to ->poll(). */ if (mask & POLLFREE) { WRITE_ONCE(req->cancelled, true); list_del_init(&req->wait.entry); /* * Careful: this *must* be the last step, since as soon * as req->head is NULL'ed out, the request can be * completed and freed, since aio_poll_complete_work() * will no longer need to take the waitqueue lock. */ smp_store_release(&req->head, NULL); } } return 1; } struct aio_poll_table { struct poll_table_struct pt; struct aio_kiocb *iocb; bool queued; int error; }; static void aio_poll_queue_proc(struct file *file, struct wait_queue_head *head, struct poll_table_struct *p) { struct aio_poll_table *pt = container_of(p, struct aio_poll_table, pt); /* multiple wait queues per file are not supported */ if (unlikely(pt->queued)) { pt->error = -EINVAL; return; } pt->queued = true; pt->error = 0; pt->iocb->poll.head = head; add_wait_queue(head, &pt->iocb->poll.wait); } static int aio_poll(struct aio_kiocb *aiocb, const struct iocb *iocb) { struct kioctx *ctx = aiocb->ki_ctx; struct poll_iocb *req = &aiocb->poll; struct aio_poll_table apt; bool cancel = false; __poll_t mask; /* reject any unknown events outside the normal event mask. */ if ((u16)iocb->aio_buf != iocb->aio_buf) return -EINVAL; /* reject fields that are not defined for poll */ if (iocb->aio_offset || iocb->aio_nbytes || iocb->aio_rw_flags) return -EINVAL; INIT_WORK(&req->work, aio_poll_complete_work); req->events = demangle_poll(iocb->aio_buf) | EPOLLERR | EPOLLHUP; req->head = NULL; req->cancelled = false; req->work_scheduled = false; req->work_need_resched = false; apt.pt._qproc = aio_poll_queue_proc; apt.pt._key = req->events; apt.iocb = aiocb; apt.queued = false; apt.error = -EINVAL; /* same as no support for IOCB_CMD_POLL */ /* initialized the list so that we can do list_empty checks */ INIT_LIST_HEAD(&req->wait.entry); init_waitqueue_func_entry(&req->wait, aio_poll_wake); mask = vfs_poll(req->file, &apt.pt) & req->events; spin_lock_irq(&ctx->ctx_lock); if (likely(apt.queued)) { bool on_queue = poll_iocb_lock_wq(req); if (!on_queue || req->work_scheduled) { /* * aio_poll_wake() already either scheduled the async * completion work, or completed the request inline. */ if (apt.error) /* unsupported case: multiple queues */ cancel = true; apt.error = 0; mask = 0; } if (mask || apt.error) { /* Steal to complete synchronously. */ list_del_init(&req->wait.entry); } else if (cancel) { /* Cancel if possible (may be too late though). */ WRITE_ONCE(req->cancelled, true); } else if (on_queue) { /* * Actually waiting for an event, so add the request to * active_reqs so that it can be cancelled if needed. */ list_add_tail(&aiocb->ki_list, &ctx->active_reqs); aiocb->ki_cancel = aio_poll_cancel; } if (on_queue) poll_iocb_unlock_wq(req); } if (mask) { /* no async, we'd stolen it */ aiocb->ki_res.res = mangle_poll(mask); apt.error = 0; } spin_unlock_irq(&ctx->ctx_lock); if (mask) iocb_put(aiocb); return apt.error; } static int __io_submit_one(struct kioctx *ctx, const struct iocb *iocb, struct iocb __user *user_iocb, struct aio_kiocb *req, bool compat) { req->ki_filp = fget(iocb->aio_fildes); if (unlikely(!req->ki_filp)) return -EBADF; if (iocb->aio_flags & IOCB_FLAG_RESFD) { struct eventfd_ctx *eventfd; /* * If the IOCB_FLAG_RESFD flag of aio_flags is set, get an * instance of the file* now. The file descriptor must be * an eventfd() fd, and will be signaled for each completed * event using the eventfd_signal() function. */ eventfd = eventfd_ctx_fdget(iocb->aio_resfd); if (IS_ERR(eventfd)) return PTR_ERR(eventfd); req->ki_eventfd = eventfd; } if (unlikely(put_user(KIOCB_KEY, &user_iocb->aio_key))) { pr_debug("EFAULT: aio_key\n"); return -EFAULT; } req->ki_res.obj = (u64)(unsigned long)user_iocb; req->ki_res.data = iocb->aio_data; req->ki_res.res = 0; req->ki_res.res2 = 0; switch (iocb->aio_lio_opcode) { case IOCB_CMD_PREAD: return aio_read(&req->rw, iocb, false, compat); case IOCB_CMD_PWRITE: return aio_write(&req->rw, iocb, false, compat); case IOCB_CMD_PREADV: return aio_read(&req->rw, iocb, true, compat); case IOCB_CMD_PWRITEV: return aio_write(&req->rw, iocb, true, compat); case IOCB_CMD_FSYNC: return aio_fsync(&req->fsync, iocb, false); case IOCB_CMD_FDSYNC: return aio_fsync(&req->fsync, iocb, true); case IOCB_CMD_POLL: return aio_poll(req, iocb); default: pr_debug("invalid aio operation %d\n", iocb->aio_lio_opcode); return -EINVAL; } } static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, bool compat) { struct aio_kiocb *req; struct iocb iocb; int err; if (unlikely(copy_from_user(&iocb, user_iocb, sizeof(iocb)))) return -EFAULT; /* enforce forwards compatibility on users */ if (unlikely(iocb.aio_reserved2)) { pr_debug("EINVAL: reserve field set\n"); return -EINVAL; } /* prevent overflows */ if (unlikely( (iocb.aio_buf != (unsigned long)iocb.aio_buf) || (iocb.aio_nbytes != (size_t)iocb.aio_nbytes) || ((ssize_t)iocb.aio_nbytes < 0) )) { pr_debug("EINVAL: overflow check\n"); return -EINVAL; } req = aio_get_req(ctx); if (unlikely(!req)) return -EAGAIN; err = __io_submit_one(ctx, &iocb, user_iocb, req, compat); /* Done with the synchronous reference */ iocb_put(req); /* * If err is 0, we'd either done aio_complete() ourselves or have * arranged for that to be done asynchronously. Anything non-zero * means that we need to destroy req ourselves. */ if (unlikely(err)) { iocb_destroy(req); put_reqs_available(ctx, 1); } return err; } /* sys_io_submit: * Queue the nr iocbs pointed to by iocbpp for processing. Returns * the number of iocbs queued. May return -EINVAL if the aio_context * specified by ctx_id is invalid, if nr is < 0, if the iocb at * *iocbpp[0] is not properly initialized, if the operation specified * is invalid for the file descriptor in the iocb. May fail with * -EFAULT if any of the data structures point to invalid data. May * fail with -EBADF if the file descriptor specified in the first * iocb is invalid. May fail with -EAGAIN if insufficient resources * are available to queue any iocbs. Will return 0 if nr is 0. Will * fail with -ENOSYS if not implemented. */ SYSCALL_DEFINE3(io_submit, aio_context_t, ctx_id, long, nr, struct iocb __user * __user *, iocbpp) { struct kioctx *ctx; long ret = 0; int i = 0; struct blk_plug plug; if (unlikely(nr < 0)) return -EINVAL; ctx = lookup_ioctx(ctx_id); if (unlikely(!ctx)) { pr_debug("EINVAL: invalid context id\n"); return -EINVAL; } if (nr > ctx->nr_events) nr = ctx->nr_events; if (nr > AIO_PLUG_THRESHOLD) blk_start_plug(&plug); for (i = 0; i < nr; i++) { struct iocb __user *user_iocb; if (unlikely(get_user(user_iocb, iocbpp + i))) { ret = -EFAULT; break; } ret = io_submit_one(ctx, user_iocb, false); if (ret) break; } if (nr > AIO_PLUG_THRESHOLD) blk_finish_plug(&plug); percpu_ref_put(&ctx->users); return i ? i : ret; } #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE3(io_submit, compat_aio_context_t, ctx_id, int, nr, compat_uptr_t __user *, iocbpp) { struct kioctx *ctx; long ret = 0; int i = 0; struct blk_plug plug; if (unlikely(nr < 0)) return -EINVAL; ctx = lookup_ioctx(ctx_id); if (unlikely(!ctx)) { pr_debug("EINVAL: invalid context id\n"); return -EINVAL; } if (nr > ctx->nr_events) nr = ctx->nr_events; if (nr > AIO_PLUG_THRESHOLD) blk_start_plug(&plug); for (i = 0; i < nr; i++) { compat_uptr_t user_iocb; if (unlikely(get_user(user_iocb, iocbpp + i))) { ret = -EFAULT; break; } ret = io_submit_one(ctx, compat_ptr(user_iocb), true); if (ret) break; } if (nr > AIO_PLUG_THRESHOLD) blk_finish_plug(&plug); percpu_ref_put(&ctx->users); return i ? i : ret; } #endif /* sys_io_cancel: * Attempts to cancel an iocb previously passed to io_submit. If * the operation is successfully cancelled, the resulting event is * copied into the memory pointed to by result without being placed * into the completion queue and 0 is returned. May fail with * -EFAULT if any of the data structures pointed to are invalid. * May fail with -EINVAL if aio_context specified by ctx_id is * invalid. May fail with -EAGAIN if the iocb specified was not * cancelled. Will fail with -ENOSYS if not implemented. */ SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb, struct io_event __user *, result) { struct kioctx *ctx; struct aio_kiocb *kiocb; int ret = -EINVAL; u32 key; u64 obj = (u64)(unsigned long)iocb; if (unlikely(get_user(key, &iocb->aio_key))) return -EFAULT; if (unlikely(key != KIOCB_KEY)) return -EINVAL; ctx = lookup_ioctx(ctx_id); if (unlikely(!ctx)) return -EINVAL; spin_lock_irq(&ctx->ctx_lock); list_for_each_entry(kiocb, &ctx->active_reqs, ki_list) { if (kiocb->ki_res.obj == obj) { ret = kiocb->ki_cancel(&kiocb->rw); list_del_init(&kiocb->ki_list); break; } } spin_unlock_irq(&ctx->ctx_lock); if (!ret) { /* * The result argument is no longer used - the io_event is * always delivered via the ring buffer. -EINPROGRESS indicates * cancellation is progress: */ ret = -EINPROGRESS; } percpu_ref_put(&ctx->users); return ret; } static long do_io_getevents(aio_context_t ctx_id, long min_nr, long nr, struct io_event __user *events, struct timespec64 *ts) { ktime_t until = ts ? timespec64_to_ktime(*ts) : KTIME_MAX; struct kioctx *ioctx = lookup_ioctx(ctx_id); long ret = -EINVAL; if (likely(ioctx)) { if (likely(min_nr <= nr && min_nr >= 0)) ret = read_events(ioctx, min_nr, nr, events, until); percpu_ref_put(&ioctx->users); } return ret; } /* io_getevents: * Attempts to read at least min_nr events and up to nr events from * the completion queue for the aio_context specified by ctx_id. If * it succeeds, the number of read events is returned. May fail with * -EINVAL if ctx_id is invalid, if min_nr is out of range, if nr is * out of range, if timeout is out of range. May fail with -EFAULT * if any of the memory specified is invalid. May return 0 or * < min_nr if the timeout specified by timeout has elapsed * before sufficient events are available, where timeout == NULL * specifies an infinite timeout. Note that the timeout pointed to by * timeout is relative. Will fail with -ENOSYS if not implemented. */ #ifdef CONFIG_64BIT SYSCALL_DEFINE5(io_getevents, aio_context_t, ctx_id, long, min_nr, long, nr, struct io_event __user *, events, struct __kernel_timespec __user *, timeout) { struct timespec64 ts; int ret; if (timeout && unlikely(get_timespec64(&ts, timeout))) return -EFAULT; ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &ts : NULL); if (!ret && signal_pending(current)) ret = -EINTR; return ret; } #endif struct __aio_sigset { const sigset_t __user *sigmask; size_t sigsetsize; }; SYSCALL_DEFINE6(io_pgetevents, aio_context_t, ctx_id, long, min_nr, long, nr, struct io_event __user *, events, struct __kernel_timespec __user *, timeout, const struct __aio_sigset __user *, usig) { struct __aio_sigset ksig = { NULL, }; struct timespec64 ts; bool interrupted; int ret; if (timeout && unlikely(get_timespec64(&ts, timeout))) return -EFAULT; if (usig && copy_from_user(&ksig, usig, sizeof(ksig))) return -EFAULT; ret = set_user_sigmask(ksig.sigmask, ksig.sigsetsize); if (ret) return ret; ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &ts : NULL); interrupted = signal_pending(current); restore_saved_sigmask_unless(interrupted); if (interrupted && !ret) ret = -ERESTARTNOHAND; return ret; } #if defined(CONFIG_COMPAT_32BIT_TIME) && !defined(CONFIG_64BIT) SYSCALL_DEFINE6(io_pgetevents_time32, aio_context_t, ctx_id, long, min_nr, long, nr, struct io_event __user *, events, struct old_timespec32 __user *, timeout, const struct __aio_sigset __user *, usig) { struct __aio_sigset ksig = { NULL, }; struct timespec64 ts; bool interrupted; int ret; if (timeout && unlikely(get_old_timespec32(&ts, timeout))) return -EFAULT; if (usig && copy_from_user(&ksig, usig, sizeof(ksig))) return -EFAULT; ret = set_user_sigmask(ksig.sigmask, ksig.sigsetsize); if (ret) return ret; ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &ts : NULL); interrupted = signal_pending(current); restore_saved_sigmask_unless(interrupted); if (interrupted && !ret) ret = -ERESTARTNOHAND; return ret; } #endif #if defined(CONFIG_COMPAT_32BIT_TIME) SYSCALL_DEFINE5(io_getevents_time32, __u32, ctx_id, __s32, min_nr, __s32, nr, struct io_event __user *, events, struct old_timespec32 __user *, timeout) { struct timespec64 t; int ret; if (timeout && get_old_timespec32(&t, timeout)) return -EFAULT; ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &t : NULL); if (!ret && signal_pending(current)) ret = -EINTR; return ret; } #endif #ifdef CONFIG_COMPAT struct __compat_aio_sigset { compat_uptr_t sigmask; compat_size_t sigsetsize; }; #if defined(CONFIG_COMPAT_32BIT_TIME) COMPAT_SYSCALL_DEFINE6(io_pgetevents, compat_aio_context_t, ctx_id, compat_long_t, min_nr, compat_long_t, nr, struct io_event __user *, events, struct old_timespec32 __user *, timeout, const struct __compat_aio_sigset __user *, usig) { struct __compat_aio_sigset ksig = { 0, }; struct timespec64 t; bool interrupted; int ret; if (timeout && get_old_timespec32(&t, timeout)) return -EFAULT; if (usig && copy_from_user(&ksig, usig, sizeof(ksig))) return -EFAULT; ret = set_compat_user_sigmask(compat_ptr(ksig.sigmask), ksig.sigsetsize); if (ret) return ret; ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &t : NULL); interrupted = signal_pending(current); restore_saved_sigmask_unless(interrupted); if (interrupted && !ret) ret = -ERESTARTNOHAND; return ret; } #endif COMPAT_SYSCALL_DEFINE6(io_pgetevents_time64, compat_aio_context_t, ctx_id, compat_long_t, min_nr, compat_long_t, nr, struct io_event __user *, events, struct __kernel_timespec __user *, timeout, const struct __compat_aio_sigset __user *, usig) { struct __compat_aio_sigset ksig = { 0, }; struct timespec64 t; bool interrupted; int ret; if (timeout && get_timespec64(&t, timeout)) return -EFAULT; if (usig && copy_from_user(&ksig, usig, sizeof(ksig))) return -EFAULT; ret = set_compat_user_sigmask(compat_ptr(ksig.sigmask), ksig.sigsetsize); if (ret) return ret; ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &t : NULL); interrupted = signal_pending(current); restore_saved_sigmask_unless(interrupted); if (interrupted && !ret) ret = -ERESTARTNOHAND; return ret; } #endif |
| 7 7 7 7 7 7 7 22 22 21 21 5 2 5 5 3 2 5 12 15 7 5 5 7 14 15 3 3 2 1 1 1 2 15 16 16 15 15 15 15 4 2 1 9 3 3 3 9 2 1 1 1 1 10 10 16 3 26 3 26 6 6 6 10 10 10 10 10 8 6 8 8 8 6 4 4 4 4 1 3 4 3 3 3 10 1 10 6 4 4 4 4 2 1 10 27 29 28 1 1 27 1 26 26 26 26 24 23 23 21 2 6 24 2 6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 | // SPDX-License-Identifier: GPL-2.0-or-later #include <linux/plist.h> #include <linux/sched/task.h> #include <linux/sched/signal.h> #include <linux/freezer.h> #include "futex.h" /* * READ this before attempting to hack on futexes! * * Basic futex operation and ordering guarantees * ============================================= * * The waiter reads the futex value in user space and calls * futex_wait(). This function computes the hash bucket and acquires * the hash bucket lock. After that it reads the futex user space value * again and verifies that the data has not changed. If it has not changed * it enqueues itself into the hash bucket, releases the hash bucket lock * and schedules. * * The waker side modifies the user space value of the futex and calls * futex_wake(). This function computes the hash bucket and acquires the * hash bucket lock. Then it looks for waiters on that futex in the hash * bucket and wakes them. * * In futex wake up scenarios where no tasks are blocked on a futex, taking * the hb spinlock can be avoided and simply return. In order for this * optimization to work, ordering guarantees must exist so that the waiter * being added to the list is acknowledged when the list is concurrently being * checked by the waker, avoiding scenarios like the following: * * CPU 0 CPU 1 * val = *futex; * sys_futex(WAIT, futex, val); * futex_wait(futex, val); * uval = *futex; * *futex = newval; * sys_futex(WAKE, futex); * futex_wake(futex); * if (queue_empty()) * return; * if (uval == val) * lock(hash_bucket(futex)); * queue(); * unlock(hash_bucket(futex)); * schedule(); * * This would cause the waiter on CPU 0 to wait forever because it * missed the transition of the user space value from val to newval * and the waker did not find the waiter in the hash bucket queue. * * The correct serialization ensures that a waiter either observes * the changed user space value before blocking or is woken by a * concurrent waker: * * CPU 0 CPU 1 * val = *futex; * sys_futex(WAIT, futex, val); * futex_wait(futex, val); * * waiters++; (a) * smp_mb(); (A) <-- paired with -. * | * lock(hash_bucket(futex)); | * | * uval = *futex; | * | *futex = newval; * | sys_futex(WAKE, futex); * | futex_wake(futex); * | * `--------> smp_mb(); (B) * if (uval == val) * queue(); * unlock(hash_bucket(futex)); * schedule(); if (waiters) * lock(hash_bucket(futex)); * else wake_waiters(futex); * waiters--; (b) unlock(hash_bucket(futex)); * * Where (A) orders the waiters increment and the futex value read through * atomic operations (see futex_hb_waiters_inc) and where (B) orders the write * to futex and the waiters read (see futex_hb_waiters_pending()). * * This yields the following case (where X:=waiters, Y:=futex): * * X = Y = 0 * * w[X]=1 w[Y]=1 * MB MB * r[Y]=y r[X]=x * * Which guarantees that x==0 && y==0 is impossible; which translates back into * the guarantee that we cannot both miss the futex variable change and the * enqueue. * * Note that a new waiter is accounted for in (a) even when it is possible that * the wait call can return error, in which case we backtrack from it in (b). * Refer to the comment in futex_q_lock(). * * Similarly, in order to account for waiters being requeued on another * address we always increment the waiters for the destination bucket before * acquiring the lock. It then decrements them again after releasing it - * the code that actually moves the futex(es) between hash buckets (requeue_futex) * will do the additional required waiter count housekeeping. This is done for * double_lock_hb() and double_unlock_hb(), respectively. */ bool __futex_wake_mark(struct futex_q *q) { if (WARN(q->pi_state || q->rt_waiter, "refusing to wake PI futex\n")) return false; __futex_unqueue(q); /* * The waiting task can free the futex_q as soon as q->lock_ptr = NULL * is written, without taking any locks. This is possible in the event * of a spurious wakeup, for example. A memory barrier is required here * to prevent the following store to lock_ptr from getting ahead of the * plist_del in __futex_unqueue(). */ smp_store_release(&q->lock_ptr, NULL); return true; } /* * The hash bucket lock must be held when this is called. * Afterwards, the futex_q must not be accessed. Callers * must ensure to later call wake_up_q() for the actual * wakeups to occur. */ void futex_wake_mark(struct wake_q_head *wake_q, struct futex_q *q) { struct task_struct *p = q->task; get_task_struct(p); if (!__futex_wake_mark(q)) { put_task_struct(p); return; } /* * Queue the task for later wakeup for after we've released * the hb->lock. */ wake_q_add_safe(wake_q, p); } /* * Wake up waiters matching bitset queued on this futex (uaddr). */ int futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset) { struct futex_q *this, *next; union futex_key key = FUTEX_KEY_INIT; DEFINE_WAKE_Q(wake_q); int ret; if (!bitset) return -EINVAL; ret = get_futex_key(uaddr, flags, &key, FUTEX_READ); if (unlikely(ret != 0)) return ret; if ((flags & FLAGS_STRICT) && !nr_wake) return 0; CLASS(hb, hb)(&key); /* Make sure we really have tasks to wakeup */ if (!futex_hb_waiters_pending(hb)) return ret; spin_lock(&hb->lock); plist_for_each_entry_safe(this, next, &hb->chain, list) { if (futex_match (&this->key, &key)) { if (this->pi_state || this->rt_waiter) { ret = -EINVAL; break; } /* Check if one of the bits is set in both bitsets */ if (!(this->bitset & bitset)) continue; this->wake(&wake_q, this); if (++ret >= nr_wake) break; } } spin_unlock(&hb->lock); wake_up_q(&wake_q); return ret; } static int futex_atomic_op_inuser(unsigned int encoded_op, u32 __user *uaddr) { unsigned int op = (encoded_op & 0x70000000) >> 28; unsigned int cmp = (encoded_op & 0x0f000000) >> 24; int oparg = sign_extend32((encoded_op & 0x00fff000) >> 12, 11); int cmparg = sign_extend32(encoded_op & 0x00000fff, 11); int oldval, ret; if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) { if (oparg < 0 || oparg > 31) { /* * kill this print and return -EINVAL when userspace * is sane again */ pr_info_ratelimited("futex_wake_op: %s tries to shift op by %d; fix this program\n", current->comm, oparg); oparg &= 31; } oparg = 1 << oparg; } pagefault_disable(); ret = arch_futex_atomic_op_inuser(op, oparg, &oldval, uaddr); pagefault_enable(); if (ret) return ret; switch (cmp) { case FUTEX_OP_CMP_EQ: return oldval == cmparg; case FUTEX_OP_CMP_NE: return oldval != cmparg; case FUTEX_OP_CMP_LT: return oldval < cmparg; case FUTEX_OP_CMP_GE: return oldval >= cmparg; case FUTEX_OP_CMP_LE: return oldval <= cmparg; case FUTEX_OP_CMP_GT: return oldval > cmparg; default: return -ENOSYS; } } /* * Wake up all waiters hashed on the physical page that is mapped * to this virtual address: */ int futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2, int nr_wake, int nr_wake2, int op) { union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT; struct futex_q *this, *next; int ret, op_ret; DEFINE_WAKE_Q(wake_q); retry: ret = get_futex_key(uaddr1, flags, &key1, FUTEX_READ); if (unlikely(ret != 0)) return ret; ret = get_futex_key(uaddr2, flags, &key2, FUTEX_WRITE); if (unlikely(ret != 0)) return ret; retry_private: if (1) { CLASS(hb, hb1)(&key1); CLASS(hb, hb2)(&key2); double_lock_hb(hb1, hb2); op_ret = futex_atomic_op_inuser(op, uaddr2); if (unlikely(op_ret < 0)) { double_unlock_hb(hb1, hb2); if (!IS_ENABLED(CONFIG_MMU) || unlikely(op_ret != -EFAULT && op_ret != -EAGAIN)) { /* * we don't get EFAULT from MMU faults if we don't have * an MMU, but we might get them from range checking */ ret = op_ret; return ret; } if (op_ret == -EFAULT) { ret = fault_in_user_writeable(uaddr2); if (ret) return ret; } cond_resched(); if (!(flags & FLAGS_SHARED)) goto retry_private; goto retry; } plist_for_each_entry_safe(this, next, &hb1->chain, list) { if (futex_match(&this->key, &key1)) { if (this->pi_state || this->rt_waiter) { ret = -EINVAL; goto out_unlock; } this->wake(&wake_q, this); if (++ret >= nr_wake) break; } } if (op_ret > 0) { op_ret = 0; plist_for_each_entry_safe(this, next, &hb2->chain, list) { if (futex_match(&this->key, &key2)) { if (this->pi_state || this->rt_waiter) { ret = -EINVAL; goto out_unlock; } this->wake(&wake_q, this); if (++op_ret >= nr_wake2) break; } } ret += op_ret; } out_unlock: double_unlock_hb(hb1, hb2); } wake_up_q(&wake_q); return ret; } static long futex_wait_restart(struct restart_block *restart); /** * futex_do_wait() - wait for wakeup, timeout, or signal * @q: the futex_q to queue up on * @timeout: the prepared hrtimer_sleeper, or null for no timeout */ void futex_do_wait(struct futex_q *q, struct hrtimer_sleeper *timeout) { /* Arm the timer */ if (timeout) hrtimer_sleeper_start_expires(timeout, HRTIMER_MODE_ABS); /* * If we have been removed from the hash list, then another task * has tried to wake us, and we can skip the call to schedule(). */ if (likely(!plist_node_empty(&q->list))) { /* * If the timer has already expired, current will already be * flagged for rescheduling. Only call schedule if there * is no timeout, or if it has yet to expire. */ if (!timeout || timeout->task) schedule(); } __set_current_state(TASK_RUNNING); } /** * futex_unqueue_multiple - Remove various futexes from their hash bucket * @v: The list of futexes to unqueue * @count: Number of futexes in the list * * Helper to unqueue a list of futexes. This can't fail. * * Return: * - >=0 - Index of the last futex that was awoken; * - -1 - No futex was awoken */ int futex_unqueue_multiple(struct futex_vector *v, int count) { int ret = -1, i; for (i = 0; i < count; i++) { if (!futex_unqueue(&v[i].q)) ret = i; } return ret; } /** * futex_wait_multiple_setup - Prepare to wait and enqueue multiple futexes * @vs: The futex list to wait on * @count: The size of the list * @woken: Index of the last woken futex, if any. Used to notify the * caller that it can return this index to userspace (return parameter) * * Prepare multiple futexes in a single step and enqueue them. This may fail if * the futex list is invalid or if any futex was already awoken. On success the * task is ready to interruptible sleep. * * Return: * - 1 - One of the futexes was woken by another thread * - 0 - Success * - <0 - -EFAULT, -EWOULDBLOCK or -EINVAL */ int futex_wait_multiple_setup(struct futex_vector *vs, int count, int *woken) { bool retry = false; int ret, i; u32 uval; /* * Make sure to have a reference on the private_hash such that we * don't block on rehash after changing the task state below. */ guard(private_hash)(); /* * Enqueuing multiple futexes is tricky, because we need to enqueue * each futex on the list before dealing with the next one to avoid * deadlocking on the hash bucket. But, before enqueuing, we need to * make sure that current->state is TASK_INTERRUPTIBLE, so we don't * lose any wake events, which cannot be done before the get_futex_key * of the next key, because it calls get_user_pages, which can sleep. * Thus, we fetch the list of futexes keys in two steps, by first * pinning all the memory keys in the futex key, and only then we read * each key and queue the corresponding futex. * * Private futexes doesn't need to recalculate hash in retry, so skip * get_futex_key() when retrying. */ retry: for (i = 0; i < count; i++) { if (!(vs[i].w.flags & FLAGS_SHARED) && retry) continue; ret = get_futex_key(u64_to_user_ptr(vs[i].w.uaddr), vs[i].w.flags, &vs[i].q.key, FUTEX_READ); if (unlikely(ret)) return ret; } set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE); for (i = 0; i < count; i++) { u32 __user *uaddr = (u32 __user *)(unsigned long)vs[i].w.uaddr; struct futex_q *q = &vs[i].q; u32 val = vs[i].w.val; if (1) { CLASS(hb, hb)(&q->key); futex_q_lock(q, hb); ret = futex_get_value_locked(&uval, uaddr); if (!ret && uval == val) { /* * The bucket lock can't be held while dealing with the * next futex. Queue each futex at this moment so hb can * be unlocked. */ futex_queue(q, hb, current); continue; } futex_q_unlock(hb); } __set_current_state(TASK_RUNNING); /* * Even if something went wrong, if we find out that a futex * was woken, we don't return error and return this index to * userspace */ *woken = futex_unqueue_multiple(vs, i); if (*woken >= 0) return 1; if (ret) { /* * If we need to handle a page fault, we need to do so * without any lock and any enqueued futex (otherwise * we could lose some wakeup). So we do it here, after * undoing all the work done so far. In success, we * retry all the work. */ if (get_user(uval, uaddr)) return -EFAULT; retry = true; goto retry; } if (uval != val) return -EWOULDBLOCK; } return 0; } /** * futex_sleep_multiple - Check sleeping conditions and sleep * @vs: List of futexes to wait for * @count: Length of vs * @to: Timeout * * Sleep if and only if the timeout hasn't expired and no futex on the list has * been woken up. */ static void futex_sleep_multiple(struct futex_vector *vs, unsigned int count, struct hrtimer_sleeper *to) { if (to && !to->task) return; for (; count; count--, vs++) { if (!READ_ONCE(vs->q.lock_ptr)) return; } schedule(); } /** * futex_wait_multiple - Prepare to wait on and enqueue several futexes * @vs: The list of futexes to wait on * @count: The number of objects * @to: Timeout before giving up and returning to userspace * * Entry point for the FUTEX_WAIT_MULTIPLE futex operation, this function * sleeps on a group of futexes and returns on the first futex that is * wake, or after the timeout has elapsed. * * Return: * - >=0 - Hint to the futex that was awoken * - <0 - On error */ int futex_wait_multiple(struct futex_vector *vs, unsigned int count, struct hrtimer_sleeper *to) { int ret, hint = 0; if (to) hrtimer_sleeper_start_expires(to, HRTIMER_MODE_ABS); while (1) { ret = futex_wait_multiple_setup(vs, count, &hint); if (ret) { if (ret > 0) { /* A futex was woken during setup */ ret = hint; } return ret; } futex_sleep_multiple(vs, count, to); __set_current_state(TASK_RUNNING); ret = futex_unqueue_multiple(vs, count); if (ret >= 0) return ret; if (to && !to->task) return -ETIMEDOUT; else if (signal_pending(current)) return -ERESTARTSYS; /* * The final case is a spurious wakeup, for * which just retry. */ } } /** * futex_wait_setup() - Prepare to wait on a futex * @uaddr: the futex userspace address * @val: the expected value * @flags: futex flags (FLAGS_SHARED, etc.) * @q: the associated futex_q * @key2: the second futex_key if used for requeue PI * @task: Task queueing this futex * * Setup the futex_q and locate the hash_bucket. Get the futex value and * compare it with the expected value. Handle atomic faults internally. * Return with the hb lock held on success, and unlocked on failure. * * Return: * - 0 - uaddr contains val and hb has been locked; * - <0 - On error and the hb is unlocked. A possible reason: the uaddr can not * be read, does not contain the expected value or is not properly aligned. */ int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags, struct futex_q *q, union futex_key *key2, struct task_struct *task) { u32 uval; int ret; /* * Access the page AFTER the hash-bucket is locked. * Order is important: * * Userspace waiter: val = var; if (cond(val)) futex_wait(&var, val); * Userspace waker: if (cond(var)) { var = new; futex_wake(&var); } * * The basic logical guarantee of a futex is that it blocks ONLY * if cond(var) is known to be true at the time of blocking, for * any cond. If we locked the hash-bucket after testing *uaddr, that * would open a race condition where we could block indefinitely with * cond(var) false, which would violate the guarantee. * * On the other hand, we insert q and release the hash-bucket only * after testing *uaddr. This guarantees that futex_wait() will NOT * absorb a wakeup if *uaddr does not match the desired values * while the syscall executes. */ retry: ret = get_futex_key(uaddr, flags, &q->key, FUTEX_READ); if (unlikely(ret != 0)) return ret; retry_private: if (1) { CLASS(hb, hb)(&q->key); futex_q_lock(q, hb); ret = futex_get_value_locked(&uval, uaddr); if (ret) { futex_q_unlock(hb); ret = get_user(uval, uaddr); if (ret) return ret; if (!(flags & FLAGS_SHARED)) goto retry_private; goto retry; } if (uval != val) { futex_q_unlock(hb); return -EWOULDBLOCK; } if (key2 && futex_match(&q->key, key2)) { futex_q_unlock(hb); return -EINVAL; } /* * The task state is guaranteed to be set before another task can * wake it. set_current_state() is implemented using smp_store_mb() and * futex_queue() calls spin_unlock() upon completion, both serializing * access to the hash list and forcing another memory barrier. */ if (task == current) set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE); futex_queue(q, hb, task); } return ret; } int __futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, struct hrtimer_sleeper *to, u32 bitset) { struct futex_q q = futex_q_init; int ret; if (!bitset) return -EINVAL; q.bitset = bitset; retry: /* * Prepare to wait on uaddr. On success, it holds hb->lock and q * is initialized. */ ret = futex_wait_setup(uaddr, val, flags, &q, NULL, current); if (ret) return ret; /* futex_queue and wait for wakeup, timeout, or a signal. */ futex_do_wait(&q, to); /* If we were woken (and unqueued), we succeeded, whatever. */ if (!futex_unqueue(&q)) return 0; if (to && !to->task) return -ETIMEDOUT; /* * We expect signal_pending(current), but we might be the * victim of a spurious wakeup as well. */ if (!signal_pending(current)) goto retry; return -ERESTARTSYS; } int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, ktime_t *abs_time, u32 bitset) { struct hrtimer_sleeper timeout, *to; struct restart_block *restart; int ret; to = futex_setup_timer(abs_time, &timeout, flags, current->timer_slack_ns); ret = __futex_wait(uaddr, flags, val, to, bitset); /* No timeout, nothing to clean up. */ if (!to) return ret; hrtimer_cancel(&to->timer); destroy_hrtimer_on_stack(&to->timer); if (ret == -ERESTARTSYS) { restart = ¤t->restart_block; restart->futex.uaddr = uaddr; restart->futex.val = val; restart->futex.time = *abs_time; restart->futex.bitset = bitset; restart->futex.flags = flags | FLAGS_HAS_TIMEOUT; return set_restart_fn(restart, futex_wait_restart); } return ret; } static long futex_wait_restart(struct restart_block *restart) { u32 __user *uaddr = restart->futex.uaddr; ktime_t t, *tp = NULL; if (restart->futex.flags & FLAGS_HAS_TIMEOUT) { t = restart->futex.time; tp = &t; } restart->fn = do_no_restart_syscall; return (long)futex_wait(uaddr, restart->futex.flags, restart->futex.val, tp, restart->futex.bitset); } |
| 272 272 272 272 272 272 268 272 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 | // SPDX-License-Identifier: GPL-2.0 /* * linux/drivers/char/misc.c * * Generic misc open routine by Johan Myreen * * Based on code from Linus * * Teemu Rantanen's Microsoft Busmouse support and Derrick Cole's * changes incorporated into 0.97pl4 * by Peter Cervasio (pete%q106fm.uucp@wupost.wustl.edu) (08SEP92) * See busmouse.c for particulars. * * Made things a lot mode modular - easy to compile in just one or two * of the misc drivers, as they are now completely independent. Linus. * * Support for loadable modules. 8-Sep-95 Philip Blundell <pjb27@cam.ac.uk> * * Fixed a failing symbol register to free the device registration * Alan Cox <alan@lxorguk.ukuu.org.uk> 21-Jan-96 * * Dynamic minors and /proc/mice by Alessandro Rubini. 26-Mar-96 * * Renamed to misc and miscdevice to be more accurate. Alan Cox 26-Mar-96 * * Handling of mouse minor numbers for kerneld: * Idea by Jacques Gelinas <jack@solucorp.qc.ca>, * adapted by Bjorn Ekwall <bj0rn@blox.se> * corrected by Alan Cox <alan@lxorguk.ukuu.org.uk> * * Changes for kmod (from kerneld): * Cyrus Durgin <cider@speakeasy.org> * * Added devfs support. Richard Gooch <rgooch@atnf.csiro.au> 10-Jan-1998 */ #include <linux/module.h> #include <linux/fs.h> #include <linux/errno.h> #include <linux/miscdevice.h> #include <linux/kernel.h> #include <linux/major.h> #include <linux/mutex.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/stat.h> #include <linux/init.h> #include <linux/device.h> #include <linux/tty.h> #include <linux/kmod.h> #include <linux/gfp.h> /* * Head entry for the doubly linked miscdevice list */ static LIST_HEAD(misc_list); static DEFINE_MUTEX(misc_mtx); /* * Assigned numbers. */ static DEFINE_IDA(misc_minors_ida); static int misc_minor_alloc(int minor) { int ret = 0; if (minor == MISC_DYNAMIC_MINOR) { /* allocate free id */ ret = ida_alloc_range(&misc_minors_ida, MISC_DYNAMIC_MINOR + 1, MINORMASK, GFP_KERNEL); } else { ret = ida_alloc_range(&misc_minors_ida, minor, minor, GFP_KERNEL); } return ret; } static void misc_minor_free(int minor) { ida_free(&misc_minors_ida, minor); } #ifdef CONFIG_PROC_FS static void *misc_seq_start(struct seq_file *seq, loff_t *pos) { mutex_lock(&misc_mtx); return seq_list_start(&misc_list, *pos); } static void *misc_seq_next(struct seq_file *seq, void *v, loff_t *pos) { return seq_list_next(v, &misc_list, pos); } static void misc_seq_stop(struct seq_file *seq, void *v) { mutex_unlock(&misc_mtx); } static int misc_seq_show(struct seq_file *seq, void *v) { const struct miscdevice *p = list_entry(v, struct miscdevice, list); seq_printf(seq, "%3i %s\n", p->minor, p->name ? p->name : ""); return 0; } static const struct seq_operations misc_seq_ops = { .start = misc_seq_start, .next = misc_seq_next, .stop = misc_seq_stop, .show = misc_seq_show, }; #endif static int misc_open(struct inode *inode, struct file *file) { int minor = iminor(inode); struct miscdevice *c = NULL, *iter; int err = -ENODEV; const struct file_operations *new_fops = NULL; mutex_lock(&misc_mtx); list_for_each_entry(iter, &misc_list, list) { if (iter->minor != minor) continue; c = iter; new_fops = fops_get(iter->fops); break; } if (!new_fops) { mutex_unlock(&misc_mtx); request_module("char-major-%d-%d", MISC_MAJOR, minor); mutex_lock(&misc_mtx); list_for_each_entry(iter, &misc_list, list) { if (iter->minor != minor) continue; c = iter; new_fops = fops_get(iter->fops); break; } if (!new_fops) goto fail; } /* * Place the miscdevice in the file's * private_data so it can be used by the * file operations, including f_op->open below */ file->private_data = c; err = 0; replace_fops(file, new_fops); if (file->f_op->open) err = file->f_op->open(inode, file); fail: mutex_unlock(&misc_mtx); return err; } static char *misc_devnode(const struct device *dev, umode_t *mode) { const struct miscdevice *c = dev_get_drvdata(dev); if (mode && c->mode) *mode = c->mode; if (c->nodename) return kstrdup(c->nodename, GFP_KERNEL); return NULL; } static const struct class misc_class = { .name = "misc", .devnode = misc_devnode, }; static const struct file_operations misc_fops = { .owner = THIS_MODULE, .open = misc_open, .llseek = noop_llseek, }; /** * misc_register - register a miscellaneous device * @misc: device structure * * Register a miscellaneous device with the kernel. If the minor * number is set to %MISC_DYNAMIC_MINOR a minor number is assigned * and placed in the minor field of the structure. For other cases * the minor number requested is used. * * The structure passed is linked into the kernel and may not be * destroyed until it has been unregistered. By default, an open() * syscall to the device sets file->private_data to point to the * structure. Drivers don't need open in fops for this. * * A zero is returned on success and a negative errno code for * failure. */ int misc_register(struct miscdevice *misc) { dev_t dev; int err = 0; bool is_dynamic = (misc->minor == MISC_DYNAMIC_MINOR); INIT_LIST_HEAD(&misc->list); mutex_lock(&misc_mtx); if (is_dynamic) { int i = misc_minor_alloc(misc->minor); if (i < 0) { err = -EBUSY; goto out; } misc->minor = i; } else { struct miscdevice *c; int i; list_for_each_entry(c, &misc_list, list) { if (c->minor == misc->minor) { err = -EBUSY; goto out; } } i = misc_minor_alloc(misc->minor); if (i < 0) { err = -EBUSY; goto out; } } dev = MKDEV(MISC_MAJOR, misc->minor); misc->this_device = device_create_with_groups(&misc_class, misc->parent, dev, misc, misc->groups, "%s", misc->name); if (IS_ERR(misc->this_device)) { misc_minor_free(misc->minor); if (is_dynamic) { misc->minor = MISC_DYNAMIC_MINOR; } err = PTR_ERR(misc->this_device); goto out; } /* * Add it to the front, so that later devices can "override" * earlier defaults */ list_add(&misc->list, &misc_list); out: mutex_unlock(&misc_mtx); return err; } EXPORT_SYMBOL(misc_register); /** * misc_deregister - unregister a miscellaneous device * @misc: device to unregister * * Unregister a miscellaneous device that was previously * successfully registered with misc_register(). */ void misc_deregister(struct miscdevice *misc) { if (WARN_ON(list_empty(&misc->list))) return; mutex_lock(&misc_mtx); list_del(&misc->list); device_destroy(&misc_class, MKDEV(MISC_MAJOR, misc->minor)); misc_minor_free(misc->minor); mutex_unlock(&misc_mtx); } EXPORT_SYMBOL(misc_deregister); static int __init misc_init(void) { int err; struct proc_dir_entry *misc_proc_file; misc_proc_file = proc_create_seq("misc", 0, NULL, &misc_seq_ops); err = class_register(&misc_class); if (err) goto fail_remove; err = __register_chrdev(MISC_MAJOR, 0, MINORMASK + 1, "misc", &misc_fops); if (err < 0) goto fail_printk; return 0; fail_printk: pr_err("unable to get major %d for misc devices\n", MISC_MAJOR); class_unregister(&misc_class); fail_remove: if (misc_proc_file) remove_proc_entry("misc", NULL); return err; } subsys_initcall(misc_init); |
| 5 4 3 3 2 1 3 2 2 8 3 3 3 1 1 1 1 6 13 13 13 13 13 2 5 5 4 4 4 6 4 4 3 3 2 1 1 1 1 1 4 3 2 2 3 1 2 3 3 1 2 2 2 2 5 4 5 5 3 5 5 4 4 3 4 4 4 2 3 1 3 1 3 3 2 4 4 3 2 1 3 1 4 4 3 4 4 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 1 2 2 2 2 2 2 2 5 8 10 12 1 1 1 1 1 1 25 43 3 1 2 2 40 1 39 5 39 2 1 38 2 1 43 17 17 14 14 5 3 3 2 4 3 1 21 5 4 17 17 17 14 12 14 1 13 14 4 14 21 36 1 5 4 1 33 3 2 2 1 1 31 4 2 3 30 2 1 1 28 3 30 2 30 1 30 2 1 1 29 6 5 23 1 27 5 5 21 3 5 2 24 3 6 4 22 4 5 2 23 1 4 36 7 12 2 11 11 7 7 8 1 1 1 1 1 8 12 25 25 25 21 12 4 3 4 1 4 3 3 4 1 11 11 11 4 2 2 4 4 3 20 18 18 14 13 3 3 18 18 18 18 18 18 18 18 18 3 18 333 334 1 335 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 | // SPDX-License-Identifier: GPL-2.0-only /* * GENEVE: Generic Network Virtualization Encapsulation * * Copyright (c) 2015 Red Hat, Inc. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/ethtool.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/etherdevice.h> #include <linux/hash.h> #include <net/ipv6_stubs.h> #include <net/dst_metadata.h> #include <net/gro_cells.h> #include <net/rtnetlink.h> #include <net/geneve.h> #include <net/gro.h> #include <net/netdev_lock.h> #include <net/protocol.h> #define GENEVE_NETDEV_VER "0.6" #define GENEVE_N_VID (1u << 24) #define GENEVE_VID_MASK (GENEVE_N_VID - 1) #define VNI_HASH_BITS 10 #define VNI_HASH_SIZE (1<<VNI_HASH_BITS) static bool log_ecn_error = true; module_param(log_ecn_error, bool, 0644); MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN"); #define GENEVE_VER 0 #define GENEVE_BASE_HLEN (sizeof(struct udphdr) + sizeof(struct genevehdr)) #define GENEVE_IPV4_HLEN (ETH_HLEN + sizeof(struct iphdr) + GENEVE_BASE_HLEN) #define GENEVE_IPV6_HLEN (ETH_HLEN + sizeof(struct ipv6hdr) + GENEVE_BASE_HLEN) /* per-network namespace private data for this module */ struct geneve_net { struct list_head geneve_list; /* sock_list is protected by rtnl lock */ struct list_head sock_list; }; static unsigned int geneve_net_id; struct geneve_dev_node { struct hlist_node hlist; struct geneve_dev *geneve; }; struct geneve_config { struct ip_tunnel_info info; bool collect_md; bool use_udp6_rx_checksums; bool ttl_inherit; enum ifla_geneve_df df; bool inner_proto_inherit; u16 port_min; u16 port_max; }; /* Pseudo network device */ struct geneve_dev { struct geneve_dev_node hlist4; /* vni hash table for IPv4 socket */ #if IS_ENABLED(CONFIG_IPV6) struct geneve_dev_node hlist6; /* vni hash table for IPv6 socket */ #endif struct net *net; /* netns for packet i/o */ struct net_device *dev; /* netdev for geneve tunnel */ struct geneve_sock __rcu *sock4; /* IPv4 socket used for geneve tunnel */ #if IS_ENABLED(CONFIG_IPV6) struct geneve_sock __rcu *sock6; /* IPv6 socket used for geneve tunnel */ #endif struct list_head next; /* geneve's per namespace list */ struct gro_cells gro_cells; struct geneve_config cfg; }; struct geneve_sock { bool collect_md; struct list_head list; struct socket *sock; struct rcu_head rcu; int refcnt; struct hlist_head vni_list[VNI_HASH_SIZE]; }; static inline __u32 geneve_net_vni_hash(u8 vni[3]) { __u32 vnid; vnid = (vni[0] << 16) | (vni[1] << 8) | vni[2]; return hash_32(vnid, VNI_HASH_BITS); } static __be64 vni_to_tunnel_id(const __u8 *vni) { #ifdef __BIG_ENDIAN return (vni[0] << 16) | (vni[1] << 8) | vni[2]; #else return (__force __be64)(((__force u64)vni[0] << 40) | ((__force u64)vni[1] << 48) | ((__force u64)vni[2] << 56)); #endif } /* Convert 64 bit tunnel ID to 24 bit VNI. */ static void tunnel_id_to_vni(__be64 tun_id, __u8 *vni) { #ifdef __BIG_ENDIAN vni[0] = (__force __u8)(tun_id >> 16); vni[1] = (__force __u8)(tun_id >> 8); vni[2] = (__force __u8)tun_id; #else vni[0] = (__force __u8)((__force u64)tun_id >> 40); vni[1] = (__force __u8)((__force u64)tun_id >> 48); vni[2] = (__force __u8)((__force u64)tun_id >> 56); #endif } static bool eq_tun_id_and_vni(u8 *tun_id, u8 *vni) { return !memcmp(vni, &tun_id[5], 3); } static sa_family_t geneve_get_sk_family(struct geneve_sock *gs) { return gs->sock->sk->sk_family; } static struct geneve_dev *geneve_lookup(struct geneve_sock *gs, __be32 addr, u8 vni[]) { struct hlist_head *vni_list_head; struct geneve_dev_node *node; __u32 hash; /* Find the device for this VNI */ hash = geneve_net_vni_hash(vni); vni_list_head = &gs->vni_list[hash]; hlist_for_each_entry_rcu(node, vni_list_head, hlist) { if (eq_tun_id_and_vni((u8 *)&node->geneve->cfg.info.key.tun_id, vni) && addr == node->geneve->cfg.info.key.u.ipv4.dst) return node->geneve; } return NULL; } #if IS_ENABLED(CONFIG_IPV6) static struct geneve_dev *geneve6_lookup(struct geneve_sock *gs, struct in6_addr addr6, u8 vni[]) { struct hlist_head *vni_list_head; struct geneve_dev_node *node; __u32 hash; /* Find the device for this VNI */ hash = geneve_net_vni_hash(vni); vni_list_head = &gs->vni_list[hash]; hlist_for_each_entry_rcu(node, vni_list_head, hlist) { if (eq_tun_id_and_vni((u8 *)&node->geneve->cfg.info.key.tun_id, vni) && ipv6_addr_equal(&addr6, &node->geneve->cfg.info.key.u.ipv6.dst)) return node->geneve; } return NULL; } #endif static inline struct genevehdr *geneve_hdr(const struct sk_buff *skb) { return (struct genevehdr *)(udp_hdr(skb) + 1); } static struct geneve_dev *geneve_lookup_skb(struct geneve_sock *gs, struct sk_buff *skb) { static u8 zero_vni[3]; u8 *vni; if (geneve_get_sk_family(gs) == AF_INET) { struct iphdr *iph; __be32 addr; iph = ip_hdr(skb); /* outer IP header... */ if (gs->collect_md) { vni = zero_vni; addr = 0; } else { vni = geneve_hdr(skb)->vni; addr = iph->saddr; } return geneve_lookup(gs, addr, vni); #if IS_ENABLED(CONFIG_IPV6) } else if (geneve_get_sk_family(gs) == AF_INET6) { static struct in6_addr zero_addr6; struct ipv6hdr *ip6h; struct in6_addr addr6; ip6h = ipv6_hdr(skb); /* outer IPv6 header... */ if (gs->collect_md) { vni = zero_vni; addr6 = zero_addr6; } else { vni = geneve_hdr(skb)->vni; addr6 = ip6h->saddr; } return geneve6_lookup(gs, addr6, vni); #endif } return NULL; } /* geneve receive/decap routine */ static void geneve_rx(struct geneve_dev *geneve, struct geneve_sock *gs, struct sk_buff *skb) { struct genevehdr *gnvh = geneve_hdr(skb); struct metadata_dst *tun_dst = NULL; unsigned int len; int nh, err = 0; void *oiph; if (ip_tunnel_collect_metadata() || gs->collect_md) { IP_TUNNEL_DECLARE_FLAGS(flags) = { }; __set_bit(IP_TUNNEL_KEY_BIT, flags); __assign_bit(IP_TUNNEL_OAM_BIT, flags, gnvh->oam); __assign_bit(IP_TUNNEL_CRIT_OPT_BIT, flags, gnvh->critical); tun_dst = udp_tun_rx_dst(skb, geneve_get_sk_family(gs), flags, vni_to_tunnel_id(gnvh->vni), gnvh->opt_len * 4); if (!tun_dst) { dev_dstats_rx_dropped(geneve->dev); goto drop; } /* Update tunnel dst according to Geneve options. */ ip_tunnel_flags_zero(flags); __set_bit(IP_TUNNEL_GENEVE_OPT_BIT, flags); ip_tunnel_info_opts_set(&tun_dst->u.tun_info, gnvh->options, gnvh->opt_len * 4, flags); } else { /* Drop packets w/ critical options, * since we don't support any... */ if (gnvh->critical) { DEV_STATS_INC(geneve->dev, rx_frame_errors); DEV_STATS_INC(geneve->dev, rx_errors); goto drop; } } if (tun_dst) skb_dst_set(skb, &tun_dst->dst); if (gnvh->proto_type == htons(ETH_P_TEB)) { skb_reset_mac_header(skb); skb->protocol = eth_type_trans(skb, geneve->dev); skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); /* Ignore packet loops (and multicast echo) */ if (ether_addr_equal(eth_hdr(skb)->h_source, geneve->dev->dev_addr)) { DEV_STATS_INC(geneve->dev, rx_errors); goto drop; } } else { skb_reset_mac_header(skb); skb->dev = geneve->dev; skb->pkt_type = PACKET_HOST; } /* Save offset of outer header relative to skb->head, * because we are going to reset the network header to the inner header * and might change skb->head. */ nh = skb_network_header(skb) - skb->head; skb_reset_network_header(skb); if (!pskb_inet_may_pull(skb)) { DEV_STATS_INC(geneve->dev, rx_length_errors); DEV_STATS_INC(geneve->dev, rx_errors); goto drop; } /* Get the outer header. */ oiph = skb->head + nh; if (geneve_get_sk_family(gs) == AF_INET) err = IP_ECN_decapsulate(oiph, skb); #if IS_ENABLED(CONFIG_IPV6) else err = IP6_ECN_decapsulate(oiph, skb); #endif if (unlikely(err)) { if (log_ecn_error) { if (geneve_get_sk_family(gs) == AF_INET) net_info_ratelimited("non-ECT from %pI4 " "with TOS=%#x\n", &((struct iphdr *)oiph)->saddr, ((struct iphdr *)oiph)->tos); #if IS_ENABLED(CONFIG_IPV6) else net_info_ratelimited("non-ECT from %pI6\n", &((struct ipv6hdr *)oiph)->saddr); #endif } if (err > 1) { DEV_STATS_INC(geneve->dev, rx_frame_errors); DEV_STATS_INC(geneve->dev, rx_errors); goto drop; } } len = skb->len; err = gro_cells_receive(&geneve->gro_cells, skb); if (likely(err == NET_RX_SUCCESS)) dev_dstats_rx_add(geneve->dev, len); return; drop: /* Consume bad packet */ kfree_skb(skb); } /* Setup stats when device is created */ static int geneve_init(struct net_device *dev) { struct geneve_dev *geneve = netdev_priv(dev); int err; err = gro_cells_init(&geneve->gro_cells, dev); if (err) return err; err = dst_cache_init(&geneve->cfg.info.dst_cache, GFP_KERNEL); if (err) { gro_cells_destroy(&geneve->gro_cells); return err; } netdev_lockdep_set_classes(dev); return 0; } static void geneve_uninit(struct net_device *dev) { struct geneve_dev *geneve = netdev_priv(dev); dst_cache_destroy(&geneve->cfg.info.dst_cache); gro_cells_destroy(&geneve->gro_cells); } /* Callback from net/ipv4/udp.c to receive packets */ static int geneve_udp_encap_recv(struct sock *sk, struct sk_buff *skb) { struct genevehdr *geneveh; struct geneve_dev *geneve; struct geneve_sock *gs; __be16 inner_proto; int opts_len; /* Need UDP and Geneve header to be present */ if (unlikely(!pskb_may_pull(skb, GENEVE_BASE_HLEN))) goto drop; /* Return packets with reserved bits set */ geneveh = geneve_hdr(skb); if (unlikely(geneveh->ver != GENEVE_VER)) goto drop; gs = rcu_dereference_sk_user_data(sk); if (!gs) goto drop; geneve = geneve_lookup_skb(gs, skb); if (!geneve) goto drop; inner_proto = geneveh->proto_type; if (unlikely((!geneve->cfg.inner_proto_inherit && inner_proto != htons(ETH_P_TEB)))) { dev_dstats_rx_dropped(geneve->dev); goto drop; } opts_len = geneveh->opt_len * 4; if (iptunnel_pull_header(skb, GENEVE_BASE_HLEN + opts_len, inner_proto, !net_eq(geneve->net, dev_net(geneve->dev)))) { dev_dstats_rx_dropped(geneve->dev); goto drop; } geneve_rx(geneve, gs, skb); return 0; drop: /* Consume bad packet */ kfree_skb(skb); return 0; } /* Callback from net/ipv{4,6}/udp.c to check that we have a tunnel for errors */ static int geneve_udp_encap_err_lookup(struct sock *sk, struct sk_buff *skb) { struct genevehdr *geneveh; struct geneve_sock *gs; u8 zero_vni[3] = { 0 }; u8 *vni = zero_vni; if (!pskb_may_pull(skb, skb_transport_offset(skb) + GENEVE_BASE_HLEN)) return -EINVAL; geneveh = geneve_hdr(skb); if (geneveh->ver != GENEVE_VER) return -EINVAL; if (geneveh->proto_type != htons(ETH_P_TEB)) return -EINVAL; gs = rcu_dereference_sk_user_data(sk); if (!gs) return -ENOENT; if (geneve_get_sk_family(gs) == AF_INET) { struct iphdr *iph = ip_hdr(skb); __be32 addr4 = 0; if (!gs->collect_md) { vni = geneve_hdr(skb)->vni; addr4 = iph->daddr; } return geneve_lookup(gs, addr4, vni) ? 0 : -ENOENT; } #if IS_ENABLED(CONFIG_IPV6) if (geneve_get_sk_family(gs) == AF_INET6) { struct ipv6hdr *ip6h = ipv6_hdr(skb); struct in6_addr addr6; memset(&addr6, 0, sizeof(struct in6_addr)); if (!gs->collect_md) { vni = geneve_hdr(skb)->vni; addr6 = ip6h->daddr; } return geneve6_lookup(gs, addr6, vni) ? 0 : -ENOENT; } #endif return -EPFNOSUPPORT; } static struct socket *geneve_create_sock(struct net *net, bool ipv6, __be16 port, bool ipv6_rx_csum) { struct socket *sock; struct udp_port_cfg udp_conf; int err; memset(&udp_conf, 0, sizeof(udp_conf)); if (ipv6) { udp_conf.family = AF_INET6; udp_conf.ipv6_v6only = 1; udp_conf.use_udp6_rx_checksums = ipv6_rx_csum; } else { udp_conf.family = AF_INET; udp_conf.local_ip.s_addr = htonl(INADDR_ANY); } udp_conf.local_udp_port = port; /* Open UDP socket */ err = udp_sock_create(net, &udp_conf, &sock); if (err < 0) return ERR_PTR(err); udp_allow_gso(sock->sk); return sock; } static int geneve_hlen(struct genevehdr *gh) { return sizeof(*gh) + gh->opt_len * 4; } static struct sk_buff *geneve_gro_receive(struct sock *sk, struct list_head *head, struct sk_buff *skb) { struct sk_buff *pp = NULL; struct sk_buff *p; struct genevehdr *gh, *gh2; unsigned int hlen, gh_len, off_gnv; const struct packet_offload *ptype; __be16 type; int flush = 1; off_gnv = skb_gro_offset(skb); hlen = off_gnv + sizeof(*gh); gh = skb_gro_header(skb, hlen, off_gnv); if (unlikely(!gh)) goto out; if (gh->ver != GENEVE_VER || gh->oam) goto out; gh_len = geneve_hlen(gh); hlen = off_gnv + gh_len; if (!skb_gro_may_pull(skb, hlen)) { gh = skb_gro_header_slow(skb, hlen, off_gnv); if (unlikely(!gh)) goto out; } list_for_each_entry(p, head, list) { if (!NAPI_GRO_CB(p)->same_flow) continue; gh2 = (struct genevehdr *)(p->data + off_gnv); if (gh->opt_len != gh2->opt_len || memcmp(gh, gh2, gh_len)) { NAPI_GRO_CB(p)->same_flow = 0; continue; } } skb_gro_pull(skb, gh_len); skb_gro_postpull_rcsum(skb, gh, gh_len); type = gh->proto_type; if (likely(type == htons(ETH_P_TEB))) return call_gro_receive(eth_gro_receive, head, skb); ptype = gro_find_receive_by_type(type); if (!ptype) goto out; pp = call_gro_receive(ptype->callbacks.gro_receive, head, skb); flush = 0; out: skb_gro_flush_final(skb, pp, flush); return pp; } static int geneve_gro_complete(struct sock *sk, struct sk_buff *skb, int nhoff) { struct genevehdr *gh; struct packet_offload *ptype; __be16 type; int gh_len; int err = -ENOSYS; gh = (struct genevehdr *)(skb->data + nhoff); gh_len = geneve_hlen(gh); type = gh->proto_type; /* since skb->encapsulation is set, eth_gro_complete() sets the inner mac header */ if (likely(type == htons(ETH_P_TEB))) return eth_gro_complete(skb, nhoff + gh_len); ptype = gro_find_complete_by_type(type); if (ptype) err = ptype->callbacks.gro_complete(skb, nhoff + gh_len); skb_set_inner_mac_header(skb, nhoff + gh_len); return err; } /* Create new listen socket if needed */ static struct geneve_sock *geneve_socket_create(struct net *net, __be16 port, bool ipv6, bool ipv6_rx_csum) { struct geneve_net *gn = net_generic(net, geneve_net_id); struct geneve_sock *gs; struct socket *sock; struct udp_tunnel_sock_cfg tunnel_cfg; int h; gs = kzalloc(sizeof(*gs), GFP_KERNEL); if (!gs) return ERR_PTR(-ENOMEM); sock = geneve_create_sock(net, ipv6, port, ipv6_rx_csum); if (IS_ERR(sock)) { kfree(gs); return ERR_CAST(sock); } gs->sock = sock; gs->refcnt = 1; for (h = 0; h < VNI_HASH_SIZE; ++h) INIT_HLIST_HEAD(&gs->vni_list[h]); /* Initialize the geneve udp offloads structure */ udp_tunnel_notify_add_rx_port(gs->sock, UDP_TUNNEL_TYPE_GENEVE); /* Mark socket as an encapsulation socket */ memset(&tunnel_cfg, 0, sizeof(tunnel_cfg)); tunnel_cfg.sk_user_data = gs; tunnel_cfg.encap_type = 1; tunnel_cfg.gro_receive = geneve_gro_receive; tunnel_cfg.gro_complete = geneve_gro_complete; tunnel_cfg.encap_rcv = geneve_udp_encap_recv; tunnel_cfg.encap_err_lookup = geneve_udp_encap_err_lookup; tunnel_cfg.encap_destroy = NULL; setup_udp_tunnel_sock(net, sock, &tunnel_cfg); list_add(&gs->list, &gn->sock_list); return gs; } static void __geneve_sock_release(struct geneve_sock *gs) { if (!gs || --gs->refcnt) return; list_del(&gs->list); udp_tunnel_notify_del_rx_port(gs->sock, UDP_TUNNEL_TYPE_GENEVE); udp_tunnel_sock_release(gs->sock); kfree_rcu(gs, rcu); } static void geneve_sock_release(struct geneve_dev *geneve) { struct geneve_sock *gs4 = rtnl_dereference(geneve->sock4); #if IS_ENABLED(CONFIG_IPV6) struct geneve_sock *gs6 = rtnl_dereference(geneve->sock6); rcu_assign_pointer(geneve->sock6, NULL); #endif rcu_assign_pointer(geneve->sock4, NULL); synchronize_net(); __geneve_sock_release(gs4); #if IS_ENABLED(CONFIG_IPV6) __geneve_sock_release(gs6); #endif } static struct geneve_sock *geneve_find_sock(struct geneve_net *gn, sa_family_t family, __be16 dst_port) { struct geneve_sock *gs; list_for_each_entry(gs, &gn->sock_list, list) { if (inet_sk(gs->sock->sk)->inet_sport == dst_port && geneve_get_sk_family(gs) == family) { return gs; } } return NULL; } static int geneve_sock_add(struct geneve_dev *geneve, bool ipv6) { struct net *net = geneve->net; struct geneve_net *gn = net_generic(net, geneve_net_id); struct geneve_dev_node *node; struct geneve_sock *gs; __u8 vni[3]; __u32 hash; gs = geneve_find_sock(gn, ipv6 ? AF_INET6 : AF_INET, geneve->cfg.info.key.tp_dst); if (gs) { gs->refcnt++; goto out; } gs = geneve_socket_create(net, geneve->cfg.info.key.tp_dst, ipv6, geneve->cfg.use_udp6_rx_checksums); if (IS_ERR(gs)) return PTR_ERR(gs); out: gs->collect_md = geneve->cfg.collect_md; #if IS_ENABLED(CONFIG_IPV6) if (ipv6) { rcu_assign_pointer(geneve->sock6, gs); node = &geneve->hlist6; } else #endif { rcu_assign_pointer(geneve->sock4, gs); node = &geneve->hlist4; } node->geneve = geneve; tunnel_id_to_vni(geneve->cfg.info.key.tun_id, vni); hash = geneve_net_vni_hash(vni); hlist_add_head_rcu(&node->hlist, &gs->vni_list[hash]); return 0; } static int geneve_open(struct net_device *dev) { struct geneve_dev *geneve = netdev_priv(dev); bool metadata = geneve->cfg.collect_md; bool ipv4, ipv6; int ret = 0; ipv6 = geneve->cfg.info.mode & IP_TUNNEL_INFO_IPV6 || metadata; ipv4 = !ipv6 || metadata; #if IS_ENABLED(CONFIG_IPV6) if (ipv6) { ret = geneve_sock_add(geneve, true); if (ret < 0 && ret != -EAFNOSUPPORT) ipv4 = false; } #endif if (ipv4) ret = geneve_sock_add(geneve, false); if (ret < 0) geneve_sock_release(geneve); return ret; } static int geneve_stop(struct net_device *dev) { struct geneve_dev *geneve = netdev_priv(dev); hlist_del_init_rcu(&geneve->hlist4.hlist); #if IS_ENABLED(CONFIG_IPV6) hlist_del_init_rcu(&geneve->hlist6.hlist); #endif geneve_sock_release(geneve); return 0; } static void geneve_build_header(struct genevehdr *geneveh, const struct ip_tunnel_info *info, __be16 inner_proto) { geneveh->ver = GENEVE_VER; geneveh->opt_len = info->options_len / 4; geneveh->oam = test_bit(IP_TUNNEL_OAM_BIT, info->key.tun_flags); geneveh->critical = test_bit(IP_TUNNEL_CRIT_OPT_BIT, info->key.tun_flags); geneveh->rsvd1 = 0; tunnel_id_to_vni(info->key.tun_id, geneveh->vni); geneveh->proto_type = inner_proto; geneveh->rsvd2 = 0; if (test_bit(IP_TUNNEL_GENEVE_OPT_BIT, info->key.tun_flags)) ip_tunnel_info_opts_get(geneveh->options, info); } static int geneve_build_skb(struct dst_entry *dst, struct sk_buff *skb, const struct ip_tunnel_info *info, bool xnet, int ip_hdr_len, bool inner_proto_inherit) { bool udp_sum = test_bit(IP_TUNNEL_CSUM_BIT, info->key.tun_flags); struct genevehdr *gnvh; __be16 inner_proto; int min_headroom; int err; skb_reset_mac_header(skb); skb_scrub_packet(skb, xnet); min_headroom = LL_RESERVED_SPACE(dst->dev) + dst->header_len + GENEVE_BASE_HLEN + info->options_len + ip_hdr_len; err = skb_cow_head(skb, min_headroom); if (unlikely(err)) goto free_dst; err = udp_tunnel_handle_offloads(skb, udp_sum); if (err) goto free_dst; gnvh = __skb_push(skb, sizeof(*gnvh) + info->options_len); inner_proto = inner_proto_inherit ? skb->protocol : htons(ETH_P_TEB); geneve_build_header(gnvh, info, inner_proto); skb_set_inner_protocol(skb, inner_proto); return 0; free_dst: dst_release(dst); return err; } static u8 geneve_get_dsfield(struct sk_buff *skb, struct net_device *dev, const struct ip_tunnel_info *info, bool *use_cache) { struct geneve_dev *geneve = netdev_priv(dev); u8 dsfield; dsfield = info->key.tos; if (dsfield == 1 && !geneve->cfg.collect_md) { dsfield = ip_tunnel_get_dsfield(ip_hdr(skb), skb); *use_cache = false; } return dsfield; } static int geneve_xmit_skb(struct sk_buff *skb, struct net_device *dev, struct geneve_dev *geneve, const struct ip_tunnel_info *info) { bool inner_proto_inherit = geneve->cfg.inner_proto_inherit; bool xnet = !net_eq(geneve->net, dev_net(geneve->dev)); struct geneve_sock *gs4 = rcu_dereference(geneve->sock4); const struct ip_tunnel_key *key = &info->key; struct rtable *rt; bool use_cache; __u8 tos, ttl; __be16 df = 0; __be32 saddr; __be16 sport; int err; if (skb_vlan_inet_prepare(skb, inner_proto_inherit)) return -EINVAL; if (!gs4) return -EIO; use_cache = ip_tunnel_dst_cache_usable(skb, info); tos = geneve_get_dsfield(skb, dev, info, &use_cache); sport = udp_flow_src_port(geneve->net, skb, geneve->cfg.port_min, geneve->cfg.port_max, true); rt = udp_tunnel_dst_lookup(skb, dev, geneve->net, 0, &saddr, &info->key, sport, geneve->cfg.info.key.tp_dst, tos, use_cache ? (struct dst_cache *)&info->dst_cache : NULL); if (IS_ERR(rt)) return PTR_ERR(rt); err = skb_tunnel_check_pmtu(skb, &rt->dst, GENEVE_IPV4_HLEN + info->options_len, netif_is_any_bridge_port(dev)); if (err < 0) { dst_release(&rt->dst); return err; } else if (err) { struct ip_tunnel_info *info; info = skb_tunnel_info(skb); if (info) { struct ip_tunnel_info *unclone; unclone = skb_tunnel_info_unclone(skb); if (unlikely(!unclone)) { dst_release(&rt->dst); return -ENOMEM; } unclone->key.u.ipv4.dst = saddr; unclone->key.u.ipv4.src = info->key.u.ipv4.dst; } if (!pskb_may_pull(skb, ETH_HLEN)) { dst_release(&rt->dst); return -EINVAL; } skb->protocol = eth_type_trans(skb, geneve->dev); __netif_rx(skb); dst_release(&rt->dst); return -EMSGSIZE; } tos = ip_tunnel_ecn_encap(tos, ip_hdr(skb), skb); if (geneve->cfg.collect_md) { ttl = key->ttl; df = test_bit(IP_TUNNEL_DONT_FRAGMENT_BIT, key->tun_flags) ? htons(IP_DF) : 0; } else { if (geneve->cfg.ttl_inherit) ttl = ip_tunnel_get_ttl(ip_hdr(skb), skb); else ttl = key->ttl; ttl = ttl ? : ip4_dst_hoplimit(&rt->dst); if (geneve->cfg.df == GENEVE_DF_SET) { df = htons(IP_DF); } else if (geneve->cfg.df == GENEVE_DF_INHERIT) { struct ethhdr *eth = skb_eth_hdr(skb); if (ntohs(eth->h_proto) == ETH_P_IPV6) { df = htons(IP_DF); } else if (ntohs(eth->h_proto) == ETH_P_IP) { struct iphdr *iph = ip_hdr(skb); if (iph->frag_off & htons(IP_DF)) df = htons(IP_DF); } } } err = geneve_build_skb(&rt->dst, skb, info, xnet, sizeof(struct iphdr), inner_proto_inherit); if (unlikely(err)) return err; udp_tunnel_xmit_skb(rt, gs4->sock->sk, skb, saddr, info->key.u.ipv4.dst, tos, ttl, df, sport, geneve->cfg.info.key.tp_dst, !net_eq(geneve->net, dev_net(geneve->dev)), !test_bit(IP_TUNNEL_CSUM_BIT, info->key.tun_flags), 0); return 0; } #if IS_ENABLED(CONFIG_IPV6) static int geneve6_xmit_skb(struct sk_buff *skb, struct net_device *dev, struct geneve_dev *geneve, const struct ip_tunnel_info *info) { bool inner_proto_inherit = geneve->cfg.inner_proto_inherit; bool xnet = !net_eq(geneve->net, dev_net(geneve->dev)); struct geneve_sock *gs6 = rcu_dereference(geneve->sock6); const struct ip_tunnel_key *key = &info->key; struct dst_entry *dst = NULL; struct in6_addr saddr; bool use_cache; __u8 prio, ttl; __be16 sport; int err; if (skb_vlan_inet_prepare(skb, inner_proto_inherit)) return -EINVAL; if (!gs6) return -EIO; use_cache = ip_tunnel_dst_cache_usable(skb, info); prio = geneve_get_dsfield(skb, dev, info, &use_cache); sport = udp_flow_src_port(geneve->net, skb, geneve->cfg.port_min, geneve->cfg.port_max, true); dst = udp_tunnel6_dst_lookup(skb, dev, geneve->net, gs6->sock, 0, &saddr, key, sport, geneve->cfg.info.key.tp_dst, prio, use_cache ? (struct dst_cache *)&info->dst_cache : NULL); if (IS_ERR(dst)) return PTR_ERR(dst); err = skb_tunnel_check_pmtu(skb, dst, GENEVE_IPV6_HLEN + info->options_len, netif_is_any_bridge_port(dev)); if (err < 0) { dst_release(dst); return err; } else if (err) { struct ip_tunnel_info *info = skb_tunnel_info(skb); if (info) { struct ip_tunnel_info *unclone; unclone = skb_tunnel_info_unclone(skb); if (unlikely(!unclone)) { dst_release(dst); return -ENOMEM; } unclone->key.u.ipv6.dst = saddr; unclone->key.u.ipv6.src = info->key.u.ipv6.dst; } if (!pskb_may_pull(skb, ETH_HLEN)) { dst_release(dst); return -EINVAL; } skb->protocol = eth_type_trans(skb, geneve->dev); __netif_rx(skb); dst_release(dst); return -EMSGSIZE; } prio = ip_tunnel_ecn_encap(prio, ip_hdr(skb), skb); if (geneve->cfg.collect_md) { ttl = key->ttl; } else { if (geneve->cfg.ttl_inherit) ttl = ip_tunnel_get_ttl(ip_hdr(skb), skb); else ttl = key->ttl; ttl = ttl ? : ip6_dst_hoplimit(dst); } err = geneve_build_skb(dst, skb, info, xnet, sizeof(struct ipv6hdr), inner_proto_inherit); if (unlikely(err)) return err; udp_tunnel6_xmit_skb(dst, gs6->sock->sk, skb, dev, &saddr, &key->u.ipv6.dst, prio, ttl, info->key.label, sport, geneve->cfg.info.key.tp_dst, !test_bit(IP_TUNNEL_CSUM_BIT, info->key.tun_flags), 0); return 0; } #endif static netdev_tx_t geneve_xmit(struct sk_buff *skb, struct net_device *dev) { struct geneve_dev *geneve = netdev_priv(dev); struct ip_tunnel_info *info = NULL; int err; if (geneve->cfg.collect_md) { info = skb_tunnel_info(skb); if (unlikely(!info || !(info->mode & IP_TUNNEL_INFO_TX))) { netdev_dbg(dev, "no tunnel metadata\n"); dev_kfree_skb(skb); dev_dstats_tx_dropped(dev); return NETDEV_TX_OK; } } else { info = &geneve->cfg.info; } rcu_read_lock(); #if IS_ENABLED(CONFIG_IPV6) if (info->mode & IP_TUNNEL_INFO_IPV6) err = geneve6_xmit_skb(skb, dev, geneve, info); else #endif err = geneve_xmit_skb(skb, dev, geneve, info); rcu_read_unlock(); if (likely(!err)) return NETDEV_TX_OK; if (err != -EMSGSIZE) dev_kfree_skb(skb); if (err == -ELOOP) DEV_STATS_INC(dev, collisions); else if (err == -ENETUNREACH) DEV_STATS_INC(dev, tx_carrier_errors); DEV_STATS_INC(dev, tx_errors); return NETDEV_TX_OK; } static int geneve_change_mtu(struct net_device *dev, int new_mtu) { if (new_mtu > dev->max_mtu) new_mtu = dev->max_mtu; else if (new_mtu < dev->min_mtu) new_mtu = dev->min_mtu; WRITE_ONCE(dev->mtu, new_mtu); return 0; } static int geneve_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb) { struct ip_tunnel_info *info = skb_tunnel_info(skb); struct geneve_dev *geneve = netdev_priv(dev); __be16 sport; if (ip_tunnel_info_af(info) == AF_INET) { struct rtable *rt; struct geneve_sock *gs4 = rcu_dereference(geneve->sock4); bool use_cache; __be32 saddr; u8 tos; if (!gs4) return -EIO; use_cache = ip_tunnel_dst_cache_usable(skb, info); tos = geneve_get_dsfield(skb, dev, info, &use_cache); sport = udp_flow_src_port(geneve->net, skb, geneve->cfg.port_min, geneve->cfg.port_max, true); rt = udp_tunnel_dst_lookup(skb, dev, geneve->net, 0, &saddr, &info->key, sport, geneve->cfg.info.key.tp_dst, tos, use_cache ? &info->dst_cache : NULL); if (IS_ERR(rt)) return PTR_ERR(rt); ip_rt_put(rt); info->key.u.ipv4.src = saddr; #if IS_ENABLED(CONFIG_IPV6) } else if (ip_tunnel_info_af(info) == AF_INET6) { struct dst_entry *dst; struct geneve_sock *gs6 = rcu_dereference(geneve->sock6); struct in6_addr saddr; bool use_cache; u8 prio; if (!gs6) return -EIO; use_cache = ip_tunnel_dst_cache_usable(skb, info); prio = geneve_get_dsfield(skb, dev, info, &use_cache); sport = udp_flow_src_port(geneve->net, skb, geneve->cfg.port_min, geneve->cfg.port_max, true); dst = udp_tunnel6_dst_lookup(skb, dev, geneve->net, gs6->sock, 0, &saddr, &info->key, sport, geneve->cfg.info.key.tp_dst, prio, use_cache ? &info->dst_cache : NULL); if (IS_ERR(dst)) return PTR_ERR(dst); dst_release(dst); info->key.u.ipv6.src = saddr; #endif } else { return -EINVAL; } info->key.tp_src = sport; info->key.tp_dst = geneve->cfg.info.key.tp_dst; return 0; } static const struct net_device_ops geneve_netdev_ops = { .ndo_init = geneve_init, .ndo_uninit = geneve_uninit, .ndo_open = geneve_open, .ndo_stop = geneve_stop, .ndo_start_xmit = geneve_xmit, .ndo_change_mtu = geneve_change_mtu, .ndo_validate_addr = eth_validate_addr, .ndo_set_mac_address = eth_mac_addr, .ndo_fill_metadata_dst = geneve_fill_metadata_dst, }; static void geneve_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *drvinfo) { strscpy(drvinfo->version, GENEVE_NETDEV_VER, sizeof(drvinfo->version)); strscpy(drvinfo->driver, "geneve", sizeof(drvinfo->driver)); } static const struct ethtool_ops geneve_ethtool_ops = { .get_drvinfo = geneve_get_drvinfo, .get_link = ethtool_op_get_link, }; /* Info for udev, that this is a virtual tunnel endpoint */ static const struct device_type geneve_type = { .name = "geneve", }; /* Calls the ndo_udp_tunnel_add of the caller in order to * supply the listening GENEVE udp ports. Callers are expected * to implement the ndo_udp_tunnel_add. */ static void geneve_offload_rx_ports(struct net_device *dev, bool push) { struct net *net = dev_net(dev); struct geneve_net *gn = net_generic(net, geneve_net_id); struct geneve_sock *gs; ASSERT_RTNL(); list_for_each_entry(gs, &gn->sock_list, list) { if (push) { udp_tunnel_push_rx_port(dev, gs->sock, UDP_TUNNEL_TYPE_GENEVE); } else { udp_tunnel_drop_rx_port(dev, gs->sock, UDP_TUNNEL_TYPE_GENEVE); } } } /* Initialize the device structure. */ static void geneve_setup(struct net_device *dev) { ether_setup(dev); dev->netdev_ops = &geneve_netdev_ops; dev->ethtool_ops = &geneve_ethtool_ops; dev->needs_free_netdev = true; SET_NETDEV_DEVTYPE(dev, &geneve_type); dev->features |= NETIF_F_SG | NETIF_F_HW_CSUM | NETIF_F_FRAGLIST; dev->features |= NETIF_F_RXCSUM; dev->features |= NETIF_F_GSO_SOFTWARE; dev->hw_features |= NETIF_F_SG | NETIF_F_HW_CSUM | NETIF_F_FRAGLIST; dev->hw_features |= NETIF_F_RXCSUM; dev->hw_features |= NETIF_F_GSO_SOFTWARE; dev->pcpu_stat_type = NETDEV_PCPU_STAT_DSTATS; /* MTU range: 68 - (something less than 65535) */ dev->min_mtu = ETH_MIN_MTU; /* The max_mtu calculation does not take account of GENEVE * options, to avoid excluding potentially valid * configurations. This will be further reduced by IPvX hdr size. */ dev->max_mtu = IP_MAX_MTU - GENEVE_BASE_HLEN - dev->hard_header_len; netif_keep_dst(dev); dev->priv_flags &= ~IFF_TX_SKB_SHARING; dev->priv_flags |= IFF_LIVE_ADDR_CHANGE | IFF_NO_QUEUE; dev->lltx = true; eth_hw_addr_random(dev); } static const struct nla_policy geneve_policy[IFLA_GENEVE_MAX + 1] = { [IFLA_GENEVE_UNSPEC] = { .strict_start_type = IFLA_GENEVE_INNER_PROTO_INHERIT }, [IFLA_GENEVE_ID] = { .type = NLA_U32 }, [IFLA_GENEVE_REMOTE] = { .len = sizeof_field(struct iphdr, daddr) }, [IFLA_GENEVE_REMOTE6] = { .len = sizeof(struct in6_addr) }, [IFLA_GENEVE_TTL] = { .type = NLA_U8 }, [IFLA_GENEVE_TOS] = { .type = NLA_U8 }, [IFLA_GENEVE_LABEL] = { .type = NLA_U32 }, [IFLA_GENEVE_PORT] = { .type = NLA_U16 }, [IFLA_GENEVE_COLLECT_METADATA] = { .type = NLA_FLAG }, [IFLA_GENEVE_UDP_CSUM] = { .type = NLA_U8 }, [IFLA_GENEVE_UDP_ZERO_CSUM6_TX] = { .type = NLA_U8 }, [IFLA_GENEVE_UDP_ZERO_CSUM6_RX] = { .type = NLA_U8 }, [IFLA_GENEVE_TTL_INHERIT] = { .type = NLA_U8 }, [IFLA_GENEVE_DF] = { .type = NLA_U8 }, [IFLA_GENEVE_INNER_PROTO_INHERIT] = { .type = NLA_FLAG }, [IFLA_GENEVE_PORT_RANGE] = NLA_POLICY_EXACT_LEN(sizeof(struct ifla_geneve_port_range)), }; static int geneve_validate(struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { if (tb[IFLA_ADDRESS]) { if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN) { NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_ADDRESS], "Provided link layer address is not Ethernet"); return -EINVAL; } if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS]))) { NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_ADDRESS], "Provided Ethernet address is not unicast"); return -EADDRNOTAVAIL; } } if (!data) { NL_SET_ERR_MSG(extack, "Not enough attributes provided to perform the operation"); return -EINVAL; } if (data[IFLA_GENEVE_ID]) { __u32 vni = nla_get_u32(data[IFLA_GENEVE_ID]); if (vni >= GENEVE_N_VID) { NL_SET_ERR_MSG_ATTR(extack, data[IFLA_GENEVE_ID], "Geneve ID must be lower than 16777216"); return -ERANGE; } } if (data[IFLA_GENEVE_DF]) { enum ifla_geneve_df df = nla_get_u8(data[IFLA_GENEVE_DF]); if (df < 0 || df > GENEVE_DF_MAX) { NL_SET_ERR_MSG_ATTR(extack, data[IFLA_GENEVE_DF], "Invalid DF attribute"); return -EINVAL; } } if (data[IFLA_GENEVE_PORT_RANGE]) { const struct ifla_geneve_port_range *p; p = nla_data(data[IFLA_GENEVE_PORT_RANGE]); if (ntohs(p->high) < ntohs(p->low)) { NL_SET_ERR_MSG_ATTR(extack, data[IFLA_GENEVE_PORT_RANGE], "Invalid source port range"); return -EINVAL; } } return 0; } static struct geneve_dev *geneve_find_dev(struct geneve_net *gn, const struct ip_tunnel_info *info, bool *tun_on_same_port, bool *tun_collect_md) { struct geneve_dev *geneve, *t = NULL; *tun_on_same_port = false; *tun_collect_md = false; list_for_each_entry(geneve, &gn->geneve_list, next) { if (info->key.tp_dst == geneve->cfg.info.key.tp_dst) { *tun_collect_md = geneve->cfg.collect_md; *tun_on_same_port = true; } if (info->key.tun_id == geneve->cfg.info.key.tun_id && info->key.tp_dst == geneve->cfg.info.key.tp_dst && !memcmp(&info->key.u, &geneve->cfg.info.key.u, sizeof(info->key.u))) t = geneve; } return t; } static bool is_tnl_info_zero(const struct ip_tunnel_info *info) { return !(info->key.tun_id || info->key.tos || !ip_tunnel_flags_empty(info->key.tun_flags) || info->key.ttl || info->key.label || info->key.tp_src || memchr_inv(&info->key.u, 0, sizeof(info->key.u))); } static bool geneve_dst_addr_equal(struct ip_tunnel_info *a, struct ip_tunnel_info *b) { if (ip_tunnel_info_af(a) == AF_INET) return a->key.u.ipv4.dst == b->key.u.ipv4.dst; else return ipv6_addr_equal(&a->key.u.ipv6.dst, &b->key.u.ipv6.dst); } static int geneve_configure(struct net *net, struct net_device *dev, struct netlink_ext_ack *extack, const struct geneve_config *cfg) { struct geneve_net *gn = net_generic(net, geneve_net_id); struct geneve_dev *t, *geneve = netdev_priv(dev); const struct ip_tunnel_info *info = &cfg->info; bool tun_collect_md, tun_on_same_port; int err, encap_len; if (cfg->collect_md && !is_tnl_info_zero(info)) { NL_SET_ERR_MSG(extack, "Device is externally controlled, so attributes (VNI, Port, and so on) must not be specified"); return -EINVAL; } geneve->net = net; geneve->dev = dev; t = geneve_find_dev(gn, info, &tun_on_same_port, &tun_collect_md); if (t) return -EBUSY; /* make enough headroom for basic scenario */ encap_len = GENEVE_BASE_HLEN + ETH_HLEN; if (!cfg->collect_md && ip_tunnel_info_af(info) == AF_INET) { encap_len += sizeof(struct iphdr); dev->max_mtu -= sizeof(struct iphdr); } else { encap_len += sizeof(struct ipv6hdr); dev->max_mtu -= sizeof(struct ipv6hdr); } dev->needed_headroom = encap_len + ETH_HLEN; if (cfg->collect_md) { if (tun_on_same_port) { NL_SET_ERR_MSG(extack, "There can be only one externally controlled device on a destination port"); return -EPERM; } } else { if (tun_collect_md) { NL_SET_ERR_MSG(extack, "There already exists an externally controlled device on this destination port"); return -EPERM; } } dst_cache_reset(&geneve->cfg.info.dst_cache); memcpy(&geneve->cfg, cfg, sizeof(*cfg)); if (geneve->cfg.inner_proto_inherit) { dev->header_ops = NULL; dev->type = ARPHRD_NONE; dev->hard_header_len = 0; dev->addr_len = 0; dev->flags = IFF_POINTOPOINT | IFF_NOARP; } err = register_netdevice(dev); if (err) return err; list_add(&geneve->next, &gn->geneve_list); return 0; } static void init_tnl_info(struct ip_tunnel_info *info, __u16 dst_port) { memset(info, 0, sizeof(*info)); info->key.tp_dst = htons(dst_port); } static int geneve_nl2info(struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack, struct geneve_config *cfg, bool changelink) { struct ip_tunnel_info *info = &cfg->info; int attrtype; if (data[IFLA_GENEVE_REMOTE] && data[IFLA_GENEVE_REMOTE6]) { NL_SET_ERR_MSG(extack, "Cannot specify both IPv4 and IPv6 Remote addresses"); return -EINVAL; } if (data[IFLA_GENEVE_REMOTE]) { if (changelink && (ip_tunnel_info_af(info) == AF_INET6)) { attrtype = IFLA_GENEVE_REMOTE; goto change_notsup; } info->key.u.ipv4.dst = nla_get_in_addr(data[IFLA_GENEVE_REMOTE]); if (ipv4_is_multicast(info->key.u.ipv4.dst)) { NL_SET_ERR_MSG_ATTR(extack, data[IFLA_GENEVE_REMOTE], "Remote IPv4 address cannot be Multicast"); return -EINVAL; } } if (data[IFLA_GENEVE_REMOTE6]) { #if IS_ENABLED(CONFIG_IPV6) if (changelink && (ip_tunnel_info_af(info) == AF_INET)) { attrtype = IFLA_GENEVE_REMOTE6; goto change_notsup; } info->mode = IP_TUNNEL_INFO_IPV6; info->key.u.ipv6.dst = nla_get_in6_addr(data[IFLA_GENEVE_REMOTE6]); if (ipv6_addr_type(&info->key.u.ipv6.dst) & IPV6_ADDR_LINKLOCAL) { NL_SET_ERR_MSG_ATTR(extack, data[IFLA_GENEVE_REMOTE6], "Remote IPv6 address cannot be link-local"); return -EINVAL; } if (ipv6_addr_is_multicast(&info->key.u.ipv6.dst)) { NL_SET_ERR_MSG_ATTR(extack, data[IFLA_GENEVE_REMOTE6], "Remote IPv6 address cannot be Multicast"); return -EINVAL; } __set_bit(IP_TUNNEL_CSUM_BIT, info->key.tun_flags); cfg->use_udp6_rx_checksums = true; #else NL_SET_ERR_MSG_ATTR(extack, data[IFLA_GENEVE_REMOTE6], "IPv6 support not enabled in the kernel"); return -EPFNOSUPPORT; #endif } if (data[IFLA_GENEVE_ID]) { __u32 vni; __u8 tvni[3]; __be64 tunid; vni = nla_get_u32(data[IFLA_GENEVE_ID]); tvni[0] = (vni & 0x00ff0000) >> 16; tvni[1] = (vni & 0x0000ff00) >> 8; tvni[2] = vni & 0x000000ff; tunid = vni_to_tunnel_id(tvni); if (changelink && (tunid != info->key.tun_id)) { attrtype = IFLA_GENEVE_ID; goto change_notsup; } info->key.tun_id = tunid; } if (data[IFLA_GENEVE_TTL_INHERIT]) { if (nla_get_u8(data[IFLA_GENEVE_TTL_INHERIT])) cfg->ttl_inherit = true; else cfg->ttl_inherit = false; } else if (data[IFLA_GENEVE_TTL]) { info->key.ttl = nla_get_u8(data[IFLA_GENEVE_TTL]); cfg->ttl_inherit = false; } if (data[IFLA_GENEVE_TOS]) info->key.tos = nla_get_u8(data[IFLA_GENEVE_TOS]); if (data[IFLA_GENEVE_DF]) cfg->df = nla_get_u8(data[IFLA_GENEVE_DF]); if (data[IFLA_GENEVE_LABEL]) { info->key.label = nla_get_be32(data[IFLA_GENEVE_LABEL]) & IPV6_FLOWLABEL_MASK; if (info->key.label && (!(info->mode & IP_TUNNEL_INFO_IPV6))) { NL_SET_ERR_MSG_ATTR(extack, data[IFLA_GENEVE_LABEL], "Label attribute only applies for IPv6 Geneve devices"); return -EINVAL; } } if (data[IFLA_GENEVE_PORT]) { if (changelink) { attrtype = IFLA_GENEVE_PORT; goto change_notsup; } info->key.tp_dst = nla_get_be16(data[IFLA_GENEVE_PORT]); } if (data[IFLA_GENEVE_PORT_RANGE]) { const struct ifla_geneve_port_range *p; if (changelink) { attrtype = IFLA_GENEVE_PORT_RANGE; goto change_notsup; } p = nla_data(data[IFLA_GENEVE_PORT_RANGE]); cfg->port_min = ntohs(p->low); cfg->port_max = ntohs(p->high); } if (data[IFLA_GENEVE_COLLECT_METADATA]) { if (changelink) { attrtype = IFLA_GENEVE_COLLECT_METADATA; goto change_notsup; } cfg->collect_md = true; } if (data[IFLA_GENEVE_UDP_CSUM]) { if (changelink) { attrtype = IFLA_GENEVE_UDP_CSUM; goto change_notsup; } if (nla_get_u8(data[IFLA_GENEVE_UDP_CSUM])) __set_bit(IP_TUNNEL_CSUM_BIT, info->key.tun_flags); } if (data[IFLA_GENEVE_UDP_ZERO_CSUM6_TX]) { #if IS_ENABLED(CONFIG_IPV6) if (changelink) { attrtype = IFLA_GENEVE_UDP_ZERO_CSUM6_TX; goto change_notsup; } if (nla_get_u8(data[IFLA_GENEVE_UDP_ZERO_CSUM6_TX])) __clear_bit(IP_TUNNEL_CSUM_BIT, info->key.tun_flags); #else NL_SET_ERR_MSG_ATTR(extack, data[IFLA_GENEVE_UDP_ZERO_CSUM6_TX], "IPv6 support not enabled in the kernel"); return -EPFNOSUPPORT; #endif } if (data[IFLA_GENEVE_UDP_ZERO_CSUM6_RX]) { #if IS_ENABLED(CONFIG_IPV6) if (changelink) { attrtype = IFLA_GENEVE_UDP_ZERO_CSUM6_RX; goto change_notsup; } if (nla_get_u8(data[IFLA_GENEVE_UDP_ZERO_CSUM6_RX])) cfg->use_udp6_rx_checksums = false; #else NL_SET_ERR_MSG_ATTR(extack, data[IFLA_GENEVE_UDP_ZERO_CSUM6_RX], "IPv6 support not enabled in the kernel"); return -EPFNOSUPPORT; #endif } if (data[IFLA_GENEVE_INNER_PROTO_INHERIT]) { if (changelink) { attrtype = IFLA_GENEVE_INNER_PROTO_INHERIT; goto change_notsup; } cfg->inner_proto_inherit = true; } return 0; change_notsup: NL_SET_ERR_MSG_ATTR(extack, data[attrtype], "Changing VNI, Port, endpoint IP address family, external, inner_proto_inherit, and UDP checksum attributes are not supported"); return -EOPNOTSUPP; } static void geneve_link_config(struct net_device *dev, struct ip_tunnel_info *info, struct nlattr *tb[]) { struct geneve_dev *geneve = netdev_priv(dev); int ldev_mtu = 0; if (tb[IFLA_MTU]) { geneve_change_mtu(dev, nla_get_u32(tb[IFLA_MTU])); return; } switch (ip_tunnel_info_af(info)) { case AF_INET: { struct flowi4 fl4 = { .daddr = info->key.u.ipv4.dst }; struct rtable *rt = ip_route_output_key(geneve->net, &fl4); if (!IS_ERR(rt) && rt->dst.dev) { ldev_mtu = rt->dst.dev->mtu - GENEVE_IPV4_HLEN; ip_rt_put(rt); } break; } #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: { struct rt6_info *rt; if (!__in6_dev_get(dev)) break; rt = rt6_lookup(geneve->net, &info->key.u.ipv6.dst, NULL, 0, NULL, 0); if (rt && rt->dst.dev) ldev_mtu = rt->dst.dev->mtu - GENEVE_IPV6_HLEN; ip6_rt_put(rt); break; } #endif } if (ldev_mtu <= 0) return; geneve_change_mtu(dev, ldev_mtu - info->options_len); } static int geneve_newlink(struct net_device *dev, struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { struct net *link_net = rtnl_newlink_link_net(params); struct nlattr **data = params->data; struct nlattr **tb = params->tb; struct geneve_config cfg = { .df = GENEVE_DF_UNSET, .use_udp6_rx_checksums = false, .ttl_inherit = false, .collect_md = false, .port_min = 1, .port_max = USHRT_MAX, }; int err; init_tnl_info(&cfg.info, GENEVE_UDP_PORT); err = geneve_nl2info(tb, data, extack, &cfg, false); if (err) return err; err = geneve_configure(link_net, dev, extack, &cfg); if (err) return err; geneve_link_config(dev, &cfg.info, tb); return 0; } /* Quiesces the geneve device data path for both TX and RX. * * On transmit geneve checks for non-NULL geneve_sock before it proceeds. * So, if we set that socket to NULL under RCU and wait for synchronize_net() * to complete for the existing set of in-flight packets to be transmitted, * then we would have quiesced the transmit data path. All the future packets * will get dropped until we unquiesce the data path. * * On receive geneve dereference the geneve_sock stashed in the socket. So, * if we set that to NULL under RCU and wait for synchronize_net() to * complete, then we would have quiesced the receive data path. */ static void geneve_quiesce(struct geneve_dev *geneve, struct geneve_sock **gs4, struct geneve_sock **gs6) { *gs4 = rtnl_dereference(geneve->sock4); rcu_assign_pointer(geneve->sock4, NULL); if (*gs4) rcu_assign_sk_user_data((*gs4)->sock->sk, NULL); #if IS_ENABLED(CONFIG_IPV6) *gs6 = rtnl_dereference(geneve->sock6); rcu_assign_pointer(geneve->sock6, NULL); if (*gs6) rcu_assign_sk_user_data((*gs6)->sock->sk, NULL); #else *gs6 = NULL; #endif synchronize_net(); } /* Resumes the geneve device data path for both TX and RX. */ static void geneve_unquiesce(struct geneve_dev *geneve, struct geneve_sock *gs4, struct geneve_sock __maybe_unused *gs6) { rcu_assign_pointer(geneve->sock4, gs4); if (gs4) rcu_assign_sk_user_data(gs4->sock->sk, gs4); #if IS_ENABLED(CONFIG_IPV6) rcu_assign_pointer(geneve->sock6, gs6); if (gs6) rcu_assign_sk_user_data(gs6->sock->sk, gs6); #endif synchronize_net(); } static int geneve_changelink(struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { struct geneve_dev *geneve = netdev_priv(dev); struct geneve_sock *gs4, *gs6; struct geneve_config cfg; int err; /* If the geneve device is configured for metadata (or externally * controlled, for example, OVS), then nothing can be changed. */ if (geneve->cfg.collect_md) return -EOPNOTSUPP; /* Start with the existing info. */ memcpy(&cfg, &geneve->cfg, sizeof(cfg)); err = geneve_nl2info(tb, data, extack, &cfg, true); if (err) return err; if (!geneve_dst_addr_equal(&geneve->cfg.info, &cfg.info)) { dst_cache_reset(&cfg.info.dst_cache); geneve_link_config(dev, &cfg.info, tb); } geneve_quiesce(geneve, &gs4, &gs6); memcpy(&geneve->cfg, &cfg, sizeof(cfg)); geneve_unquiesce(geneve, gs4, gs6); return 0; } static void geneve_dellink(struct net_device *dev, struct list_head *head) { struct geneve_dev *geneve = netdev_priv(dev); list_del(&geneve->next); unregister_netdevice_queue(dev, head); } static size_t geneve_get_size(const struct net_device *dev) { return nla_total_size(sizeof(__u32)) + /* IFLA_GENEVE_ID */ nla_total_size(sizeof(struct in6_addr)) + /* IFLA_GENEVE_REMOTE{6} */ nla_total_size(sizeof(__u8)) + /* IFLA_GENEVE_TTL */ nla_total_size(sizeof(__u8)) + /* IFLA_GENEVE_TOS */ nla_total_size(sizeof(__u8)) + /* IFLA_GENEVE_DF */ nla_total_size(sizeof(__be32)) + /* IFLA_GENEVE_LABEL */ nla_total_size(sizeof(__be16)) + /* IFLA_GENEVE_PORT */ nla_total_size(0) + /* IFLA_GENEVE_COLLECT_METADATA */ nla_total_size(sizeof(__u8)) + /* IFLA_GENEVE_UDP_CSUM */ nla_total_size(sizeof(__u8)) + /* IFLA_GENEVE_UDP_ZERO_CSUM6_TX */ nla_total_size(sizeof(__u8)) + /* IFLA_GENEVE_UDP_ZERO_CSUM6_RX */ nla_total_size(sizeof(__u8)) + /* IFLA_GENEVE_TTL_INHERIT */ nla_total_size(0) + /* IFLA_GENEVE_INNER_PROTO_INHERIT */ nla_total_size(sizeof(struct ifla_geneve_port_range)) + /* IFLA_GENEVE_PORT_RANGE */ 0; } static int geneve_fill_info(struct sk_buff *skb, const struct net_device *dev) { struct geneve_dev *geneve = netdev_priv(dev); struct ip_tunnel_info *info = &geneve->cfg.info; bool ttl_inherit = geneve->cfg.ttl_inherit; bool metadata = geneve->cfg.collect_md; struct ifla_geneve_port_range ports = { .low = htons(geneve->cfg.port_min), .high = htons(geneve->cfg.port_max), }; __u8 tmp_vni[3]; __u32 vni; tunnel_id_to_vni(info->key.tun_id, tmp_vni); vni = (tmp_vni[0] << 16) | (tmp_vni[1] << 8) | tmp_vni[2]; if (nla_put_u32(skb, IFLA_GENEVE_ID, vni)) goto nla_put_failure; if (!metadata && ip_tunnel_info_af(info) == AF_INET) { if (nla_put_in_addr(skb, IFLA_GENEVE_REMOTE, info->key.u.ipv4.dst)) goto nla_put_failure; if (nla_put_u8(skb, IFLA_GENEVE_UDP_CSUM, test_bit(IP_TUNNEL_CSUM_BIT, info->key.tun_flags))) goto nla_put_failure; #if IS_ENABLED(CONFIG_IPV6) } else if (!metadata) { if (nla_put_in6_addr(skb, IFLA_GENEVE_REMOTE6, &info->key.u.ipv6.dst)) goto nla_put_failure; if (nla_put_u8(skb, IFLA_GENEVE_UDP_ZERO_CSUM6_TX, !test_bit(IP_TUNNEL_CSUM_BIT, info->key.tun_flags))) goto nla_put_failure; #endif } if (nla_put_u8(skb, IFLA_GENEVE_TTL, info->key.ttl) || nla_put_u8(skb, IFLA_GENEVE_TOS, info->key.tos) || nla_put_be32(skb, IFLA_GENEVE_LABEL, info->key.label)) goto nla_put_failure; if (nla_put_u8(skb, IFLA_GENEVE_DF, geneve->cfg.df)) goto nla_put_failure; if (nla_put_be16(skb, IFLA_GENEVE_PORT, info->key.tp_dst)) goto nla_put_failure; if (metadata && nla_put_flag(skb, IFLA_GENEVE_COLLECT_METADATA)) goto nla_put_failure; #if IS_ENABLED(CONFIG_IPV6) if (nla_put_u8(skb, IFLA_GENEVE_UDP_ZERO_CSUM6_RX, !geneve->cfg.use_udp6_rx_checksums)) goto nla_put_failure; #endif if (nla_put_u8(skb, IFLA_GENEVE_TTL_INHERIT, ttl_inherit)) goto nla_put_failure; if (geneve->cfg.inner_proto_inherit && nla_put_flag(skb, IFLA_GENEVE_INNER_PROTO_INHERIT)) goto nla_put_failure; if (nla_put(skb, IFLA_GENEVE_PORT_RANGE, sizeof(ports), &ports)) goto nla_put_failure; return 0; nla_put_failure: return -EMSGSIZE; } static struct rtnl_link_ops geneve_link_ops __read_mostly = { .kind = "geneve", .maxtype = IFLA_GENEVE_MAX, .policy = geneve_policy, .priv_size = sizeof(struct geneve_dev), .setup = geneve_setup, .validate = geneve_validate, .newlink = geneve_newlink, .changelink = geneve_changelink, .dellink = geneve_dellink, .get_size = geneve_get_size, .fill_info = geneve_fill_info, }; struct net_device *geneve_dev_create_fb(struct net *net, const char *name, u8 name_assign_type, u16 dst_port) { struct nlattr *tb[IFLA_MAX + 1]; struct net_device *dev; LIST_HEAD(list_kill); int err; struct geneve_config cfg = { .df = GENEVE_DF_UNSET, .use_udp6_rx_checksums = true, .ttl_inherit = false, .collect_md = true, .port_min = 1, .port_max = USHRT_MAX, }; memset(tb, 0, sizeof(tb)); dev = rtnl_create_link(net, name, name_assign_type, &geneve_link_ops, tb, NULL); if (IS_ERR(dev)) return dev; init_tnl_info(&cfg.info, dst_port); err = geneve_configure(net, dev, NULL, &cfg); if (err) { free_netdev(dev); return ERR_PTR(err); } /* openvswitch users expect packet sizes to be unrestricted, * so set the largest MTU we can. */ err = geneve_change_mtu(dev, IP_MAX_MTU); if (err) goto err; err = rtnl_configure_link(dev, NULL, 0, NULL); if (err < 0) goto err; return dev; err: geneve_dellink(dev, &list_kill); unregister_netdevice_many(&list_kill); return ERR_PTR(err); } EXPORT_SYMBOL_GPL(geneve_dev_create_fb); static int geneve_netdevice_event(struct notifier_block *unused, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); if (event == NETDEV_UDP_TUNNEL_PUSH_INFO) geneve_offload_rx_ports(dev, true); else if (event == NETDEV_UDP_TUNNEL_DROP_INFO) geneve_offload_rx_ports(dev, false); return NOTIFY_DONE; } static struct notifier_block geneve_notifier_block __read_mostly = { .notifier_call = geneve_netdevice_event, }; static __net_init int geneve_init_net(struct net *net) { struct geneve_net *gn = net_generic(net, geneve_net_id); INIT_LIST_HEAD(&gn->geneve_list); INIT_LIST_HEAD(&gn->sock_list); return 0; } static void __net_exit geneve_exit_rtnl_net(struct net *net, struct list_head *dev_to_kill) { struct geneve_net *gn = net_generic(net, geneve_net_id); struct geneve_dev *geneve, *next; list_for_each_entry_safe(geneve, next, &gn->geneve_list, next) geneve_dellink(geneve->dev, dev_to_kill); } static void __net_exit geneve_exit_net(struct net *net) { const struct geneve_net *gn = net_generic(net, geneve_net_id); WARN_ON_ONCE(!list_empty(&gn->sock_list)); } static struct pernet_operations geneve_net_ops = { .init = geneve_init_net, .exit_rtnl = geneve_exit_rtnl_net, .exit = geneve_exit_net, .id = &geneve_net_id, .size = sizeof(struct geneve_net), }; static int __init geneve_init_module(void) { int rc; rc = register_pernet_subsys(&geneve_net_ops); if (rc) goto out1; rc = register_netdevice_notifier(&geneve_notifier_block); if (rc) goto out2; rc = rtnl_link_register(&geneve_link_ops); if (rc) goto out3; return 0; out3: unregister_netdevice_notifier(&geneve_notifier_block); out2: unregister_pernet_subsys(&geneve_net_ops); out1: return rc; } late_initcall(geneve_init_module); static void __exit geneve_cleanup_module(void) { rtnl_link_unregister(&geneve_link_ops); unregister_netdevice_notifier(&geneve_notifier_block); unregister_pernet_subsys(&geneve_net_ops); } module_exit(geneve_cleanup_module); MODULE_LICENSE("GPL"); MODULE_VERSION(GENEVE_NETDEV_VER); MODULE_AUTHOR("John W. Linville <linville@tuxdriver.com>"); MODULE_DESCRIPTION("Interface driver for GENEVE encapsulated traffic"); MODULE_ALIAS_RTNL_LINK("geneve"); |
| 14 613 613 613 16 611 612 14 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 | // SPDX-License-Identifier: GPL-2.0 OR MIT /* * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. * * This is an implementation of the BLAKE2s hash and PRF functions. * * Information: https://blake2.net/ * */ #include <crypto/internal/blake2s.h> #include <linux/bug.h> #include <linux/export.h> #include <linux/init.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/string.h> #include <linux/types.h> static inline void blake2s_set_lastblock(struct blake2s_state *state) { state->f[0] = -1; } void blake2s_update(struct blake2s_state *state, const u8 *in, size_t inlen) { const size_t fill = BLAKE2S_BLOCK_SIZE - state->buflen; if (unlikely(!inlen)) return; if (inlen > fill) { memcpy(state->buf + state->buflen, in, fill); blake2s_compress(state, state->buf, 1, BLAKE2S_BLOCK_SIZE); state->buflen = 0; in += fill; inlen -= fill; } if (inlen > BLAKE2S_BLOCK_SIZE) { const size_t nblocks = DIV_ROUND_UP(inlen, BLAKE2S_BLOCK_SIZE); blake2s_compress(state, in, nblocks - 1, BLAKE2S_BLOCK_SIZE); in += BLAKE2S_BLOCK_SIZE * (nblocks - 1); inlen -= BLAKE2S_BLOCK_SIZE * (nblocks - 1); } memcpy(state->buf + state->buflen, in, inlen); state->buflen += inlen; } EXPORT_SYMBOL(blake2s_update); void blake2s_final(struct blake2s_state *state, u8 *out) { WARN_ON(IS_ENABLED(DEBUG) && !out); blake2s_set_lastblock(state); memset(state->buf + state->buflen, 0, BLAKE2S_BLOCK_SIZE - state->buflen); /* Padding */ blake2s_compress(state, state->buf, 1, state->buflen); cpu_to_le32_array(state->h, ARRAY_SIZE(state->h)); memcpy(out, state->h, state->outlen); memzero_explicit(state, sizeof(*state)); } EXPORT_SYMBOL(blake2s_final); static int __init blake2s_mod_init(void) { if (IS_ENABLED(CONFIG_CRYPTO_SELFTESTS) && WARN_ON(!blake2s_selftest())) return -ENODEV; return 0; } module_init(blake2s_mod_init); MODULE_DESCRIPTION("BLAKE2s hash function"); MODULE_AUTHOR("Jason A. Donenfeld <Jason@zx2c4.com>"); |
| 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 6 6 1 2 5 1 5 6 6 6 6 6 6 6 6 6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 | /* * Linear conversion Plug-In * Copyright (c) 1999 by Jaroslav Kysela <perex@perex.cz>, * Abramo Bagnara <abramo@alsa-project.org> * * * This library is free software; you can redistribute it and/or modify * it under the terms of the GNU Library General Public License as * published by the Free Software Foundation; either version 2 of * the License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Library General Public License for more details. * * You should have received a copy of the GNU Library General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * */ #include <linux/time.h> #include <sound/core.h> #include <sound/pcm.h> #include "pcm_plugin.h" /* * Basic linear conversion plugin */ struct linear_priv { int cvt_endian; /* need endian conversion? */ unsigned int src_ofs; /* byte offset in source format */ unsigned int dst_ofs; /* byte soffset in destination format */ unsigned int copy_ofs; /* byte offset in temporary u32 data */ unsigned int dst_bytes; /* byte size of destination format */ unsigned int copy_bytes; /* bytes to copy per conversion */ unsigned int flip; /* MSB flip for signeness, done after endian conv */ }; static inline void do_convert(struct linear_priv *data, unsigned char *dst, unsigned char *src) { unsigned int tmp = 0; unsigned char *p = (unsigned char *)&tmp; memcpy(p + data->copy_ofs, src + data->src_ofs, data->copy_bytes); if (data->cvt_endian) tmp = swab32(tmp); tmp ^= data->flip; memcpy(dst, p + data->dst_ofs, data->dst_bytes); } static void convert(struct snd_pcm_plugin *plugin, const struct snd_pcm_plugin_channel *src_channels, struct snd_pcm_plugin_channel *dst_channels, snd_pcm_uframes_t frames) { struct linear_priv *data = (struct linear_priv *)plugin->extra_data; int channel; int nchannels = plugin->src_format.channels; for (channel = 0; channel < nchannels; ++channel) { char *src; char *dst; int src_step, dst_step; snd_pcm_uframes_t frames1; if (!src_channels[channel].enabled) { if (dst_channels[channel].wanted) snd_pcm_area_silence(&dst_channels[channel].area, 0, frames, plugin->dst_format.format); dst_channels[channel].enabled = 0; continue; } dst_channels[channel].enabled = 1; src = src_channels[channel].area.addr + src_channels[channel].area.first / 8; dst = dst_channels[channel].area.addr + dst_channels[channel].area.first / 8; src_step = src_channels[channel].area.step / 8; dst_step = dst_channels[channel].area.step / 8; frames1 = frames; while (frames1-- > 0) { do_convert(data, dst, src); src += src_step; dst += dst_step; } } } static snd_pcm_sframes_t linear_transfer(struct snd_pcm_plugin *plugin, const struct snd_pcm_plugin_channel *src_channels, struct snd_pcm_plugin_channel *dst_channels, snd_pcm_uframes_t frames) { if (snd_BUG_ON(!plugin || !src_channels || !dst_channels)) return -ENXIO; if (frames == 0) return 0; #ifdef CONFIG_SND_DEBUG { unsigned int channel; for (channel = 0; channel < plugin->src_format.channels; channel++) { if (snd_BUG_ON(src_channels[channel].area.first % 8 || src_channels[channel].area.step % 8)) return -ENXIO; if (snd_BUG_ON(dst_channels[channel].area.first % 8 || dst_channels[channel].area.step % 8)) return -ENXIO; } } #endif if (frames > dst_channels[0].frames) frames = dst_channels[0].frames; convert(plugin, src_channels, dst_channels, frames); return frames; } static void init_data(struct linear_priv *data, snd_pcm_format_t src_format, snd_pcm_format_t dst_format) { int src_le, dst_le, src_bytes, dst_bytes; src_bytes = snd_pcm_format_width(src_format) / 8; dst_bytes = snd_pcm_format_width(dst_format) / 8; src_le = snd_pcm_format_little_endian(src_format) > 0; dst_le = snd_pcm_format_little_endian(dst_format) > 0; data->dst_bytes = dst_bytes; data->cvt_endian = src_le != dst_le; data->copy_bytes = src_bytes < dst_bytes ? src_bytes : dst_bytes; if (src_le) { data->copy_ofs = 4 - data->copy_bytes; data->src_ofs = src_bytes - data->copy_bytes; } else data->src_ofs = snd_pcm_format_physical_width(src_format) / 8 - src_bytes; if (dst_le) data->dst_ofs = 4 - data->dst_bytes; else data->dst_ofs = snd_pcm_format_physical_width(dst_format) / 8 - dst_bytes; if (snd_pcm_format_signed(src_format) != snd_pcm_format_signed(dst_format)) { if (dst_le) data->flip = (__force u32)cpu_to_le32(0x80000000); else data->flip = (__force u32)cpu_to_be32(0x80000000); } } int snd_pcm_plugin_build_linear(struct snd_pcm_substream *plug, struct snd_pcm_plugin_format *src_format, struct snd_pcm_plugin_format *dst_format, struct snd_pcm_plugin **r_plugin) { int err; struct linear_priv *data; struct snd_pcm_plugin *plugin; if (snd_BUG_ON(!r_plugin)) return -ENXIO; *r_plugin = NULL; if (snd_BUG_ON(src_format->rate != dst_format->rate)) return -ENXIO; if (snd_BUG_ON(src_format->channels != dst_format->channels)) return -ENXIO; if (snd_BUG_ON(!snd_pcm_format_linear(src_format->format) || !snd_pcm_format_linear(dst_format->format))) return -ENXIO; err = snd_pcm_plugin_build(plug, "linear format conversion", src_format, dst_format, sizeof(struct linear_priv), &plugin); if (err < 0) return err; data = (struct linear_priv *)plugin->extra_data; init_data(data, src_format->format, dst_format->format); plugin->transfer = linear_transfer; *r_plugin = plugin; return 0; } |
| 4 4 4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 | // SPDX-License-Identifier: GPL-2.0-only /* * RDMA resource limiting controller for cgroups. * * Used to allow a cgroup hierarchy to stop processes from consuming * additional RDMA resources after a certain limit is reached. * * Copyright (C) 2016 Parav Pandit <pandit.parav@gmail.com> */ #include <linux/bitops.h> #include <linux/slab.h> #include <linux/seq_file.h> #include <linux/cgroup.h> #include <linux/parser.h> #include <linux/cgroup_rdma.h> #define RDMACG_MAX_STR "max" /* * Protects list of resource pools maintained on per cgroup basis * and rdma device list. */ static DEFINE_MUTEX(rdmacg_mutex); static LIST_HEAD(rdmacg_devices); enum rdmacg_file_type { RDMACG_RESOURCE_TYPE_MAX, RDMACG_RESOURCE_TYPE_STAT, }; /* * resource table definition as to be seen by the user. * Need to add entries to it when more resources are * added/defined at IB verb/core layer. */ static char const *rdmacg_resource_names[] = { [RDMACG_RESOURCE_HCA_HANDLE] = "hca_handle", [RDMACG_RESOURCE_HCA_OBJECT] = "hca_object", }; /* resource tracker for each resource of rdma cgroup */ struct rdmacg_resource { int max; int usage; }; /* * resource pool object which represents per cgroup, per device * resources. There are multiple instances of this object per cgroup, * therefore it cannot be embedded within rdma_cgroup structure. It * is maintained as list. */ struct rdmacg_resource_pool { struct rdmacg_device *device; struct rdmacg_resource resources[RDMACG_RESOURCE_MAX]; struct list_head cg_node; struct list_head dev_node; /* count active user tasks of this pool */ u64 usage_sum; /* total number counts which are set to max */ int num_max_cnt; }; static struct rdma_cgroup *css_rdmacg(struct cgroup_subsys_state *css) { return container_of(css, struct rdma_cgroup, css); } static struct rdma_cgroup *parent_rdmacg(struct rdma_cgroup *cg) { return css_rdmacg(cg->css.parent); } static inline struct rdma_cgroup *get_current_rdmacg(void) { return css_rdmacg(task_get_css(current, rdma_cgrp_id)); } static void set_resource_limit(struct rdmacg_resource_pool *rpool, int index, int new_max) { if (new_max == S32_MAX) { if (rpool->resources[index].max != S32_MAX) rpool->num_max_cnt++; } else { if (rpool->resources[index].max == S32_MAX) rpool->num_max_cnt--; } rpool->resources[index].max = new_max; } static void set_all_resource_max_limit(struct rdmacg_resource_pool *rpool) { int i; for (i = 0; i < RDMACG_RESOURCE_MAX; i++) set_resource_limit(rpool, i, S32_MAX); } static void free_cg_rpool_locked(struct rdmacg_resource_pool *rpool) { lockdep_assert_held(&rdmacg_mutex); list_del(&rpool->cg_node); list_del(&rpool->dev_node); kfree(rpool); } static struct rdmacg_resource_pool * find_cg_rpool_locked(struct rdma_cgroup *cg, struct rdmacg_device *device) { struct rdmacg_resource_pool *pool; lockdep_assert_held(&rdmacg_mutex); list_for_each_entry(pool, &cg->rpools, cg_node) if (pool->device == device) return pool; return NULL; } static struct rdmacg_resource_pool * get_cg_rpool_locked(struct rdma_cgroup *cg, struct rdmacg_device *device) { struct rdmacg_resource_pool *rpool; rpool = find_cg_rpool_locked(cg, device); if (rpool) return rpool; rpool = kzalloc(sizeof(*rpool), GFP_KERNEL); if (!rpool) return ERR_PTR(-ENOMEM); rpool->device = device; set_all_resource_max_limit(rpool); INIT_LIST_HEAD(&rpool->cg_node); INIT_LIST_HEAD(&rpool->dev_node); list_add_tail(&rpool->cg_node, &cg->rpools); list_add_tail(&rpool->dev_node, &device->rpools); return rpool; } /** * uncharge_cg_locked - uncharge resource for rdma cgroup * @cg: pointer to cg to uncharge and all parents in hierarchy * @device: pointer to rdmacg device * @index: index of the resource to uncharge in cg (resource pool) * * It also frees the resource pool which was created as part of * charging operation when there are no resources attached to * resource pool. */ static void uncharge_cg_locked(struct rdma_cgroup *cg, struct rdmacg_device *device, enum rdmacg_resource_type index) { struct rdmacg_resource_pool *rpool; rpool = find_cg_rpool_locked(cg, device); /* * rpool cannot be null at this stage. Let kernel operate in case * if there a bug in IB stack or rdma controller, instead of crashing * the system. */ if (unlikely(!rpool)) { pr_warn("Invalid device %p or rdma cgroup %p\n", cg, device); return; } rpool->resources[index].usage--; /* * A negative count (or overflow) is invalid, * it indicates a bug in the rdma controller. */ WARN_ON_ONCE(rpool->resources[index].usage < 0); rpool->usage_sum--; if (rpool->usage_sum == 0 && rpool->num_max_cnt == RDMACG_RESOURCE_MAX) { /* * No user of the rpool and all entries are set to max, so * safe to delete this rpool. */ free_cg_rpool_locked(rpool); } } /** * rdmacg_uncharge_hierarchy - hierarchically uncharge rdma resource count * @cg: pointer to cg to uncharge and all parents in hierarchy * @device: pointer to rdmacg device * @stop_cg: while traversing hirerchy, when meet with stop_cg cgroup * stop uncharging * @index: index of the resource to uncharge in cg in given resource pool */ static void rdmacg_uncharge_hierarchy(struct rdma_cgroup *cg, struct rdmacg_device *device, struct rdma_cgroup *stop_cg, enum rdmacg_resource_type index) { struct rdma_cgroup *p; mutex_lock(&rdmacg_mutex); for (p = cg; p != stop_cg; p = parent_rdmacg(p)) uncharge_cg_locked(p, device, index); mutex_unlock(&rdmacg_mutex); css_put(&cg->css); } /** * rdmacg_uncharge - hierarchically uncharge rdma resource count * @cg: pointer to cg to uncharge and all parents in hierarchy * @device: pointer to rdmacg device * @index: index of the resource to uncharge in cgroup in given resource pool */ void rdmacg_uncharge(struct rdma_cgroup *cg, struct rdmacg_device *device, enum rdmacg_resource_type index) { if (index >= RDMACG_RESOURCE_MAX) return; rdmacg_uncharge_hierarchy(cg, device, NULL, index); } EXPORT_SYMBOL(rdmacg_uncharge); /** * rdmacg_try_charge - hierarchically try to charge the rdma resource * @rdmacg: pointer to rdma cgroup which will own this resource * @device: pointer to rdmacg device * @index: index of the resource to charge in cgroup (resource pool) * * This function follows charging resource in hierarchical way. * It will fail if the charge would cause the new value to exceed the * hierarchical limit. * Returns 0 if the charge succeeded, otherwise -EAGAIN, -ENOMEM or -EINVAL. * Returns pointer to rdmacg for this resource when charging is successful. * * Charger needs to account resources on two criteria. * (a) per cgroup & (b) per device resource usage. * Per cgroup resource usage ensures that tasks of cgroup doesn't cross * the configured limits. Per device provides granular configuration * in multi device usage. It allocates resource pool in the hierarchy * for each parent it come across for first resource. Later on resource * pool will be available. Therefore it will be much faster thereon * to charge/uncharge. */ int rdmacg_try_charge(struct rdma_cgroup **rdmacg, struct rdmacg_device *device, enum rdmacg_resource_type index) { struct rdma_cgroup *cg, *p; struct rdmacg_resource_pool *rpool; s64 new; int ret = 0; if (index >= RDMACG_RESOURCE_MAX) return -EINVAL; /* * hold on to css, as cgroup can be removed but resource * accounting happens on css. */ cg = get_current_rdmacg(); mutex_lock(&rdmacg_mutex); for (p = cg; p; p = parent_rdmacg(p)) { rpool = get_cg_rpool_locked(p, device); if (IS_ERR(rpool)) { ret = PTR_ERR(rpool); goto err; } else { new = rpool->resources[index].usage + 1; if (new > rpool->resources[index].max) { ret = -EAGAIN; goto err; } else { rpool->resources[index].usage = new; rpool->usage_sum++; } } } mutex_unlock(&rdmacg_mutex); *rdmacg = cg; return 0; err: mutex_unlock(&rdmacg_mutex); rdmacg_uncharge_hierarchy(cg, device, p, index); return ret; } EXPORT_SYMBOL(rdmacg_try_charge); /** * rdmacg_register_device - register rdmacg device to rdma controller. * @device: pointer to rdmacg device whose resources need to be accounted. * * If IB stack wish a device to participate in rdma cgroup resource * tracking, it must invoke this API to register with rdma cgroup before * any user space application can start using the RDMA resources. */ void rdmacg_register_device(struct rdmacg_device *device) { INIT_LIST_HEAD(&device->dev_node); INIT_LIST_HEAD(&device->rpools); mutex_lock(&rdmacg_mutex); list_add_tail(&device->dev_node, &rdmacg_devices); mutex_unlock(&rdmacg_mutex); } EXPORT_SYMBOL(rdmacg_register_device); /** * rdmacg_unregister_device - unregister rdmacg device from rdma controller. * @device: pointer to rdmacg device which was previously registered with rdma * controller using rdmacg_register_device(). * * IB stack must invoke this after all the resources of the IB device * are destroyed and after ensuring that no more resources will be created * when this API is invoked. */ void rdmacg_unregister_device(struct rdmacg_device *device) { struct rdmacg_resource_pool *rpool, *tmp; /* * Synchronize with any active resource settings, * usage query happening via configfs. */ mutex_lock(&rdmacg_mutex); list_del_init(&device->dev_node); /* * Now that this device is off the cgroup list, its safe to free * all the rpool resources. */ list_for_each_entry_safe(rpool, tmp, &device->rpools, dev_node) free_cg_rpool_locked(rpool); mutex_unlock(&rdmacg_mutex); } EXPORT_SYMBOL(rdmacg_unregister_device); static int parse_resource(char *c, int *intval) { substring_t argstr; char *name, *value = c; size_t len; int ret, i; name = strsep(&value, "="); if (!name || !value) return -EINVAL; i = match_string(rdmacg_resource_names, RDMACG_RESOURCE_MAX, name); if (i < 0) return i; len = strlen(value); argstr.from = value; argstr.to = value + len; ret = match_int(&argstr, intval); if (ret >= 0) { if (*intval < 0) return -EINVAL; return i; } if (strncmp(value, RDMACG_MAX_STR, len) == 0) { *intval = S32_MAX; return i; } return -EINVAL; } static int rdmacg_parse_limits(char *options, int *new_limits, unsigned long *enables) { char *c; int err = -EINVAL; /* parse resource options */ while ((c = strsep(&options, " ")) != NULL) { int index, intval; index = parse_resource(c, &intval); if (index < 0) goto err; new_limits[index] = intval; *enables |= BIT(index); } return 0; err: return err; } static struct rdmacg_device *rdmacg_get_device_locked(const char *name) { struct rdmacg_device *device; lockdep_assert_held(&rdmacg_mutex); list_for_each_entry(device, &rdmacg_devices, dev_node) if (!strcmp(name, device->name)) return device; return NULL; } static ssize_t rdmacg_resource_set_max(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { struct rdma_cgroup *cg = css_rdmacg(of_css(of)); const char *dev_name; struct rdmacg_resource_pool *rpool; struct rdmacg_device *device; char *options = strstrip(buf); int *new_limits; unsigned long enables = 0; int i = 0, ret = 0; /* extract the device name first */ dev_name = strsep(&options, " "); if (!dev_name) { ret = -EINVAL; goto err; } new_limits = kcalloc(RDMACG_RESOURCE_MAX, sizeof(int), GFP_KERNEL); if (!new_limits) { ret = -ENOMEM; goto err; } ret = rdmacg_parse_limits(options, new_limits, &enables); if (ret) goto parse_err; /* acquire lock to synchronize with hot plug devices */ mutex_lock(&rdmacg_mutex); device = rdmacg_get_device_locked(dev_name); if (!device) { ret = -ENODEV; goto dev_err; } rpool = get_cg_rpool_locked(cg, device); if (IS_ERR(rpool)) { ret = PTR_ERR(rpool); goto dev_err; } /* now set the new limits of the rpool */ for_each_set_bit(i, &enables, RDMACG_RESOURCE_MAX) set_resource_limit(rpool, i, new_limits[i]); if (rpool->usage_sum == 0 && rpool->num_max_cnt == RDMACG_RESOURCE_MAX) { /* * No user of the rpool and all entries are set to max, so * safe to delete this rpool. */ free_cg_rpool_locked(rpool); } dev_err: mutex_unlock(&rdmacg_mutex); parse_err: kfree(new_limits); err: return ret ?: nbytes; } static void print_rpool_values(struct seq_file *sf, struct rdmacg_resource_pool *rpool) { enum rdmacg_file_type sf_type; int i; u32 value; sf_type = seq_cft(sf)->private; for (i = 0; i < RDMACG_RESOURCE_MAX; i++) { seq_puts(sf, rdmacg_resource_names[i]); seq_putc(sf, '='); if (sf_type == RDMACG_RESOURCE_TYPE_MAX) { if (rpool) value = rpool->resources[i].max; else value = S32_MAX; } else { if (rpool) value = rpool->resources[i].usage; else value = 0; } if (value == S32_MAX) seq_puts(sf, RDMACG_MAX_STR); else seq_printf(sf, "%d", value); seq_putc(sf, ' '); } } static int rdmacg_resource_read(struct seq_file *sf, void *v) { struct rdmacg_device *device; struct rdmacg_resource_pool *rpool; struct rdma_cgroup *cg = css_rdmacg(seq_css(sf)); mutex_lock(&rdmacg_mutex); list_for_each_entry(device, &rdmacg_devices, dev_node) { seq_printf(sf, "%s ", device->name); rpool = find_cg_rpool_locked(cg, device); print_rpool_values(sf, rpool); seq_putc(sf, '\n'); } mutex_unlock(&rdmacg_mutex); return 0; } static struct cftype rdmacg_files[] = { { .name = "max", .write = rdmacg_resource_set_max, .seq_show = rdmacg_resource_read, .private = RDMACG_RESOURCE_TYPE_MAX, .flags = CFTYPE_NOT_ON_ROOT, }, { .name = "current", .seq_show = rdmacg_resource_read, .private = RDMACG_RESOURCE_TYPE_STAT, .flags = CFTYPE_NOT_ON_ROOT, }, { } /* terminate */ }; static struct cgroup_subsys_state * rdmacg_css_alloc(struct cgroup_subsys_state *parent) { struct rdma_cgroup *cg; cg = kzalloc(sizeof(*cg), GFP_KERNEL); if (!cg) return ERR_PTR(-ENOMEM); INIT_LIST_HEAD(&cg->rpools); return &cg->css; } static void rdmacg_css_free(struct cgroup_subsys_state *css) { struct rdma_cgroup *cg = css_rdmacg(css); kfree(cg); } /** * rdmacg_css_offline - cgroup css_offline callback * @css: css of interest * * This function is called when @css is about to go away and responsible * for shooting down all rdmacg associated with @css. As part of that it * marks all the resource pool entries to max value, so that when resources are * uncharged, associated resource pool can be freed as well. */ static void rdmacg_css_offline(struct cgroup_subsys_state *css) { struct rdma_cgroup *cg = css_rdmacg(css); struct rdmacg_resource_pool *rpool; mutex_lock(&rdmacg_mutex); list_for_each_entry(rpool, &cg->rpools, cg_node) set_all_resource_max_limit(rpool); mutex_unlock(&rdmacg_mutex); } struct cgroup_subsys rdma_cgrp_subsys = { .css_alloc = rdmacg_css_alloc, .css_free = rdmacg_css_free, .css_offline = rdmacg_css_offline, .legacy_cftypes = rdmacg_files, .dfl_cftypes = rdmacg_files, }; |
| 1 1 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 | // SPDX-License-Identifier: GPL-2.0 /* * fs/ext4/fast_commit.c * * Written by Harshad Shirwadkar <harshadshirwadkar@gmail.com> * * Ext4 fast commits routines. */ #include "ext4.h" #include "ext4_jbd2.h" #include "ext4_extents.h" #include "mballoc.h" #include <linux/lockdep.h> /* * Ext4 Fast Commits * ----------------- * * Ext4 fast commits implement fine grained journalling for Ext4. * * Fast commits are organized as a log of tag-length-value (TLV) structs. (See * struct ext4_fc_tl). Each TLV contains some delta that is replayed TLV by * TLV during the recovery phase. For the scenarios for which we currently * don't have replay code, fast commit falls back to full commits. * Fast commits record delta in one of the following three categories. * * (A) Directory entry updates: * * - EXT4_FC_TAG_UNLINK - records directory entry unlink * - EXT4_FC_TAG_LINK - records directory entry link * - EXT4_FC_TAG_CREAT - records inode and directory entry creation * * (B) File specific data range updates: * * - EXT4_FC_TAG_ADD_RANGE - records addition of new blocks to an inode * - EXT4_FC_TAG_DEL_RANGE - records deletion of blocks from an inode * * (C) Inode metadata (mtime / ctime etc): * * - EXT4_FC_TAG_INODE - record the inode that should be replayed * during recovery. Note that iblocks field is * not replayed and instead derived during * replay. * Commit Operation * ---------------- * With fast commits, we maintain all the directory entry operations in the * order in which they are issued in an in-memory queue. This queue is flushed * to disk during the commit operation. We also maintain a list of inodes * that need to be committed during a fast commit in another in memory queue of * inodes. During the commit operation, we commit in the following order: * * [1] Prepare all the inodes to write out their data by setting * "EXT4_STATE_FC_FLUSHING_DATA". This ensures that inode cannot be * deleted while it is being flushed. * [2] Flush data buffers to disk and clear "EXT4_STATE_FC_FLUSHING_DATA" * state. * [3] Lock the journal by calling jbd2_journal_lock_updates. This ensures that * all the exsiting handles finish and no new handles can start. * [4] Mark all the fast commit eligible inodes as undergoing fast commit * by setting "EXT4_STATE_FC_COMMITTING" state. * [5] Unlock the journal by calling jbd2_journal_unlock_updates. This allows * starting of new handles. If new handles try to start an update on * any of the inodes that are being committed, ext4_fc_track_inode() * will block until those inodes have finished the fast commit. * [6] Commit all the directory entry updates in the fast commit space. * [7] Commit all the changed inodes in the fast commit space and clear * "EXT4_STATE_FC_COMMITTING" for these inodes. * [8] Write tail tag (this tag ensures the atomicity, please read the following * section for more details). * * All the inode updates must be enclosed within jbd2_jounrnal_start() * and jbd2_journal_stop() similar to JBD2 journaling. * * Fast Commit Ineligibility * ------------------------- * * Not all operations are supported by fast commits today (e.g extended * attributes). Fast commit ineligibility is marked by calling * ext4_fc_mark_ineligible(): This makes next fast commit operation to fall back * to full commit. * * Atomicity of commits * -------------------- * In order to guarantee atomicity during the commit operation, fast commit * uses "EXT4_FC_TAG_TAIL" tag that marks a fast commit as complete. Tail * tag contains CRC of the contents and TID of the transaction after which * this fast commit should be applied. Recovery code replays fast commit * logs only if there's at least 1 valid tail present. For every fast commit * operation, there is 1 tail. This means, we may end up with multiple tails * in the fast commit space. Here's an example: * * - Create a new file A and remove existing file B * - fsync() * - Append contents to file A * - Truncate file A * - fsync() * * The fast commit space at the end of above operations would look like this: * [HEAD] [CREAT A] [UNLINK B] [TAIL] [ADD_RANGE A] [DEL_RANGE A] [TAIL] * |<--- Fast Commit 1 --->|<--- Fast Commit 2 ---->| * * Replay code should thus check for all the valid tails in the FC area. * * Fast Commit Replay Idempotence * ------------------------------ * * Fast commits tags are idempotent in nature provided the recovery code follows * certain rules. The guiding principle that the commit path follows while * committing is that it stores the result of a particular operation instead of * storing the procedure. * * Let's consider this rename operation: 'mv /a /b'. Let's assume dirent '/a' * was associated with inode 10. During fast commit, instead of storing this * operation as a procedure "rename a to b", we store the resulting file system * state as a "series" of outcomes: * * - Link dirent b to inode 10 * - Unlink dirent a * - Inode <10> with valid refcount * * Now when recovery code runs, it needs "enforce" this state on the file * system. This is what guarantees idempotence of fast commit replay. * * Let's take an example of a procedure that is not idempotent and see how fast * commits make it idempotent. Consider following sequence of operations: * * rm A; mv B A; read A * (x) (y) (z) * * (x), (y) and (z) are the points at which we can crash. If we store this * sequence of operations as is then the replay is not idempotent. Let's say * while in replay, we crash at (z). During the second replay, file A (which was * actually created as a result of "mv B A" operation) would get deleted. Thus, * file named A would be absent when we try to read A. So, this sequence of * operations is not idempotent. However, as mentioned above, instead of storing * the procedure fast commits store the outcome of each procedure. Thus the fast * commit log for above procedure would be as follows: * * (Let's assume dirent A was linked to inode 10 and dirent B was linked to * inode 11 before the replay) * * [Unlink A] [Link A to inode 11] [Unlink B] [Inode 11] * (w) (x) (y) (z) * * If we crash at (z), we will have file A linked to inode 11. During the second * replay, we will remove file A (inode 11). But we will create it back and make * it point to inode 11. We won't find B, so we'll just skip that step. At this * point, the refcount for inode 11 is not reliable, but that gets fixed by the * replay of last inode 11 tag. Crashes at points (w), (x) and (y) get handled * similarly. Thus, by converting a non-idempotent procedure into a series of * idempotent outcomes, fast commits ensured idempotence during the replay. * * Locking * ------- * sbi->s_fc_lock protects the fast commit inodes queue and the fast commit * dentry queue. ei->i_fc_lock protects the fast commit related info in a given * inode. Most of the code avoids acquiring both the locks, but if one must do * that then sbi->s_fc_lock must be acquired before ei->i_fc_lock. * * TODOs * ----- * * 0) Fast commit replay path hardening: Fast commit replay code should use * journal handles to make sure all the updates it does during the replay * path are atomic. With that if we crash during fast commit replay, after * trying to do recovery again, we will find a file system where fast commit * area is invalid (because new full commit would be found). In order to deal * with that, fast commit replay code should ensure that the "FC_REPLAY" * superblock state is persisted before starting the replay, so that after * the crash, fast commit recovery code can look at that flag and perform * fast commit recovery even if that area is invalidated by later full * commits. * * 1) Handle more ineligible cases. * * 2) Change ext4_fc_commit() to lookup logical to physical mapping using extent * status tree. This would get rid of the need to call ext4_fc_track_inode() * before acquiring i_data_sem. To do that we would need to ensure that * modified extents from the extent status tree are not evicted from memory. */ #include <trace/events/ext4.h> static struct kmem_cache *ext4_fc_dentry_cachep; static void ext4_end_buffer_io_sync(struct buffer_head *bh, int uptodate) { BUFFER_TRACE(bh, ""); if (uptodate) { ext4_debug("%s: Block %lld up-to-date", __func__, bh->b_blocknr); set_buffer_uptodate(bh); } else { ext4_debug("%s: Block %lld not up-to-date", __func__, bh->b_blocknr); clear_buffer_uptodate(bh); } unlock_buffer(bh); } static inline void ext4_fc_reset_inode(struct inode *inode) { struct ext4_inode_info *ei = EXT4_I(inode); ei->i_fc_lblk_start = 0; ei->i_fc_lblk_len = 0; } void ext4_fc_init_inode(struct inode *inode) { struct ext4_inode_info *ei = EXT4_I(inode); ext4_fc_reset_inode(inode); ext4_clear_inode_state(inode, EXT4_STATE_FC_COMMITTING); INIT_LIST_HEAD(&ei->i_fc_list); INIT_LIST_HEAD(&ei->i_fc_dilist); init_waitqueue_head(&ei->i_fc_wait); } static bool ext4_fc_disabled(struct super_block *sb) { return (!test_opt2(sb, JOURNAL_FAST_COMMIT) || (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY)); } /* * Remove inode from fast commit list. If the inode is being committed * we wait until inode commit is done. */ void ext4_fc_del(struct inode *inode) { struct ext4_inode_info *ei = EXT4_I(inode); struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct ext4_fc_dentry_update *fc_dentry; wait_queue_head_t *wq; if (ext4_fc_disabled(inode->i_sb)) return; mutex_lock(&sbi->s_fc_lock); if (list_empty(&ei->i_fc_list) && list_empty(&ei->i_fc_dilist)) { mutex_unlock(&sbi->s_fc_lock); return; } /* * Since ext4_fc_del is called from ext4_evict_inode while having a * handle open, there is no need for us to wait here even if a fast * commit is going on. That is because, if this inode is being * committed, ext4_mark_inode_dirty would have waited for inode commit * operation to finish before we come here. So, by the time we come * here, inode's EXT4_STATE_FC_COMMITTING would have been cleared. So, * we shouldn't see EXT4_STATE_FC_COMMITTING to be set on this inode * here. * * We may come here without any handles open in the "no_delete" case of * ext4_evict_inode as well. However, if that happens, we first mark the * file system as fast commit ineligible anyway. So, even in that case, * it is okay to remove the inode from the fc list. */ WARN_ON(ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING) && !ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE)); while (ext4_test_inode_state(inode, EXT4_STATE_FC_FLUSHING_DATA)) { #if (BITS_PER_LONG < 64) DEFINE_WAIT_BIT(wait, &ei->i_state_flags, EXT4_STATE_FC_FLUSHING_DATA); wq = bit_waitqueue(&ei->i_state_flags, EXT4_STATE_FC_FLUSHING_DATA); #else DEFINE_WAIT_BIT(wait, &ei->i_flags, EXT4_STATE_FC_FLUSHING_DATA); wq = bit_waitqueue(&ei->i_flags, EXT4_STATE_FC_FLUSHING_DATA); #endif prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); if (ext4_test_inode_state(inode, EXT4_STATE_FC_FLUSHING_DATA)) { mutex_unlock(&sbi->s_fc_lock); schedule(); mutex_lock(&sbi->s_fc_lock); } finish_wait(wq, &wait.wq_entry); } list_del_init(&ei->i_fc_list); /* * Since this inode is getting removed, let's also remove all FC * dentry create references, since it is not needed to log it anyways. */ if (list_empty(&ei->i_fc_dilist)) { mutex_unlock(&sbi->s_fc_lock); return; } fc_dentry = list_first_entry(&ei->i_fc_dilist, struct ext4_fc_dentry_update, fcd_dilist); WARN_ON(fc_dentry->fcd_op != EXT4_FC_TAG_CREAT); list_del_init(&fc_dentry->fcd_list); list_del_init(&fc_dentry->fcd_dilist); WARN_ON(!list_empty(&ei->i_fc_dilist)); mutex_unlock(&sbi->s_fc_lock); release_dentry_name_snapshot(&fc_dentry->fcd_name); kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry); } /* * Mark file system as fast commit ineligible, and record latest * ineligible transaction tid. This means until the recorded * transaction, commit operation would result in a full jbd2 commit. */ void ext4_fc_mark_ineligible(struct super_block *sb, int reason, handle_t *handle) { struct ext4_sb_info *sbi = EXT4_SB(sb); tid_t tid; bool has_transaction = true; bool is_ineligible; if (ext4_fc_disabled(sb)) return; if (handle && !IS_ERR(handle)) tid = handle->h_transaction->t_tid; else { read_lock(&sbi->s_journal->j_state_lock); if (sbi->s_journal->j_running_transaction) tid = sbi->s_journal->j_running_transaction->t_tid; else has_transaction = false; read_unlock(&sbi->s_journal->j_state_lock); } mutex_lock(&sbi->s_fc_lock); is_ineligible = ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); if (has_transaction && (!is_ineligible || tid_gt(tid, sbi->s_fc_ineligible_tid))) sbi->s_fc_ineligible_tid = tid; ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); mutex_unlock(&sbi->s_fc_lock); WARN_ON(reason >= EXT4_FC_REASON_MAX); sbi->s_fc_stats.fc_ineligible_reason_count[reason]++; } /* * Generic fast commit tracking function. If this is the first time this we are * called after a full commit, we initialize fast commit fields and then call * __fc_track_fn() with update = 0. If we have already been called after a full * commit, we pass update = 1. Based on that, the track function can determine * if it needs to track a field for the first time or if it needs to just * update the previously tracked value. * * If enqueue is set, this function enqueues the inode in fast commit list. */ static int ext4_fc_track_template( handle_t *handle, struct inode *inode, int (*__fc_track_fn)(handle_t *handle, struct inode *, void *, bool), void *args, int enqueue) { bool update = false; struct ext4_inode_info *ei = EXT4_I(inode); struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); tid_t tid = 0; int ret; tid = handle->h_transaction->t_tid; spin_lock(&ei->i_fc_lock); if (tid == ei->i_sync_tid) { update = true; } else { ext4_fc_reset_inode(inode); ei->i_sync_tid = tid; } ret = __fc_track_fn(handle, inode, args, update); spin_unlock(&ei->i_fc_lock); if (!enqueue) return ret; mutex_lock(&sbi->s_fc_lock); if (list_empty(&EXT4_I(inode)->i_fc_list)) list_add_tail(&EXT4_I(inode)->i_fc_list, (sbi->s_journal->j_flags & JBD2_FULL_COMMIT_ONGOING || sbi->s_journal->j_flags & JBD2_FAST_COMMIT_ONGOING) ? &sbi->s_fc_q[FC_Q_STAGING] : &sbi->s_fc_q[FC_Q_MAIN]); mutex_unlock(&sbi->s_fc_lock); return ret; } struct __track_dentry_update_args { struct dentry *dentry; int op; }; /* __track_fn for directory entry updates. Called with ei->i_fc_lock. */ static int __track_dentry_update(handle_t *handle, struct inode *inode, void *arg, bool update) { struct ext4_fc_dentry_update *node; struct ext4_inode_info *ei = EXT4_I(inode); struct __track_dentry_update_args *dentry_update = (struct __track_dentry_update_args *)arg; struct dentry *dentry = dentry_update->dentry; struct inode *dir = dentry->d_parent->d_inode; struct super_block *sb = inode->i_sb; struct ext4_sb_info *sbi = EXT4_SB(sb); spin_unlock(&ei->i_fc_lock); if (IS_ENCRYPTED(dir)) { ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_ENCRYPTED_FILENAME, handle); spin_lock(&ei->i_fc_lock); return -EOPNOTSUPP; } node = kmem_cache_alloc(ext4_fc_dentry_cachep, GFP_NOFS); if (!node) { ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_NOMEM, handle); spin_lock(&ei->i_fc_lock); return -ENOMEM; } node->fcd_op = dentry_update->op; node->fcd_parent = dir->i_ino; node->fcd_ino = inode->i_ino; take_dentry_name_snapshot(&node->fcd_name, dentry); INIT_LIST_HEAD(&node->fcd_dilist); INIT_LIST_HEAD(&node->fcd_list); mutex_lock(&sbi->s_fc_lock); if (sbi->s_journal->j_flags & JBD2_FULL_COMMIT_ONGOING || sbi->s_journal->j_flags & JBD2_FAST_COMMIT_ONGOING) list_add_tail(&node->fcd_list, &sbi->s_fc_dentry_q[FC_Q_STAGING]); else list_add_tail(&node->fcd_list, &sbi->s_fc_dentry_q[FC_Q_MAIN]); /* * This helps us keep a track of all fc_dentry updates which is part of * this ext4 inode. So in case the inode is getting unlinked, before * even we get a chance to fsync, we could remove all fc_dentry * references while evicting the inode in ext4_fc_del(). * Also with this, we don't need to loop over all the inodes in * sbi->s_fc_q to get the corresponding inode in * ext4_fc_commit_dentry_updates(). */ if (dentry_update->op == EXT4_FC_TAG_CREAT) { WARN_ON(!list_empty(&ei->i_fc_dilist)); list_add_tail(&node->fcd_dilist, &ei->i_fc_dilist); } mutex_unlock(&sbi->s_fc_lock); spin_lock(&ei->i_fc_lock); return 0; } void __ext4_fc_track_unlink(handle_t *handle, struct inode *inode, struct dentry *dentry) { struct __track_dentry_update_args args; int ret; args.dentry = dentry; args.op = EXT4_FC_TAG_UNLINK; ret = ext4_fc_track_template(handle, inode, __track_dentry_update, (void *)&args, 0); trace_ext4_fc_track_unlink(handle, inode, dentry, ret); } void ext4_fc_track_unlink(handle_t *handle, struct dentry *dentry) { struct inode *inode = d_inode(dentry); if (ext4_fc_disabled(inode->i_sb)) return; if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE)) return; __ext4_fc_track_unlink(handle, inode, dentry); } void __ext4_fc_track_link(handle_t *handle, struct inode *inode, struct dentry *dentry) { struct __track_dentry_update_args args; int ret; args.dentry = dentry; args.op = EXT4_FC_TAG_LINK; ret = ext4_fc_track_template(handle, inode, __track_dentry_update, (void *)&args, 0); trace_ext4_fc_track_link(handle, inode, dentry, ret); } void ext4_fc_track_link(handle_t *handle, struct dentry *dentry) { struct inode *inode = d_inode(dentry); if (ext4_fc_disabled(inode->i_sb)) return; if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE)) return; __ext4_fc_track_link(handle, inode, dentry); } void __ext4_fc_track_create(handle_t *handle, struct inode *inode, struct dentry *dentry) { struct __track_dentry_update_args args; int ret; args.dentry = dentry; args.op = EXT4_FC_TAG_CREAT; ret = ext4_fc_track_template(handle, inode, __track_dentry_update, (void *)&args, 0); trace_ext4_fc_track_create(handle, inode, dentry, ret); } void ext4_fc_track_create(handle_t *handle, struct dentry *dentry) { struct inode *inode = d_inode(dentry); if (ext4_fc_disabled(inode->i_sb)) return; if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE)) return; __ext4_fc_track_create(handle, inode, dentry); } /* __track_fn for inode tracking */ static int __track_inode(handle_t *handle, struct inode *inode, void *arg, bool update) { if (update) return -EEXIST; EXT4_I(inode)->i_fc_lblk_len = 0; return 0; } void ext4_fc_track_inode(handle_t *handle, struct inode *inode) { struct ext4_inode_info *ei = EXT4_I(inode); wait_queue_head_t *wq; int ret; if (S_ISDIR(inode->i_mode)) return; if (ext4_fc_disabled(inode->i_sb)) return; if (ext4_should_journal_data(inode)) { ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_INODE_JOURNAL_DATA, handle); return; } if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE)) return; /* * If we come here, we may sleep while waiting for the inode to * commit. We shouldn't be holding i_data_sem when we go to sleep since * the commit path needs to grab the lock while committing the inode. */ lockdep_assert_not_held(&ei->i_data_sem); while (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) { #if (BITS_PER_LONG < 64) DEFINE_WAIT_BIT(wait, &ei->i_state_flags, EXT4_STATE_FC_COMMITTING); wq = bit_waitqueue(&ei->i_state_flags, EXT4_STATE_FC_COMMITTING); #else DEFINE_WAIT_BIT(wait, &ei->i_flags, EXT4_STATE_FC_COMMITTING); wq = bit_waitqueue(&ei->i_flags, EXT4_STATE_FC_COMMITTING); #endif prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) schedule(); finish_wait(wq, &wait.wq_entry); } /* * From this point on, this inode will not be committed either * by fast or full commit as long as the handle is open. */ ret = ext4_fc_track_template(handle, inode, __track_inode, NULL, 1); trace_ext4_fc_track_inode(handle, inode, ret); } struct __track_range_args { ext4_lblk_t start, end; }; /* __track_fn for tracking data updates */ static int __track_range(handle_t *handle, struct inode *inode, void *arg, bool update) { struct ext4_inode_info *ei = EXT4_I(inode); ext4_lblk_t oldstart; struct __track_range_args *__arg = (struct __track_range_args *)arg; if (inode->i_ino < EXT4_FIRST_INO(inode->i_sb)) { ext4_debug("Special inode %ld being modified\n", inode->i_ino); return -ECANCELED; } oldstart = ei->i_fc_lblk_start; if (update && ei->i_fc_lblk_len > 0) { ei->i_fc_lblk_start = min(ei->i_fc_lblk_start, __arg->start); ei->i_fc_lblk_len = max(oldstart + ei->i_fc_lblk_len - 1, __arg->end) - ei->i_fc_lblk_start + 1; } else { ei->i_fc_lblk_start = __arg->start; ei->i_fc_lblk_len = __arg->end - __arg->start + 1; } return 0; } void ext4_fc_track_range(handle_t *handle, struct inode *inode, ext4_lblk_t start, ext4_lblk_t end) { struct __track_range_args args; int ret; if (S_ISDIR(inode->i_mode)) return; if (ext4_fc_disabled(inode->i_sb)) return; if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE)) return; if (ext4_has_inline_data(inode)) { ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_XATTR, handle); return; } args.start = start; args.end = end; ret = ext4_fc_track_template(handle, inode, __track_range, &args, 1); trace_ext4_fc_track_range(handle, inode, start, end, ret); } static void ext4_fc_submit_bh(struct super_block *sb, bool is_tail) { blk_opf_t write_flags = REQ_SYNC; struct buffer_head *bh = EXT4_SB(sb)->s_fc_bh; /* Add REQ_FUA | REQ_PREFLUSH only its tail */ if (test_opt(sb, BARRIER) && is_tail) write_flags |= REQ_FUA | REQ_PREFLUSH; lock_buffer(bh); set_buffer_dirty(bh); set_buffer_uptodate(bh); bh->b_end_io = ext4_end_buffer_io_sync; submit_bh(REQ_OP_WRITE | write_flags, bh); EXT4_SB(sb)->s_fc_bh = NULL; } /* Ext4 commit path routines */ /* * Allocate len bytes on a fast commit buffer. * * During the commit time this function is used to manage fast commit * block space. We don't split a fast commit log onto different * blocks. So this function makes sure that if there's not enough space * on the current block, the remaining space in the current block is * marked as unused by adding EXT4_FC_TAG_PAD tag. In that case, * new block is from jbd2 and CRC is updated to reflect the padding * we added. */ static u8 *ext4_fc_reserve_space(struct super_block *sb, int len, u32 *crc) { struct ext4_fc_tl tl; struct ext4_sb_info *sbi = EXT4_SB(sb); struct buffer_head *bh; int bsize = sbi->s_journal->j_blocksize; int ret, off = sbi->s_fc_bytes % bsize; int remaining; u8 *dst; /* * If 'len' is too long to fit in any block alongside a PAD tlv, then we * cannot fulfill the request. */ if (len > bsize - EXT4_FC_TAG_BASE_LEN) return NULL; if (!sbi->s_fc_bh) { ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh); if (ret) return NULL; sbi->s_fc_bh = bh; } dst = sbi->s_fc_bh->b_data + off; /* * Allocate the bytes in the current block if we can do so while still * leaving enough space for a PAD tlv. */ remaining = bsize - EXT4_FC_TAG_BASE_LEN - off; if (len <= remaining) { sbi->s_fc_bytes += len; return dst; } /* * Else, terminate the current block with a PAD tlv, then allocate a new * block and allocate the bytes at the start of that new block. */ tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_PAD); tl.fc_len = cpu_to_le16(remaining); memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN); memset(dst + EXT4_FC_TAG_BASE_LEN, 0, remaining); *crc = ext4_chksum(*crc, sbi->s_fc_bh->b_data, bsize); ext4_fc_submit_bh(sb, false); ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh); if (ret) return NULL; sbi->s_fc_bh = bh; sbi->s_fc_bytes += bsize - off + len; return sbi->s_fc_bh->b_data; } /* * Complete a fast commit by writing tail tag. * * Writing tail tag marks the end of a fast commit. In order to guarantee * atomicity, after writing tail tag, even if there's space remaining * in the block, next commit shouldn't use it. That's why tail tag * has the length as that of the remaining space on the block. */ static int ext4_fc_write_tail(struct super_block *sb, u32 crc) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_fc_tl tl; struct ext4_fc_tail tail; int off, bsize = sbi->s_journal->j_blocksize; u8 *dst; /* * ext4_fc_reserve_space takes care of allocating an extra block if * there's no enough space on this block for accommodating this tail. */ dst = ext4_fc_reserve_space(sb, EXT4_FC_TAG_BASE_LEN + sizeof(tail), &crc); if (!dst) return -ENOSPC; off = sbi->s_fc_bytes % bsize; tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_TAIL); tl.fc_len = cpu_to_le16(bsize - off + sizeof(struct ext4_fc_tail)); sbi->s_fc_bytes = round_up(sbi->s_fc_bytes, bsize); memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN); dst += EXT4_FC_TAG_BASE_LEN; tail.fc_tid = cpu_to_le32(sbi->s_journal->j_running_transaction->t_tid); memcpy(dst, &tail.fc_tid, sizeof(tail.fc_tid)); dst += sizeof(tail.fc_tid); crc = ext4_chksum(crc, sbi->s_fc_bh->b_data, dst - (u8 *)sbi->s_fc_bh->b_data); tail.fc_crc = cpu_to_le32(crc); memcpy(dst, &tail.fc_crc, sizeof(tail.fc_crc)); dst += sizeof(tail.fc_crc); memset(dst, 0, bsize - off); /* Don't leak uninitialized memory. */ ext4_fc_submit_bh(sb, true); return 0; } /* * Adds tag, length, value and updates CRC. Returns true if tlv was added. * Returns false if there's not enough space. */ static bool ext4_fc_add_tlv(struct super_block *sb, u16 tag, u16 len, u8 *val, u32 *crc) { struct ext4_fc_tl tl; u8 *dst; dst = ext4_fc_reserve_space(sb, EXT4_FC_TAG_BASE_LEN + len, crc); if (!dst) return false; tl.fc_tag = cpu_to_le16(tag); tl.fc_len = cpu_to_le16(len); memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN); memcpy(dst + EXT4_FC_TAG_BASE_LEN, val, len); return true; } /* Same as above, but adds dentry tlv. */ static bool ext4_fc_add_dentry_tlv(struct super_block *sb, u32 *crc, struct ext4_fc_dentry_update *fc_dentry) { struct ext4_fc_dentry_info fcd; struct ext4_fc_tl tl; int dlen = fc_dentry->fcd_name.name.len; u8 *dst = ext4_fc_reserve_space(sb, EXT4_FC_TAG_BASE_LEN + sizeof(fcd) + dlen, crc); if (!dst) return false; fcd.fc_parent_ino = cpu_to_le32(fc_dentry->fcd_parent); fcd.fc_ino = cpu_to_le32(fc_dentry->fcd_ino); tl.fc_tag = cpu_to_le16(fc_dentry->fcd_op); tl.fc_len = cpu_to_le16(sizeof(fcd) + dlen); memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN); dst += EXT4_FC_TAG_BASE_LEN; memcpy(dst, &fcd, sizeof(fcd)); dst += sizeof(fcd); memcpy(dst, fc_dentry->fcd_name.name.name, dlen); return true; } /* * Writes inode in the fast commit space under TLV with tag @tag. * Returns 0 on success, error on failure. */ static int ext4_fc_write_inode(struct inode *inode, u32 *crc) { struct ext4_inode_info *ei = EXT4_I(inode); int inode_len = EXT4_GOOD_OLD_INODE_SIZE; int ret; struct ext4_iloc iloc; struct ext4_fc_inode fc_inode; struct ext4_fc_tl tl; u8 *dst; ret = ext4_get_inode_loc(inode, &iloc); if (ret) return ret; if (ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA)) inode_len = EXT4_INODE_SIZE(inode->i_sb); else if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) inode_len += ei->i_extra_isize; fc_inode.fc_ino = cpu_to_le32(inode->i_ino); tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_INODE); tl.fc_len = cpu_to_le16(inode_len + sizeof(fc_inode.fc_ino)); ret = -ECANCELED; dst = ext4_fc_reserve_space(inode->i_sb, EXT4_FC_TAG_BASE_LEN + inode_len + sizeof(fc_inode.fc_ino), crc); if (!dst) goto err; memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN); dst += EXT4_FC_TAG_BASE_LEN; memcpy(dst, &fc_inode, sizeof(fc_inode)); dst += sizeof(fc_inode); memcpy(dst, (u8 *)ext4_raw_inode(&iloc), inode_len); ret = 0; err: brelse(iloc.bh); return ret; } /* * Writes updated data ranges for the inode in question. Updates CRC. * Returns 0 on success, error otherwise. */ static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc) { ext4_lblk_t old_blk_size, cur_lblk_off, new_blk_size; struct ext4_inode_info *ei = EXT4_I(inode); struct ext4_map_blocks map; struct ext4_fc_add_range fc_ext; struct ext4_fc_del_range lrange; struct ext4_extent *ex; int ret; spin_lock(&ei->i_fc_lock); if (ei->i_fc_lblk_len == 0) { spin_unlock(&ei->i_fc_lock); return 0; } old_blk_size = ei->i_fc_lblk_start; new_blk_size = ei->i_fc_lblk_start + ei->i_fc_lblk_len - 1; ei->i_fc_lblk_len = 0; spin_unlock(&ei->i_fc_lock); cur_lblk_off = old_blk_size; ext4_debug("will try writing %d to %d for inode %ld\n", cur_lblk_off, new_blk_size, inode->i_ino); while (cur_lblk_off <= new_blk_size) { map.m_lblk = cur_lblk_off; map.m_len = new_blk_size - cur_lblk_off + 1; ret = ext4_map_blocks(NULL, inode, &map, EXT4_GET_BLOCKS_IO_SUBMIT | EXT4_EX_NOCACHE); if (ret < 0) return -ECANCELED; if (map.m_len == 0) { cur_lblk_off++; continue; } if (ret == 0) { lrange.fc_ino = cpu_to_le32(inode->i_ino); lrange.fc_lblk = cpu_to_le32(map.m_lblk); lrange.fc_len = cpu_to_le32(map.m_len); if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_DEL_RANGE, sizeof(lrange), (u8 *)&lrange, crc)) return -ENOSPC; } else { unsigned int max = (map.m_flags & EXT4_MAP_UNWRITTEN) ? EXT_UNWRITTEN_MAX_LEN : EXT_INIT_MAX_LEN; /* Limit the number of blocks in one extent */ map.m_len = min(max, map.m_len); fc_ext.fc_ino = cpu_to_le32(inode->i_ino); ex = (struct ext4_extent *)&fc_ext.fc_ex; ex->ee_block = cpu_to_le32(map.m_lblk); ex->ee_len = cpu_to_le16(map.m_len); ext4_ext_store_pblock(ex, map.m_pblk); if (map.m_flags & EXT4_MAP_UNWRITTEN) ext4_ext_mark_unwritten(ex); else ext4_ext_mark_initialized(ex); if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_ADD_RANGE, sizeof(fc_ext), (u8 *)&fc_ext, crc)) return -ENOSPC; } cur_lblk_off += map.m_len; } return 0; } /* Flushes data of all the inodes in the commit queue. */ static int ext4_fc_flush_data(journal_t *journal) { struct super_block *sb = journal->j_private; struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_inode_info *ei; int ret = 0; list_for_each_entry(ei, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { ret = jbd2_submit_inode_data(journal, ei->jinode); if (ret) return ret; } list_for_each_entry(ei, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { ret = jbd2_wait_inode_data(journal, ei->jinode); if (ret) return ret; } return 0; } /* Commit all the directory entry updates */ static int ext4_fc_commit_dentry_updates(journal_t *journal, u32 *crc) { struct super_block *sb = journal->j_private; struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_fc_dentry_update *fc_dentry, *fc_dentry_n; struct inode *inode; struct ext4_inode_info *ei; int ret; if (list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN])) return 0; list_for_each_entry_safe(fc_dentry, fc_dentry_n, &sbi->s_fc_dentry_q[FC_Q_MAIN], fcd_list) { if (fc_dentry->fcd_op != EXT4_FC_TAG_CREAT) { if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry)) return -ENOSPC; continue; } /* * With fcd_dilist we need not loop in sbi->s_fc_q to get the * corresponding inode. Also, the corresponding inode could have been * deleted, in which case, we don't need to do anything. */ if (list_empty(&fc_dentry->fcd_dilist)) continue; ei = list_first_entry(&fc_dentry->fcd_dilist, struct ext4_inode_info, i_fc_dilist); inode = &ei->vfs_inode; WARN_ON(inode->i_ino != fc_dentry->fcd_ino); /* * We first write the inode and then the create dirent. This * allows the recovery code to create an unnamed inode first * and then link it to a directory entry. This allows us * to use namei.c routines almost as is and simplifies * the recovery code. */ ret = ext4_fc_write_inode(inode, crc); if (ret) return ret; ret = ext4_fc_write_inode_data(inode, crc); if (ret) return ret; if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry)) return -ENOSPC; } return 0; } static int ext4_fc_perform_commit(journal_t *journal) { struct super_block *sb = journal->j_private; struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_inode_info *iter; struct ext4_fc_head head; struct inode *inode; struct blk_plug plug; int ret = 0; u32 crc = 0; /* * Step 1: Mark all inodes on s_fc_q[MAIN] with * EXT4_STATE_FC_FLUSHING_DATA. This prevents these inodes from being * freed until the data flush is over. */ mutex_lock(&sbi->s_fc_lock); list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { ext4_set_inode_state(&iter->vfs_inode, EXT4_STATE_FC_FLUSHING_DATA); } mutex_unlock(&sbi->s_fc_lock); /* Step 2: Flush data for all the eligible inodes. */ ret = ext4_fc_flush_data(journal); /* * Step 3: Clear EXT4_STATE_FC_FLUSHING_DATA flag, before returning * any error from step 2. This ensures that waiters waiting on * EXT4_STATE_FC_FLUSHING_DATA can resume. */ mutex_lock(&sbi->s_fc_lock); list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { ext4_clear_inode_state(&iter->vfs_inode, EXT4_STATE_FC_FLUSHING_DATA); #if (BITS_PER_LONG < 64) wake_up_bit(&iter->i_state_flags, EXT4_STATE_FC_FLUSHING_DATA); #else wake_up_bit(&iter->i_flags, EXT4_STATE_FC_FLUSHING_DATA); #endif } /* * Make sure clearing of EXT4_STATE_FC_FLUSHING_DATA is visible before * the waiter checks the bit. Pairs with implicit barrier in * prepare_to_wait() in ext4_fc_del(). */ smp_mb(); mutex_unlock(&sbi->s_fc_lock); /* * If we encountered error in Step 2, return it now after clearing * EXT4_STATE_FC_FLUSHING_DATA bit. */ if (ret) return ret; /* Step 4: Mark all inodes as being committed. */ jbd2_journal_lock_updates(journal); /* * The journal is now locked. No more handles can start and all the * previous handles are now drained. We now mark the inodes on the * commit queue as being committed. */ mutex_lock(&sbi->s_fc_lock); list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { ext4_set_inode_state(&iter->vfs_inode, EXT4_STATE_FC_COMMITTING); } mutex_unlock(&sbi->s_fc_lock); jbd2_journal_unlock_updates(journal); /* * Step 5: If file system device is different from journal device, * issue a cache flush before we start writing fast commit blocks. */ if (journal->j_fs_dev != journal->j_dev) blkdev_issue_flush(journal->j_fs_dev); blk_start_plug(&plug); /* Step 6: Write fast commit blocks to disk. */ if (sbi->s_fc_bytes == 0) { /* * Step 6.1: Add a head tag only if this is the first fast * commit in this TID. */ head.fc_features = cpu_to_le32(EXT4_FC_SUPPORTED_FEATURES); head.fc_tid = cpu_to_le32( sbi->s_journal->j_running_transaction->t_tid); if (!ext4_fc_add_tlv(sb, EXT4_FC_TAG_HEAD, sizeof(head), (u8 *)&head, &crc)) { ret = -ENOSPC; goto out; } } /* Step 6.2: Now write all the dentry updates. */ mutex_lock(&sbi->s_fc_lock); ret = ext4_fc_commit_dentry_updates(journal, &crc); if (ret) goto out; /* Step 6.3: Now write all the changed inodes to disk. */ list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { inode = &iter->vfs_inode; if (!ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) continue; ret = ext4_fc_write_inode_data(inode, &crc); if (ret) goto out; ret = ext4_fc_write_inode(inode, &crc); if (ret) goto out; } /* Step 6.4: Finally write tail tag to conclude this fast commit. */ ret = ext4_fc_write_tail(sb, crc); out: mutex_unlock(&sbi->s_fc_lock); blk_finish_plug(&plug); return ret; } static void ext4_fc_update_stats(struct super_block *sb, int status, u64 commit_time, int nblks, tid_t commit_tid) { struct ext4_fc_stats *stats = &EXT4_SB(sb)->s_fc_stats; ext4_debug("Fast commit ended with status = %d for tid %u", status, commit_tid); if (status == EXT4_FC_STATUS_OK) { stats->fc_num_commits++; stats->fc_numblks += nblks; if (likely(stats->s_fc_avg_commit_time)) stats->s_fc_avg_commit_time = (commit_time + stats->s_fc_avg_commit_time * 3) / 4; else stats->s_fc_avg_commit_time = commit_time; } else if (status == EXT4_FC_STATUS_FAILED || status == EXT4_FC_STATUS_INELIGIBLE) { if (status == EXT4_FC_STATUS_FAILED) stats->fc_failed_commits++; stats->fc_ineligible_commits++; } else { stats->fc_skipped_commits++; } trace_ext4_fc_commit_stop(sb, nblks, status, commit_tid); } /* * The main commit entry point. Performs a fast commit for transaction * commit_tid if needed. If it's not possible to perform a fast commit * due to various reasons, we fall back to full commit. Returns 0 * on success, error otherwise. */ int ext4_fc_commit(journal_t *journal, tid_t commit_tid) { struct super_block *sb = journal->j_private; struct ext4_sb_info *sbi = EXT4_SB(sb); int nblks = 0, ret, bsize = journal->j_blocksize; int subtid = atomic_read(&sbi->s_fc_subtid); int status = EXT4_FC_STATUS_OK, fc_bufs_before = 0; ktime_t start_time, commit_time; int old_ioprio, journal_ioprio; if (!test_opt2(sb, JOURNAL_FAST_COMMIT)) return jbd2_complete_transaction(journal, commit_tid); trace_ext4_fc_commit_start(sb, commit_tid); start_time = ktime_get(); old_ioprio = get_current_ioprio(); restart_fc: ret = jbd2_fc_begin_commit(journal, commit_tid); if (ret == -EALREADY) { /* There was an ongoing commit, check if we need to restart */ if (atomic_read(&sbi->s_fc_subtid) <= subtid && tid_gt(commit_tid, journal->j_commit_sequence)) goto restart_fc; ext4_fc_update_stats(sb, EXT4_FC_STATUS_SKIPPED, 0, 0, commit_tid); return 0; } else if (ret) { /* * Commit couldn't start. Just update stats and perform a * full commit. */ ext4_fc_update_stats(sb, EXT4_FC_STATUS_FAILED, 0, 0, commit_tid); return jbd2_complete_transaction(journal, commit_tid); } /* * After establishing journal barrier via jbd2_fc_begin_commit(), check * if we are fast commit ineligible. */ if (ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE)) { status = EXT4_FC_STATUS_INELIGIBLE; goto fallback; } /* * Now that we know that this thread is going to do a fast commit, * elevate the priority to match that of the journal thread. */ if (journal->j_task->io_context) journal_ioprio = sbi->s_journal->j_task->io_context->ioprio; else journal_ioprio = EXT4_DEF_JOURNAL_IOPRIO; set_task_ioprio(current, journal_ioprio); fc_bufs_before = (sbi->s_fc_bytes + bsize - 1) / bsize; ret = ext4_fc_perform_commit(journal); if (ret < 0) { status = EXT4_FC_STATUS_FAILED; goto fallback; } nblks = (sbi->s_fc_bytes + bsize - 1) / bsize - fc_bufs_before; ret = jbd2_fc_wait_bufs(journal, nblks); if (ret < 0) { status = EXT4_FC_STATUS_FAILED; goto fallback; } atomic_inc(&sbi->s_fc_subtid); ret = jbd2_fc_end_commit(journal); set_task_ioprio(current, old_ioprio); /* * weight the commit time higher than the average time so we * don't react too strongly to vast changes in the commit time */ commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time)); ext4_fc_update_stats(sb, status, commit_time, nblks, commit_tid); return ret; fallback: set_task_ioprio(current, old_ioprio); ret = jbd2_fc_end_commit_fallback(journal); ext4_fc_update_stats(sb, status, 0, 0, commit_tid); return ret; } /* * Fast commit cleanup routine. This is called after every fast commit and * full commit. full is true if we are called after a full commit. */ static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid) { struct super_block *sb = journal->j_private; struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_inode_info *ei; struct ext4_fc_dentry_update *fc_dentry; if (full && sbi->s_fc_bh) sbi->s_fc_bh = NULL; trace_ext4_fc_cleanup(journal, full, tid); jbd2_fc_release_bufs(journal); mutex_lock(&sbi->s_fc_lock); while (!list_empty(&sbi->s_fc_q[FC_Q_MAIN])) { ei = list_first_entry(&sbi->s_fc_q[FC_Q_MAIN], struct ext4_inode_info, i_fc_list); list_del_init(&ei->i_fc_list); ext4_clear_inode_state(&ei->vfs_inode, EXT4_STATE_FC_COMMITTING); if (tid_geq(tid, ei->i_sync_tid)) { ext4_fc_reset_inode(&ei->vfs_inode); } else if (full) { /* * We are called after a full commit, inode has been * modified while the commit was running. Re-enqueue * the inode into STAGING, which will then be splice * back into MAIN. This cannot happen during * fastcommit because the journal is locked all the * time in that case (and tid doesn't increase so * tid check above isn't reliable). */ list_add_tail(&ei->i_fc_list, &sbi->s_fc_q[FC_Q_STAGING]); } /* * Make sure clearing of EXT4_STATE_FC_COMMITTING is * visible before we send the wakeup. Pairs with implicit * barrier in prepare_to_wait() in ext4_fc_track_inode(). */ smp_mb(); #if (BITS_PER_LONG < 64) wake_up_bit(&ei->i_state_flags, EXT4_STATE_FC_COMMITTING); #else wake_up_bit(&ei->i_flags, EXT4_STATE_FC_COMMITTING); #endif } while (!list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN])) { fc_dentry = list_first_entry(&sbi->s_fc_dentry_q[FC_Q_MAIN], struct ext4_fc_dentry_update, fcd_list); list_del_init(&fc_dentry->fcd_list); list_del_init(&fc_dentry->fcd_dilist); release_dentry_name_snapshot(&fc_dentry->fcd_name); kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry); } list_splice_init(&sbi->s_fc_dentry_q[FC_Q_STAGING], &sbi->s_fc_dentry_q[FC_Q_MAIN]); list_splice_init(&sbi->s_fc_q[FC_Q_STAGING], &sbi->s_fc_q[FC_Q_MAIN]); if (tid_geq(tid, sbi->s_fc_ineligible_tid)) { sbi->s_fc_ineligible_tid = 0; ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); } if (full) sbi->s_fc_bytes = 0; mutex_unlock(&sbi->s_fc_lock); trace_ext4_fc_stats(sb); } /* Ext4 Replay Path Routines */ /* Helper struct for dentry replay routines */ struct dentry_info_args { int parent_ino, dname_len, ino, inode_len; char *dname; }; /* Same as struct ext4_fc_tl, but uses native endianness fields */ struct ext4_fc_tl_mem { u16 fc_tag; u16 fc_len; }; static inline void tl_to_darg(struct dentry_info_args *darg, struct ext4_fc_tl_mem *tl, u8 *val) { struct ext4_fc_dentry_info fcd; memcpy(&fcd, val, sizeof(fcd)); darg->parent_ino = le32_to_cpu(fcd.fc_parent_ino); darg->ino = le32_to_cpu(fcd.fc_ino); darg->dname = val + offsetof(struct ext4_fc_dentry_info, fc_dname); darg->dname_len = tl->fc_len - sizeof(struct ext4_fc_dentry_info); } static inline void ext4_fc_get_tl(struct ext4_fc_tl_mem *tl, u8 *val) { struct ext4_fc_tl tl_disk; memcpy(&tl_disk, val, EXT4_FC_TAG_BASE_LEN); tl->fc_len = le16_to_cpu(tl_disk.fc_len); tl->fc_tag = le16_to_cpu(tl_disk.fc_tag); } /* Unlink replay function */ static int ext4_fc_replay_unlink(struct super_block *sb, struct ext4_fc_tl_mem *tl, u8 *val) { struct inode *inode, *old_parent; struct qstr entry; struct dentry_info_args darg; int ret = 0; tl_to_darg(&darg, tl, val); trace_ext4_fc_replay(sb, EXT4_FC_TAG_UNLINK, darg.ino, darg.parent_ino, darg.dname_len); entry.name = darg.dname; entry.len = darg.dname_len; inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL); if (IS_ERR(inode)) { ext4_debug("Inode %d not found", darg.ino); return 0; } old_parent = ext4_iget(sb, darg.parent_ino, EXT4_IGET_NORMAL); if (IS_ERR(old_parent)) { ext4_debug("Dir with inode %d not found", darg.parent_ino); iput(inode); return 0; } ret = __ext4_unlink(old_parent, &entry, inode, NULL); /* -ENOENT ok coz it might not exist anymore. */ if (ret == -ENOENT) ret = 0; iput(old_parent); iput(inode); return ret; } static int ext4_fc_replay_link_internal(struct super_block *sb, struct dentry_info_args *darg, struct inode *inode) { struct inode *dir = NULL; struct dentry *dentry_dir = NULL, *dentry_inode = NULL; struct qstr qstr_dname = QSTR_INIT(darg->dname, darg->dname_len); int ret = 0; dir = ext4_iget(sb, darg->parent_ino, EXT4_IGET_NORMAL); if (IS_ERR(dir)) { ext4_debug("Dir with inode %d not found.", darg->parent_ino); dir = NULL; goto out; } dentry_dir = d_obtain_alias(dir); if (IS_ERR(dentry_dir)) { ext4_debug("Failed to obtain dentry"); dentry_dir = NULL; goto out; } dentry_inode = d_alloc(dentry_dir, &qstr_dname); if (!dentry_inode) { ext4_debug("Inode dentry not created."); ret = -ENOMEM; goto out; } ret = __ext4_link(dir, inode, dentry_inode); /* * It's possible that link already existed since data blocks * for the dir in question got persisted before we crashed OR * we replayed this tag and crashed before the entire replay * could complete. */ if (ret && ret != -EEXIST) { ext4_debug("Failed to link\n"); goto out; } ret = 0; out: if (dentry_dir) { d_drop(dentry_dir); dput(dentry_dir); } else if (dir) { iput(dir); } if (dentry_inode) { d_drop(dentry_inode); dput(dentry_inode); } return ret; } /* Link replay function */ static int ext4_fc_replay_link(struct super_block *sb, struct ext4_fc_tl_mem *tl, u8 *val) { struct inode *inode; struct dentry_info_args darg; int ret = 0; tl_to_darg(&darg, tl, val); trace_ext4_fc_replay(sb, EXT4_FC_TAG_LINK, darg.ino, darg.parent_ino, darg.dname_len); inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL); if (IS_ERR(inode)) { ext4_debug("Inode not found."); return 0; } ret = ext4_fc_replay_link_internal(sb, &darg, inode); iput(inode); return ret; } /* * Record all the modified inodes during replay. We use this later to setup * block bitmaps correctly. */ static int ext4_fc_record_modified_inode(struct super_block *sb, int ino) { struct ext4_fc_replay_state *state; int i; state = &EXT4_SB(sb)->s_fc_replay_state; for (i = 0; i < state->fc_modified_inodes_used; i++) if (state->fc_modified_inodes[i] == ino) return 0; if (state->fc_modified_inodes_used == state->fc_modified_inodes_size) { int *fc_modified_inodes; fc_modified_inodes = krealloc(state->fc_modified_inodes, sizeof(int) * (state->fc_modified_inodes_size + EXT4_FC_REPLAY_REALLOC_INCREMENT), GFP_KERNEL); if (!fc_modified_inodes) return -ENOMEM; state->fc_modified_inodes = fc_modified_inodes; state->fc_modified_inodes_size += EXT4_FC_REPLAY_REALLOC_INCREMENT; } state->fc_modified_inodes[state->fc_modified_inodes_used++] = ino; return 0; } /* * Inode replay function */ static int ext4_fc_replay_inode(struct super_block *sb, struct ext4_fc_tl_mem *tl, u8 *val) { struct ext4_fc_inode fc_inode; struct ext4_inode *raw_inode; struct ext4_inode *raw_fc_inode; struct inode *inode = NULL; struct ext4_iloc iloc; int inode_len, ino, ret, tag = tl->fc_tag; struct ext4_extent_header *eh; size_t off_gen = offsetof(struct ext4_inode, i_generation); memcpy(&fc_inode, val, sizeof(fc_inode)); ino = le32_to_cpu(fc_inode.fc_ino); trace_ext4_fc_replay(sb, tag, ino, 0, 0); inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL); if (!IS_ERR(inode)) { ext4_ext_clear_bb(inode); iput(inode); } inode = NULL; ret = ext4_fc_record_modified_inode(sb, ino); if (ret) goto out; raw_fc_inode = (struct ext4_inode *) (val + offsetof(struct ext4_fc_inode, fc_raw_inode)); ret = ext4_get_fc_inode_loc(sb, ino, &iloc); if (ret) goto out; inode_len = tl->fc_len - sizeof(struct ext4_fc_inode); raw_inode = ext4_raw_inode(&iloc); memcpy(raw_inode, raw_fc_inode, offsetof(struct ext4_inode, i_block)); memcpy((u8 *)raw_inode + off_gen, (u8 *)raw_fc_inode + off_gen, inode_len - off_gen); if (le32_to_cpu(raw_inode->i_flags) & EXT4_EXTENTS_FL) { eh = (struct ext4_extent_header *)(&raw_inode->i_block[0]); if (eh->eh_magic != EXT4_EXT_MAGIC) { memset(eh, 0, sizeof(*eh)); eh->eh_magic = EXT4_EXT_MAGIC; eh->eh_max = cpu_to_le16( (sizeof(raw_inode->i_block) - sizeof(struct ext4_extent_header)) / sizeof(struct ext4_extent)); } } else if (le32_to_cpu(raw_inode->i_flags) & EXT4_INLINE_DATA_FL) { memcpy(raw_inode->i_block, raw_fc_inode->i_block, sizeof(raw_inode->i_block)); } /* Immediately update the inode on disk. */ ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh); if (ret) goto out; ret = sync_dirty_buffer(iloc.bh); if (ret) goto out; ret = ext4_mark_inode_used(sb, ino); if (ret) goto out; /* Given that we just wrote the inode on disk, this SHOULD succeed. */ inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL); if (IS_ERR(inode)) { ext4_debug("Inode not found."); return -EFSCORRUPTED; } /* * Our allocator could have made different decisions than before * crashing. This should be fixed but until then, we calculate * the number of blocks the inode. */ if (!ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA)) ext4_ext_replay_set_iblocks(inode); inode->i_generation = le32_to_cpu(ext4_raw_inode(&iloc)->i_generation); ext4_reset_inode_seed(inode); ext4_inode_csum_set(inode, ext4_raw_inode(&iloc), EXT4_I(inode)); ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh); sync_dirty_buffer(iloc.bh); brelse(iloc.bh); out: iput(inode); if (!ret) blkdev_issue_flush(sb->s_bdev); return 0; } /* * Dentry create replay function. * * EXT4_FC_TAG_CREAT is preceded by EXT4_FC_TAG_INODE_FULL. Which means, the * inode for which we are trying to create a dentry here, should already have * been replayed before we start here. */ static int ext4_fc_replay_create(struct super_block *sb, struct ext4_fc_tl_mem *tl, u8 *val) { int ret = 0; struct inode *inode = NULL; struct inode *dir = NULL; struct dentry_info_args darg; tl_to_darg(&darg, tl, val); trace_ext4_fc_replay(sb, EXT4_FC_TAG_CREAT, darg.ino, darg.parent_ino, darg.dname_len); /* This takes care of update group descriptor and other metadata */ ret = ext4_mark_inode_used(sb, darg.ino); if (ret) goto out; inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL); if (IS_ERR(inode)) { ext4_debug("inode %d not found.", darg.ino); inode = NULL; ret = -EINVAL; goto out; } if (S_ISDIR(inode->i_mode)) { /* * If we are creating a directory, we need to make sure that the * dot and dot dot dirents are setup properly. */ dir = ext4_iget(sb, darg.parent_ino, EXT4_IGET_NORMAL); if (IS_ERR(dir)) { ext4_debug("Dir %d not found.", darg.ino); goto out; } ret = ext4_init_new_dir(NULL, dir, inode); iput(dir); if (ret) { ret = 0; goto out; } } ret = ext4_fc_replay_link_internal(sb, &darg, inode); if (ret) goto out; set_nlink(inode, 1); ext4_mark_inode_dirty(NULL, inode); out: iput(inode); return ret; } /* * Record physical disk regions which are in use as per fast commit area, * and used by inodes during replay phase. Our simple replay phase * allocator excludes these regions from allocation. */ int ext4_fc_record_regions(struct super_block *sb, int ino, ext4_lblk_t lblk, ext4_fsblk_t pblk, int len, int replay) { struct ext4_fc_replay_state *state; struct ext4_fc_alloc_region *region; state = &EXT4_SB(sb)->s_fc_replay_state; /* * during replay phase, the fc_regions_valid may not same as * fc_regions_used, update it when do new additions. */ if (replay && state->fc_regions_used != state->fc_regions_valid) state->fc_regions_used = state->fc_regions_valid; if (state->fc_regions_used == state->fc_regions_size) { struct ext4_fc_alloc_region *fc_regions; fc_regions = krealloc(state->fc_regions, sizeof(struct ext4_fc_alloc_region) * (state->fc_regions_size + EXT4_FC_REPLAY_REALLOC_INCREMENT), GFP_KERNEL); if (!fc_regions) return -ENOMEM; state->fc_regions_size += EXT4_FC_REPLAY_REALLOC_INCREMENT; state->fc_regions = fc_regions; } region = &state->fc_regions[state->fc_regions_used++]; region->ino = ino; region->lblk = lblk; region->pblk = pblk; region->len = len; if (replay) state->fc_regions_valid++; return 0; } /* Replay add range tag */ static int ext4_fc_replay_add_range(struct super_block *sb, struct ext4_fc_tl_mem *tl, u8 *val) { struct ext4_fc_add_range fc_add_ex; struct ext4_extent newex, *ex; struct inode *inode; ext4_lblk_t start, cur; int remaining, len; ext4_fsblk_t start_pblk; struct ext4_map_blocks map; struct ext4_ext_path *path = NULL; int ret; memcpy(&fc_add_ex, val, sizeof(fc_add_ex)); ex = (struct ext4_extent *)&fc_add_ex.fc_ex; trace_ext4_fc_replay(sb, EXT4_FC_TAG_ADD_RANGE, le32_to_cpu(fc_add_ex.fc_ino), le32_to_cpu(ex->ee_block), ext4_ext_get_actual_len(ex)); inode = ext4_iget(sb, le32_to_cpu(fc_add_ex.fc_ino), EXT4_IGET_NORMAL); if (IS_ERR(inode)) { ext4_debug("Inode not found."); return 0; } ret = ext4_fc_record_modified_inode(sb, inode->i_ino); if (ret) goto out; start = le32_to_cpu(ex->ee_block); start_pblk = ext4_ext_pblock(ex); len = ext4_ext_get_actual_len(ex); cur = start; remaining = len; ext4_debug("ADD_RANGE, lblk %d, pblk %lld, len %d, unwritten %d, inode %ld\n", start, start_pblk, len, ext4_ext_is_unwritten(ex), inode->i_ino); while (remaining > 0) { map.m_lblk = cur; map.m_len = remaining; map.m_pblk = 0; ret = ext4_map_blocks(NULL, inode, &map, 0); if (ret < 0) goto out; if (ret == 0) { /* Range is not mapped */ path = ext4_find_extent(inode, cur, path, 0); if (IS_ERR(path)) goto out; memset(&newex, 0, sizeof(newex)); newex.ee_block = cpu_to_le32(cur); ext4_ext_store_pblock( &newex, start_pblk + cur - start); newex.ee_len = cpu_to_le16(map.m_len); if (ext4_ext_is_unwritten(ex)) ext4_ext_mark_unwritten(&newex); down_write(&EXT4_I(inode)->i_data_sem); path = ext4_ext_insert_extent(NULL, inode, path, &newex, 0); up_write((&EXT4_I(inode)->i_data_sem)); if (IS_ERR(path)) goto out; goto next; } if (start_pblk + cur - start != map.m_pblk) { /* * Logical to physical mapping changed. This can happen * if this range was removed and then reallocated to * map to new physical blocks during a fast commit. */ ret = ext4_ext_replay_update_ex(inode, cur, map.m_len, ext4_ext_is_unwritten(ex), start_pblk + cur - start); if (ret) goto out; /* * Mark the old blocks as free since they aren't used * anymore. We maintain an array of all the modified * inodes. In case these blocks are still used at either * a different logical range in the same inode or in * some different inode, we will mark them as allocated * at the end of the FC replay using our array of * modified inodes. */ ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, false); goto next; } /* Range is mapped and needs a state change */ ext4_debug("Converting from %ld to %d %lld", map.m_flags & EXT4_MAP_UNWRITTEN, ext4_ext_is_unwritten(ex), map.m_pblk); ret = ext4_ext_replay_update_ex(inode, cur, map.m_len, ext4_ext_is_unwritten(ex), map.m_pblk); if (ret) goto out; /* * We may have split the extent tree while toggling the state. * Try to shrink the extent tree now. */ ext4_ext_replay_shrink_inode(inode, start + len); next: cur += map.m_len; remaining -= map.m_len; } ext4_ext_replay_shrink_inode(inode, i_size_read(inode) >> sb->s_blocksize_bits); out: ext4_free_ext_path(path); iput(inode); return 0; } /* Replay DEL_RANGE tag */ static int ext4_fc_replay_del_range(struct super_block *sb, struct ext4_fc_tl_mem *tl, u8 *val) { struct inode *inode; struct ext4_fc_del_range lrange; struct ext4_map_blocks map; ext4_lblk_t cur, remaining; int ret; memcpy(&lrange, val, sizeof(lrange)); cur = le32_to_cpu(lrange.fc_lblk); remaining = le32_to_cpu(lrange.fc_len); trace_ext4_fc_replay(sb, EXT4_FC_TAG_DEL_RANGE, le32_to_cpu(lrange.fc_ino), cur, remaining); inode = ext4_iget(sb, le32_to_cpu(lrange.fc_ino), EXT4_IGET_NORMAL); if (IS_ERR(inode)) { ext4_debug("Inode %d not found", le32_to_cpu(lrange.fc_ino)); return 0; } ret = ext4_fc_record_modified_inode(sb, inode->i_ino); if (ret) goto out; ext4_debug("DEL_RANGE, inode %ld, lblk %d, len %d\n", inode->i_ino, le32_to_cpu(lrange.fc_lblk), le32_to_cpu(lrange.fc_len)); while (remaining > 0) { map.m_lblk = cur; map.m_len = remaining; ret = ext4_map_blocks(NULL, inode, &map, 0); if (ret < 0) goto out; if (ret > 0) { remaining -= ret; cur += ret; ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, false); } else { remaining -= map.m_len; cur += map.m_len; } } down_write(&EXT4_I(inode)->i_data_sem); ret = ext4_ext_remove_space(inode, le32_to_cpu(lrange.fc_lblk), le32_to_cpu(lrange.fc_lblk) + le32_to_cpu(lrange.fc_len) - 1); up_write(&EXT4_I(inode)->i_data_sem); if (ret) goto out; ext4_ext_replay_shrink_inode(inode, i_size_read(inode) >> sb->s_blocksize_bits); ext4_mark_inode_dirty(NULL, inode); out: iput(inode); return 0; } static void ext4_fc_set_bitmaps_and_counters(struct super_block *sb) { struct ext4_fc_replay_state *state; struct inode *inode; struct ext4_ext_path *path = NULL; struct ext4_map_blocks map; int i, ret, j; ext4_lblk_t cur, end; state = &EXT4_SB(sb)->s_fc_replay_state; for (i = 0; i < state->fc_modified_inodes_used; i++) { inode = ext4_iget(sb, state->fc_modified_inodes[i], EXT4_IGET_NORMAL); if (IS_ERR(inode)) { ext4_debug("Inode %d not found.", state->fc_modified_inodes[i]); continue; } cur = 0; end = EXT_MAX_BLOCKS; if (ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA)) { iput(inode); continue; } while (cur < end) { map.m_lblk = cur; map.m_len = end - cur; ret = ext4_map_blocks(NULL, inode, &map, 0); if (ret < 0) break; if (ret > 0) { path = ext4_find_extent(inode, map.m_lblk, path, 0); if (!IS_ERR(path)) { for (j = 0; j < path->p_depth; j++) ext4_mb_mark_bb(inode->i_sb, path[j].p_block, 1, true); } else { path = NULL; } cur += ret; ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, true); } else { cur = cur + (map.m_len ? map.m_len : 1); } } iput(inode); } ext4_free_ext_path(path); } /* * Check if block is in excluded regions for block allocation. The simple * allocator that runs during replay phase is calls this function to see * if it is okay to use a block. */ bool ext4_fc_replay_check_excluded(struct super_block *sb, ext4_fsblk_t blk) { int i; struct ext4_fc_replay_state *state; state = &EXT4_SB(sb)->s_fc_replay_state; for (i = 0; i < state->fc_regions_valid; i++) { if (state->fc_regions[i].ino == 0 || state->fc_regions[i].len == 0) continue; if (in_range(blk, state->fc_regions[i].pblk, state->fc_regions[i].len)) return true; } return false; } /* Cleanup function called after replay */ void ext4_fc_replay_cleanup(struct super_block *sb) { struct ext4_sb_info *sbi = EXT4_SB(sb); sbi->s_mount_state &= ~EXT4_FC_REPLAY; kfree(sbi->s_fc_replay_state.fc_regions); kfree(sbi->s_fc_replay_state.fc_modified_inodes); } static bool ext4_fc_value_len_isvalid(struct ext4_sb_info *sbi, int tag, int len) { switch (tag) { case EXT4_FC_TAG_ADD_RANGE: return len == sizeof(struct ext4_fc_add_range); case EXT4_FC_TAG_DEL_RANGE: return len == sizeof(struct ext4_fc_del_range); case EXT4_FC_TAG_CREAT: case EXT4_FC_TAG_LINK: case EXT4_FC_TAG_UNLINK: len -= sizeof(struct ext4_fc_dentry_info); return len >= 1 && len <= EXT4_NAME_LEN; case EXT4_FC_TAG_INODE: len -= sizeof(struct ext4_fc_inode); return len >= EXT4_GOOD_OLD_INODE_SIZE && len <= sbi->s_inode_size; case EXT4_FC_TAG_PAD: return true; /* padding can have any length */ case EXT4_FC_TAG_TAIL: return len >= sizeof(struct ext4_fc_tail); case EXT4_FC_TAG_HEAD: return len == sizeof(struct ext4_fc_head); } return false; } /* * Recovery Scan phase handler * * This function is called during the scan phase and is responsible * for doing following things: * - Make sure the fast commit area has valid tags for replay * - Count number of tags that need to be replayed by the replay handler * - Verify CRC * - Create a list of excluded blocks for allocation during replay phase * * This function returns JBD2_FC_REPLAY_CONTINUE to indicate that SCAN is * incomplete and JBD2 should send more blocks. It returns JBD2_FC_REPLAY_STOP * to indicate that scan has finished and JBD2 can now start replay phase. * It returns a negative error to indicate that there was an error. At the end * of a successful scan phase, sbi->s_fc_replay_state.fc_replay_num_tags is set * to indicate the number of tags that need to replayed during the replay phase. */ static int ext4_fc_replay_scan(journal_t *journal, struct buffer_head *bh, int off, tid_t expected_tid) { struct super_block *sb = journal->j_private; struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_fc_replay_state *state; int ret = JBD2_FC_REPLAY_CONTINUE; struct ext4_fc_add_range ext; struct ext4_fc_tl_mem tl; struct ext4_fc_tail tail; __u8 *start, *end, *cur, *val; struct ext4_fc_head head; struct ext4_extent *ex; state = &sbi->s_fc_replay_state; start = (u8 *)bh->b_data; end = start + journal->j_blocksize; if (state->fc_replay_expected_off == 0) { state->fc_cur_tag = 0; state->fc_replay_num_tags = 0; state->fc_crc = 0; state->fc_regions = NULL; state->fc_regions_valid = state->fc_regions_used = state->fc_regions_size = 0; /* Check if we can stop early */ if (le16_to_cpu(((struct ext4_fc_tl *)start)->fc_tag) != EXT4_FC_TAG_HEAD) return 0; } if (off != state->fc_replay_expected_off) { ret = -EFSCORRUPTED; goto out_err; } state->fc_replay_expected_off++; for (cur = start; cur <= end - EXT4_FC_TAG_BASE_LEN; cur = cur + EXT4_FC_TAG_BASE_LEN + tl.fc_len) { ext4_fc_get_tl(&tl, cur); val = cur + EXT4_FC_TAG_BASE_LEN; if (tl.fc_len > end - val || !ext4_fc_value_len_isvalid(sbi, tl.fc_tag, tl.fc_len)) { ret = state->fc_replay_num_tags ? JBD2_FC_REPLAY_STOP : -ECANCELED; goto out_err; } ext4_debug("Scan phase, tag:%s, blk %lld\n", tag2str(tl.fc_tag), bh->b_blocknr); switch (tl.fc_tag) { case EXT4_FC_TAG_ADD_RANGE: memcpy(&ext, val, sizeof(ext)); ex = (struct ext4_extent *)&ext.fc_ex; ret = ext4_fc_record_regions(sb, le32_to_cpu(ext.fc_ino), le32_to_cpu(ex->ee_block), ext4_ext_pblock(ex), ext4_ext_get_actual_len(ex), 0); if (ret < 0) break; ret = JBD2_FC_REPLAY_CONTINUE; fallthrough; case EXT4_FC_TAG_DEL_RANGE: case EXT4_FC_TAG_LINK: case EXT4_FC_TAG_UNLINK: case EXT4_FC_TAG_CREAT: case EXT4_FC_TAG_INODE: case EXT4_FC_TAG_PAD: state->fc_cur_tag++; state->fc_crc = ext4_chksum(state->fc_crc, cur, EXT4_FC_TAG_BASE_LEN + tl.fc_len); break; case EXT4_FC_TAG_TAIL: state->fc_cur_tag++; memcpy(&tail, val, sizeof(tail)); state->fc_crc = ext4_chksum(state->fc_crc, cur, EXT4_FC_TAG_BASE_LEN + offsetof(struct ext4_fc_tail, fc_crc)); if (le32_to_cpu(tail.fc_tid) == expected_tid && le32_to_cpu(tail.fc_crc) == state->fc_crc) { state->fc_replay_num_tags = state->fc_cur_tag; state->fc_regions_valid = state->fc_regions_used; } else { ret = state->fc_replay_num_tags ? JBD2_FC_REPLAY_STOP : -EFSBADCRC; } state->fc_crc = 0; break; case EXT4_FC_TAG_HEAD: memcpy(&head, val, sizeof(head)); if (le32_to_cpu(head.fc_features) & ~EXT4_FC_SUPPORTED_FEATURES) { ret = -EOPNOTSUPP; break; } if (le32_to_cpu(head.fc_tid) != expected_tid) { ret = JBD2_FC_REPLAY_STOP; break; } state->fc_cur_tag++; state->fc_crc = ext4_chksum(state->fc_crc, cur, EXT4_FC_TAG_BASE_LEN + tl.fc_len); break; default: ret = state->fc_replay_num_tags ? JBD2_FC_REPLAY_STOP : -ECANCELED; } if (ret < 0 || ret == JBD2_FC_REPLAY_STOP) break; } out_err: trace_ext4_fc_replay_scan(sb, ret, off); return ret; } /* * Main recovery path entry point. * The meaning of return codes is similar as above. */ static int ext4_fc_replay(journal_t *journal, struct buffer_head *bh, enum passtype pass, int off, tid_t expected_tid) { struct super_block *sb = journal->j_private; struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_fc_tl_mem tl; __u8 *start, *end, *cur, *val; int ret = JBD2_FC_REPLAY_CONTINUE; struct ext4_fc_replay_state *state = &sbi->s_fc_replay_state; struct ext4_fc_tail tail; if (pass == PASS_SCAN) { state->fc_current_pass = PASS_SCAN; return ext4_fc_replay_scan(journal, bh, off, expected_tid); } if (state->fc_current_pass != pass) { state->fc_current_pass = pass; sbi->s_mount_state |= EXT4_FC_REPLAY; } if (!sbi->s_fc_replay_state.fc_replay_num_tags) { ext4_debug("Replay stops\n"); ext4_fc_set_bitmaps_and_counters(sb); return 0; } #ifdef CONFIG_EXT4_DEBUG if (sbi->s_fc_debug_max_replay && off >= sbi->s_fc_debug_max_replay) { pr_warn("Dropping fc block %d because max_replay set\n", off); return JBD2_FC_REPLAY_STOP; } #endif start = (u8 *)bh->b_data; end = start + journal->j_blocksize; for (cur = start; cur <= end - EXT4_FC_TAG_BASE_LEN; cur = cur + EXT4_FC_TAG_BASE_LEN + tl.fc_len) { ext4_fc_get_tl(&tl, cur); val = cur + EXT4_FC_TAG_BASE_LEN; if (state->fc_replay_num_tags == 0) { ret = JBD2_FC_REPLAY_STOP; ext4_fc_set_bitmaps_and_counters(sb); break; } ext4_debug("Replay phase, tag:%s\n", tag2str(tl.fc_tag)); state->fc_replay_num_tags--; switch (tl.fc_tag) { case EXT4_FC_TAG_LINK: ret = ext4_fc_replay_link(sb, &tl, val); break; case EXT4_FC_TAG_UNLINK: ret = ext4_fc_replay_unlink(sb, &tl, val); break; case EXT4_FC_TAG_ADD_RANGE: ret = ext4_fc_replay_add_range(sb, &tl, val); break; case EXT4_FC_TAG_CREAT: ret = ext4_fc_replay_create(sb, &tl, val); break; case EXT4_FC_TAG_DEL_RANGE: ret = ext4_fc_replay_del_range(sb, &tl, val); break; case EXT4_FC_TAG_INODE: ret = ext4_fc_replay_inode(sb, &tl, val); break; case EXT4_FC_TAG_PAD: trace_ext4_fc_replay(sb, EXT4_FC_TAG_PAD, 0, tl.fc_len, 0); break; case EXT4_FC_TAG_TAIL: trace_ext4_fc_replay(sb, EXT4_FC_TAG_TAIL, 0, tl.fc_len, 0); memcpy(&tail, val, sizeof(tail)); WARN_ON(le32_to_cpu(tail.fc_tid) != expected_tid); break; case EXT4_FC_TAG_HEAD: break; default: trace_ext4_fc_replay(sb, tl.fc_tag, 0, tl.fc_len, 0); ret = -ECANCELED; break; } if (ret < 0) break; ret = JBD2_FC_REPLAY_CONTINUE; } return ret; } void ext4_fc_init(struct super_block *sb, journal_t *journal) { /* * We set replay callback even if fast commit disabled because we may * could still have fast commit blocks that need to be replayed even if * fast commit has now been turned off. */ journal->j_fc_replay_callback = ext4_fc_replay; if (!test_opt2(sb, JOURNAL_FAST_COMMIT)) return; journal->j_fc_cleanup_callback = ext4_fc_cleanup; } static const char * const fc_ineligible_reasons[] = { [EXT4_FC_REASON_XATTR] = "Extended attributes changed", [EXT4_FC_REASON_CROSS_RENAME] = "Cross rename", [EXT4_FC_REASON_JOURNAL_FLAG_CHANGE] = "Journal flag changed", [EXT4_FC_REASON_NOMEM] = "Insufficient memory", [EXT4_FC_REASON_SWAP_BOOT] = "Swap boot", [EXT4_FC_REASON_RESIZE] = "Resize", [EXT4_FC_REASON_RENAME_DIR] = "Dir renamed", [EXT4_FC_REASON_FALLOC_RANGE] = "Falloc range op", [EXT4_FC_REASON_INODE_JOURNAL_DATA] = "Data journalling", [EXT4_FC_REASON_ENCRYPTED_FILENAME] = "Encrypted filename", }; int ext4_fc_info_show(struct seq_file *seq, void *v) { struct ext4_sb_info *sbi = EXT4_SB((struct super_block *)seq->private); struct ext4_fc_stats *stats = &sbi->s_fc_stats; int i; if (v != SEQ_START_TOKEN) return 0; seq_printf(seq, "fc stats:\n%ld commits\n%ld ineligible\n%ld numblks\n%lluus avg_commit_time\n", stats->fc_num_commits, stats->fc_ineligible_commits, stats->fc_numblks, div_u64(stats->s_fc_avg_commit_time, 1000)); seq_puts(seq, "Ineligible reasons:\n"); for (i = 0; i < EXT4_FC_REASON_MAX; i++) seq_printf(seq, "\"%s\":\t%d\n", fc_ineligible_reasons[i], stats->fc_ineligible_reason_count[i]); return 0; } int __init ext4_fc_init_dentry_cache(void) { ext4_fc_dentry_cachep = KMEM_CACHE(ext4_fc_dentry_update, SLAB_RECLAIM_ACCOUNT); if (ext4_fc_dentry_cachep == NULL) return -ENOMEM; return 0; } void ext4_fc_destroy_dentry_cache(void) { kmem_cache_destroy(ext4_fc_dentry_cachep); } |
| 12 2 2 2 12 12 12 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 | // SPDX-License-Identifier: GPL-2.0 #include <linux/types.h> #include <linux/kconfig.h> #include <linux/list.h> #include <linux/security.h> #include <linux/umh.h> #include <linux/sysctl.h> #include <linux/module.h> #include "fallback.h" #include "firmware.h" /* * firmware fallback mechanism */ /* * use small loading timeout for caching devices' firmware because all these * firmware images have been loaded successfully at lease once, also system is * ready for completing firmware loading now. The maximum size of firmware in * current distributions is about 2M bytes, so 10 secs should be enough. */ void fw_fallback_set_cache_timeout(void) { fw_fallback_config.old_timeout = __firmware_loading_timeout(); __fw_fallback_set_timeout(10); } /* Restores the timeout to the value last configured during normal operation */ void fw_fallback_set_default_timeout(void) { __fw_fallback_set_timeout(fw_fallback_config.old_timeout); } static long firmware_loading_timeout(void) { return __firmware_loading_timeout() > 0 ? __firmware_loading_timeout() * HZ : MAX_JIFFY_OFFSET; } static inline int fw_sysfs_wait_timeout(struct fw_priv *fw_priv, long timeout) { return __fw_state_wait_common(fw_priv, timeout); } static LIST_HEAD(pending_fw_head); void kill_pending_fw_fallback_reqs(bool kill_all) { struct fw_priv *fw_priv; struct fw_priv *next; mutex_lock(&fw_lock); list_for_each_entry_safe(fw_priv, next, &pending_fw_head, pending_list) { if (kill_all || !fw_priv->need_uevent) __fw_load_abort(fw_priv); } if (kill_all) fw_load_abort_all = true; mutex_unlock(&fw_lock); } /** * fw_load_sysfs_fallback() - load a firmware via the sysfs fallback mechanism * @fw_sysfs: firmware sysfs information for the firmware to load * @timeout: timeout to wait for the load * * In charge of constructing a sysfs fallback interface for firmware loading. **/ static int fw_load_sysfs_fallback(struct fw_sysfs *fw_sysfs, long timeout) { int retval = 0; struct device *f_dev = &fw_sysfs->dev; struct fw_priv *fw_priv = fw_sysfs->fw_priv; /* fall back on userspace loading */ if (!fw_priv->data) fw_priv->is_paged_buf = true; dev_set_uevent_suppress(f_dev, true); retval = device_add(f_dev); if (retval) { dev_err(f_dev, "%s: device_register failed\n", __func__); goto err_put_dev; } mutex_lock(&fw_lock); if (fw_load_abort_all || fw_state_is_aborted(fw_priv)) { mutex_unlock(&fw_lock); retval = -EINTR; goto out; } list_add(&fw_priv->pending_list, &pending_fw_head); mutex_unlock(&fw_lock); if (fw_priv->opt_flags & FW_OPT_UEVENT) { fw_priv->need_uevent = true; dev_set_uevent_suppress(f_dev, false); dev_dbg(f_dev, "firmware: requesting %s\n", fw_priv->fw_name); kobject_uevent(&fw_sysfs->dev.kobj, KOBJ_ADD); } else { timeout = MAX_JIFFY_OFFSET; } retval = fw_sysfs_wait_timeout(fw_priv, timeout); if (retval < 0 && retval != -ENOENT) { mutex_lock(&fw_lock); fw_load_abort(fw_sysfs); mutex_unlock(&fw_lock); } if (fw_state_is_aborted(fw_priv)) { if (retval == -ERESTARTSYS) retval = -EINTR; } else if (fw_priv->is_paged_buf && !fw_priv->data) retval = -ENOMEM; out: device_del(f_dev); err_put_dev: put_device(f_dev); return retval; } static int fw_load_from_user_helper(struct firmware *firmware, const char *name, struct device *device, u32 opt_flags) { struct fw_sysfs *fw_sysfs; long timeout; int ret; timeout = firmware_loading_timeout(); if (opt_flags & FW_OPT_NOWAIT) { timeout = usermodehelper_read_lock_wait(timeout); if (!timeout) { dev_dbg(device, "firmware: %s loading timed out\n", name); return -EBUSY; } } else { ret = usermodehelper_read_trylock(); if (WARN_ON(ret)) { dev_err(device, "firmware: %s will not be loaded\n", name); return ret; } } fw_sysfs = fw_create_instance(firmware, name, device, opt_flags); if (IS_ERR(fw_sysfs)) { ret = PTR_ERR(fw_sysfs); goto out_unlock; } fw_sysfs->fw_priv = firmware->priv; ret = fw_load_sysfs_fallback(fw_sysfs, timeout); if (!ret) ret = assign_fw(firmware, device); out_unlock: usermodehelper_read_unlock(); return ret; } static bool fw_force_sysfs_fallback(u32 opt_flags) { if (fw_fallback_config.force_sysfs_fallback) return true; if (!(opt_flags & FW_OPT_USERHELPER)) return false; return true; } static bool fw_run_sysfs_fallback(u32 opt_flags) { int ret; if (fw_fallback_config.ignore_sysfs_fallback) { pr_info_once("Ignoring firmware sysfs fallback due to sysctl knob\n"); return false; } if ((opt_flags & FW_OPT_NOFALLBACK_SYSFS)) return false; /* Also permit LSMs and IMA to fail firmware sysfs fallback */ ret = security_kernel_load_data(LOADING_FIRMWARE, true); if (ret < 0) return false; return fw_force_sysfs_fallback(opt_flags); } /** * firmware_fallback_sysfs() - use the fallback mechanism to find firmware * @fw: pointer to firmware image * @name: name of firmware file to look for * @device: device for which firmware is being loaded * @opt_flags: options to control firmware loading behaviour, as defined by * &enum fw_opt * @ret: return value from direct lookup which triggered the fallback mechanism * * This function is called if direct lookup for the firmware failed, it enables * a fallback mechanism through userspace by exposing a sysfs loading * interface. Userspace is in charge of loading the firmware through the sysfs * loading interface. This sysfs fallback mechanism may be disabled completely * on a system by setting the proc sysctl value ignore_sysfs_fallback to true. * If this is false we check if the internal API caller set the * @FW_OPT_NOFALLBACK_SYSFS flag, if so it would also disable the fallback * mechanism. A system may want to enforce the sysfs fallback mechanism at all * times, it can do this by setting ignore_sysfs_fallback to false and * force_sysfs_fallback to true. * Enabling force_sysfs_fallback is functionally equivalent to build a kernel * with CONFIG_FW_LOADER_USER_HELPER_FALLBACK. **/ int firmware_fallback_sysfs(struct firmware *fw, const char *name, struct device *device, u32 opt_flags, int ret) { if (!fw_run_sysfs_fallback(opt_flags)) return ret; if (!(opt_flags & FW_OPT_NO_WARN)) dev_warn(device, "Falling back to sysfs fallback for: %s\n", name); else dev_dbg(device, "Falling back to sysfs fallback for: %s\n", name); return fw_load_from_user_helper(fw, name, device, opt_flags); } |
| 6046 6046 6099 6051 6055 6051 6051 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Multiplex several virtual IPIs over a single HW IPI. * * Copyright The Asahi Linux Contributors * Copyright (c) 2022 Ventana Micro Systems Inc. */ #define pr_fmt(fmt) "ipi-mux: " fmt #include <linux/cpu.h> #include <linux/init.h> #include <linux/irq.h> #include <linux/irqchip.h> #include <linux/irqchip/chained_irq.h> #include <linux/irqdomain.h> #include <linux/jump_label.h> #include <linux/percpu.h> #include <linux/smp.h> struct ipi_mux_cpu { atomic_t enable; atomic_t bits; }; static struct ipi_mux_cpu __percpu *ipi_mux_pcpu; static struct irq_domain *ipi_mux_domain; static void (*ipi_mux_send)(unsigned int cpu); static void ipi_mux_mask(struct irq_data *d) { struct ipi_mux_cpu *icpu = this_cpu_ptr(ipi_mux_pcpu); atomic_andnot(BIT(irqd_to_hwirq(d)), &icpu->enable); } static void ipi_mux_unmask(struct irq_data *d) { struct ipi_mux_cpu *icpu = this_cpu_ptr(ipi_mux_pcpu); u32 ibit = BIT(irqd_to_hwirq(d)); atomic_or(ibit, &icpu->enable); /* * The atomic_or() above must complete before the atomic_read() * below to avoid racing ipi_mux_send_mask(). */ smp_mb__after_atomic(); /* If a pending IPI was unmasked, raise a parent IPI immediately. */ if (atomic_read(&icpu->bits) & ibit) ipi_mux_send(smp_processor_id()); } static void ipi_mux_send_mask(struct irq_data *d, const struct cpumask *mask) { struct ipi_mux_cpu *icpu = this_cpu_ptr(ipi_mux_pcpu); u32 ibit = BIT(irqd_to_hwirq(d)); unsigned long pending; int cpu; for_each_cpu(cpu, mask) { icpu = per_cpu_ptr(ipi_mux_pcpu, cpu); /* * This sequence is the mirror of the one in ipi_mux_unmask(); * see the comment there. Additionally, release semantics * ensure that the vIPI flag set is ordered after any shared * memory accesses that precede it. This therefore also pairs * with the atomic_fetch_andnot in ipi_mux_process(). */ pending = atomic_fetch_or_release(ibit, &icpu->bits); /* * The atomic_fetch_or_release() above must complete * before the atomic_read() below to avoid racing with * ipi_mux_unmask(). */ smp_mb__after_atomic(); /* * The flag writes must complete before the physical IPI is * issued to another CPU. This is implied by the control * dependency on the result of atomic_read() below, which is * itself already ordered after the vIPI flag write. */ if (!(pending & ibit) && (atomic_read(&icpu->enable) & ibit)) ipi_mux_send(cpu); } } static const struct irq_chip ipi_mux_chip = { .name = "IPI Mux", .irq_mask = ipi_mux_mask, .irq_unmask = ipi_mux_unmask, .ipi_send_mask = ipi_mux_send_mask, }; static int ipi_mux_domain_alloc(struct irq_domain *d, unsigned int virq, unsigned int nr_irqs, void *arg) { int i; for (i = 0; i < nr_irqs; i++) { irq_set_percpu_devid(virq + i); irq_domain_set_info(d, virq + i, i, &ipi_mux_chip, NULL, handle_percpu_devid_irq, NULL, NULL); } return 0; } static const struct irq_domain_ops ipi_mux_domain_ops = { .alloc = ipi_mux_domain_alloc, .free = irq_domain_free_irqs_top, }; /** * ipi_mux_process - Process multiplexed virtual IPIs */ void ipi_mux_process(void) { struct ipi_mux_cpu *icpu = this_cpu_ptr(ipi_mux_pcpu); irq_hw_number_t hwirq; unsigned long ipis; unsigned int en; /* * Reading enable mask does not need to be ordered as long as * this function is called from interrupt handler because only * the CPU itself can change it's own enable mask. */ en = atomic_read(&icpu->enable); /* * Clear the IPIs we are about to handle. This pairs with the * atomic_fetch_or_release() in ipi_mux_send_mask(). */ ipis = atomic_fetch_andnot(en, &icpu->bits) & en; for_each_set_bit(hwirq, &ipis, BITS_PER_TYPE(int)) generic_handle_domain_irq(ipi_mux_domain, hwirq); } /** * ipi_mux_create - Create virtual IPIs multiplexed on top of a single * parent IPI. * @nr_ipi: number of virtual IPIs to create. This should * be <= BITS_PER_TYPE(int) * @mux_send: callback to trigger parent IPI for a particular CPU * * Returns first virq of the newly created virtual IPIs upon success * or <=0 upon failure */ int ipi_mux_create(unsigned int nr_ipi, void (*mux_send)(unsigned int cpu)) { struct fwnode_handle *fwnode; struct irq_domain *domain; int rc; if (ipi_mux_domain) return -EEXIST; if (BITS_PER_TYPE(int) < nr_ipi || !mux_send) return -EINVAL; ipi_mux_pcpu = alloc_percpu(typeof(*ipi_mux_pcpu)); if (!ipi_mux_pcpu) return -ENOMEM; fwnode = irq_domain_alloc_named_fwnode("IPI-Mux"); if (!fwnode) { pr_err("unable to create IPI Mux fwnode\n"); rc = -ENOMEM; goto fail_free_cpu; } domain = irq_domain_create_linear(fwnode, nr_ipi, &ipi_mux_domain_ops, NULL); if (!domain) { pr_err("unable to add IPI Mux domain\n"); rc = -ENOMEM; goto fail_free_fwnode; } domain->flags |= IRQ_DOMAIN_FLAG_IPI_SINGLE; irq_domain_update_bus_token(domain, DOMAIN_BUS_IPI); rc = irq_domain_alloc_irqs(domain, nr_ipi, NUMA_NO_NODE, NULL); if (rc <= 0) { pr_err("unable to alloc IRQs from IPI Mux domain\n"); goto fail_free_domain; } ipi_mux_domain = domain; ipi_mux_send = mux_send; return rc; fail_free_domain: irq_domain_remove(domain); fail_free_fwnode: irq_domain_free_fwnode(fwnode); fail_free_cpu: free_percpu(ipi_mux_pcpu); return rc; } |
| 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 | // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2005-2006 Dell Inc. * * Serial Attached SCSI (SAS) transport class. * * The SAS transport class contains common code to deal with SAS HBAs, * an aproximated representation of SAS topologies in the driver model, * and various sysfs attributes to expose these topologies and management * interfaces to userspace. * * In addition to the basic SCSI core objects this transport class * introduces two additional intermediate objects: The SAS PHY * as represented by struct sas_phy defines an "outgoing" PHY on * a SAS HBA or Expander, and the SAS remote PHY represented by * struct sas_rphy defines an "incoming" PHY on a SAS Expander or * end device. Note that this is purely a software concept, the * underlying hardware for a PHY and a remote PHY is the exactly * the same. * * There is no concept of a SAS port in this code, users can see * what PHYs form a wide port based on the port_identifier attribute, * which is the same for all PHYs in a port. */ #include <linux/init.h> #include <linux/module.h> #include <linux/jiffies.h> #include <linux/err.h> #include <linux/slab.h> #include <linux/string.h> #include <linux/blkdev.h> #include <linux/bsg.h> #include <scsi/scsi.h> #include <scsi/scsi_cmnd.h> #include <scsi/scsi_device.h> #include <scsi/scsi_host.h> #include <scsi/scsi_transport.h> #include <scsi/scsi_transport_sas.h> #include "scsi_sas_internal.h" #include "scsi_priv.h" struct sas_host_attrs { struct list_head rphy_list; struct mutex lock; struct request_queue *q; u32 next_target_id; u32 next_expander_id; int next_port_id; }; #define to_sas_host_attrs(host) ((struct sas_host_attrs *)(host)->shost_data) /* * Hack to allow attributes of the same name in different objects. */ #define SAS_DEVICE_ATTR(_prefix,_name,_mode,_show,_store) \ struct device_attribute dev_attr_##_prefix##_##_name = \ __ATTR(_name,_mode,_show,_store) /* * Pretty printing helpers */ #define sas_bitfield_name_match(title, table) \ static ssize_t \ get_sas_##title##_names(u32 table_key, char *buf) \ { \ char *prefix = ""; \ ssize_t len = 0; \ int i; \ \ for (i = 0; i < ARRAY_SIZE(table); i++) { \ if (table[i].value & table_key) { \ len += sprintf(buf + len, "%s%s", \ prefix, table[i].name); \ prefix = ", "; \ } \ } \ len += sprintf(buf + len, "\n"); \ return len; \ } #define sas_bitfield_name_set(title, table) \ static ssize_t \ set_sas_##title##_names(u32 *table_key, const char *buf) \ { \ ssize_t len = 0; \ int i; \ \ for (i = 0; i < ARRAY_SIZE(table); i++) { \ len = strlen(table[i].name); \ if (strncmp(buf, table[i].name, len) == 0 && \ (buf[len] == '\n' || buf[len] == '\0')) { \ *table_key = table[i].value; \ return 0; \ } \ } \ return -EINVAL; \ } #define sas_bitfield_name_search(title, table) \ static ssize_t \ get_sas_##title##_names(u32 table_key, char *buf) \ { \ ssize_t len = 0; \ int i; \ \ for (i = 0; i < ARRAY_SIZE(table); i++) { \ if (table[i].value == table_key) { \ len += sprintf(buf + len, "%s", \ table[i].name); \ break; \ } \ } \ len += sprintf(buf + len, "\n"); \ return len; \ } static struct { u32 value; char *name; } sas_device_type_names[] = { { SAS_PHY_UNUSED, "unused" }, { SAS_END_DEVICE, "end device" }, { SAS_EDGE_EXPANDER_DEVICE, "edge expander" }, { SAS_FANOUT_EXPANDER_DEVICE, "fanout expander" }, }; sas_bitfield_name_search(device_type, sas_device_type_names) static struct { u32 value; char *name; } sas_protocol_names[] = { { SAS_PROTOCOL_SATA, "sata" }, { SAS_PROTOCOL_SMP, "smp" }, { SAS_PROTOCOL_STP, "stp" }, { SAS_PROTOCOL_SSP, "ssp" }, }; sas_bitfield_name_match(protocol, sas_protocol_names) static struct { u32 value; char *name; } sas_linkspeed_names[] = { { SAS_LINK_RATE_UNKNOWN, "Unknown" }, { SAS_PHY_DISABLED, "Phy disabled" }, { SAS_LINK_RATE_FAILED, "Link Rate failed" }, { SAS_SATA_SPINUP_HOLD, "Spin-up hold" }, { SAS_LINK_RATE_1_5_GBPS, "1.5 Gbit" }, { SAS_LINK_RATE_3_0_GBPS, "3.0 Gbit" }, { SAS_LINK_RATE_6_0_GBPS, "6.0 Gbit" }, { SAS_LINK_RATE_12_0_GBPS, "12.0 Gbit" }, { SAS_LINK_RATE_22_5_GBPS, "22.5 Gbit" }, }; sas_bitfield_name_search(linkspeed, sas_linkspeed_names) sas_bitfield_name_set(linkspeed, sas_linkspeed_names) static struct sas_end_device *sas_sdev_to_rdev(struct scsi_device *sdev) { struct sas_rphy *rphy = target_to_rphy(sdev->sdev_target); struct sas_end_device *rdev; BUG_ON(rphy->identify.device_type != SAS_END_DEVICE); rdev = rphy_to_end_device(rphy); return rdev; } static int sas_smp_dispatch(struct bsg_job *job) { struct Scsi_Host *shost = dev_to_shost(job->dev); struct sas_rphy *rphy = NULL; if (!scsi_is_host_device(job->dev)) rphy = dev_to_rphy(job->dev); if (!job->reply_payload.payload_len) { dev_warn(job->dev, "space for a smp response is missing\n"); bsg_job_done(job, -EINVAL, 0); return 0; } to_sas_internal(shost->transportt)->f->smp_handler(job, shost, rphy); return 0; } static int sas_bsg_initialize(struct Scsi_Host *shost, struct sas_rphy *rphy) { struct request_queue *q; if (!to_sas_internal(shost->transportt)->f->smp_handler) { printk("%s can't handle SMP requests\n", shost->hostt->name); return 0; } if (rphy) { q = bsg_setup_queue(&rphy->dev, dev_name(&rphy->dev), NULL, sas_smp_dispatch, NULL, 0); if (IS_ERR(q)) return PTR_ERR(q); rphy->q = q; } else { char name[20]; snprintf(name, sizeof(name), "sas_host%d", shost->host_no); q = bsg_setup_queue(&shost->shost_gendev, name, NULL, sas_smp_dispatch, NULL, 0); if (IS_ERR(q)) return PTR_ERR(q); to_sas_host_attrs(shost)->q = q; } return 0; } /* * SAS host attributes */ static int sas_host_setup(struct transport_container *tc, struct device *dev, struct device *cdev) { struct Scsi_Host *shost = dev_to_shost(dev); struct sas_host_attrs *sas_host = to_sas_host_attrs(shost); struct device *dma_dev = shost->dma_dev; INIT_LIST_HEAD(&sas_host->rphy_list); mutex_init(&sas_host->lock); sas_host->next_target_id = 0; sas_host->next_expander_id = 0; sas_host->next_port_id = 0; if (sas_bsg_initialize(shost, NULL)) dev_printk(KERN_ERR, dev, "fail to a bsg device %d\n", shost->host_no); if (dma_dev->dma_mask) { shost->opt_sectors = min_t(unsigned int, shost->max_sectors, dma_opt_mapping_size(dma_dev) >> SECTOR_SHIFT); } return 0; } static int sas_host_remove(struct transport_container *tc, struct device *dev, struct device *cdev) { struct Scsi_Host *shost = dev_to_shost(dev); struct request_queue *q = to_sas_host_attrs(shost)->q; bsg_remove_queue(q); return 0; } static DECLARE_TRANSPORT_CLASS(sas_host_class, "sas_host", sas_host_setup, sas_host_remove, NULL); static int sas_host_match(struct attribute_container *cont, struct device *dev) { struct Scsi_Host *shost; struct sas_internal *i; if (!scsi_is_host_device(dev)) return 0; shost = dev_to_shost(dev); if (!shost->transportt) return 0; if (shost->transportt->host_attrs.ac.class != &sas_host_class.class) return 0; i = to_sas_internal(shost->transportt); return &i->t.host_attrs.ac == cont; } static int do_sas_phy_delete(struct device *dev, void *data) { int pass = (int)(unsigned long)data; if (pass == 0 && scsi_is_sas_port(dev)) sas_port_delete(dev_to_sas_port(dev)); else if (pass == 1 && scsi_is_sas_phy(dev)) sas_phy_delete(dev_to_phy(dev)); return 0; } /** * sas_remove_children - tear down a devices SAS data structures * @dev: device belonging to the sas object * * Removes all SAS PHYs and remote PHYs for a given object */ void sas_remove_children(struct device *dev) { device_for_each_child(dev, (void *)0, do_sas_phy_delete); device_for_each_child(dev, (void *)1, do_sas_phy_delete); } EXPORT_SYMBOL(sas_remove_children); /** * sas_remove_host - tear down a Scsi_Host's SAS data structures * @shost: Scsi Host that is torn down * * Removes all SAS PHYs and remote PHYs for a given Scsi_Host and remove the * Scsi_Host as well. * * Note: Do not call scsi_remove_host() on the Scsi_Host any more, as it is * already removed. */ void sas_remove_host(struct Scsi_Host *shost) { sas_remove_children(&shost->shost_gendev); scsi_remove_host(shost); } EXPORT_SYMBOL(sas_remove_host); /** * sas_get_address - return the SAS address of the device * @sdev: scsi device * * Returns the SAS address of the scsi device */ u64 sas_get_address(struct scsi_device *sdev) { struct sas_end_device *rdev = sas_sdev_to_rdev(sdev); return rdev->rphy.identify.sas_address; } EXPORT_SYMBOL(sas_get_address); /** * sas_tlr_supported - checking TLR bit in vpd 0x90 * @sdev: scsi device struct * * Check Transport Layer Retries are supported or not. * If vpd page 0x90 is present, TRL is supported. * */ unsigned int sas_tlr_supported(struct scsi_device *sdev) { const int vpd_len = 32; struct sas_end_device *rdev = sas_sdev_to_rdev(sdev); char *buffer = kzalloc(vpd_len, GFP_KERNEL); int ret = 0; if (!buffer) goto out; if (scsi_get_vpd_page(sdev, 0x90, buffer, vpd_len)) goto out; /* * Magic numbers: the VPD Protocol page (0x90) * has a 4 byte header and then one entry per device port * the TLR bit is at offset 8 on each port entry * if we take the first port, that's at total offset 12 */ ret = buffer[12] & 0x01; out: kfree(buffer); rdev->tlr_supported = ret; return ret; } EXPORT_SYMBOL_GPL(sas_tlr_supported); /** * sas_disable_tlr - setting TLR flags * @sdev: scsi device struct * * Seting tlr_enabled flag to 0. * */ void sas_disable_tlr(struct scsi_device *sdev) { struct sas_end_device *rdev = sas_sdev_to_rdev(sdev); rdev->tlr_enabled = 0; } EXPORT_SYMBOL_GPL(sas_disable_tlr); /** * sas_enable_tlr - setting TLR flags * @sdev: scsi device struct * * Seting tlr_enabled flag 1. * */ void sas_enable_tlr(struct scsi_device *sdev) { unsigned int tlr_supported = 0; tlr_supported = sas_tlr_supported(sdev); if (tlr_supported) { struct sas_end_device *rdev = sas_sdev_to_rdev(sdev); rdev->tlr_enabled = 1; } return; } EXPORT_SYMBOL_GPL(sas_enable_tlr); unsigned int sas_is_tlr_enabled(struct scsi_device *sdev) { struct sas_end_device *rdev = sas_sdev_to_rdev(sdev); return rdev->tlr_enabled; } EXPORT_SYMBOL_GPL(sas_is_tlr_enabled); /** * sas_ata_ncq_prio_supported - Check for ATA NCQ command priority support * @sdev: SCSI device * * Check if an ATA device supports NCQ priority using VPD page 89h (ATA * Information). Since this VPD page is implemented only for ATA devices, * this function always returns false for SCSI devices. */ bool sas_ata_ncq_prio_supported(struct scsi_device *sdev) { struct scsi_vpd *vpd; bool ncq_prio_supported = false; rcu_read_lock(); vpd = rcu_dereference(sdev->vpd_pg89); if (vpd && vpd->len >= 214) ncq_prio_supported = (vpd->data[213] >> 4) & 1; rcu_read_unlock(); return ncq_prio_supported; } EXPORT_SYMBOL_GPL(sas_ata_ncq_prio_supported); /* * SAS Phy attributes */ #define sas_phy_show_simple(field, name, format_string, cast) \ static ssize_t \ show_sas_phy_##name(struct device *dev, \ struct device_attribute *attr, char *buf) \ { \ struct sas_phy *phy = transport_class_to_phy(dev); \ \ return snprintf(buf, 20, format_string, cast phy->field); \ } #define sas_phy_simple_attr(field, name, format_string, type) \ sas_phy_show_simple(field, name, format_string, (type)) \ static DEVICE_ATTR(name, S_IRUGO, show_sas_phy_##name, NULL) #define sas_phy_show_protocol(field, name) \ static ssize_t \ show_sas_phy_##name(struct device *dev, \ struct device_attribute *attr, char *buf) \ { \ struct sas_phy *phy = transport_class_to_phy(dev); \ \ if (!phy->field) \ return snprintf(buf, 20, "none\n"); \ return get_sas_protocol_names(phy->field, buf); \ } #define sas_phy_protocol_attr(field, name) \ sas_phy_show_protocol(field, name) \ static DEVICE_ATTR(name, S_IRUGO, show_sas_phy_##name, NULL) #define sas_phy_show_linkspeed(field) \ static ssize_t \ show_sas_phy_##field(struct device *dev, \ struct device_attribute *attr, char *buf) \ { \ struct sas_phy *phy = transport_class_to_phy(dev); \ \ return get_sas_linkspeed_names(phy->field, buf); \ } /* Fudge to tell if we're minimum or maximum */ #define sas_phy_store_linkspeed(field) \ static ssize_t \ store_sas_phy_##field(struct device *dev, \ struct device_attribute *attr, \ const char *buf, size_t count) \ { \ struct sas_phy *phy = transport_class_to_phy(dev); \ struct Scsi_Host *shost = dev_to_shost(phy->dev.parent); \ struct sas_internal *i = to_sas_internal(shost->transportt); \ u32 value; \ struct sas_phy_linkrates rates = {0}; \ int error; \ \ error = set_sas_linkspeed_names(&value, buf); \ if (error) \ return error; \ rates.field = value; \ error = i->f->set_phy_speed(phy, &rates); \ \ return error ? error : count; \ } #define sas_phy_linkspeed_rw_attr(field) \ sas_phy_show_linkspeed(field) \ sas_phy_store_linkspeed(field) \ static DEVICE_ATTR(field, S_IRUGO, show_sas_phy_##field, \ store_sas_phy_##field) #define sas_phy_linkspeed_attr(field) \ sas_phy_show_linkspeed(field) \ static DEVICE_ATTR(field, S_IRUGO, show_sas_phy_##field, NULL) #define sas_phy_show_linkerror(field) \ static ssize_t \ show_sas_phy_##field(struct device *dev, \ struct device_attribute *attr, char *buf) \ { \ struct sas_phy *phy = transport_class_to_phy(dev); \ struct Scsi_Host *shost = dev_to_shost(phy->dev.parent); \ struct sas_internal *i = to_sas_internal(shost->transportt); \ int error; \ \ error = i->f->get_linkerrors ? i->f->get_linkerrors(phy) : 0; \ if (error) \ return error; \ return snprintf(buf, 20, "%u\n", phy->field); \ } #define sas_phy_linkerror_attr(field) \ sas_phy_show_linkerror(field) \ static DEVICE_ATTR(field, S_IRUGO, show_sas_phy_##field, NULL) static ssize_t show_sas_device_type(struct device *dev, struct device_attribute *attr, char *buf) { struct sas_phy *phy = transport_class_to_phy(dev); if (!phy->identify.device_type) return snprintf(buf, 20, "none\n"); return get_sas_device_type_names(phy->identify.device_type, buf); } static DEVICE_ATTR(device_type, S_IRUGO, show_sas_device_type, NULL); static ssize_t do_sas_phy_enable(struct device *dev, size_t count, int enable) { struct sas_phy *phy = transport_class_to_phy(dev); struct Scsi_Host *shost = dev_to_shost(phy->dev.parent); struct sas_internal *i = to_sas_internal(shost->transportt); int error; error = i->f->phy_enable(phy, enable); if (error) return error; phy->enabled = enable; return count; }; static ssize_t store_sas_phy_enable(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { if (count < 1) return -EINVAL; switch (buf[0]) { case '0': do_sas_phy_enable(dev, count, 0); break; case '1': do_sas_phy_enable(dev, count, 1); break; default: return -EINVAL; } return count; } static ssize_t show_sas_phy_enable(struct device *dev, struct device_attribute *attr, char *buf) { struct sas_phy *phy = transport_class_to_phy(dev); return snprintf(buf, 20, "%d\n", phy->enabled); } static DEVICE_ATTR(enable, S_IRUGO | S_IWUSR, show_sas_phy_enable, store_sas_phy_enable); static ssize_t do_sas_phy_reset(struct device *dev, size_t count, int hard_reset) { struct sas_phy *phy = transport_class_to_phy(dev); struct Scsi_Host *shost = dev_to_shost(phy->dev.parent); struct sas_internal *i = to_sas_internal(shost->transportt); int error; error = i->f->phy_reset(phy, hard_reset); if (error) return error; phy->enabled = 1; return count; }; static ssize_t store_sas_link_reset(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { return do_sas_phy_reset(dev, count, 0); } static DEVICE_ATTR(link_reset, S_IWUSR, NULL, store_sas_link_reset); static ssize_t store_sas_hard_reset(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { return do_sas_phy_reset(dev, count, 1); } static DEVICE_ATTR(hard_reset, S_IWUSR, NULL, store_sas_hard_reset); sas_phy_protocol_attr(identify.initiator_port_protocols, initiator_port_protocols); sas_phy_protocol_attr(identify.target_port_protocols, target_port_protocols); sas_phy_simple_attr(identify.sas_address, sas_address, "0x%016llx\n", unsigned long long); sas_phy_simple_attr(identify.phy_identifier, phy_identifier, "%d\n", u8); sas_phy_linkspeed_attr(negotiated_linkrate); sas_phy_linkspeed_attr(minimum_linkrate_hw); sas_phy_linkspeed_rw_attr(minimum_linkrate); sas_phy_linkspeed_attr(maximum_linkrate_hw); sas_phy_linkspeed_rw_attr(maximum_linkrate); sas_phy_linkerror_attr(invalid_dword_count); sas_phy_linkerror_attr(running_disparity_error_count); sas_phy_linkerror_attr(loss_of_dword_sync_count); sas_phy_linkerror_attr(phy_reset_problem_count); static int sas_phy_setup(struct transport_container *tc, struct device *dev, struct device *cdev) { struct sas_phy *phy = dev_to_phy(dev); struct Scsi_Host *shost = dev_to_shost(phy->dev.parent); struct sas_internal *i = to_sas_internal(shost->transportt); if (i->f->phy_setup) i->f->phy_setup(phy); return 0; } static DECLARE_TRANSPORT_CLASS(sas_phy_class, "sas_phy", sas_phy_setup, NULL, NULL); static int sas_phy_match(struct attribute_container *cont, struct device *dev) { struct Scsi_Host *shost; struct sas_internal *i; if (!scsi_is_sas_phy(dev)) return 0; shost = dev_to_shost(dev->parent); if (!shost->transportt) return 0; if (shost->transportt->host_attrs.ac.class != &sas_host_class.class) return 0; i = to_sas_internal(shost->transportt); return &i->phy_attr_cont.ac == cont; } static void sas_phy_release(struct device *dev) { struct sas_phy *phy = dev_to_phy(dev); struct Scsi_Host *shost = dev_to_shost(phy->dev.parent); struct sas_internal *i = to_sas_internal(shost->transportt); if (i->f->phy_release) i->f->phy_release(phy); put_device(dev->parent); kfree(phy); } /** * sas_phy_alloc - allocates and initialize a SAS PHY structure * @parent: Parent device * @number: Phy index * * Allocates an SAS PHY structure. It will be added in the device tree * below the device specified by @parent, which has to be either a Scsi_Host * or sas_rphy. * * Returns: * SAS PHY allocated or %NULL if the allocation failed. */ struct sas_phy *sas_phy_alloc(struct device *parent, int number) { struct Scsi_Host *shost = dev_to_shost(parent); struct sas_phy *phy; phy = kzalloc(sizeof(*phy), GFP_KERNEL); if (!phy) return NULL; phy->number = number; phy->enabled = 1; device_initialize(&phy->dev); phy->dev.parent = get_device(parent); phy->dev.release = sas_phy_release; INIT_LIST_HEAD(&phy->port_siblings); if (scsi_is_sas_expander_device(parent)) { struct sas_rphy *rphy = dev_to_rphy(parent); dev_set_name(&phy->dev, "phy-%d:%d:%d", shost->host_no, rphy->scsi_target_id, number); } else dev_set_name(&phy->dev, "phy-%d:%d", shost->host_no, number); transport_setup_device(&phy->dev); return phy; } EXPORT_SYMBOL(sas_phy_alloc); /** * sas_phy_add - add a SAS PHY to the device hierarchy * @phy: The PHY to be added * * Publishes a SAS PHY to the rest of the system. */ int sas_phy_add(struct sas_phy *phy) { int error; error = device_add(&phy->dev); if (error) return error; error = transport_add_device(&phy->dev); if (error) { device_del(&phy->dev); return error; } transport_configure_device(&phy->dev); return 0; } EXPORT_SYMBOL(sas_phy_add); /** * sas_phy_free - free a SAS PHY * @phy: SAS PHY to free * * Frees the specified SAS PHY. * * Note: * This function must only be called on a PHY that has not * successfully been added using sas_phy_add(). */ void sas_phy_free(struct sas_phy *phy) { transport_destroy_device(&phy->dev); put_device(&phy->dev); } EXPORT_SYMBOL(sas_phy_free); /** * sas_phy_delete - remove SAS PHY * @phy: SAS PHY to remove * * Removes the specified SAS PHY. If the SAS PHY has an * associated remote PHY it is removed before. */ void sas_phy_delete(struct sas_phy *phy) { struct device *dev = &phy->dev; /* this happens if the phy is still part of a port when deleted */ BUG_ON(!list_empty(&phy->port_siblings)); transport_remove_device(dev); device_del(dev); transport_destroy_device(dev); put_device(dev); } EXPORT_SYMBOL(sas_phy_delete); /** * scsi_is_sas_phy - check if a struct device represents a SAS PHY * @dev: device to check * * Returns: * %1 if the device represents a SAS PHY, %0 else */ int scsi_is_sas_phy(const struct device *dev) { return dev->release == sas_phy_release; } EXPORT_SYMBOL(scsi_is_sas_phy); /* * SAS Port attributes */ #define sas_port_show_simple(field, name, format_string, cast) \ static ssize_t \ show_sas_port_##name(struct device *dev, \ struct device_attribute *attr, char *buf) \ { \ struct sas_port *port = transport_class_to_sas_port(dev); \ \ return snprintf(buf, 20, format_string, cast port->field); \ } #define sas_port_simple_attr(field, name, format_string, type) \ sas_port_show_simple(field, name, format_string, (type)) \ static DEVICE_ATTR(name, S_IRUGO, show_sas_port_##name, NULL) sas_port_simple_attr(num_phys, num_phys, "%d\n", int); static DECLARE_TRANSPORT_CLASS(sas_port_class, "sas_port", NULL, NULL, NULL); static int sas_port_match(struct attribute_container *cont, struct device *dev) { struct Scsi_Host *shost; struct sas_internal *i; if (!scsi_is_sas_port(dev)) return 0; shost = dev_to_shost(dev->parent); if (!shost->transportt) return 0; if (shost->transportt->host_attrs.ac.class != &sas_host_class.class) return 0; i = to_sas_internal(shost->transportt); return &i->port_attr_cont.ac == cont; } static void sas_port_release(struct device *dev) { struct sas_port *port = dev_to_sas_port(dev); BUG_ON(!list_empty(&port->phy_list)); put_device(dev->parent); kfree(port); } static void sas_port_create_link(struct sas_port *port, struct sas_phy *phy) { int res; res = sysfs_create_link(&port->dev.kobj, &phy->dev.kobj, dev_name(&phy->dev)); if (res) goto err; res = sysfs_create_link(&phy->dev.kobj, &port->dev.kobj, "port"); if (res) goto err; return; err: printk(KERN_ERR "%s: Cannot create port links, err=%d\n", __func__, res); } static void sas_port_delete_link(struct sas_port *port, struct sas_phy *phy) { sysfs_remove_link(&port->dev.kobj, dev_name(&phy->dev)); sysfs_remove_link(&phy->dev.kobj, "port"); } /** * sas_port_alloc - allocate and initialize a SAS port structure * * @parent: parent device * @port_id: port number * * Allocates a SAS port structure. It will be added to the device tree * below the device specified by @parent which must be either a Scsi_Host * or a sas_expander_device. * * Returns: %NULL on error */ struct sas_port *sas_port_alloc(struct device *parent, int port_id) { struct Scsi_Host *shost = dev_to_shost(parent); struct sas_port *port; port = kzalloc(sizeof(*port), GFP_KERNEL); if (!port) return NULL; port->port_identifier = port_id; device_initialize(&port->dev); port->dev.parent = get_device(parent); port->dev.release = sas_port_release; mutex_init(&port->phy_list_mutex); INIT_LIST_HEAD(&port->phy_list); if (scsi_is_sas_expander_device(parent)) { struct sas_rphy *rphy = dev_to_rphy(parent); dev_set_name(&port->dev, "port-%d:%d:%d", shost->host_no, rphy->scsi_target_id, port->port_identifier); } else dev_set_name(&port->dev, "port-%d:%d", shost->host_no, port->port_identifier); transport_setup_device(&port->dev); return port; } EXPORT_SYMBOL(sas_port_alloc); /** * sas_port_alloc_num - allocate and initialize a SAS port structure * * @parent: parent device * * Allocates a SAS port structure and a number to go with it. This * interface is really for adapters where the port number has no * meansing, so the sas class should manage them. It will be added to * the device tree below the device specified by @parent which must be * either a Scsi_Host or a sas_expander_device. * * Returns: %NULL on error */ struct sas_port *sas_port_alloc_num(struct device *parent) { int index; struct Scsi_Host *shost = dev_to_shost(parent); struct sas_host_attrs *sas_host = to_sas_host_attrs(shost); /* FIXME: use idr for this eventually */ mutex_lock(&sas_host->lock); if (scsi_is_sas_expander_device(parent)) { struct sas_rphy *rphy = dev_to_rphy(parent); struct sas_expander_device *exp = rphy_to_expander_device(rphy); index = exp->next_port_id++; } else index = sas_host->next_port_id++; mutex_unlock(&sas_host->lock); return sas_port_alloc(parent, index); } EXPORT_SYMBOL(sas_port_alloc_num); /** * sas_port_add - add a SAS port to the device hierarchy * @port: port to be added * * publishes a port to the rest of the system */ int sas_port_add(struct sas_port *port) { int error; /* No phys should be added until this is made visible */ BUG_ON(!list_empty(&port->phy_list)); error = device_add(&port->dev); if (error) return error; transport_add_device(&port->dev); transport_configure_device(&port->dev); return 0; } EXPORT_SYMBOL(sas_port_add); /** * sas_port_free - free a SAS PORT * @port: SAS PORT to free * * Frees the specified SAS PORT. * * Note: * This function must only be called on a PORT that has not * successfully been added using sas_port_add(). */ void sas_port_free(struct sas_port *port) { transport_destroy_device(&port->dev); put_device(&port->dev); } EXPORT_SYMBOL(sas_port_free); /** * sas_port_delete - remove SAS PORT * @port: SAS PORT to remove * * Removes the specified SAS PORT. If the SAS PORT has an * associated phys, unlink them from the port as well. */ void sas_port_delete(struct sas_port *port) { struct device *dev = &port->dev; struct sas_phy *phy, *tmp_phy; if (port->rphy) { sas_rphy_delete(port->rphy); port->rphy = NULL; } mutex_lock(&port->phy_list_mutex); list_for_each_entry_safe(phy, tmp_phy, &port->phy_list, port_siblings) { sas_port_delete_link(port, phy); list_del_init(&phy->port_siblings); } mutex_unlock(&port->phy_list_mutex); if (port->is_backlink) { struct device *parent = port->dev.parent; sysfs_remove_link(&port->dev.kobj, dev_name(parent)); port->is_backlink = 0; } transport_remove_device(dev); device_del(dev); transport_destroy_device(dev); put_device(dev); } EXPORT_SYMBOL(sas_port_delete); /** * scsi_is_sas_port - check if a struct device represents a SAS port * @dev: device to check * * Returns: * %1 if the device represents a SAS Port, %0 else */ int scsi_is_sas_port(const struct device *dev) { return dev->release == sas_port_release; } EXPORT_SYMBOL(scsi_is_sas_port); /** * sas_port_get_phy - try to take a reference on a port member * @port: port to check */ struct sas_phy *sas_port_get_phy(struct sas_port *port) { struct sas_phy *phy; mutex_lock(&port->phy_list_mutex); if (list_empty(&port->phy_list)) phy = NULL; else { struct list_head *ent = port->phy_list.next; phy = list_entry(ent, typeof(*phy), port_siblings); get_device(&phy->dev); } mutex_unlock(&port->phy_list_mutex); return phy; } EXPORT_SYMBOL(sas_port_get_phy); /** * sas_port_add_phy - add another phy to a port to form a wide port * @port: port to add the phy to * @phy: phy to add * * When a port is initially created, it is empty (has no phys). All * ports must have at least one phy to operated, and all wide ports * must have at least two. The current code makes no difference * between ports and wide ports, but the only object that can be * connected to a remote device is a port, so ports must be formed on * all devices with phys if they're connected to anything. */ void sas_port_add_phy(struct sas_port *port, struct sas_phy *phy) { mutex_lock(&port->phy_list_mutex); if (unlikely(!list_empty(&phy->port_siblings))) { /* make sure we're already on this port */ struct sas_phy *tmp; list_for_each_entry(tmp, &port->phy_list, port_siblings) if (tmp == phy) break; /* If this trips, you added a phy that was already * part of a different port */ if (unlikely(tmp != phy)) { dev_printk(KERN_ERR, &port->dev, "trying to add phy %s fails: it's already part of another port\n", dev_name(&phy->dev)); BUG(); } } else { sas_port_create_link(port, phy); list_add_tail(&phy->port_siblings, &port->phy_list); port->num_phys++; } mutex_unlock(&port->phy_list_mutex); } EXPORT_SYMBOL(sas_port_add_phy); /** * sas_port_delete_phy - remove a phy from a port or wide port * @port: port to remove the phy from * @phy: phy to remove * * This operation is used for tearing down ports again. It must be * done to every port or wide port before calling sas_port_delete. */ void sas_port_delete_phy(struct sas_port *port, struct sas_phy *phy) { mutex_lock(&port->phy_list_mutex); sas_port_delete_link(port, phy); list_del_init(&phy->port_siblings); port->num_phys--; mutex_unlock(&port->phy_list_mutex); } EXPORT_SYMBOL(sas_port_delete_phy); void sas_port_mark_backlink(struct sas_port *port) { int res; struct device *parent = port->dev.parent->parent->parent; if (port->is_backlink) return; port->is_backlink = 1; res = sysfs_create_link(&port->dev.kobj, &parent->kobj, dev_name(parent)); if (res) goto err; return; err: printk(KERN_ERR "%s: Cannot create port backlink, err=%d\n", __func__, res); } EXPORT_SYMBOL(sas_port_mark_backlink); /* * SAS remote PHY attributes. */ #define sas_rphy_show_simple(field, name, format_string, cast) \ static ssize_t \ show_sas_rphy_##name(struct device *dev, \ struct device_attribute *attr, char *buf) \ { \ struct sas_rphy *rphy = transport_class_to_rphy(dev); \ \ return snprintf(buf, 20, format_string, cast rphy->field); \ } #define sas_rphy_simple_attr(field, name, format_string, type) \ sas_rphy_show_simple(field, name, format_string, (type)) \ static SAS_DEVICE_ATTR(rphy, name, S_IRUGO, \ show_sas_rphy_##name, NULL) #define sas_rphy_show_protocol(field, name) \ static ssize_t \ show_sas_rphy_##name(struct device *dev, \ struct device_attribute *attr, char *buf) \ { \ struct sas_rphy *rphy = transport_class_to_rphy(dev); \ \ if (!rphy->field) \ return snprintf(buf, 20, "none\n"); \ return get_sas_protocol_names(rphy->field, buf); \ } #define sas_rphy_protocol_attr(field, name) \ sas_rphy_show_protocol(field, name) \ static SAS_DEVICE_ATTR(rphy, name, S_IRUGO, \ show_sas_rphy_##name, NULL) static ssize_t show_sas_rphy_device_type(struct device *dev, struct device_attribute *attr, char *buf) { struct sas_rphy *rphy = transport_class_to_rphy(dev); if (!rphy->identify.device_type) return snprintf(buf, 20, "none\n"); return get_sas_device_type_names( rphy->identify.device_type, buf); } static SAS_DEVICE_ATTR(rphy, device_type, S_IRUGO, show_sas_rphy_device_type, NULL); static ssize_t show_sas_rphy_enclosure_identifier(struct device *dev, struct device_attribute *attr, char *buf) { struct sas_rphy *rphy = transport_class_to_rphy(dev); struct sas_phy *phy = dev_to_phy(rphy->dev.parent); struct Scsi_Host *shost = dev_to_shost(phy->dev.parent); struct sas_internal *i = to_sas_internal(shost->transportt); u64 identifier; int error; error = i->f->get_enclosure_identifier(rphy, &identifier); if (error) return error; return sprintf(buf, "0x%llx\n", (unsigned long long)identifier); } static SAS_DEVICE_ATTR(rphy, enclosure_identifier, S_IRUGO, show_sas_rphy_enclosure_identifier, NULL); static ssize_t show_sas_rphy_bay_identifier(struct device *dev, struct device_attribute *attr, char *buf) { struct sas_rphy *rphy = transport_class_to_rphy(dev); struct sas_phy *phy = dev_to_phy(rphy->dev.parent); struct Scsi_Host *shost = dev_to_shost(phy->dev.parent); struct sas_internal *i = to_sas_internal(shost->transportt); int val; val = i->f->get_bay_identifier(rphy); if (val < 0) return val; return sprintf(buf, "%d\n", val); } static SAS_DEVICE_ATTR(rphy, bay_identifier, S_IRUGO, show_sas_rphy_bay_identifier, NULL); sas_rphy_protocol_attr(identify.initiator_port_protocols, initiator_port_protocols); sas_rphy_protocol_attr(identify.target_port_protocols, target_port_protocols); sas_rphy_simple_attr(identify.sas_address, sas_address, "0x%016llx\n", unsigned long long); sas_rphy_simple_attr(identify.phy_identifier, phy_identifier, "%d\n", u8); sas_rphy_simple_attr(scsi_target_id, scsi_target_id, "%d\n", u32); /* only need 8 bytes of data plus header (4 or 8) */ #define BUF_SIZE 64 int sas_read_port_mode_page(struct scsi_device *sdev) { char *buffer = kzalloc(BUF_SIZE, GFP_KERNEL), *msdata; struct sas_end_device *rdev = sas_sdev_to_rdev(sdev); struct scsi_mode_data mode_data; int error; if (!buffer) return -ENOMEM; error = scsi_mode_sense(sdev, 1, 0x19, 0, buffer, BUF_SIZE, 30*HZ, 3, &mode_data, NULL); if (error) goto out; msdata = buffer + mode_data.header_length + mode_data.block_descriptor_length; if (msdata - buffer > BUF_SIZE - 8) goto out; error = 0; rdev->ready_led_meaning = msdata[2] & 0x10 ? 1 : 0; rdev->I_T_nexus_loss_timeout = (msdata[4] << 8) + msdata[5]; rdev->initiator_response_timeout = (msdata[6] << 8) + msdata[7]; out: kfree(buffer); return error; } EXPORT_SYMBOL(sas_read_port_mode_page); static DECLARE_TRANSPORT_CLASS(sas_end_dev_class, "sas_end_device", NULL, NULL, NULL); #define sas_end_dev_show_simple(field, name, format_string, cast) \ static ssize_t \ show_sas_end_dev_##name(struct device *dev, \ struct device_attribute *attr, char *buf) \ { \ struct sas_rphy *rphy = transport_class_to_rphy(dev); \ struct sas_end_device *rdev = rphy_to_end_device(rphy); \ \ return snprintf(buf, 20, format_string, cast rdev->field); \ } #define sas_end_dev_simple_attr(field, name, format_string, type) \ sas_end_dev_show_simple(field, name, format_string, (type)) \ static SAS_DEVICE_ATTR(end_dev, name, S_IRUGO, \ show_sas_end_dev_##name, NULL) sas_end_dev_simple_attr(ready_led_meaning, ready_led_meaning, "%d\n", int); sas_end_dev_simple_attr(I_T_nexus_loss_timeout, I_T_nexus_loss_timeout, "%d\n", int); sas_end_dev_simple_attr(initiator_response_timeout, initiator_response_timeout, "%d\n", int); sas_end_dev_simple_attr(tlr_supported, tlr_supported, "%d\n", int); sas_end_dev_simple_attr(tlr_enabled, tlr_enabled, "%d\n", int); static DECLARE_TRANSPORT_CLASS(sas_expander_class, "sas_expander", NULL, NULL, NULL); #define sas_expander_show_simple(field, name, format_string, cast) \ static ssize_t \ show_sas_expander_##name(struct device *dev, \ struct device_attribute *attr, char *buf) \ { \ struct sas_rphy *rphy = transport_class_to_rphy(dev); \ struct sas_expander_device *edev = rphy_to_expander_device(rphy); \ \ return snprintf(buf, 20, format_string, cast edev->field); \ } #define sas_expander_simple_attr(field, name, format_string, type) \ sas_expander_show_simple(field, name, format_string, (type)) \ static SAS_DEVICE_ATTR(expander, name, S_IRUGO, \ show_sas_expander_##name, NULL) sas_expander_simple_attr(vendor_id, vendor_id, "%s\n", char *); sas_expander_simple_attr(product_id, product_id, "%s\n", char *); sas_expander_simple_attr(product_rev, product_rev, "%s\n", char *); sas_expander_simple_attr(component_vendor_id, component_vendor_id, "%s\n", char *); sas_expander_simple_attr(component_id, component_id, "%u\n", unsigned int); sas_expander_simple_attr(component_revision_id, component_revision_id, "%u\n", unsigned int); sas_expander_simple_attr(level, level, "%d\n", int); static DECLARE_TRANSPORT_CLASS(sas_rphy_class, "sas_device", NULL, NULL, NULL); static int sas_rphy_match(struct attribute_container *cont, struct device *dev) { struct Scsi_Host *shost; struct sas_internal *i; if (!scsi_is_sas_rphy(dev)) return 0; shost = dev_to_shost(dev->parent->parent); if (!shost->transportt) return 0; if (shost->transportt->host_attrs.ac.class != &sas_host_class.class) return 0; i = to_sas_internal(shost->transportt); return &i->rphy_attr_cont.ac == cont; } static int sas_end_dev_match(struct attribute_container *cont, struct device *dev) { struct Scsi_Host *shost; struct sas_internal *i; struct sas_rphy *rphy; if (!scsi_is_sas_rphy(dev)) return 0; shost = dev_to_shost(dev->parent->parent); rphy = dev_to_rphy(dev); if (!shost->transportt) return 0; if (shost->transportt->host_attrs.ac.class != &sas_host_class.class) return 0; i = to_sas_internal(shost->transportt); return &i->end_dev_attr_cont.ac == cont && rphy->identify.device_type == SAS_END_DEVICE; } static int sas_expander_match(struct attribute_container *cont, struct device *dev) { struct Scsi_Host *shost; struct sas_internal *i; struct sas_rphy *rphy; if (!scsi_is_sas_rphy(dev)) return 0; shost = dev_to_shost(dev->parent->parent); rphy = dev_to_rphy(dev); if (!shost->transportt) return 0; if (shost->transportt->host_attrs.ac.class != &sas_host_class.class) return 0; i = to_sas_internal(shost->transportt); return &i->expander_attr_cont.ac == cont && (rphy->identify.device_type == SAS_EDGE_EXPANDER_DEVICE || rphy->identify.device_type == SAS_FANOUT_EXPANDER_DEVICE); } static void sas_expander_release(struct device *dev) { struct sas_rphy *rphy = dev_to_rphy(dev); struct sas_expander_device *edev = rphy_to_expander_device(rphy); put_device(dev->parent); kfree(edev); } static void sas_end_device_release(struct device *dev) { struct sas_rphy *rphy = dev_to_rphy(dev); struct sas_end_device *edev = rphy_to_end_device(rphy); put_device(dev->parent); kfree(edev); } /** * sas_rphy_initialize - common rphy initialization * @rphy: rphy to initialise * * Used by both sas_end_device_alloc() and sas_expander_alloc() to * initialise the common rphy component of each. */ static void sas_rphy_initialize(struct sas_rphy *rphy) { INIT_LIST_HEAD(&rphy->list); } /** * sas_end_device_alloc - allocate an rphy for an end device * @parent: which port * * Allocates an SAS remote PHY structure, connected to @parent. * * Returns: * SAS PHY allocated or %NULL if the allocation failed. */ struct sas_rphy *sas_end_device_alloc(struct sas_port *parent) { struct Scsi_Host *shost = dev_to_shost(&parent->dev); struct sas_end_device *rdev; rdev = kzalloc(sizeof(*rdev), GFP_KERNEL); if (!rdev) { return NULL; } device_initialize(&rdev->rphy.dev); rdev->rphy.dev.parent = get_device(&parent->dev); rdev->rphy.dev.release = sas_end_device_release; if (scsi_is_sas_expander_device(parent->dev.parent)) { struct sas_rphy *rphy = dev_to_rphy(parent->dev.parent); dev_set_name(&rdev->rphy.dev, "end_device-%d:%d:%d", shost->host_no, rphy->scsi_target_id, parent->port_identifier); } else dev_set_name(&rdev->rphy.dev, "end_device-%d:%d", shost->host_no, parent->port_identifier); rdev->rphy.identify.device_type = SAS_END_DEVICE; sas_rphy_initialize(&rdev->rphy); transport_setup_device(&rdev->rphy.dev); return &rdev->rphy; } EXPORT_SYMBOL(sas_end_device_alloc); /** * sas_expander_alloc - allocate an rphy for an end device * @parent: which port * @type: SAS_EDGE_EXPANDER_DEVICE or SAS_FANOUT_EXPANDER_DEVICE * * Allocates an SAS remote PHY structure, connected to @parent. * * Returns: * SAS PHY allocated or %NULL if the allocation failed. */ struct sas_rphy *sas_expander_alloc(struct sas_port *parent, enum sas_device_type type) { struct Scsi_Host *shost = dev_to_shost(&parent->dev); struct sas_expander_device *rdev; struct sas_host_attrs *sas_host = to_sas_host_attrs(shost); BUG_ON(type != SAS_EDGE_EXPANDER_DEVICE && type != SAS_FANOUT_EXPANDER_DEVICE); rdev = kzalloc(sizeof(*rdev), GFP_KERNEL); if (!rdev) { return NULL; } device_initialize(&rdev->rphy.dev); rdev->rphy.dev.parent = get_device(&parent->dev); rdev->rphy.dev.release = sas_expander_release; mutex_lock(&sas_host->lock); rdev->rphy.scsi_target_id = sas_host->next_expander_id++; mutex_unlock(&sas_host->lock); dev_set_name(&rdev->rphy.dev, "expander-%d:%d", shost->host_no, rdev->rphy.scsi_target_id); rdev->rphy.identify.device_type = type; sas_rphy_initialize(&rdev->rphy); transport_setup_device(&rdev->rphy.dev); return &rdev->rphy; } EXPORT_SYMBOL(sas_expander_alloc); /** * sas_rphy_add - add a SAS remote PHY to the device hierarchy * @rphy: The remote PHY to be added * * Publishes a SAS remote PHY to the rest of the system. */ int sas_rphy_add(struct sas_rphy *rphy) { struct sas_port *parent = dev_to_sas_port(rphy->dev.parent); struct Scsi_Host *shost = dev_to_shost(parent->dev.parent); struct sas_host_attrs *sas_host = to_sas_host_attrs(shost); struct sas_identify *identify = &rphy->identify; int error; if (parent->rphy) return -ENXIO; parent->rphy = rphy; error = device_add(&rphy->dev); if (error) return error; transport_add_device(&rphy->dev); transport_configure_device(&rphy->dev); if (sas_bsg_initialize(shost, rphy)) printk("fail to a bsg device %s\n", dev_name(&rphy->dev)); mutex_lock(&sas_host->lock); list_add_tail(&rphy->list, &sas_host->rphy_list); if (identify->device_type == SAS_END_DEVICE && (identify->target_port_protocols & (SAS_PROTOCOL_SSP | SAS_PROTOCOL_STP | SAS_PROTOCOL_SATA))) rphy->scsi_target_id = sas_host->next_target_id++; else if (identify->device_type == SAS_END_DEVICE) rphy->scsi_target_id = -1; mutex_unlock(&sas_host->lock); if (identify->device_type == SAS_END_DEVICE && rphy->scsi_target_id != -1) { int lun; if (identify->target_port_protocols & SAS_PROTOCOL_SSP) lun = SCAN_WILD_CARD; else lun = 0; scsi_scan_target(&rphy->dev, 0, rphy->scsi_target_id, lun, SCSI_SCAN_INITIAL); } return 0; } EXPORT_SYMBOL(sas_rphy_add); /** * sas_rphy_free - free a SAS remote PHY * @rphy: SAS remote PHY to free * * Frees the specified SAS remote PHY. * * Note: * This function must only be called on a remote * PHY that has not successfully been added using * sas_rphy_add() (or has been sas_rphy_remove()'d) */ void sas_rphy_free(struct sas_rphy *rphy) { struct device *dev = &rphy->dev; struct Scsi_Host *shost = dev_to_shost(rphy->dev.parent->parent); struct sas_host_attrs *sas_host = to_sas_host_attrs(shost); mutex_lock(&sas_host->lock); list_del(&rphy->list); mutex_unlock(&sas_host->lock); transport_destroy_device(dev); put_device(dev); } EXPORT_SYMBOL(sas_rphy_free); /** * sas_rphy_delete - remove and free SAS remote PHY * @rphy: SAS remote PHY to remove and free * * Removes the specified SAS remote PHY and frees it. */ void sas_rphy_delete(struct sas_rphy *rphy) { sas_rphy_remove(rphy); sas_rphy_free(rphy); } EXPORT_SYMBOL(sas_rphy_delete); /** * sas_rphy_unlink - unlink SAS remote PHY * @rphy: SAS remote phy to unlink from its parent port * * Removes port reference to an rphy */ void sas_rphy_unlink(struct sas_rphy *rphy) { struct sas_port *parent = dev_to_sas_port(rphy->dev.parent); parent->rphy = NULL; } EXPORT_SYMBOL(sas_rphy_unlink); /** * sas_rphy_remove - remove SAS remote PHY * @rphy: SAS remote phy to remove * * Removes the specified SAS remote PHY. */ void sas_rphy_remove(struct sas_rphy *rphy) { struct device *dev = &rphy->dev; switch (rphy->identify.device_type) { case SAS_END_DEVICE: scsi_remove_target(dev); break; case SAS_EDGE_EXPANDER_DEVICE: case SAS_FANOUT_EXPANDER_DEVICE: sas_remove_children(dev); break; default: break; } sas_rphy_unlink(rphy); bsg_remove_queue(rphy->q); transport_remove_device(dev); device_del(dev); } EXPORT_SYMBOL(sas_rphy_remove); /** * scsi_is_sas_rphy - check if a struct device represents a SAS remote PHY * @dev: device to check * * Returns: * %1 if the device represents a SAS remote PHY, %0 else */ int scsi_is_sas_rphy(const struct device *dev) { return dev->release == sas_end_device_release || dev->release == sas_expander_release; } EXPORT_SYMBOL(scsi_is_sas_rphy); static void scan_channel_zero(struct Scsi_Host *shost, uint id, u64 lun) { struct sas_host_attrs *sas_host = to_sas_host_attrs(shost); struct sas_rphy *rphy; list_for_each_entry(rphy, &sas_host->rphy_list, list) { if (rphy->identify.device_type != SAS_END_DEVICE || rphy->scsi_target_id == -1) continue; if (id == SCAN_WILD_CARD || id == rphy->scsi_target_id) { scsi_scan_target(&rphy->dev, 0, rphy->scsi_target_id, lun, SCSI_SCAN_MANUAL); } } } /* * SCSI scan helper */ static int sas_user_scan(struct Scsi_Host *shost, uint channel, uint id, u64 lun) { struct sas_host_attrs *sas_host = to_sas_host_attrs(shost); int res = 0; int i; switch (channel) { case 0: mutex_lock(&sas_host->lock); scan_channel_zero(shost, id, lun); mutex_unlock(&sas_host->lock); break; case SCAN_WILD_CARD: mutex_lock(&sas_host->lock); scan_channel_zero(shost, id, lun); mutex_unlock(&sas_host->lock); for (i = 1; i <= shost->max_channel; i++) { res = scsi_scan_host_selected(shost, i, id, lun, SCSI_SCAN_MANUAL); if (res) goto exit_scan; } break; default: if (channel < shost->max_channel) { res = scsi_scan_host_selected(shost, channel, id, lun, SCSI_SCAN_MANUAL); } else { res = -EINVAL; } break; } exit_scan: return res; } /* * Setup / Teardown code */ #define SETUP_TEMPLATE(attrb, field, perm, test) \ i->private_##attrb[count] = dev_attr_##field; \ i->private_##attrb[count].attr.mode = perm; \ i->attrb[count] = &i->private_##attrb[count]; \ if (test) \ count++ #define SETUP_TEMPLATE_RW(attrb, field, perm, test, ro_test, ro_perm) \ i->private_##attrb[count] = dev_attr_##field; \ i->private_##attrb[count].attr.mode = perm; \ if (ro_test) { \ i->private_##attrb[count].attr.mode = ro_perm; \ i->private_##attrb[count].store = NULL; \ } \ i->attrb[count] = &i->private_##attrb[count]; \ if (test) \ count++ #define SETUP_RPORT_ATTRIBUTE(field) \ SETUP_TEMPLATE(rphy_attrs, field, S_IRUGO, 1) #define SETUP_OPTIONAL_RPORT_ATTRIBUTE(field, func) \ SETUP_TEMPLATE(rphy_attrs, field, S_IRUGO, i->f->func) #define SETUP_PHY_ATTRIBUTE(field) \ SETUP_TEMPLATE(phy_attrs, field, S_IRUGO, 1) #define SETUP_PHY_ATTRIBUTE_RW(field) \ SETUP_TEMPLATE_RW(phy_attrs, field, S_IRUGO | S_IWUSR, 1, \ !i->f->set_phy_speed, S_IRUGO) #define SETUP_OPTIONAL_PHY_ATTRIBUTE_RW(field, func) \ SETUP_TEMPLATE_RW(phy_attrs, field, S_IRUGO | S_IWUSR, 1, \ !i->f->func, S_IRUGO) #define SETUP_PORT_ATTRIBUTE(field) \ SETUP_TEMPLATE(port_attrs, field, S_IRUGO, 1) #define SETUP_OPTIONAL_PHY_ATTRIBUTE(field, func) \ SETUP_TEMPLATE(phy_attrs, field, S_IRUGO, i->f->func) #define SETUP_PHY_ATTRIBUTE_WRONLY(field) \ SETUP_TEMPLATE(phy_attrs, field, S_IWUSR, 1) #define SETUP_OPTIONAL_PHY_ATTRIBUTE_WRONLY(field, func) \ SETUP_TEMPLATE(phy_attrs, field, S_IWUSR, i->f->func) #define SETUP_END_DEV_ATTRIBUTE(field) \ SETUP_TEMPLATE(end_dev_attrs, field, S_IRUGO, 1) #define SETUP_EXPANDER_ATTRIBUTE(field) \ SETUP_TEMPLATE(expander_attrs, expander_##field, S_IRUGO, 1) /** * sas_attach_transport - instantiate SAS transport template * @ft: SAS transport class function template */ struct scsi_transport_template * sas_attach_transport(struct sas_function_template *ft) { struct sas_internal *i; int count; i = kzalloc(sizeof(struct sas_internal), GFP_KERNEL); if (!i) return NULL; i->t.user_scan = sas_user_scan; i->t.host_attrs.ac.attrs = &i->host_attrs[0]; i->t.host_attrs.ac.class = &sas_host_class.class; i->t.host_attrs.ac.match = sas_host_match; transport_container_register(&i->t.host_attrs); i->t.host_size = sizeof(struct sas_host_attrs); i->phy_attr_cont.ac.class = &sas_phy_class.class; i->phy_attr_cont.ac.attrs = &i->phy_attrs[0]; i->phy_attr_cont.ac.match = sas_phy_match; transport_container_register(&i->phy_attr_cont); i->port_attr_cont.ac.class = &sas_port_class.class; i->port_attr_cont.ac.attrs = &i->port_attrs[0]; i->port_attr_cont.ac.match = sas_port_match; transport_container_register(&i->port_attr_cont); i->rphy_attr_cont.ac.class = &sas_rphy_class.class; i->rphy_attr_cont.ac.attrs = &i->rphy_attrs[0]; i->rphy_attr_cont.ac.match = sas_rphy_match; transport_container_register(&i->rphy_attr_cont); i->end_dev_attr_cont.ac.class = &sas_end_dev_class.class; i->end_dev_attr_cont.ac.attrs = &i->end_dev_attrs[0]; i->end_dev_attr_cont.ac.match = sas_end_dev_match; transport_container_register(&i->end_dev_attr_cont); i->expander_attr_cont.ac.class = &sas_expander_class.class; i->expander_attr_cont.ac.attrs = &i->expander_attrs[0]; i->expander_attr_cont.ac.match = sas_expander_match; transport_container_register(&i->expander_attr_cont); i->f = ft; count = 0; SETUP_PHY_ATTRIBUTE(initiator_port_protocols); SETUP_PHY_ATTRIBUTE(target_port_protocols); SETUP_PHY_ATTRIBUTE(device_type); SETUP_PHY_ATTRIBUTE(sas_address); SETUP_PHY_ATTRIBUTE(phy_identifier); SETUP_PHY_ATTRIBUTE(negotiated_linkrate); SETUP_PHY_ATTRIBUTE(minimum_linkrate_hw); SETUP_PHY_ATTRIBUTE_RW(minimum_linkrate); SETUP_PHY_ATTRIBUTE(maximum_linkrate_hw); SETUP_PHY_ATTRIBUTE_RW(maximum_linkrate); SETUP_PHY_ATTRIBUTE(invalid_dword_count); SETUP_PHY_ATTRIBUTE(running_disparity_error_count); SETUP_PHY_ATTRIBUTE(loss_of_dword_sync_count); SETUP_PHY_ATTRIBUTE(phy_reset_problem_count); SETUP_OPTIONAL_PHY_ATTRIBUTE_WRONLY(link_reset, phy_reset); SETUP_OPTIONAL_PHY_ATTRIBUTE_WRONLY(hard_reset, phy_reset); SETUP_OPTIONAL_PHY_ATTRIBUTE_RW(enable, phy_enable); i->phy_attrs[count] = NULL; count = 0; SETUP_PORT_ATTRIBUTE(num_phys); i->port_attrs[count] = NULL; count = 0; SETUP_RPORT_ATTRIBUTE(rphy_initiator_port_protocols); SETUP_RPORT_ATTRIBUTE(rphy_target_port_protocols); SETUP_RPORT_ATTRIBUTE(rphy_device_type); SETUP_RPORT_ATTRIBUTE(rphy_sas_address); SETUP_RPORT_ATTRIBUTE(rphy_phy_identifier); SETUP_RPORT_ATTRIBUTE(rphy_scsi_target_id); SETUP_OPTIONAL_RPORT_ATTRIBUTE(rphy_enclosure_identifier, get_enclosure_identifier); SETUP_OPTIONAL_RPORT_ATTRIBUTE(rphy_bay_identifier, get_bay_identifier); i->rphy_attrs[count] = NULL; count = 0; SETUP_END_DEV_ATTRIBUTE(end_dev_ready_led_meaning); SETUP_END_DEV_ATTRIBUTE(end_dev_I_T_nexus_loss_timeout); SETUP_END_DEV_ATTRIBUTE(end_dev_initiator_response_timeout); SETUP_END_DEV_ATTRIBUTE(end_dev_tlr_supported); SETUP_END_DEV_ATTRIBUTE(end_dev_tlr_enabled); i->end_dev_attrs[count] = NULL; count = 0; SETUP_EXPANDER_ATTRIBUTE(vendor_id); SETUP_EXPANDER_ATTRIBUTE(product_id); SETUP_EXPANDER_ATTRIBUTE(product_rev); SETUP_EXPANDER_ATTRIBUTE(component_vendor_id); SETUP_EXPANDER_ATTRIBUTE(component_id); SETUP_EXPANDER_ATTRIBUTE(component_revision_id); SETUP_EXPANDER_ATTRIBUTE(level); i->expander_attrs[count] = NULL; return &i->t; } EXPORT_SYMBOL(sas_attach_transport); /** * sas_release_transport - release SAS transport template instance * @t: transport template instance */ void sas_release_transport(struct scsi_transport_template *t) { struct sas_internal *i = to_sas_internal(t); transport_container_unregister(&i->t.host_attrs); transport_container_unregister(&i->phy_attr_cont); transport_container_unregister(&i->port_attr_cont); transport_container_unregister(&i->rphy_attr_cont); transport_container_unregister(&i->end_dev_attr_cont); transport_container_unregister(&i->expander_attr_cont); kfree(i); } EXPORT_SYMBOL(sas_release_transport); static __init int sas_transport_init(void) { int error; error = transport_class_register(&sas_host_class); if (error) goto out; error = transport_class_register(&sas_phy_class); if (error) goto out_unregister_transport; error = transport_class_register(&sas_port_class); if (error) goto out_unregister_phy; error = transport_class_register(&sas_rphy_class); if (error) goto out_unregister_port; error = transport_class_register(&sas_end_dev_class); if (error) goto out_unregister_rphy; error = transport_class_register(&sas_expander_class); if (error) goto out_unregister_end_dev; return 0; out_unregister_end_dev: transport_class_unregister(&sas_end_dev_class); out_unregister_rphy: transport_class_unregister(&sas_rphy_class); out_unregister_port: transport_class_unregister(&sas_port_class); out_unregister_phy: transport_class_unregister(&sas_phy_class); out_unregister_transport: transport_class_unregister(&sas_host_class); out: return error; } static void __exit sas_transport_exit(void) { transport_class_unregister(&sas_host_class); transport_class_unregister(&sas_phy_class); transport_class_unregister(&sas_port_class); transport_class_unregister(&sas_rphy_class); transport_class_unregister(&sas_end_dev_class); transport_class_unregister(&sas_expander_class); } MODULE_AUTHOR("Christoph Hellwig"); MODULE_DESCRIPTION("SAS Transport Attributes"); MODULE_LICENSE("GPL"); module_init(sas_transport_init); module_exit(sas_transport_exit); |
| 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 3 3 3 3 3 3 3 3 107 107 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 | // SPDX-License-Identifier: GPL-2.0 /* * security/tomoyo/condition.c * * Copyright (C) 2005-2011 NTT DATA CORPORATION */ #include "common.h" #include <linux/slab.h> /* List of "struct tomoyo_condition". */ LIST_HEAD(tomoyo_condition_list); /** * tomoyo_argv - Check argv[] in "struct linux_binbrm". * * @index: Index number of @arg_ptr. * @arg_ptr: Contents of argv[@index]. * @argc: Length of @argv. * @argv: Pointer to "struct tomoyo_argv". * @checked: Set to true if @argv[@index] was found. * * Returns true on success, false otherwise. */ static bool tomoyo_argv(const unsigned int index, const char *arg_ptr, const int argc, const struct tomoyo_argv *argv, u8 *checked) { int i; struct tomoyo_path_info arg; arg.name = arg_ptr; for (i = 0; i < argc; argv++, checked++, i++) { bool result; if (index != argv->index) continue; *checked = 1; tomoyo_fill_path_info(&arg); result = tomoyo_path_matches_pattern(&arg, argv->value); if (argv->is_not) result = !result; if (!result) return false; } return true; } /** * tomoyo_envp - Check envp[] in "struct linux_binbrm". * * @env_name: The name of environment variable. * @env_value: The value of environment variable. * @envc: Length of @envp. * @envp: Pointer to "struct tomoyo_envp". * @checked: Set to true if @envp[@env_name] was found. * * Returns true on success, false otherwise. */ static bool tomoyo_envp(const char *env_name, const char *env_value, const int envc, const struct tomoyo_envp *envp, u8 *checked) { int i; struct tomoyo_path_info name; struct tomoyo_path_info value; name.name = env_name; tomoyo_fill_path_info(&name); value.name = env_value; tomoyo_fill_path_info(&value); for (i = 0; i < envc; envp++, checked++, i++) { bool result; if (!tomoyo_path_matches_pattern(&name, envp->name)) continue; *checked = 1; if (envp->value) { result = tomoyo_path_matches_pattern(&value, envp->value); if (envp->is_not) result = !result; } else { result = true; if (!envp->is_not) result = !result; } if (!result) return false; } return true; } /** * tomoyo_scan_bprm - Scan "struct linux_binprm". * * @ee: Pointer to "struct tomoyo_execve". * @argc: Length of @argc. * @argv: Pointer to "struct tomoyo_argv". * @envc: Length of @envp. * @envp: Pointer to "struct tomoyo_envp". * * Returns true on success, false otherwise. */ static bool tomoyo_scan_bprm(struct tomoyo_execve *ee, const u16 argc, const struct tomoyo_argv *argv, const u16 envc, const struct tomoyo_envp *envp) { struct linux_binprm *bprm = ee->bprm; struct tomoyo_page_dump *dump = &ee->dump; char *arg_ptr = ee->tmp; int arg_len = 0; unsigned long pos = bprm->p; int offset = pos % PAGE_SIZE; int argv_count = bprm->argc; int envp_count = bprm->envc; bool result = true; u8 local_checked[32]; u8 *checked; if (argc + envc <= sizeof(local_checked)) { checked = local_checked; memset(local_checked, 0, sizeof(local_checked)); } else { checked = kzalloc(argc + envc, GFP_NOFS); if (!checked) return false; } while (argv_count || envp_count) { if (!tomoyo_dump_page(bprm, pos, dump)) { result = false; goto out; } pos += PAGE_SIZE - offset; while (offset < PAGE_SIZE) { /* Read. */ const char *kaddr = dump->data; const unsigned char c = kaddr[offset++]; if (c && arg_len < TOMOYO_EXEC_TMPSIZE - 10) { if (c == '\\') { arg_ptr[arg_len++] = '\\'; arg_ptr[arg_len++] = '\\'; } else if (c > ' ' && c < 127) { arg_ptr[arg_len++] = c; } else { arg_ptr[arg_len++] = '\\'; arg_ptr[arg_len++] = (c >> 6) + '0'; arg_ptr[arg_len++] = ((c >> 3) & 7) + '0'; arg_ptr[arg_len++] = (c & 7) + '0'; } } else { arg_ptr[arg_len] = '\0'; } if (c) continue; /* Check. */ if (argv_count) { if (!tomoyo_argv(bprm->argc - argv_count, arg_ptr, argc, argv, checked)) { result = false; break; } argv_count--; } else if (envp_count) { char *cp = strchr(arg_ptr, '='); if (cp) { *cp = '\0'; if (!tomoyo_envp(arg_ptr, cp + 1, envc, envp, checked + argc)) { result = false; break; } } envp_count--; } else { break; } arg_len = 0; } offset = 0; if (!result) break; } out: if (result) { int i; /* Check not-yet-checked entries. */ for (i = 0; i < argc; i++) { if (checked[i]) continue; /* * Return true only if all unchecked indexes in * bprm->argv[] are not matched. */ if (argv[i].is_not) continue; result = false; break; } for (i = 0; i < envc; envp++, i++) { if (checked[argc + i]) continue; /* * Return true only if all unchecked environ variables * in bprm->envp[] are either undefined or not matched. */ if ((!envp->value && !envp->is_not) || (envp->value && envp->is_not)) continue; result = false; break; } } if (checked != local_checked) kfree(checked); return result; } /** * tomoyo_scan_exec_realpath - Check "exec.realpath" parameter of "struct tomoyo_condition". * * @file: Pointer to "struct file". * @ptr: Pointer to "struct tomoyo_name_union". * @match: True if "exec.realpath=", false if "exec.realpath!=". * * Returns true on success, false otherwise. */ static bool tomoyo_scan_exec_realpath(struct file *file, const struct tomoyo_name_union *ptr, const bool match) { bool result; struct tomoyo_path_info exe; if (!file) return false; exe.name = tomoyo_realpath_from_path(&file->f_path); if (!exe.name) return false; tomoyo_fill_path_info(&exe); result = tomoyo_compare_name_union(&exe, ptr); kfree(exe.name); return result == match; } /** * tomoyo_get_dqword - tomoyo_get_name() for a quoted string. * * @start: String to save. * * Returns pointer to "struct tomoyo_path_info" on success, NULL otherwise. */ static const struct tomoyo_path_info *tomoyo_get_dqword(char *start) { char *cp = start + strlen(start) - 1; if (cp == start || *start++ != '"' || *cp != '"') return NULL; *cp = '\0'; if (*start && !tomoyo_correct_word(start)) return NULL; return tomoyo_get_name(start); } /** * tomoyo_parse_name_union_quoted - Parse a quoted word. * * @param: Pointer to "struct tomoyo_acl_param". * @ptr: Pointer to "struct tomoyo_name_union". * * Returns true on success, false otherwise. */ static bool tomoyo_parse_name_union_quoted(struct tomoyo_acl_param *param, struct tomoyo_name_union *ptr) { char *filename = param->data; if (*filename == '@') return tomoyo_parse_name_union(param, ptr); ptr->filename = tomoyo_get_dqword(filename); return ptr->filename != NULL; } /** * tomoyo_parse_argv - Parse an argv[] condition part. * * @left: Lefthand value. * @right: Righthand value. * @argv: Pointer to "struct tomoyo_argv". * * Returns true on success, false otherwise. */ static bool tomoyo_parse_argv(char *left, char *right, struct tomoyo_argv *argv) { if (tomoyo_parse_ulong(&argv->index, &left) != TOMOYO_VALUE_TYPE_DECIMAL || *left++ != ']' || *left) return false; argv->value = tomoyo_get_dqword(right); return argv->value != NULL; } /** * tomoyo_parse_envp - Parse an envp[] condition part. * * @left: Lefthand value. * @right: Righthand value. * @envp: Pointer to "struct tomoyo_envp". * * Returns true on success, false otherwise. */ static bool tomoyo_parse_envp(char *left, char *right, struct tomoyo_envp *envp) { const struct tomoyo_path_info *name; const struct tomoyo_path_info *value; char *cp = left + strlen(left) - 1; if (*cp-- != ']' || *cp != '"') goto out; *cp = '\0'; if (!tomoyo_correct_word(left)) goto out; name = tomoyo_get_name(left); if (!name) goto out; if (!strcmp(right, "NULL")) { value = NULL; } else { value = tomoyo_get_dqword(right); if (!value) { tomoyo_put_name(name); goto out; } } envp->name = name; envp->value = value; return true; out: return false; } /** * tomoyo_same_condition - Check for duplicated "struct tomoyo_condition" entry. * * @a: Pointer to "struct tomoyo_condition". * @b: Pointer to "struct tomoyo_condition". * * Returns true if @a == @b, false otherwise. */ static inline bool tomoyo_same_condition(const struct tomoyo_condition *a, const struct tomoyo_condition *b) { return a->size == b->size && a->condc == b->condc && a->numbers_count == b->numbers_count && a->names_count == b->names_count && a->argc == b->argc && a->envc == b->envc && a->grant_log == b->grant_log && a->transit == b->transit && !memcmp(a + 1, b + 1, a->size - sizeof(*a)); } /** * tomoyo_condition_type - Get condition type. * * @word: Keyword string. * * Returns one of values in "enum tomoyo_conditions_index" on success, * TOMOYO_MAX_CONDITION_KEYWORD otherwise. */ static u8 tomoyo_condition_type(const char *word) { u8 i; for (i = 0; i < TOMOYO_MAX_CONDITION_KEYWORD; i++) { if (!strcmp(word, tomoyo_condition_keyword[i])) break; } return i; } /* Define this to enable debug mode. */ /* #define DEBUG_CONDITION */ #ifdef DEBUG_CONDITION #define dprintk printk #else #define dprintk(...) do { } while (0) #endif /** * tomoyo_commit_condition - Commit "struct tomoyo_condition". * * @entry: Pointer to "struct tomoyo_condition". * * Returns pointer to "struct tomoyo_condition" on success, NULL otherwise. * * This function merges duplicated entries. This function returns NULL if * @entry is not duplicated but memory quota for policy has exceeded. */ static struct tomoyo_condition *tomoyo_commit_condition (struct tomoyo_condition *entry) { struct tomoyo_condition *ptr; bool found = false; if (mutex_lock_interruptible(&tomoyo_policy_lock)) { dprintk(KERN_WARNING "%u: %s failed\n", __LINE__, __func__); ptr = NULL; found = true; goto out; } list_for_each_entry(ptr, &tomoyo_condition_list, head.list) { if (!tomoyo_same_condition(ptr, entry) || atomic_read(&ptr->head.users) == TOMOYO_GC_IN_PROGRESS) continue; /* Same entry found. Share this entry. */ atomic_inc(&ptr->head.users); found = true; break; } if (!found) { if (tomoyo_memory_ok(entry)) { atomic_set(&entry->head.users, 1); list_add(&entry->head.list, &tomoyo_condition_list); } else { found = true; ptr = NULL; } } mutex_unlock(&tomoyo_policy_lock); out: if (found) { tomoyo_del_condition(&entry->head.list); kfree(entry); entry = ptr; } return entry; } /** * tomoyo_get_transit_preference - Parse domain transition preference for execve(). * * @param: Pointer to "struct tomoyo_acl_param". * @e: Pointer to "struct tomoyo_condition". * * Returns the condition string part. */ static char *tomoyo_get_transit_preference(struct tomoyo_acl_param *param, struct tomoyo_condition *e) { char * const pos = param->data; bool flag; if (*pos == '<') { e->transit = tomoyo_get_domainname(param); goto done; } { char *cp = strchr(pos, ' '); if (cp) *cp = '\0'; flag = tomoyo_correct_path(pos) || !strcmp(pos, "keep") || !strcmp(pos, "initialize") || !strcmp(pos, "reset") || !strcmp(pos, "child") || !strcmp(pos, "parent"); if (cp) *cp = ' '; } if (!flag) return pos; e->transit = tomoyo_get_name(tomoyo_read_token(param)); done: if (e->transit) return param->data; /* * Return a bad read-only condition string that will let * tomoyo_get_condition() return NULL. */ return "/"; } /** * tomoyo_get_condition - Parse condition part. * * @param: Pointer to "struct tomoyo_acl_param". * * Returns pointer to "struct tomoyo_condition" on success, NULL otherwise. */ struct tomoyo_condition *tomoyo_get_condition(struct tomoyo_acl_param *param) { struct tomoyo_condition *entry = NULL; struct tomoyo_condition_element *condp = NULL; struct tomoyo_number_union *numbers_p = NULL; struct tomoyo_name_union *names_p = NULL; struct tomoyo_argv *argv = NULL; struct tomoyo_envp *envp = NULL; struct tomoyo_condition e = { }; char * const start_of_string = tomoyo_get_transit_preference(param, &e); char * const end_of_string = start_of_string + strlen(start_of_string); char *pos; rerun: pos = start_of_string; while (1) { u8 left = -1; u8 right = -1; char *left_word = pos; char *cp; char *right_word; bool is_not; if (!*left_word) break; /* * Since left-hand condition does not allow use of "path_group" * or "number_group" and environment variable's names do not * accept '=', it is guaranteed that the original line consists * of one or more repetition of $left$operator$right blocks * where "$left is free from '=' and ' '" and "$operator is * either '=' or '!='" and "$right is free from ' '". * Therefore, we can reconstruct the original line at the end * of dry run even if we overwrite $operator with '\0'. */ cp = strchr(pos, ' '); if (cp) { *cp = '\0'; /* Will restore later. */ pos = cp + 1; } else { pos = ""; } right_word = strchr(left_word, '='); if (!right_word || right_word == left_word) goto out; is_not = *(right_word - 1) == '!'; if (is_not) *(right_word++ - 1) = '\0'; /* Will restore later. */ else if (*(right_word + 1) != '=') *right_word++ = '\0'; /* Will restore later. */ else goto out; dprintk(KERN_WARNING "%u: <%s>%s=<%s>\n", __LINE__, left_word, is_not ? "!" : "", right_word); if (!strcmp(left_word, "grant_log")) { if (entry) { if (is_not || entry->grant_log != TOMOYO_GRANTLOG_AUTO) goto out; else if (!strcmp(right_word, "yes")) entry->grant_log = TOMOYO_GRANTLOG_YES; else if (!strcmp(right_word, "no")) entry->grant_log = TOMOYO_GRANTLOG_NO; else goto out; } continue; } if (!strncmp(left_word, "exec.argv[", 10)) { if (!argv) { e.argc++; e.condc++; } else { e.argc--; e.condc--; left = TOMOYO_ARGV_ENTRY; argv->is_not = is_not; if (!tomoyo_parse_argv(left_word + 10, right_word, argv++)) goto out; } goto store_value; } if (!strncmp(left_word, "exec.envp[\"", 11)) { if (!envp) { e.envc++; e.condc++; } else { e.envc--; e.condc--; left = TOMOYO_ENVP_ENTRY; envp->is_not = is_not; if (!tomoyo_parse_envp(left_word + 11, right_word, envp++)) goto out; } goto store_value; } left = tomoyo_condition_type(left_word); dprintk(KERN_WARNING "%u: <%s> left=%u\n", __LINE__, left_word, left); if (left == TOMOYO_MAX_CONDITION_KEYWORD) { if (!numbers_p) { e.numbers_count++; } else { e.numbers_count--; left = TOMOYO_NUMBER_UNION; param->data = left_word; if (*left_word == '@' || !tomoyo_parse_number_union(param, numbers_p++)) goto out; } } if (!condp) e.condc++; else e.condc--; if (left == TOMOYO_EXEC_REALPATH || left == TOMOYO_SYMLINK_TARGET) { if (!names_p) { e.names_count++; } else { e.names_count--; right = TOMOYO_NAME_UNION; param->data = right_word; if (!tomoyo_parse_name_union_quoted(param, names_p++)) goto out; } goto store_value; } right = tomoyo_condition_type(right_word); if (right == TOMOYO_MAX_CONDITION_KEYWORD) { if (!numbers_p) { e.numbers_count++; } else { e.numbers_count--; right = TOMOYO_NUMBER_UNION; param->data = right_word; if (!tomoyo_parse_number_union(param, numbers_p++)) goto out; } } store_value: if (!condp) { dprintk(KERN_WARNING "%u: dry_run left=%u right=%u match=%u\n", __LINE__, left, right, !is_not); continue; } condp->left = left; condp->right = right; condp->equals = !is_not; dprintk(KERN_WARNING "%u: left=%u right=%u match=%u\n", __LINE__, condp->left, condp->right, condp->equals); condp++; } dprintk(KERN_INFO "%u: cond=%u numbers=%u names=%u ac=%u ec=%u\n", __LINE__, e.condc, e.numbers_count, e.names_count, e.argc, e.envc); if (entry) { BUG_ON(e.names_count | e.numbers_count | e.argc | e.envc | e.condc); return tomoyo_commit_condition(entry); } e.size = sizeof(*entry) + e.condc * sizeof(struct tomoyo_condition_element) + e.numbers_count * sizeof(struct tomoyo_number_union) + e.names_count * sizeof(struct tomoyo_name_union) + e.argc * sizeof(struct tomoyo_argv) + e.envc * sizeof(struct tomoyo_envp); entry = kzalloc(e.size, GFP_NOFS); if (!entry) goto out2; *entry = e; e.transit = NULL; condp = (struct tomoyo_condition_element *) (entry + 1); numbers_p = (struct tomoyo_number_union *) (condp + e.condc); names_p = (struct tomoyo_name_union *) (numbers_p + e.numbers_count); argv = (struct tomoyo_argv *) (names_p + e.names_count); envp = (struct tomoyo_envp *) (argv + e.argc); { bool flag = false; for (pos = start_of_string; pos < end_of_string; pos++) { if (*pos) continue; if (flag) /* Restore " ". */ *pos = ' '; else if (*(pos + 1) == '=') /* Restore "!=". */ *pos = '!'; else /* Restore "=". */ *pos = '='; flag = !flag; } } goto rerun; out: dprintk(KERN_WARNING "%u: %s failed\n", __LINE__, __func__); if (entry) { tomoyo_del_condition(&entry->head.list); kfree(entry); } out2: tomoyo_put_name(e.transit); return NULL; } /** * tomoyo_get_attributes - Revalidate "struct inode". * * @obj: Pointer to "struct tomoyo_obj_info". * * Returns nothing. */ void tomoyo_get_attributes(struct tomoyo_obj_info *obj) { u8 i; struct dentry *dentry = NULL; for (i = 0; i < TOMOYO_MAX_PATH_STAT; i++) { struct inode *inode; switch (i) { case TOMOYO_PATH1: dentry = obj->path1.dentry; if (!dentry) continue; break; case TOMOYO_PATH2: dentry = obj->path2.dentry; if (!dentry) continue; break; default: if (!dentry) continue; dentry = dget_parent(dentry); break; } inode = d_backing_inode(dentry); if (inode) { struct tomoyo_mini_stat *stat = &obj->stat[i]; stat->uid = inode->i_uid; stat->gid = inode->i_gid; stat->ino = inode->i_ino; stat->mode = inode->i_mode; stat->dev = inode->i_sb->s_dev; stat->rdev = inode->i_rdev; obj->stat_valid[i] = true; } if (i & 1) /* TOMOYO_PATH1_PARENT or TOMOYO_PATH2_PARENT */ dput(dentry); } } /** * tomoyo_condition - Check condition part. * * @r: Pointer to "struct tomoyo_request_info". * @cond: Pointer to "struct tomoyo_condition". Maybe NULL. * * Returns true on success, false otherwise. * * Caller holds tomoyo_read_lock(). */ bool tomoyo_condition(struct tomoyo_request_info *r, const struct tomoyo_condition *cond) { u32 i; unsigned long min_v[2] = { 0, 0 }; unsigned long max_v[2] = { 0, 0 }; const struct tomoyo_condition_element *condp; const struct tomoyo_number_union *numbers_p; const struct tomoyo_name_union *names_p; const struct tomoyo_argv *argv; const struct tomoyo_envp *envp; struct tomoyo_obj_info *obj; u16 condc; u16 argc; u16 envc; struct linux_binprm *bprm = NULL; if (!cond) return true; condc = cond->condc; argc = cond->argc; envc = cond->envc; obj = r->obj; if (r->ee) bprm = r->ee->bprm; if (!bprm && (argc || envc)) return false; condp = (struct tomoyo_condition_element *) (cond + 1); numbers_p = (const struct tomoyo_number_union *) (condp + condc); names_p = (const struct tomoyo_name_union *) (numbers_p + cond->numbers_count); argv = (const struct tomoyo_argv *) (names_p + cond->names_count); envp = (const struct tomoyo_envp *) (argv + argc); for (i = 0; i < condc; i++) { const bool match = condp->equals; const u8 left = condp->left; const u8 right = condp->right; bool is_bitop[2] = { false, false }; u8 j; condp++; /* Check argv[] and envp[] later. */ if (left == TOMOYO_ARGV_ENTRY || left == TOMOYO_ENVP_ENTRY) continue; /* Check string expressions. */ if (right == TOMOYO_NAME_UNION) { const struct tomoyo_name_union *ptr = names_p++; struct tomoyo_path_info *symlink; struct tomoyo_execve *ee; struct file *file; switch (left) { case TOMOYO_SYMLINK_TARGET: symlink = obj ? obj->symlink_target : NULL; if (!symlink || !tomoyo_compare_name_union(symlink, ptr) == match) goto out; break; case TOMOYO_EXEC_REALPATH: ee = r->ee; file = ee ? ee->bprm->file : NULL; if (!tomoyo_scan_exec_realpath(file, ptr, match)) goto out; break; } continue; } /* Check numeric or bit-op expressions. */ for (j = 0; j < 2; j++) { const u8 index = j ? right : left; unsigned long value = 0; switch (index) { case TOMOYO_TASK_UID: value = from_kuid(&init_user_ns, current_uid()); break; case TOMOYO_TASK_EUID: value = from_kuid(&init_user_ns, current_euid()); break; case TOMOYO_TASK_SUID: value = from_kuid(&init_user_ns, current_suid()); break; case TOMOYO_TASK_FSUID: value = from_kuid(&init_user_ns, current_fsuid()); break; case TOMOYO_TASK_GID: value = from_kgid(&init_user_ns, current_gid()); break; case TOMOYO_TASK_EGID: value = from_kgid(&init_user_ns, current_egid()); break; case TOMOYO_TASK_SGID: value = from_kgid(&init_user_ns, current_sgid()); break; case TOMOYO_TASK_FSGID: value = from_kgid(&init_user_ns, current_fsgid()); break; case TOMOYO_TASK_PID: value = tomoyo_sys_getpid(); break; case TOMOYO_TASK_PPID: value = tomoyo_sys_getppid(); break; case TOMOYO_TYPE_IS_SOCKET: value = S_IFSOCK; break; case TOMOYO_TYPE_IS_SYMLINK: value = S_IFLNK; break; case TOMOYO_TYPE_IS_FILE: value = S_IFREG; break; case TOMOYO_TYPE_IS_BLOCK_DEV: value = S_IFBLK; break; case TOMOYO_TYPE_IS_DIRECTORY: value = S_IFDIR; break; case TOMOYO_TYPE_IS_CHAR_DEV: value = S_IFCHR; break; case TOMOYO_TYPE_IS_FIFO: value = S_IFIFO; break; case TOMOYO_MODE_SETUID: value = S_ISUID; break; case TOMOYO_MODE_SETGID: value = S_ISGID; break; case TOMOYO_MODE_STICKY: value = S_ISVTX; break; case TOMOYO_MODE_OWNER_READ: value = 0400; break; case TOMOYO_MODE_OWNER_WRITE: value = 0200; break; case TOMOYO_MODE_OWNER_EXECUTE: value = 0100; break; case TOMOYO_MODE_GROUP_READ: value = 0040; break; case TOMOYO_MODE_GROUP_WRITE: value = 0020; break; case TOMOYO_MODE_GROUP_EXECUTE: value = 0010; break; case TOMOYO_MODE_OTHERS_READ: value = 0004; break; case TOMOYO_MODE_OTHERS_WRITE: value = 0002; break; case TOMOYO_MODE_OTHERS_EXECUTE: value = 0001; break; case TOMOYO_EXEC_ARGC: if (!bprm) goto out; value = bprm->argc; break; case TOMOYO_EXEC_ENVC: if (!bprm) goto out; value = bprm->envc; break; case TOMOYO_NUMBER_UNION: /* Fetch values later. */ break; default: if (!obj) goto out; if (!obj->validate_done) { tomoyo_get_attributes(obj); obj->validate_done = true; } { u8 stat_index; struct tomoyo_mini_stat *stat; switch (index) { case TOMOYO_PATH1_UID: case TOMOYO_PATH1_GID: case TOMOYO_PATH1_INO: case TOMOYO_PATH1_MAJOR: case TOMOYO_PATH1_MINOR: case TOMOYO_PATH1_TYPE: case TOMOYO_PATH1_DEV_MAJOR: case TOMOYO_PATH1_DEV_MINOR: case TOMOYO_PATH1_PERM: stat_index = TOMOYO_PATH1; break; case TOMOYO_PATH2_UID: case TOMOYO_PATH2_GID: case TOMOYO_PATH2_INO: case TOMOYO_PATH2_MAJOR: case TOMOYO_PATH2_MINOR: case TOMOYO_PATH2_TYPE: case TOMOYO_PATH2_DEV_MAJOR: case TOMOYO_PATH2_DEV_MINOR: case TOMOYO_PATH2_PERM: stat_index = TOMOYO_PATH2; break; case TOMOYO_PATH1_PARENT_UID: case TOMOYO_PATH1_PARENT_GID: case TOMOYO_PATH1_PARENT_INO: case TOMOYO_PATH1_PARENT_PERM: stat_index = TOMOYO_PATH1_PARENT; break; case TOMOYO_PATH2_PARENT_UID: case TOMOYO_PATH2_PARENT_GID: case TOMOYO_PATH2_PARENT_INO: case TOMOYO_PATH2_PARENT_PERM: stat_index = TOMOYO_PATH2_PARENT; break; default: goto out; } if (!obj->stat_valid[stat_index]) goto out; stat = &obj->stat[stat_index]; switch (index) { case TOMOYO_PATH1_UID: case TOMOYO_PATH2_UID: case TOMOYO_PATH1_PARENT_UID: case TOMOYO_PATH2_PARENT_UID: value = from_kuid(&init_user_ns, stat->uid); break; case TOMOYO_PATH1_GID: case TOMOYO_PATH2_GID: case TOMOYO_PATH1_PARENT_GID: case TOMOYO_PATH2_PARENT_GID: value = from_kgid(&init_user_ns, stat->gid); break; case TOMOYO_PATH1_INO: case TOMOYO_PATH2_INO: case TOMOYO_PATH1_PARENT_INO: case TOMOYO_PATH2_PARENT_INO: value = stat->ino; break; case TOMOYO_PATH1_MAJOR: case TOMOYO_PATH2_MAJOR: value = MAJOR(stat->dev); break; case TOMOYO_PATH1_MINOR: case TOMOYO_PATH2_MINOR: value = MINOR(stat->dev); break; case TOMOYO_PATH1_TYPE: case TOMOYO_PATH2_TYPE: value = stat->mode & S_IFMT; break; case TOMOYO_PATH1_DEV_MAJOR: case TOMOYO_PATH2_DEV_MAJOR: value = MAJOR(stat->rdev); break; case TOMOYO_PATH1_DEV_MINOR: case TOMOYO_PATH2_DEV_MINOR: value = MINOR(stat->rdev); break; case TOMOYO_PATH1_PERM: case TOMOYO_PATH2_PERM: case TOMOYO_PATH1_PARENT_PERM: case TOMOYO_PATH2_PARENT_PERM: value = stat->mode & S_IALLUGO; break; } } break; } max_v[j] = value; min_v[j] = value; switch (index) { case TOMOYO_MODE_SETUID: case TOMOYO_MODE_SETGID: case TOMOYO_MODE_STICKY: case TOMOYO_MODE_OWNER_READ: case TOMOYO_MODE_OWNER_WRITE: case TOMOYO_MODE_OWNER_EXECUTE: case TOMOYO_MODE_GROUP_READ: case TOMOYO_MODE_GROUP_WRITE: case TOMOYO_MODE_GROUP_EXECUTE: case TOMOYO_MODE_OTHERS_READ: case TOMOYO_MODE_OTHERS_WRITE: case TOMOYO_MODE_OTHERS_EXECUTE: is_bitop[j] = true; } } if (left == TOMOYO_NUMBER_UNION) { /* Fetch values now. */ const struct tomoyo_number_union *ptr = numbers_p++; min_v[0] = ptr->values[0]; max_v[0] = ptr->values[1]; } if (right == TOMOYO_NUMBER_UNION) { /* Fetch values now. */ const struct tomoyo_number_union *ptr = numbers_p++; if (ptr->group) { if (tomoyo_number_matches_group(min_v[0], max_v[0], ptr->group) == match) continue; } else { if ((min_v[0] <= ptr->values[1] && max_v[0] >= ptr->values[0]) == match) continue; } goto out; } /* * Bit operation is valid only when counterpart value * represents permission. */ if (is_bitop[0] && is_bitop[1]) { goto out; } else if (is_bitop[0]) { switch (right) { case TOMOYO_PATH1_PERM: case TOMOYO_PATH1_PARENT_PERM: case TOMOYO_PATH2_PERM: case TOMOYO_PATH2_PARENT_PERM: if (!(max_v[0] & max_v[1]) == !match) continue; } goto out; } else if (is_bitop[1]) { switch (left) { case TOMOYO_PATH1_PERM: case TOMOYO_PATH1_PARENT_PERM: case TOMOYO_PATH2_PERM: case TOMOYO_PATH2_PARENT_PERM: if (!(max_v[0] & max_v[1]) == !match) continue; } goto out; } /* Normal value range comparison. */ if ((min_v[0] <= max_v[1] && max_v[0] >= min_v[1]) == match) continue; out: return false; } /* Check argv[] and envp[] now. */ if (r->ee && (argc || envc)) return tomoyo_scan_bprm(r->ee, argc, argv, envc, envp); return true; } |
| 113 113 13 13 13 8 8 8 8 8 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 | // SPDX-License-Identifier: GPL-2.0-or-later /* auditsc.c -- System-call auditing support * Handles all system-call specific auditing features. * * Copyright 2003-2004 Red Hat Inc., Durham, North Carolina. * Copyright 2005 Hewlett-Packard Development Company, L.P. * Copyright (C) 2005, 2006 IBM Corporation * All Rights Reserved. * * Written by Rickard E. (Rik) Faith <faith@redhat.com> * * Many of the ideas implemented here are from Stephen C. Tweedie, * especially the idea of avoiding a copy by using getname. * * The method for actual interception of syscall entry and exit (not in * this file -- see entry.S) is based on a GPL'd patch written by * okir@suse.de and Copyright 2003 SuSE Linux AG. * * POSIX message queue support added by George Wilson <ltcgcw@us.ibm.com>, * 2006. * * The support of additional filter rules compares (>, <, >=, <=) was * added by Dustin Kirkland <dustin.kirkland@us.ibm.com>, 2005. * * Modified by Amy Griffis <amy.griffis@hp.com> to collect additional * filesystem information. * * Subject and object context labeling support added by <danjones@us.ibm.com> * and <dustin.kirkland@us.ibm.com> for LSPP certification compliance. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/init.h> #include <asm/types.h> #include <linux/atomic.h> #include <linux/fs.h> #include <linux/namei.h> #include <linux/mm.h> #include <linux/export.h> #include <linux/slab.h> #include <linux/mount.h> #include <linux/socket.h> #include <linux/mqueue.h> #include <linux/audit.h> #include <linux/personality.h> #include <linux/time.h> #include <linux/netlink.h> #include <linux/compiler.h> #include <asm/unistd.h> #include <linux/security.h> #include <linux/list.h> #include <linux/binfmts.h> #include <linux/highmem.h> #include <linux/syscalls.h> #include <asm/syscall.h> #include <linux/capability.h> #include <linux/fs_struct.h> #include <linux/compat.h> #include <linux/ctype.h> #include <linux/string.h> #include <linux/uaccess.h> #include <linux/fsnotify_backend.h> #include <uapi/linux/limits.h> #include <uapi/linux/netfilter/nf_tables.h> #include <uapi/linux/openat2.h> // struct open_how #include <uapi/linux/fanotify.h> #include "audit.h" /* flags stating the success for a syscall */ #define AUDITSC_INVALID 0 #define AUDITSC_SUCCESS 1 #define AUDITSC_FAILURE 2 /* no execve audit message should be longer than this (userspace limits), * see the note near the top of audit_log_execve_info() about this value */ #define MAX_EXECVE_AUDIT_LEN 7500 /* max length to print of cmdline/proctitle value during audit */ #define MAX_PROCTITLE_AUDIT_LEN 128 /* number of audit rules */ int audit_n_rules; /* determines whether we collect data for signals sent */ int audit_signals; struct audit_aux_data { struct audit_aux_data *next; int type; }; /* Number of target pids per aux struct. */ #define AUDIT_AUX_PIDS 16 struct audit_aux_data_pids { struct audit_aux_data d; pid_t target_pid[AUDIT_AUX_PIDS]; kuid_t target_auid[AUDIT_AUX_PIDS]; kuid_t target_uid[AUDIT_AUX_PIDS]; unsigned int target_sessionid[AUDIT_AUX_PIDS]; struct lsm_prop target_ref[AUDIT_AUX_PIDS]; char target_comm[AUDIT_AUX_PIDS][TASK_COMM_LEN]; int pid_count; }; struct audit_aux_data_bprm_fcaps { struct audit_aux_data d; struct audit_cap_data fcap; unsigned int fcap_ver; struct audit_cap_data old_pcap; struct audit_cap_data new_pcap; }; struct audit_tree_refs { struct audit_tree_refs *next; struct audit_chunk *c[31]; }; struct audit_nfcfgop_tab { enum audit_nfcfgop op; const char *s; }; static const struct audit_nfcfgop_tab audit_nfcfgs[] = { { AUDIT_XT_OP_REGISTER, "xt_register" }, { AUDIT_XT_OP_REPLACE, "xt_replace" }, { AUDIT_XT_OP_UNREGISTER, "xt_unregister" }, { AUDIT_NFT_OP_TABLE_REGISTER, "nft_register_table" }, { AUDIT_NFT_OP_TABLE_UNREGISTER, "nft_unregister_table" }, { AUDIT_NFT_OP_CHAIN_REGISTER, "nft_register_chain" }, { AUDIT_NFT_OP_CHAIN_UNREGISTER, "nft_unregister_chain" }, { AUDIT_NFT_OP_RULE_REGISTER, "nft_register_rule" }, { AUDIT_NFT_OP_RULE_UNREGISTER, "nft_unregister_rule" }, { AUDIT_NFT_OP_SET_REGISTER, "nft_register_set" }, { AUDIT_NFT_OP_SET_UNREGISTER, "nft_unregister_set" }, { AUDIT_NFT_OP_SETELEM_REGISTER, "nft_register_setelem" }, { AUDIT_NFT_OP_SETELEM_UNREGISTER, "nft_unregister_setelem" }, { AUDIT_NFT_OP_GEN_REGISTER, "nft_register_gen" }, { AUDIT_NFT_OP_OBJ_REGISTER, "nft_register_obj" }, { AUDIT_NFT_OP_OBJ_UNREGISTER, "nft_unregister_obj" }, { AUDIT_NFT_OP_OBJ_RESET, "nft_reset_obj" }, { AUDIT_NFT_OP_FLOWTABLE_REGISTER, "nft_register_flowtable" }, { AUDIT_NFT_OP_FLOWTABLE_UNREGISTER, "nft_unregister_flowtable" }, { AUDIT_NFT_OP_SETELEM_RESET, "nft_reset_setelem" }, { AUDIT_NFT_OP_RULE_RESET, "nft_reset_rule" }, { AUDIT_NFT_OP_INVALID, "nft_invalid" }, }; static int audit_match_perm(struct audit_context *ctx, int mask) { unsigned n; if (unlikely(!ctx)) return 0; n = ctx->major; switch (audit_classify_syscall(ctx->arch, n)) { case AUDITSC_NATIVE: if ((mask & AUDIT_PERM_WRITE) && audit_match_class(AUDIT_CLASS_WRITE, n)) return 1; if ((mask & AUDIT_PERM_READ) && audit_match_class(AUDIT_CLASS_READ, n)) return 1; if ((mask & AUDIT_PERM_ATTR) && audit_match_class(AUDIT_CLASS_CHATTR, n)) return 1; return 0; case AUDITSC_COMPAT: /* 32bit on biarch */ if ((mask & AUDIT_PERM_WRITE) && audit_match_class(AUDIT_CLASS_WRITE_32, n)) return 1; if ((mask & AUDIT_PERM_READ) && audit_match_class(AUDIT_CLASS_READ_32, n)) return 1; if ((mask & AUDIT_PERM_ATTR) && audit_match_class(AUDIT_CLASS_CHATTR_32, n)) return 1; return 0; case AUDITSC_OPEN: return mask & ACC_MODE(ctx->argv[1]); case AUDITSC_OPENAT: return mask & ACC_MODE(ctx->argv[2]); case AUDITSC_SOCKETCALL: return ((mask & AUDIT_PERM_WRITE) && ctx->argv[0] == SYS_BIND); case AUDITSC_EXECVE: return mask & AUDIT_PERM_EXEC; case AUDITSC_OPENAT2: return mask & ACC_MODE((u32)ctx->openat2.flags); default: return 0; } } static int audit_match_filetype(struct audit_context *ctx, int val) { struct audit_names *n; umode_t mode = (umode_t)val; if (unlikely(!ctx)) return 0; list_for_each_entry(n, &ctx->names_list, list) { if ((n->ino != AUDIT_INO_UNSET) && ((n->mode & S_IFMT) == mode)) return 1; } return 0; } /* * We keep a linked list of fixed-sized (31 pointer) arrays of audit_chunk *; * ->first_trees points to its beginning, ->trees - to the current end of data. * ->tree_count is the number of free entries in array pointed to by ->trees. * Original condition is (NULL, NULL, 0); as soon as it grows we never revert to NULL, * "empty" becomes (p, p, 31) afterwards. We don't shrink the list (and seriously, * it's going to remain 1-element for almost any setup) until we free context itself. * References in it _are_ dropped - at the same time we free/drop aux stuff. */ static void audit_set_auditable(struct audit_context *ctx) { if (!ctx->prio) { ctx->prio = 1; ctx->current_state = AUDIT_STATE_RECORD; } } static int put_tree_ref(struct audit_context *ctx, struct audit_chunk *chunk) { struct audit_tree_refs *p = ctx->trees; int left = ctx->tree_count; if (likely(left)) { p->c[--left] = chunk; ctx->tree_count = left; return 1; } if (!p) return 0; p = p->next; if (p) { p->c[30] = chunk; ctx->trees = p; ctx->tree_count = 30; return 1; } return 0; } static int grow_tree_refs(struct audit_context *ctx) { struct audit_tree_refs *p = ctx->trees; ctx->trees = kzalloc(sizeof(struct audit_tree_refs), GFP_KERNEL); if (!ctx->trees) { ctx->trees = p; return 0; } if (p) p->next = ctx->trees; else ctx->first_trees = ctx->trees; ctx->tree_count = 31; return 1; } static void unroll_tree_refs(struct audit_context *ctx, struct audit_tree_refs *p, int count) { struct audit_tree_refs *q; int n; if (!p) { /* we started with empty chain */ p = ctx->first_trees; count = 31; /* if the very first allocation has failed, nothing to do */ if (!p) return; } n = count; for (q = p; q != ctx->trees; q = q->next, n = 31) { while (n--) { audit_put_chunk(q->c[n]); q->c[n] = NULL; } } while (n-- > ctx->tree_count) { audit_put_chunk(q->c[n]); q->c[n] = NULL; } ctx->trees = p; ctx->tree_count = count; } static void free_tree_refs(struct audit_context *ctx) { struct audit_tree_refs *p, *q; for (p = ctx->first_trees; p; p = q) { q = p->next; kfree(p); } } static int match_tree_refs(struct audit_context *ctx, struct audit_tree *tree) { struct audit_tree_refs *p; int n; if (!tree) return 0; /* full ones */ for (p = ctx->first_trees; p != ctx->trees; p = p->next) { for (n = 0; n < 31; n++) if (audit_tree_match(p->c[n], tree)) return 1; } /* partial */ if (p) { for (n = ctx->tree_count; n < 31; n++) if (audit_tree_match(p->c[n], tree)) return 1; } return 0; } static int audit_compare_uid(kuid_t uid, struct audit_names *name, struct audit_field *f, struct audit_context *ctx) { struct audit_names *n; int rc; if (name) { rc = audit_uid_comparator(uid, f->op, name->uid); if (rc) return rc; } if (ctx) { list_for_each_entry(n, &ctx->names_list, list) { rc = audit_uid_comparator(uid, f->op, n->uid); if (rc) return rc; } } return 0; } static int audit_compare_gid(kgid_t gid, struct audit_names *name, struct audit_field *f, struct audit_context *ctx) { struct audit_names *n; int rc; if (name) { rc = audit_gid_comparator(gid, f->op, name->gid); if (rc) return rc; } if (ctx) { list_for_each_entry(n, &ctx->names_list, list) { rc = audit_gid_comparator(gid, f->op, n->gid); if (rc) return rc; } } return 0; } static int audit_field_compare(struct task_struct *tsk, const struct cred *cred, struct audit_field *f, struct audit_context *ctx, struct audit_names *name) { switch (f->val) { /* process to file object comparisons */ case AUDIT_COMPARE_UID_TO_OBJ_UID: return audit_compare_uid(cred->uid, name, f, ctx); case AUDIT_COMPARE_GID_TO_OBJ_GID: return audit_compare_gid(cred->gid, name, f, ctx); case AUDIT_COMPARE_EUID_TO_OBJ_UID: return audit_compare_uid(cred->euid, name, f, ctx); case AUDIT_COMPARE_EGID_TO_OBJ_GID: return audit_compare_gid(cred->egid, name, f, ctx); case AUDIT_COMPARE_AUID_TO_OBJ_UID: return audit_compare_uid(audit_get_loginuid(tsk), name, f, ctx); case AUDIT_COMPARE_SUID_TO_OBJ_UID: return audit_compare_uid(cred->suid, name, f, ctx); case AUDIT_COMPARE_SGID_TO_OBJ_GID: return audit_compare_gid(cred->sgid, name, f, ctx); case AUDIT_COMPARE_FSUID_TO_OBJ_UID: return audit_compare_uid(cred->fsuid, name, f, ctx); case AUDIT_COMPARE_FSGID_TO_OBJ_GID: return audit_compare_gid(cred->fsgid, name, f, ctx); /* uid comparisons */ case AUDIT_COMPARE_UID_TO_AUID: return audit_uid_comparator(cred->uid, f->op, audit_get_loginuid(tsk)); case AUDIT_COMPARE_UID_TO_EUID: return audit_uid_comparator(cred->uid, f->op, cred->euid); case AUDIT_COMPARE_UID_TO_SUID: return audit_uid_comparator(cred->uid, f->op, cred->suid); case AUDIT_COMPARE_UID_TO_FSUID: return audit_uid_comparator(cred->uid, f->op, cred->fsuid); /* auid comparisons */ case AUDIT_COMPARE_AUID_TO_EUID: return audit_uid_comparator(audit_get_loginuid(tsk), f->op, cred->euid); case AUDIT_COMPARE_AUID_TO_SUID: return audit_uid_comparator(audit_get_loginuid(tsk), f->op, cred->suid); case AUDIT_COMPARE_AUID_TO_FSUID: return audit_uid_comparator(audit_get_loginuid(tsk), f->op, cred->fsuid); /* euid comparisons */ case AUDIT_COMPARE_EUID_TO_SUID: return audit_uid_comparator(cred->euid, f->op, cred->suid); case AUDIT_COMPARE_EUID_TO_FSUID: return audit_uid_comparator(cred->euid, f->op, cred->fsuid); /* suid comparisons */ case AUDIT_COMPARE_SUID_TO_FSUID: return audit_uid_comparator(cred->suid, f->op, cred->fsuid); /* gid comparisons */ case AUDIT_COMPARE_GID_TO_EGID: return audit_gid_comparator(cred->gid, f->op, cred->egid); case AUDIT_COMPARE_GID_TO_SGID: return audit_gid_comparator(cred->gid, f->op, cred->sgid); case AUDIT_COMPARE_GID_TO_FSGID: return audit_gid_comparator(cred->gid, f->op, cred->fsgid); /* egid comparisons */ case AUDIT_COMPARE_EGID_TO_SGID: return audit_gid_comparator(cred->egid, f->op, cred->sgid); case AUDIT_COMPARE_EGID_TO_FSGID: return audit_gid_comparator(cred->egid, f->op, cred->fsgid); /* sgid comparison */ case AUDIT_COMPARE_SGID_TO_FSGID: return audit_gid_comparator(cred->sgid, f->op, cred->fsgid); default: WARN(1, "Missing AUDIT_COMPARE define. Report as a bug\n"); return 0; } return 0; } /* Determine if any context name data matches a rule's watch data */ /* Compare a task_struct with an audit_rule. Return 1 on match, 0 * otherwise. * * If task_creation is true, this is an explicit indication that we are * filtering a task rule at task creation time. This and tsk == current are * the only situations where tsk->cred may be accessed without an rcu read lock. */ static int audit_filter_rules(struct task_struct *tsk, struct audit_krule *rule, struct audit_context *ctx, struct audit_names *name, enum audit_state *state, bool task_creation) { const struct cred *cred; int i, need_sid = 1; struct lsm_prop prop = { }; unsigned int sessionid; if (ctx && rule->prio <= ctx->prio) return 0; cred = rcu_dereference_check(tsk->cred, tsk == current || task_creation); for (i = 0; i < rule->field_count; i++) { struct audit_field *f = &rule->fields[i]; struct audit_names *n; int result = 0; pid_t pid; switch (f->type) { case AUDIT_PID: pid = task_tgid_nr(tsk); result = audit_comparator(pid, f->op, f->val); break; case AUDIT_PPID: if (ctx) { if (!ctx->ppid) ctx->ppid = task_ppid_nr(tsk); result = audit_comparator(ctx->ppid, f->op, f->val); } break; case AUDIT_EXE: result = audit_exe_compare(tsk, rule->exe); if (f->op == Audit_not_equal) result = !result; break; case AUDIT_UID: result = audit_uid_comparator(cred->uid, f->op, f->uid); break; case AUDIT_EUID: result = audit_uid_comparator(cred->euid, f->op, f->uid); break; case AUDIT_SUID: result = audit_uid_comparator(cred->suid, f->op, f->uid); break; case AUDIT_FSUID: result = audit_uid_comparator(cred->fsuid, f->op, f->uid); break; case AUDIT_GID: result = audit_gid_comparator(cred->gid, f->op, f->gid); if (f->op == Audit_equal) { if (!result) result = groups_search(cred->group_info, f->gid); } else if (f->op == Audit_not_equal) { if (result) result = !groups_search(cred->group_info, f->gid); } break; case AUDIT_EGID: result = audit_gid_comparator(cred->egid, f->op, f->gid); if (f->op == Audit_equal) { if (!result) result = groups_search(cred->group_info, f->gid); } else if (f->op == Audit_not_equal) { if (result) result = !groups_search(cred->group_info, f->gid); } break; case AUDIT_SGID: result = audit_gid_comparator(cred->sgid, f->op, f->gid); break; case AUDIT_FSGID: result = audit_gid_comparator(cred->fsgid, f->op, f->gid); break; case AUDIT_SESSIONID: sessionid = audit_get_sessionid(tsk); result = audit_comparator(sessionid, f->op, f->val); break; case AUDIT_PERS: result = audit_comparator(tsk->personality, f->op, f->val); break; case AUDIT_ARCH: if (ctx) result = audit_comparator(ctx->arch, f->op, f->val); break; case AUDIT_EXIT: if (ctx && ctx->return_valid != AUDITSC_INVALID) result = audit_comparator(ctx->return_code, f->op, f->val); break; case AUDIT_SUCCESS: if (ctx && ctx->return_valid != AUDITSC_INVALID) { if (f->val) result = audit_comparator(ctx->return_valid, f->op, AUDITSC_SUCCESS); else result = audit_comparator(ctx->return_valid, f->op, AUDITSC_FAILURE); } break; case AUDIT_DEVMAJOR: if (name) { if (audit_comparator(MAJOR(name->dev), f->op, f->val) || audit_comparator(MAJOR(name->rdev), f->op, f->val)) ++result; } else if (ctx) { list_for_each_entry(n, &ctx->names_list, list) { if (audit_comparator(MAJOR(n->dev), f->op, f->val) || audit_comparator(MAJOR(n->rdev), f->op, f->val)) { ++result; break; } } } break; case AUDIT_DEVMINOR: if (name) { if (audit_comparator(MINOR(name->dev), f->op, f->val) || audit_comparator(MINOR(name->rdev), f->op, f->val)) ++result; } else if (ctx) { list_for_each_entry(n, &ctx->names_list, list) { if (audit_comparator(MINOR(n->dev), f->op, f->val) || audit_comparator(MINOR(n->rdev), f->op, f->val)) { ++result; break; } } } break; case AUDIT_INODE: if (name) result = audit_comparator(name->ino, f->op, f->val); else if (ctx) { list_for_each_entry(n, &ctx->names_list, list) { if (audit_comparator(n->ino, f->op, f->val)) { ++result; break; } } } break; case AUDIT_OBJ_UID: if (name) { result = audit_uid_comparator(name->uid, f->op, f->uid); } else if (ctx) { list_for_each_entry(n, &ctx->names_list, list) { if (audit_uid_comparator(n->uid, f->op, f->uid)) { ++result; break; } } } break; case AUDIT_OBJ_GID: if (name) { result = audit_gid_comparator(name->gid, f->op, f->gid); } else if (ctx) { list_for_each_entry(n, &ctx->names_list, list) { if (audit_gid_comparator(n->gid, f->op, f->gid)) { ++result; break; } } } break; case AUDIT_WATCH: if (name) { result = audit_watch_compare(rule->watch, name->ino, name->dev); if (f->op == Audit_not_equal) result = !result; } break; case AUDIT_DIR: if (ctx) { result = match_tree_refs(ctx, rule->tree); if (f->op == Audit_not_equal) result = !result; } break; case AUDIT_LOGINUID: result = audit_uid_comparator(audit_get_loginuid(tsk), f->op, f->uid); break; case AUDIT_LOGINUID_SET: result = audit_comparator(audit_loginuid_set(tsk), f->op, f->val); break; case AUDIT_SADDR_FAM: if (ctx && ctx->sockaddr) result = audit_comparator(ctx->sockaddr->ss_family, f->op, f->val); break; case AUDIT_SUBJ_USER: case AUDIT_SUBJ_ROLE: case AUDIT_SUBJ_TYPE: case AUDIT_SUBJ_SEN: case AUDIT_SUBJ_CLR: /* NOTE: this may return negative values indicating a temporary error. We simply treat this as a match for now to avoid losing information that may be wanted. An error message will also be logged upon error */ if (f->lsm_rule) { if (need_sid) { /* @tsk should always be equal to * @current with the exception of * fork()/copy_process() in which case * the new @tsk creds are still a dup * of @current's creds so we can still * use * security_current_getlsmprop_subj() * here even though it always refs * @current's creds */ security_current_getlsmprop_subj(&prop); need_sid = 0; } result = security_audit_rule_match(&prop, f->type, f->op, f->lsm_rule); } break; case AUDIT_OBJ_USER: case AUDIT_OBJ_ROLE: case AUDIT_OBJ_TYPE: case AUDIT_OBJ_LEV_LOW: case AUDIT_OBJ_LEV_HIGH: /* The above note for AUDIT_SUBJ_USER...AUDIT_SUBJ_CLR also applies here */ if (f->lsm_rule) { /* Find files that match */ if (name) { result = security_audit_rule_match( &name->oprop, f->type, f->op, f->lsm_rule); } else if (ctx) { list_for_each_entry(n, &ctx->names_list, list) { if (security_audit_rule_match( &n->oprop, f->type, f->op, f->lsm_rule)) { ++result; break; } } } /* Find ipc objects that match */ if (!ctx || ctx->type != AUDIT_IPC) break; if (security_audit_rule_match(&ctx->ipc.oprop, f->type, f->op, f->lsm_rule)) ++result; } break; case AUDIT_ARG0: case AUDIT_ARG1: case AUDIT_ARG2: case AUDIT_ARG3: if (ctx) result = audit_comparator(ctx->argv[f->type-AUDIT_ARG0], f->op, f->val); break; case AUDIT_FILTERKEY: /* ignore this field for filtering */ result = 1; break; case AUDIT_PERM: result = audit_match_perm(ctx, f->val); if (f->op == Audit_not_equal) result = !result; break; case AUDIT_FILETYPE: result = audit_match_filetype(ctx, f->val); if (f->op == Audit_not_equal) result = !result; break; case AUDIT_FIELD_COMPARE: result = audit_field_compare(tsk, cred, f, ctx, name); break; } if (!result) return 0; } if (ctx) { if (rule->filterkey) { kfree(ctx->filterkey); ctx->filterkey = kstrdup(rule->filterkey, GFP_ATOMIC); } ctx->prio = rule->prio; } switch (rule->action) { case AUDIT_NEVER: *state = AUDIT_STATE_DISABLED; break; case AUDIT_ALWAYS: *state = AUDIT_STATE_RECORD; break; } return 1; } /* At process creation time, we can determine if system-call auditing is * completely disabled for this task. Since we only have the task * structure at this point, we can only check uid and gid. */ static enum audit_state audit_filter_task(struct task_struct *tsk, char **key) { struct audit_entry *e; enum audit_state state; rcu_read_lock(); list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_TASK], list) { if (audit_filter_rules(tsk, &e->rule, NULL, NULL, &state, true)) { if (state == AUDIT_STATE_RECORD) *key = kstrdup(e->rule.filterkey, GFP_ATOMIC); rcu_read_unlock(); return state; } } rcu_read_unlock(); return AUDIT_STATE_BUILD; } static int audit_in_mask(const struct audit_krule *rule, unsigned long val) { int word, bit; if (val > 0xffffffff) return false; word = AUDIT_WORD(val); if (word >= AUDIT_BITMASK_SIZE) return false; bit = AUDIT_BIT(val); return rule->mask[word] & bit; } /** * __audit_filter_op - common filter helper for operations (syscall/uring/etc) * @tsk: associated task * @ctx: audit context * @list: audit filter list * @name: audit_name (can be NULL) * @op: current syscall/uring_op * * Run the udit filters specified in @list against @tsk using @ctx, * @name, and @op, as necessary; the caller is responsible for ensuring * that the call is made while the RCU read lock is held. The @name * parameter can be NULL, but all others must be specified. * Returns 1/true if the filter finds a match, 0/false if none are found. */ static int __audit_filter_op(struct task_struct *tsk, struct audit_context *ctx, struct list_head *list, struct audit_names *name, unsigned long op) { struct audit_entry *e; enum audit_state state; list_for_each_entry_rcu(e, list, list) { if (audit_in_mask(&e->rule, op) && audit_filter_rules(tsk, &e->rule, ctx, name, &state, false)) { ctx->current_state = state; return 1; } } return 0; } /** * audit_filter_uring - apply filters to an io_uring operation * @tsk: associated task * @ctx: audit context */ static void audit_filter_uring(struct task_struct *tsk, struct audit_context *ctx) { if (auditd_test_task(tsk)) return; rcu_read_lock(); __audit_filter_op(tsk, ctx, &audit_filter_list[AUDIT_FILTER_URING_EXIT], NULL, ctx->uring_op); rcu_read_unlock(); } /* At syscall exit time, this filter is called if the audit_state is * not low enough that auditing cannot take place, but is also not * high enough that we already know we have to write an audit record * (i.e., the state is AUDIT_STATE_BUILD). */ static void audit_filter_syscall(struct task_struct *tsk, struct audit_context *ctx) { if (auditd_test_task(tsk)) return; rcu_read_lock(); __audit_filter_op(tsk, ctx, &audit_filter_list[AUDIT_FILTER_EXIT], NULL, ctx->major); rcu_read_unlock(); } /* * Given an audit_name check the inode hash table to see if they match. * Called holding the rcu read lock to protect the use of audit_inode_hash */ static int audit_filter_inode_name(struct task_struct *tsk, struct audit_names *n, struct audit_context *ctx) { int h = audit_hash_ino((u32)n->ino); struct list_head *list = &audit_inode_hash[h]; return __audit_filter_op(tsk, ctx, list, n, ctx->major); } /* At syscall exit time, this filter is called if any audit_names have been * collected during syscall processing. We only check rules in sublists at hash * buckets applicable to the inode numbers in audit_names. * Regarding audit_state, same rules apply as for audit_filter_syscall(). */ void audit_filter_inodes(struct task_struct *tsk, struct audit_context *ctx) { struct audit_names *n; if (auditd_test_task(tsk)) return; rcu_read_lock(); list_for_each_entry(n, &ctx->names_list, list) { if (audit_filter_inode_name(tsk, n, ctx)) break; } rcu_read_unlock(); } static inline void audit_proctitle_free(struct audit_context *context) { kfree(context->proctitle.value); context->proctitle.value = NULL; context->proctitle.len = 0; } static inline void audit_free_module(struct audit_context *context) { if (context->type == AUDIT_KERN_MODULE) { kfree(context->module.name); context->module.name = NULL; } } static inline void audit_free_names(struct audit_context *context) { struct audit_names *n, *next; list_for_each_entry_safe(n, next, &context->names_list, list) { list_del(&n->list); if (n->name) putname(n->name); if (n->should_free) kfree(n); } context->name_count = 0; path_put(&context->pwd); context->pwd.dentry = NULL; context->pwd.mnt = NULL; } static inline void audit_free_aux(struct audit_context *context) { struct audit_aux_data *aux; while ((aux = context->aux)) { context->aux = aux->next; kfree(aux); } context->aux = NULL; while ((aux = context->aux_pids)) { context->aux_pids = aux->next; kfree(aux); } context->aux_pids = NULL; } /** * audit_reset_context - reset a audit_context structure * @ctx: the audit_context to reset * * All fields in the audit_context will be reset to an initial state, all * references held by fields will be dropped, and private memory will be * released. When this function returns the audit_context will be suitable * for reuse, so long as the passed context is not NULL or a dummy context. */ static void audit_reset_context(struct audit_context *ctx) { if (!ctx) return; /* if ctx is non-null, reset the "ctx->context" regardless */ ctx->context = AUDIT_CTX_UNUSED; if (ctx->dummy) return; /* * NOTE: It shouldn't matter in what order we release the fields, so * release them in the order in which they appear in the struct; * this gives us some hope of quickly making sure we are * resetting the audit_context properly. * * Other things worth mentioning: * - we don't reset "dummy" * - we don't reset "state", we do reset "current_state" * - we preserve "filterkey" if "state" is AUDIT_STATE_RECORD * - much of this is likely overkill, but play it safe for now * - we really need to work on improving the audit_context struct */ ctx->current_state = ctx->state; ctx->serial = 0; ctx->major = 0; ctx->uring_op = 0; ctx->ctime = (struct timespec64){ .tv_sec = 0, .tv_nsec = 0 }; memset(ctx->argv, 0, sizeof(ctx->argv)); ctx->return_code = 0; ctx->prio = (ctx->state == AUDIT_STATE_RECORD ? ~0ULL : 0); ctx->return_valid = AUDITSC_INVALID; audit_free_names(ctx); if (ctx->state != AUDIT_STATE_RECORD) { kfree(ctx->filterkey); ctx->filterkey = NULL; } audit_free_aux(ctx); kfree(ctx->sockaddr); ctx->sockaddr = NULL; ctx->sockaddr_len = 0; ctx->ppid = 0; ctx->uid = ctx->euid = ctx->suid = ctx->fsuid = KUIDT_INIT(0); ctx->gid = ctx->egid = ctx->sgid = ctx->fsgid = KGIDT_INIT(0); ctx->personality = 0; ctx->arch = 0; ctx->target_pid = 0; ctx->target_auid = ctx->target_uid = KUIDT_INIT(0); ctx->target_sessionid = 0; lsmprop_init(&ctx->target_ref); ctx->target_comm[0] = '\0'; unroll_tree_refs(ctx, NULL, 0); WARN_ON(!list_empty(&ctx->killed_trees)); audit_free_module(ctx); ctx->fds[0] = -1; ctx->type = 0; /* reset last for audit_free_*() */ } static inline struct audit_context *audit_alloc_context(enum audit_state state) { struct audit_context *context; context = kzalloc(sizeof(*context), GFP_KERNEL); if (!context) return NULL; context->context = AUDIT_CTX_UNUSED; context->state = state; context->prio = state == AUDIT_STATE_RECORD ? ~0ULL : 0; INIT_LIST_HEAD(&context->killed_trees); INIT_LIST_HEAD(&context->names_list); context->fds[0] = -1; context->return_valid = AUDITSC_INVALID; return context; } /** * audit_alloc - allocate an audit context block for a task * @tsk: task * * Filter on the task information and allocate a per-task audit context * if necessary. Doing so turns on system call auditing for the * specified task. This is called from copy_process, so no lock is * needed. */ int audit_alloc(struct task_struct *tsk) { struct audit_context *context; enum audit_state state; char *key = NULL; if (likely(!audit_ever_enabled)) return 0; state = audit_filter_task(tsk, &key); if (state == AUDIT_STATE_DISABLED) { clear_task_syscall_work(tsk, SYSCALL_AUDIT); return 0; } context = audit_alloc_context(state); if (!context) { kfree(key); audit_log_lost("out of memory in audit_alloc"); return -ENOMEM; } context->filterkey = key; audit_set_context(tsk, context); set_task_syscall_work(tsk, SYSCALL_AUDIT); return 0; } static inline void audit_free_context(struct audit_context *context) { /* resetting is extra work, but it is likely just noise */ audit_reset_context(context); audit_proctitle_free(context); free_tree_refs(context); kfree(context->filterkey); kfree(context); } static int audit_log_pid_context(struct audit_context *context, pid_t pid, kuid_t auid, kuid_t uid, unsigned int sessionid, struct lsm_prop *prop, char *comm) { struct audit_buffer *ab; struct lsm_context ctx; int rc = 0; ab = audit_log_start(context, GFP_KERNEL, AUDIT_OBJ_PID); if (!ab) return rc; audit_log_format(ab, "opid=%d oauid=%d ouid=%d oses=%d", pid, from_kuid(&init_user_ns, auid), from_kuid(&init_user_ns, uid), sessionid); if (lsmprop_is_set(prop)) { if (security_lsmprop_to_secctx(prop, &ctx) < 0) { audit_log_format(ab, " obj=(none)"); rc = 1; } else { audit_log_format(ab, " obj=%s", ctx.context); security_release_secctx(&ctx); } } audit_log_format(ab, " ocomm="); audit_log_untrustedstring(ab, comm); audit_log_end(ab); return rc; } static void audit_log_execve_info(struct audit_context *context, struct audit_buffer **ab) { long len_max; long len_rem; long len_full; long len_buf; long len_abuf = 0; long len_tmp; bool require_data; bool encode; unsigned int iter; unsigned int arg; char *buf_head; char *buf; const char __user *p = (const char __user *)current->mm->arg_start; /* NOTE: this buffer needs to be large enough to hold all the non-arg * data we put in the audit record for this argument (see the * code below) ... at this point in time 96 is plenty */ char abuf[96]; /* NOTE: we set MAX_EXECVE_AUDIT_LEN to a rather arbitrary limit, the * current value of 7500 is not as important as the fact that it * is less than 8k, a setting of 7500 gives us plenty of wiggle * room if we go over a little bit in the logging below */ WARN_ON_ONCE(MAX_EXECVE_AUDIT_LEN > 7500); len_max = MAX_EXECVE_AUDIT_LEN; /* scratch buffer to hold the userspace args */ buf_head = kmalloc(MAX_EXECVE_AUDIT_LEN + 1, GFP_KERNEL); if (!buf_head) { audit_panic("out of memory for argv string"); return; } buf = buf_head; audit_log_format(*ab, "argc=%d", context->execve.argc); len_rem = len_max; len_buf = 0; len_full = 0; require_data = true; encode = false; iter = 0; arg = 0; do { /* NOTE: we don't ever want to trust this value for anything * serious, but the audit record format insists we * provide an argument length for really long arguments, * e.g. > MAX_EXECVE_AUDIT_LEN, so we have no choice but * to use strncpy_from_user() to obtain this value for * recording in the log, although we don't use it * anywhere here to avoid a double-fetch problem */ if (len_full == 0) len_full = strnlen_user(p, MAX_ARG_STRLEN) - 1; /* read more data from userspace */ if (require_data) { /* can we make more room in the buffer? */ if (buf != buf_head) { memmove(buf_head, buf, len_buf); buf = buf_head; } /* fetch as much as we can of the argument */ len_tmp = strncpy_from_user(&buf_head[len_buf], p, len_max - len_buf); if (len_tmp == -EFAULT) { /* unable to copy from userspace */ send_sig(SIGKILL, current, 0); goto out; } else if (len_tmp == (len_max - len_buf)) { /* buffer is not large enough */ require_data = true; /* NOTE: if we are going to span multiple * buffers force the encoding so we stand * a chance at a sane len_full value and * consistent record encoding */ encode = true; len_full = len_full * 2; p += len_tmp; } else { require_data = false; if (!encode) encode = audit_string_contains_control( buf, len_tmp); /* try to use a trusted value for len_full */ if (len_full < len_max) len_full = (encode ? len_tmp * 2 : len_tmp); p += len_tmp + 1; } len_buf += len_tmp; buf_head[len_buf] = '\0'; /* length of the buffer in the audit record? */ len_abuf = (encode ? len_buf * 2 : len_buf + 2); } /* write as much as we can to the audit log */ if (len_buf >= 0) { /* NOTE: some magic numbers here - basically if we * can't fit a reasonable amount of data into the * existing audit buffer, flush it and start with * a new buffer */ if ((sizeof(abuf) + 8) > len_rem) { len_rem = len_max; audit_log_end(*ab); *ab = audit_log_start(context, GFP_KERNEL, AUDIT_EXECVE); if (!*ab) goto out; } /* create the non-arg portion of the arg record */ len_tmp = 0; if (require_data || (iter > 0) || ((len_abuf + sizeof(abuf)) > len_rem)) { if (iter == 0) { len_tmp += snprintf(&abuf[len_tmp], sizeof(abuf) - len_tmp, " a%d_len=%lu", arg, len_full); } len_tmp += snprintf(&abuf[len_tmp], sizeof(abuf) - len_tmp, " a%d[%d]=", arg, iter++); } else len_tmp += snprintf(&abuf[len_tmp], sizeof(abuf) - len_tmp, " a%d=", arg); WARN_ON(len_tmp >= sizeof(abuf)); abuf[sizeof(abuf) - 1] = '\0'; /* log the arg in the audit record */ audit_log_format(*ab, "%s", abuf); len_rem -= len_tmp; len_tmp = len_buf; if (encode) { if (len_abuf > len_rem) len_tmp = len_rem / 2; /* encoding */ audit_log_n_hex(*ab, buf, len_tmp); len_rem -= len_tmp * 2; len_abuf -= len_tmp * 2; } else { if (len_abuf > len_rem) len_tmp = len_rem - 2; /* quotes */ audit_log_n_string(*ab, buf, len_tmp); len_rem -= len_tmp + 2; /* don't subtract the "2" because we still need * to add quotes to the remaining string */ len_abuf -= len_tmp; } len_buf -= len_tmp; buf += len_tmp; } /* ready to move to the next argument? */ if ((len_buf == 0) && !require_data) { arg++; iter = 0; len_full = 0; require_data = true; encode = false; } } while (arg < context->execve.argc); /* NOTE: the caller handles the final audit_log_end() call */ out: kfree(buf_head); } static void audit_log_cap(struct audit_buffer *ab, char *prefix, kernel_cap_t *cap) { if (cap_isclear(*cap)) { audit_log_format(ab, " %s=0", prefix); return; } audit_log_format(ab, " %s=%016llx", prefix, cap->val); } static void audit_log_fcaps(struct audit_buffer *ab, struct audit_names *name) { if (name->fcap_ver == -1) { audit_log_format(ab, " cap_fe=? cap_fver=? cap_fp=? cap_fi=?"); return; } audit_log_cap(ab, "cap_fp", &name->fcap.permitted); audit_log_cap(ab, "cap_fi", &name->fcap.inheritable); audit_log_format(ab, " cap_fe=%d cap_fver=%x cap_frootid=%d", name->fcap.fE, name->fcap_ver, from_kuid(&init_user_ns, name->fcap.rootid)); } static void audit_log_time(struct audit_context *context, struct audit_buffer **ab) { const struct audit_ntp_data *ntp = &context->time.ntp_data; const struct timespec64 *tk = &context->time.tk_injoffset; static const char * const ntp_name[] = { "offset", "freq", "status", "tai", "tick", "adjust", }; int type; if (context->type == AUDIT_TIME_ADJNTPVAL) { for (type = 0; type < AUDIT_NTP_NVALS; type++) { if (ntp->vals[type].newval != ntp->vals[type].oldval) { if (!*ab) { *ab = audit_log_start(context, GFP_KERNEL, AUDIT_TIME_ADJNTPVAL); if (!*ab) return; } audit_log_format(*ab, "op=%s old=%lli new=%lli", ntp_name[type], ntp->vals[type].oldval, ntp->vals[type].newval); audit_log_end(*ab); *ab = NULL; } } } if (tk->tv_sec != 0 || tk->tv_nsec != 0) { if (!*ab) { *ab = audit_log_start(context, GFP_KERNEL, AUDIT_TIME_INJOFFSET); if (!*ab) return; } audit_log_format(*ab, "sec=%lli nsec=%li", (long long)tk->tv_sec, tk->tv_nsec); audit_log_end(*ab); *ab = NULL; } } static void show_special(struct audit_context *context, int *call_panic) { struct audit_buffer *ab; int i; ab = audit_log_start(context, GFP_KERNEL, context->type); if (!ab) return; switch (context->type) { case AUDIT_SOCKETCALL: { int nargs = context->socketcall.nargs; audit_log_format(ab, "nargs=%d", nargs); for (i = 0; i < nargs; i++) audit_log_format(ab, " a%d=%lx", i, context->socketcall.args[i]); break; } case AUDIT_IPC: audit_log_format(ab, "ouid=%u ogid=%u mode=%#ho", from_kuid(&init_user_ns, context->ipc.uid), from_kgid(&init_user_ns, context->ipc.gid), context->ipc.mode); if (lsmprop_is_set(&context->ipc.oprop)) { struct lsm_context lsmctx; if (security_lsmprop_to_secctx(&context->ipc.oprop, &lsmctx) < 0) { *call_panic = 1; } else { audit_log_format(ab, " obj=%s", lsmctx.context); security_release_secctx(&lsmctx); } } if (context->ipc.has_perm) { audit_log_end(ab); ab = audit_log_start(context, GFP_KERNEL, AUDIT_IPC_SET_PERM); if (unlikely(!ab)) return; audit_log_format(ab, "qbytes=%lx ouid=%u ogid=%u mode=%#ho", context->ipc.qbytes, context->ipc.perm_uid, context->ipc.perm_gid, context->ipc.perm_mode); } break; case AUDIT_MQ_OPEN: audit_log_format(ab, "oflag=0x%x mode=%#ho mq_flags=0x%lx mq_maxmsg=%ld " "mq_msgsize=%ld mq_curmsgs=%ld", context->mq_open.oflag, context->mq_open.mode, context->mq_open.attr.mq_flags, context->mq_open.attr.mq_maxmsg, context->mq_open.attr.mq_msgsize, context->mq_open.attr.mq_curmsgs); break; case AUDIT_MQ_SENDRECV: audit_log_format(ab, "mqdes=%d msg_len=%zd msg_prio=%u " "abs_timeout_sec=%lld abs_timeout_nsec=%ld", context->mq_sendrecv.mqdes, context->mq_sendrecv.msg_len, context->mq_sendrecv.msg_prio, (long long) context->mq_sendrecv.abs_timeout.tv_sec, context->mq_sendrecv.abs_timeout.tv_nsec); break; case AUDIT_MQ_NOTIFY: audit_log_format(ab, "mqdes=%d sigev_signo=%d", context->mq_notify.mqdes, context->mq_notify.sigev_signo); break; case AUDIT_MQ_GETSETATTR: { struct mq_attr *attr = &context->mq_getsetattr.mqstat; audit_log_format(ab, "mqdes=%d mq_flags=0x%lx mq_maxmsg=%ld mq_msgsize=%ld " "mq_curmsgs=%ld ", context->mq_getsetattr.mqdes, attr->mq_flags, attr->mq_maxmsg, attr->mq_msgsize, attr->mq_curmsgs); break; } case AUDIT_CAPSET: audit_log_format(ab, "pid=%d", context->capset.pid); audit_log_cap(ab, "cap_pi", &context->capset.cap.inheritable); audit_log_cap(ab, "cap_pp", &context->capset.cap.permitted); audit_log_cap(ab, "cap_pe", &context->capset.cap.effective); audit_log_cap(ab, "cap_pa", &context->capset.cap.ambient); break; case AUDIT_MMAP: audit_log_format(ab, "fd=%d flags=0x%x", context->mmap.fd, context->mmap.flags); break; case AUDIT_OPENAT2: audit_log_format(ab, "oflag=0%llo mode=0%llo resolve=0x%llx", context->openat2.flags, context->openat2.mode, context->openat2.resolve); break; case AUDIT_EXECVE: audit_log_execve_info(context, &ab); break; case AUDIT_KERN_MODULE: audit_log_format(ab, "name="); if (context->module.name) { audit_log_untrustedstring(ab, context->module.name); } else audit_log_format(ab, "(null)"); break; case AUDIT_TIME_ADJNTPVAL: case AUDIT_TIME_INJOFFSET: /* this call deviates from the rest, eating the buffer */ audit_log_time(context, &ab); break; } audit_log_end(ab); } static inline int audit_proctitle_rtrim(char *proctitle, int len) { char *end = proctitle + len - 1; while (end > proctitle && !isprint(*end)) end--; /* catch the case where proctitle is only 1 non-print character */ len = end - proctitle + 1; len -= isprint(proctitle[len-1]) == 0; return len; } /* * audit_log_name - produce AUDIT_PATH record from struct audit_names * @context: audit_context for the task * @n: audit_names structure with reportable details * @path: optional path to report instead of audit_names->name * @record_num: record number to report when handling a list of names * @call_panic: optional pointer to int that will be updated if secid fails */ static void audit_log_name(struct audit_context *context, struct audit_names *n, const struct path *path, int record_num, int *call_panic) { struct audit_buffer *ab; ab = audit_log_start(context, GFP_KERNEL, AUDIT_PATH); if (!ab) return; audit_log_format(ab, "item=%d", record_num); if (path) audit_log_d_path(ab, " name=", path); else if (n->name) { switch (n->name_len) { case AUDIT_NAME_FULL: /* log the full path */ audit_log_format(ab, " name="); audit_log_untrustedstring(ab, n->name->name); break; case 0: /* name was specified as a relative path and the * directory component is the cwd */ if (context->pwd.dentry && context->pwd.mnt) audit_log_d_path(ab, " name=", &context->pwd); else audit_log_format(ab, " name=(null)"); break; default: /* log the name's directory component */ audit_log_format(ab, " name="); audit_log_n_untrustedstring(ab, n->name->name, n->name_len); } } else audit_log_format(ab, " name=(null)"); if (n->ino != AUDIT_INO_UNSET) audit_log_format(ab, " inode=%lu dev=%02x:%02x mode=%#ho ouid=%u ogid=%u rdev=%02x:%02x", n->ino, MAJOR(n->dev), MINOR(n->dev), n->mode, from_kuid(&init_user_ns, n->uid), from_kgid(&init_user_ns, n->gid), MAJOR(n->rdev), MINOR(n->rdev)); if (lsmprop_is_set(&n->oprop)) { struct lsm_context ctx; if (security_lsmprop_to_secctx(&n->oprop, &ctx) < 0) { if (call_panic) *call_panic = 2; } else { audit_log_format(ab, " obj=%s", ctx.context); security_release_secctx(&ctx); } } /* log the audit_names record type */ switch (n->type) { case AUDIT_TYPE_NORMAL: audit_log_format(ab, " nametype=NORMAL"); break; case AUDIT_TYPE_PARENT: audit_log_format(ab, " nametype=PARENT"); break; case AUDIT_TYPE_CHILD_DELETE: audit_log_format(ab, " nametype=DELETE"); break; case AUDIT_TYPE_CHILD_CREATE: audit_log_format(ab, " nametype=CREATE"); break; default: audit_log_format(ab, " nametype=UNKNOWN"); break; } audit_log_fcaps(ab, n); audit_log_end(ab); } static void audit_log_proctitle(void) { int res; char *buf; char *msg = "(null)"; int len = strlen(msg); struct audit_context *context = audit_context(); struct audit_buffer *ab; ab = audit_log_start(context, GFP_KERNEL, AUDIT_PROCTITLE); if (!ab) return; /* audit_panic or being filtered */ audit_log_format(ab, "proctitle="); /* Not cached */ if (!context->proctitle.value) { buf = kmalloc(MAX_PROCTITLE_AUDIT_LEN, GFP_KERNEL); if (!buf) goto out; /* Historically called this from procfs naming */ res = get_cmdline(current, buf, MAX_PROCTITLE_AUDIT_LEN); if (res == 0) { kfree(buf); goto out; } res = audit_proctitle_rtrim(buf, res); if (res == 0) { kfree(buf); goto out; } context->proctitle.value = buf; context->proctitle.len = res; } msg = context->proctitle.value; len = context->proctitle.len; out: audit_log_n_untrustedstring(ab, msg, len); audit_log_end(ab); } /** * audit_log_uring - generate a AUDIT_URINGOP record * @ctx: the audit context */ static void audit_log_uring(struct audit_context *ctx) { struct audit_buffer *ab; const struct cred *cred; ab = audit_log_start(ctx, GFP_ATOMIC, AUDIT_URINGOP); if (!ab) return; cred = current_cred(); audit_log_format(ab, "uring_op=%d", ctx->uring_op); if (ctx->return_valid != AUDITSC_INVALID) audit_log_format(ab, " success=%s exit=%ld", str_yes_no(ctx->return_valid == AUDITSC_SUCCESS), ctx->return_code); audit_log_format(ab, " items=%d" " ppid=%d pid=%d uid=%u gid=%u euid=%u suid=%u" " fsuid=%u egid=%u sgid=%u fsgid=%u", ctx->name_count, task_ppid_nr(current), task_tgid_nr(current), from_kuid(&init_user_ns, cred->uid), from_kgid(&init_user_ns, cred->gid), from_kuid(&init_user_ns, cred->euid), from_kuid(&init_user_ns, cred->suid), from_kuid(&init_user_ns, cred->fsuid), from_kgid(&init_user_ns, cred->egid), from_kgid(&init_user_ns, cred->sgid), from_kgid(&init_user_ns, cred->fsgid)); audit_log_task_context(ab); audit_log_key(ab, ctx->filterkey); audit_log_end(ab); } static void audit_log_exit(void) { int i, call_panic = 0; struct audit_context *context = audit_context(); struct audit_buffer *ab; struct audit_aux_data *aux; struct audit_names *n; context->personality = current->personality; switch (context->context) { case AUDIT_CTX_SYSCALL: ab = audit_log_start(context, GFP_KERNEL, AUDIT_SYSCALL); if (!ab) return; audit_log_format(ab, "arch=%x syscall=%d", context->arch, context->major); if (context->personality != PER_LINUX) audit_log_format(ab, " per=%lx", context->personality); if (context->return_valid != AUDITSC_INVALID) audit_log_format(ab, " success=%s exit=%ld", str_yes_no(context->return_valid == AUDITSC_SUCCESS), context->return_code); audit_log_format(ab, " a0=%lx a1=%lx a2=%lx a3=%lx items=%d", context->argv[0], context->argv[1], context->argv[2], context->argv[3], context->name_count); audit_log_task_info(ab); audit_log_key(ab, context->filterkey); audit_log_end(ab); break; case AUDIT_CTX_URING: audit_log_uring(context); break; default: BUG(); break; } for (aux = context->aux; aux; aux = aux->next) { ab = audit_log_start(context, GFP_KERNEL, aux->type); if (!ab) continue; /* audit_panic has been called */ switch (aux->type) { case AUDIT_BPRM_FCAPS: { struct audit_aux_data_bprm_fcaps *axs = (void *)aux; audit_log_format(ab, "fver=%x", axs->fcap_ver); audit_log_cap(ab, "fp", &axs->fcap.permitted); audit_log_cap(ab, "fi", &axs->fcap.inheritable); audit_log_format(ab, " fe=%d", axs->fcap.fE); audit_log_cap(ab, "old_pp", &axs->old_pcap.permitted); audit_log_cap(ab, "old_pi", &axs->old_pcap.inheritable); audit_log_cap(ab, "old_pe", &axs->old_pcap.effective); audit_log_cap(ab, "old_pa", &axs->old_pcap.ambient); audit_log_cap(ab, "pp", &axs->new_pcap.permitted); audit_log_cap(ab, "pi", &axs->new_pcap.inheritable); audit_log_cap(ab, "pe", &axs->new_pcap.effective); audit_log_cap(ab, "pa", &axs->new_pcap.ambient); audit_log_format(ab, " frootid=%d", from_kuid(&init_user_ns, axs->fcap.rootid)); break; } } audit_log_end(ab); } if (context->type) show_special(context, &call_panic); if (context->fds[0] >= 0) { ab = audit_log_start(context, GFP_KERNEL, AUDIT_FD_PAIR); if (ab) { audit_log_format(ab, "fd0=%d fd1=%d", context->fds[0], context->fds[1]); audit_log_end(ab); } } if (context->sockaddr_len) { ab = audit_log_start(context, GFP_KERNEL, AUDIT_SOCKADDR); if (ab) { audit_log_format(ab, "saddr="); audit_log_n_hex(ab, (void *)context->sockaddr, context->sockaddr_len); audit_log_end(ab); } } for (aux = context->aux_pids; aux; aux = aux->next) { struct audit_aux_data_pids *axs = (void *)aux; for (i = 0; i < axs->pid_count; i++) if (audit_log_pid_context(context, axs->target_pid[i], axs->target_auid[i], axs->target_uid[i], axs->target_sessionid[i], &axs->target_ref[i], axs->target_comm[i])) call_panic = 1; } if (context->target_pid && audit_log_pid_context(context, context->target_pid, context->target_auid, context->target_uid, context->target_sessionid, &context->target_ref, context->target_comm)) call_panic = 1; if (context->pwd.dentry && context->pwd.mnt) { ab = audit_log_start(context, GFP_KERNEL, AUDIT_CWD); if (ab) { audit_log_d_path(ab, "cwd=", &context->pwd); audit_log_end(ab); } } i = 0; list_for_each_entry(n, &context->names_list, list) { if (n->hidden) continue; audit_log_name(context, n, NULL, i++, &call_panic); } if (context->context == AUDIT_CTX_SYSCALL) audit_log_proctitle(); /* Send end of event record to help user space know we are finished */ ab = audit_log_start(context, GFP_KERNEL, AUDIT_EOE); if (ab) audit_log_end(ab); if (call_panic) audit_panic("error in audit_log_exit()"); } /** * __audit_free - free a per-task audit context * @tsk: task whose audit context block to free * * Called from copy_process, do_exit, and the io_uring code */ void __audit_free(struct task_struct *tsk) { struct audit_context *context = tsk->audit_context; if (!context) return; /* this may generate CONFIG_CHANGE records */ if (!list_empty(&context->killed_trees)) audit_kill_trees(context); /* We are called either by do_exit() or the fork() error handling code; * in the former case tsk == current and in the latter tsk is a * random task_struct that doesn't have any meaningful data we * need to log via audit_log_exit(). */ if (tsk == current && !context->dummy) { context->return_valid = AUDITSC_INVALID; context->return_code = 0; if (context->context == AUDIT_CTX_SYSCALL) { audit_filter_syscall(tsk, context); audit_filter_inodes(tsk, context); if (context->current_state == AUDIT_STATE_RECORD) audit_log_exit(); } else if (context->context == AUDIT_CTX_URING) { /* TODO: verify this case is real and valid */ audit_filter_uring(tsk, context); audit_filter_inodes(tsk, context); if (context->current_state == AUDIT_STATE_RECORD) audit_log_uring(context); } } audit_set_context(tsk, NULL); audit_free_context(context); } /** * audit_return_fixup - fixup the return codes in the audit_context * @ctx: the audit_context * @success: true/false value to indicate if the operation succeeded or not * @code: operation return code * * We need to fixup the return code in the audit logs if the actual return * codes are later going to be fixed by the arch specific signal handlers. */ static void audit_return_fixup(struct audit_context *ctx, int success, long code) { /* * This is actually a test for: * (rc == ERESTARTSYS ) || (rc == ERESTARTNOINTR) || * (rc == ERESTARTNOHAND) || (rc == ERESTART_RESTARTBLOCK) * * but is faster than a bunch of || */ if (unlikely(code <= -ERESTARTSYS) && (code >= -ERESTART_RESTARTBLOCK) && (code != -ENOIOCTLCMD)) ctx->return_code = -EINTR; else ctx->return_code = code; ctx->return_valid = (success ? AUDITSC_SUCCESS : AUDITSC_FAILURE); } /** * __audit_uring_entry - prepare the kernel task's audit context for io_uring * @op: the io_uring opcode * * This is similar to audit_syscall_entry() but is intended for use by io_uring * operations. This function should only ever be called from * audit_uring_entry() as we rely on the audit context checking present in that * function. */ void __audit_uring_entry(u8 op) { struct audit_context *ctx = audit_context(); if (ctx->state == AUDIT_STATE_DISABLED) return; /* * NOTE: It's possible that we can be called from the process' context * before it returns to userspace, and before audit_syscall_exit() * is called. In this case there is not much to do, just record * the io_uring details and return. */ ctx->uring_op = op; if (ctx->context == AUDIT_CTX_SYSCALL) return; ctx->dummy = !audit_n_rules; if (!ctx->dummy && ctx->state == AUDIT_STATE_BUILD) ctx->prio = 0; ctx->context = AUDIT_CTX_URING; ctx->current_state = ctx->state; ktime_get_coarse_real_ts64(&ctx->ctime); } /** * __audit_uring_exit - wrap up the kernel task's audit context after io_uring * @success: true/false value to indicate if the operation succeeded or not * @code: operation return code * * This is similar to audit_syscall_exit() but is intended for use by io_uring * operations. This function should only ever be called from * audit_uring_exit() as we rely on the audit context checking present in that * function. */ void __audit_uring_exit(int success, long code) { struct audit_context *ctx = audit_context(); if (ctx->dummy) { if (ctx->context != AUDIT_CTX_URING) return; goto out; } audit_return_fixup(ctx, success, code); if (ctx->context == AUDIT_CTX_SYSCALL) { /* * NOTE: See the note in __audit_uring_entry() about the case * where we may be called from process context before we * return to userspace via audit_syscall_exit(). In this * case we simply emit a URINGOP record and bail, the * normal syscall exit handling will take care of * everything else. * It is also worth mentioning that when we are called, * the current process creds may differ from the creds * used during the normal syscall processing; keep that * in mind if/when we move the record generation code. */ /* * We need to filter on the syscall info here to decide if we * should emit a URINGOP record. I know it seems odd but this * solves the problem where users have a filter to block *all* * syscall records in the "exit" filter; we want to preserve * the behavior here. */ audit_filter_syscall(current, ctx); if (ctx->current_state != AUDIT_STATE_RECORD) audit_filter_uring(current, ctx); audit_filter_inodes(current, ctx); if (ctx->current_state != AUDIT_STATE_RECORD) return; audit_log_uring(ctx); return; } /* this may generate CONFIG_CHANGE records */ if (!list_empty(&ctx->killed_trees)) audit_kill_trees(ctx); /* run through both filters to ensure we set the filterkey properly */ audit_filter_uring(current, ctx); audit_filter_inodes(current, ctx); if (ctx->current_state != AUDIT_STATE_RECORD) goto out; audit_log_exit(); out: audit_reset_context(ctx); } /** * __audit_syscall_entry - fill in an audit record at syscall entry * @major: major syscall type (function) * @a1: additional syscall register 1 * @a2: additional syscall register 2 * @a3: additional syscall register 3 * @a4: additional syscall register 4 * * Fill in audit context at syscall entry. This only happens if the * audit context was created when the task was created and the state or * filters demand the audit context be built. If the state from the * per-task filter or from the per-syscall filter is AUDIT_STATE_RECORD, * then the record will be written at syscall exit time (otherwise, it * will only be written if another part of the kernel requests that it * be written). */ void __audit_syscall_entry(int major, unsigned long a1, unsigned long a2, unsigned long a3, unsigned long a4) { struct audit_context *context = audit_context(); enum audit_state state; if (!audit_enabled || !context) return; WARN_ON(context->context != AUDIT_CTX_UNUSED); WARN_ON(context->name_count); if (context->context != AUDIT_CTX_UNUSED || context->name_count) { audit_panic("unrecoverable error in audit_syscall_entry()"); return; } state = context->state; if (state == AUDIT_STATE_DISABLED) return; context->dummy = !audit_n_rules; if (!context->dummy && state == AUDIT_STATE_BUILD) { context->prio = 0; if (auditd_test_task(current)) return; } context->arch = syscall_get_arch(current); context->major = major; context->argv[0] = a1; context->argv[1] = a2; context->argv[2] = a3; context->argv[3] = a4; context->context = AUDIT_CTX_SYSCALL; context->current_state = state; ktime_get_coarse_real_ts64(&context->ctime); } /** * __audit_syscall_exit - deallocate audit context after a system call * @success: success value of the syscall * @return_code: return value of the syscall * * Tear down after system call. If the audit context has been marked as * auditable (either because of the AUDIT_STATE_RECORD state from * filtering, or because some other part of the kernel wrote an audit * message), then write out the syscall information. In call cases, * free the names stored from getname(). */ void __audit_syscall_exit(int success, long return_code) { struct audit_context *context = audit_context(); if (!context || context->dummy || context->context != AUDIT_CTX_SYSCALL) goto out; /* this may generate CONFIG_CHANGE records */ if (!list_empty(&context->killed_trees)) audit_kill_trees(context); audit_return_fixup(context, success, return_code); /* run through both filters to ensure we set the filterkey properly */ audit_filter_syscall(current, context); audit_filter_inodes(current, context); if (context->current_state != AUDIT_STATE_RECORD) goto out; audit_log_exit(); out: audit_reset_context(context); } static inline void handle_one(const struct inode *inode) { struct audit_context *context; struct audit_tree_refs *p; struct audit_chunk *chunk; int count; if (likely(!inode->i_fsnotify_marks)) return; context = audit_context(); p = context->trees; count = context->tree_count; rcu_read_lock(); chunk = audit_tree_lookup(inode); rcu_read_unlock(); if (!chunk) return; if (likely(put_tree_ref(context, chunk))) return; if (unlikely(!grow_tree_refs(context))) { pr_warn("out of memory, audit has lost a tree reference\n"); audit_set_auditable(context); audit_put_chunk(chunk); unroll_tree_refs(context, p, count); return; } put_tree_ref(context, chunk); } static void handle_path(const struct dentry *dentry) { struct audit_context *context; struct audit_tree_refs *p; const struct dentry *d, *parent; struct audit_chunk *drop; unsigned long seq; int count; context = audit_context(); p = context->trees; count = context->tree_count; retry: drop = NULL; d = dentry; rcu_read_lock(); seq = read_seqbegin(&rename_lock); for (;;) { struct inode *inode = d_backing_inode(d); if (inode && unlikely(inode->i_fsnotify_marks)) { struct audit_chunk *chunk; chunk = audit_tree_lookup(inode); if (chunk) { if (unlikely(!put_tree_ref(context, chunk))) { drop = chunk; break; } } } parent = d->d_parent; if (parent == d) break; d = parent; } if (unlikely(read_seqretry(&rename_lock, seq) || drop)) { /* in this order */ rcu_read_unlock(); if (!drop) { /* just a race with rename */ unroll_tree_refs(context, p, count); goto retry; } audit_put_chunk(drop); if (grow_tree_refs(context)) { /* OK, got more space */ unroll_tree_refs(context, p, count); goto retry; } /* too bad */ pr_warn("out of memory, audit has lost a tree reference\n"); unroll_tree_refs(context, p, count); audit_set_auditable(context); return; } rcu_read_unlock(); } static struct audit_names *audit_alloc_name(struct audit_context *context, unsigned char type) { struct audit_names *aname; if (context->name_count < AUDIT_NAMES) { aname = &context->preallocated_names[context->name_count]; memset(aname, 0, sizeof(*aname)); } else { aname = kzalloc(sizeof(*aname), GFP_NOFS); if (!aname) return NULL; aname->should_free = true; } aname->ino = AUDIT_INO_UNSET; aname->type = type; list_add_tail(&aname->list, &context->names_list); context->name_count++; if (!context->pwd.dentry) get_fs_pwd(current->fs, &context->pwd); return aname; } /** * __audit_reusename - fill out filename with info from existing entry * @uptr: userland ptr to pathname * * Search the audit_names list for the current audit context. If there is an * existing entry with a matching "uptr" then return the filename * associated with that audit_name. If not, return NULL. */ struct filename * __audit_reusename(const __user char *uptr) { struct audit_context *context = audit_context(); struct audit_names *n; list_for_each_entry(n, &context->names_list, list) { if (!n->name) continue; if (n->name->uptr == uptr) return refname(n->name); } return NULL; } /** * __audit_getname - add a name to the list * @name: name to add * * Add a name to the list of audit names for this context. * Called from fs/namei.c:getname(). */ void __audit_getname(struct filename *name) { struct audit_context *context = audit_context(); struct audit_names *n; if (context->context == AUDIT_CTX_UNUSED) return; n = audit_alloc_name(context, AUDIT_TYPE_UNKNOWN); if (!n) return; n->name = name; n->name_len = AUDIT_NAME_FULL; name->aname = n; refname(name); } static inline int audit_copy_fcaps(struct audit_names *name, const struct dentry *dentry) { struct cpu_vfs_cap_data caps; int rc; if (!dentry) return 0; rc = get_vfs_caps_from_disk(&nop_mnt_idmap, dentry, &caps); if (rc) return rc; name->fcap.permitted = caps.permitted; name->fcap.inheritable = caps.inheritable; name->fcap.fE = !!(caps.magic_etc & VFS_CAP_FLAGS_EFFECTIVE); name->fcap.rootid = caps.rootid; name->fcap_ver = (caps.magic_etc & VFS_CAP_REVISION_MASK) >> VFS_CAP_REVISION_SHIFT; return 0; } /* Copy inode data into an audit_names. */ static void audit_copy_inode(struct audit_names *name, const struct dentry *dentry, struct inode *inode, unsigned int flags) { name->ino = inode->i_ino; name->dev = inode->i_sb->s_dev; name->mode = inode->i_mode; name->uid = inode->i_uid; name->gid = inode->i_gid; name->rdev = inode->i_rdev; security_inode_getlsmprop(inode, &name->oprop); if (flags & AUDIT_INODE_NOEVAL) { name->fcap_ver = -1; return; } audit_copy_fcaps(name, dentry); } /** * __audit_inode - store the inode and device from a lookup * @name: name being audited * @dentry: dentry being audited * @flags: attributes for this particular entry */ void __audit_inode(struct filename *name, const struct dentry *dentry, unsigned int flags) { struct audit_context *context = audit_context(); struct inode *inode = d_backing_inode(dentry); struct audit_names *n; bool parent = flags & AUDIT_INODE_PARENT; struct audit_entry *e; struct list_head *list = &audit_filter_list[AUDIT_FILTER_FS]; int i; if (context->context == AUDIT_CTX_UNUSED) return; rcu_read_lock(); list_for_each_entry_rcu(e, list, list) { for (i = 0; i < e->rule.field_count; i++) { struct audit_field *f = &e->rule.fields[i]; if (f->type == AUDIT_FSTYPE && audit_comparator(inode->i_sb->s_magic, f->op, f->val) && e->rule.action == AUDIT_NEVER) { rcu_read_unlock(); return; } } } rcu_read_unlock(); if (!name) goto out_alloc; /* * If we have a pointer to an audit_names entry already, then we can * just use it directly if the type is correct. */ n = name->aname; if (n) { if (parent) { if (n->type == AUDIT_TYPE_PARENT || n->type == AUDIT_TYPE_UNKNOWN) goto out; } else { if (n->type != AUDIT_TYPE_PARENT) goto out; } } list_for_each_entry_reverse(n, &context->names_list, list) { if (n->ino) { /* valid inode number, use that for the comparison */ if (n->ino != inode->i_ino || n->dev != inode->i_sb->s_dev) continue; } else if (n->name) { /* inode number has not been set, check the name */ if (strcmp(n->name->name, name->name)) continue; } else /* no inode and no name (?!) ... this is odd ... */ continue; /* match the correct record type */ if (parent) { if (n->type == AUDIT_TYPE_PARENT || n->type == AUDIT_TYPE_UNKNOWN) goto out; } else { if (n->type != AUDIT_TYPE_PARENT) goto out; } } out_alloc: /* unable to find an entry with both a matching name and type */ n = audit_alloc_name(context, AUDIT_TYPE_UNKNOWN); if (!n) return; if (name) { n->name = name; refname(name); } out: if (parent) { n->name_len = n->name ? parent_len(n->name->name) : AUDIT_NAME_FULL; n->type = AUDIT_TYPE_PARENT; if (flags & AUDIT_INODE_HIDDEN) n->hidden = true; } else { n->name_len = AUDIT_NAME_FULL; n->type = AUDIT_TYPE_NORMAL; } handle_path(dentry); audit_copy_inode(n, dentry, inode, flags & AUDIT_INODE_NOEVAL); } void __audit_file(const struct file *file) { __audit_inode(NULL, file->f_path.dentry, 0); } /** * __audit_inode_child - collect inode info for created/removed objects * @parent: inode of dentry parent * @dentry: dentry being audited * @type: AUDIT_TYPE_* value that we're looking for * * For syscalls that create or remove filesystem objects, audit_inode * can only collect information for the filesystem object's parent. * This call updates the audit context with the child's information. * Syscalls that create a new filesystem object must be hooked after * the object is created. Syscalls that remove a filesystem object * must be hooked prior, in order to capture the target inode during * unsuccessful attempts. */ void __audit_inode_child(struct inode *parent, const struct dentry *dentry, const unsigned char type) { struct audit_context *context = audit_context(); struct inode *inode = d_backing_inode(dentry); const struct qstr *dname = &dentry->d_name; struct audit_names *n, *found_parent = NULL, *found_child = NULL; struct audit_entry *e; struct list_head *list = &audit_filter_list[AUDIT_FILTER_FS]; int i; if (context->context == AUDIT_CTX_UNUSED) return; rcu_read_lock(); list_for_each_entry_rcu(e, list, list) { for (i = 0; i < e->rule.field_count; i++) { struct audit_field *f = &e->rule.fields[i]; if (f->type == AUDIT_FSTYPE && audit_comparator(parent->i_sb->s_magic, f->op, f->val) && e->rule.action == AUDIT_NEVER) { rcu_read_unlock(); return; } } } rcu_read_unlock(); if (inode) handle_one(inode); /* look for a parent entry first */ list_for_each_entry(n, &context->names_list, list) { if (!n->name || (n->type != AUDIT_TYPE_PARENT && n->type != AUDIT_TYPE_UNKNOWN)) continue; if (n->ino == parent->i_ino && n->dev == parent->i_sb->s_dev && !audit_compare_dname_path(dname, n->name->name, n->name_len)) { if (n->type == AUDIT_TYPE_UNKNOWN) n->type = AUDIT_TYPE_PARENT; found_parent = n; break; } } cond_resched(); /* is there a matching child entry? */ list_for_each_entry(n, &context->names_list, list) { /* can only match entries that have a name */ if (!n->name || (n->type != type && n->type != AUDIT_TYPE_UNKNOWN)) continue; if (!strcmp(dname->name, n->name->name) || !audit_compare_dname_path(dname, n->name->name, found_parent ? found_parent->name_len : AUDIT_NAME_FULL)) { if (n->type == AUDIT_TYPE_UNKNOWN) n->type = type; found_child = n; break; } } if (!found_parent) { /* create a new, "anonymous" parent record */ n = audit_alloc_name(context, AUDIT_TYPE_PARENT); if (!n) return; audit_copy_inode(n, NULL, parent, 0); } if (!found_child) { found_child = audit_alloc_name(context, type); if (!found_child) return; /* Re-use the name belonging to the slot for a matching parent * directory. All names for this context are relinquished in * audit_free_names() */ if (found_parent) { found_child->name = found_parent->name; found_child->name_len = AUDIT_NAME_FULL; refname(found_child->name); } } if (inode) audit_copy_inode(found_child, dentry, inode, 0); else found_child->ino = AUDIT_INO_UNSET; } EXPORT_SYMBOL_GPL(__audit_inode_child); /** * auditsc_get_stamp - get local copies of audit_context values * @ctx: audit_context for the task * @t: timespec64 to store time recorded in the audit_context * @serial: serial value that is recorded in the audit_context * * Also sets the context as auditable. */ int auditsc_get_stamp(struct audit_context *ctx, struct timespec64 *t, unsigned int *serial) { if (ctx->context == AUDIT_CTX_UNUSED) return 0; if (!ctx->serial) ctx->serial = audit_serial(); t->tv_sec = ctx->ctime.tv_sec; t->tv_nsec = ctx->ctime.tv_nsec; *serial = ctx->serial; if (!ctx->prio) { ctx->prio = 1; ctx->current_state = AUDIT_STATE_RECORD; } return 1; } /** * __audit_mq_open - record audit data for a POSIX MQ open * @oflag: open flag * @mode: mode bits * @attr: queue attributes * */ void __audit_mq_open(int oflag, umode_t mode, struct mq_attr *attr) { struct audit_context *context = audit_context(); if (attr) memcpy(&context->mq_open.attr, attr, sizeof(struct mq_attr)); else memset(&context->mq_open.attr, 0, sizeof(struct mq_attr)); context->mq_open.oflag = oflag; context->mq_open.mode = mode; context->type = AUDIT_MQ_OPEN; } /** * __audit_mq_sendrecv - record audit data for a POSIX MQ timed send/receive * @mqdes: MQ descriptor * @msg_len: Message length * @msg_prio: Message priority * @abs_timeout: Message timeout in absolute time * */ void __audit_mq_sendrecv(mqd_t mqdes, size_t msg_len, unsigned int msg_prio, const struct timespec64 *abs_timeout) { struct audit_context *context = audit_context(); struct timespec64 *p = &context->mq_sendrecv.abs_timeout; if (abs_timeout) memcpy(p, abs_timeout, sizeof(*p)); else memset(p, 0, sizeof(*p)); context->mq_sendrecv.mqdes = mqdes; context->mq_sendrecv.msg_len = msg_len; context->mq_sendrecv.msg_prio = msg_prio; context->type = AUDIT_MQ_SENDRECV; } /** * __audit_mq_notify - record audit data for a POSIX MQ notify * @mqdes: MQ descriptor * @notification: Notification event * */ void __audit_mq_notify(mqd_t mqdes, const struct sigevent *notification) { struct audit_context *context = audit_context(); if (notification) context->mq_notify.sigev_signo = notification->sigev_signo; else context->mq_notify.sigev_signo = 0; context->mq_notify.mqdes = mqdes; context->type = AUDIT_MQ_NOTIFY; } /** * __audit_mq_getsetattr - record audit data for a POSIX MQ get/set attribute * @mqdes: MQ descriptor * @mqstat: MQ flags * */ void __audit_mq_getsetattr(mqd_t mqdes, struct mq_attr *mqstat) { struct audit_context *context = audit_context(); context->mq_getsetattr.mqdes = mqdes; context->mq_getsetattr.mqstat = *mqstat; context->type = AUDIT_MQ_GETSETATTR; } /** * __audit_ipc_obj - record audit data for ipc object * @ipcp: ipc permissions * */ void __audit_ipc_obj(struct kern_ipc_perm *ipcp) { struct audit_context *context = audit_context(); context->ipc.uid = ipcp->uid; context->ipc.gid = ipcp->gid; context->ipc.mode = ipcp->mode; context->ipc.has_perm = 0; security_ipc_getlsmprop(ipcp, &context->ipc.oprop); context->type = AUDIT_IPC; } /** * __audit_ipc_set_perm - record audit data for new ipc permissions * @qbytes: msgq bytes * @uid: msgq user id * @gid: msgq group id * @mode: msgq mode (permissions) * * Called only after audit_ipc_obj(). */ void __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, umode_t mode) { struct audit_context *context = audit_context(); context->ipc.qbytes = qbytes; context->ipc.perm_uid = uid; context->ipc.perm_gid = gid; context->ipc.perm_mode = mode; context->ipc.has_perm = 1; } void __audit_bprm(struct linux_binprm *bprm) { struct audit_context *context = audit_context(); context->type = AUDIT_EXECVE; context->execve.argc = bprm->argc; } /** * __audit_socketcall - record audit data for sys_socketcall * @nargs: number of args, which should not be more than AUDITSC_ARGS. * @args: args array * */ int __audit_socketcall(int nargs, unsigned long *args) { struct audit_context *context = audit_context(); if (nargs <= 0 || nargs > AUDITSC_ARGS || !args) return -EINVAL; context->type = AUDIT_SOCKETCALL; context->socketcall.nargs = nargs; memcpy(context->socketcall.args, args, nargs * sizeof(unsigned long)); return 0; } /** * __audit_fd_pair - record audit data for pipe and socketpair * @fd1: the first file descriptor * @fd2: the second file descriptor * */ void __audit_fd_pair(int fd1, int fd2) { struct audit_context *context = audit_context(); context->fds[0] = fd1; context->fds[1] = fd2; } /** * __audit_sockaddr - record audit data for sys_bind, sys_connect, sys_sendto * @len: data length in user space * @a: data address in kernel space * * Returns 0 for success or NULL context or < 0 on error. */ int __audit_sockaddr(int len, void *a) { struct audit_context *context = audit_context(); if (!context->sockaddr) { void *p = kmalloc(sizeof(struct sockaddr_storage), GFP_KERNEL); if (!p) return -ENOMEM; context->sockaddr = p; } context->sockaddr_len = len; memcpy(context->sockaddr, a, len); return 0; } void __audit_ptrace(struct task_struct *t) { struct audit_context *context = audit_context(); context->target_pid = task_tgid_nr(t); context->target_auid = audit_get_loginuid(t); context->target_uid = task_uid(t); context->target_sessionid = audit_get_sessionid(t); strscpy(context->target_comm, t->comm); security_task_getlsmprop_obj(t, &context->target_ref); } /** * audit_signal_info_syscall - record signal info for syscalls * @t: task being signaled * * If the audit subsystem is being terminated, record the task (pid) * and uid that is doing that. */ int audit_signal_info_syscall(struct task_struct *t) { struct audit_aux_data_pids *axp; struct audit_context *ctx = audit_context(); kuid_t t_uid = task_uid(t); if (!audit_signals || audit_dummy_context()) return 0; /* optimize the common case by putting first signal recipient directly * in audit_context */ if (!ctx->target_pid) { ctx->target_pid = task_tgid_nr(t); ctx->target_auid = audit_get_loginuid(t); ctx->target_uid = t_uid; ctx->target_sessionid = audit_get_sessionid(t); strscpy(ctx->target_comm, t->comm); security_task_getlsmprop_obj(t, &ctx->target_ref); return 0; } axp = (void *)ctx->aux_pids; if (!axp || axp->pid_count == AUDIT_AUX_PIDS) { axp = kzalloc(sizeof(*axp), GFP_ATOMIC); if (!axp) return -ENOMEM; axp->d.type = AUDIT_OBJ_PID; axp->d.next = ctx->aux_pids; ctx->aux_pids = (void *)axp; } BUG_ON(axp->pid_count >= AUDIT_AUX_PIDS); axp->target_pid[axp->pid_count] = task_tgid_nr(t); axp->target_auid[axp->pid_count] = audit_get_loginuid(t); axp->target_uid[axp->pid_count] = t_uid; axp->target_sessionid[axp->pid_count] = audit_get_sessionid(t); security_task_getlsmprop_obj(t, &axp->target_ref[axp->pid_count]); strscpy(axp->target_comm[axp->pid_count], t->comm); axp->pid_count++; return 0; } /** * __audit_log_bprm_fcaps - store information about a loading bprm and relevant fcaps * @bprm: pointer to the bprm being processed * @new: the proposed new credentials * @old: the old credentials * * Simply check if the proc already has the caps given by the file and if not * store the priv escalation info for later auditing at the end of the syscall * * -Eric */ int __audit_log_bprm_fcaps(struct linux_binprm *bprm, const struct cred *new, const struct cred *old) { struct audit_aux_data_bprm_fcaps *ax; struct audit_context *context = audit_context(); struct cpu_vfs_cap_data vcaps; ax = kmalloc(sizeof(*ax), GFP_KERNEL); if (!ax) return -ENOMEM; ax->d.type = AUDIT_BPRM_FCAPS; ax->d.next = context->aux; context->aux = (void *)ax; get_vfs_caps_from_disk(&nop_mnt_idmap, bprm->file->f_path.dentry, &vcaps); ax->fcap.permitted = vcaps.permitted; ax->fcap.inheritable = vcaps.inheritable; ax->fcap.fE = !!(vcaps.magic_etc & VFS_CAP_FLAGS_EFFECTIVE); ax->fcap.rootid = vcaps.rootid; ax->fcap_ver = (vcaps.magic_etc & VFS_CAP_REVISION_MASK) >> VFS_CAP_REVISION_SHIFT; ax->old_pcap.permitted = old->cap_permitted; ax->old_pcap.inheritable = old->cap_inheritable; ax->old_pcap.effective = old->cap_effective; ax->old_pcap.ambient = old->cap_ambient; ax->new_pcap.permitted = new->cap_permitted; ax->new_pcap.inheritable = new->cap_inheritable; ax->new_pcap.effective = new->cap_effective; ax->new_pcap.ambient = new->cap_ambient; return 0; } /** * __audit_log_capset - store information about the arguments to the capset syscall * @new: the new credentials * @old: the old (current) credentials * * Record the arguments userspace sent to sys_capset for later printing by the * audit system if applicable */ void __audit_log_capset(const struct cred *new, const struct cred *old) { struct audit_context *context = audit_context(); context->capset.pid = task_tgid_nr(current); context->capset.cap.effective = new->cap_effective; context->capset.cap.inheritable = new->cap_effective; context->capset.cap.permitted = new->cap_permitted; context->capset.cap.ambient = new->cap_ambient; context->type = AUDIT_CAPSET; } void __audit_mmap_fd(int fd, int flags) { struct audit_context *context = audit_context(); context->mmap.fd = fd; context->mmap.flags = flags; context->type = AUDIT_MMAP; } void __audit_openat2_how(struct open_how *how) { struct audit_context *context = audit_context(); context->openat2.flags = how->flags; context->openat2.mode = how->mode; context->openat2.resolve = how->resolve; context->type = AUDIT_OPENAT2; } void __audit_log_kern_module(const char *name) { struct audit_context *context = audit_context(); context->module.name = kstrdup(name, GFP_KERNEL); if (!context->module.name) audit_log_lost("out of memory in __audit_log_kern_module"); context->type = AUDIT_KERN_MODULE; } void __audit_fanotify(u32 response, struct fanotify_response_info_audit_rule *friar) { /* {subj,obj}_trust values are {0,1,2}: no,yes,unknown */ switch (friar->hdr.type) { case FAN_RESPONSE_INFO_NONE: audit_log(audit_context(), GFP_KERNEL, AUDIT_FANOTIFY, "resp=%u fan_type=%u fan_info=0 subj_trust=2 obj_trust=2", response, FAN_RESPONSE_INFO_NONE); break; case FAN_RESPONSE_INFO_AUDIT_RULE: audit_log(audit_context(), GFP_KERNEL, AUDIT_FANOTIFY, "resp=%u fan_type=%u fan_info=%X subj_trust=%u obj_trust=%u", response, friar->hdr.type, friar->rule_number, friar->subj_trust, friar->obj_trust); } } void __audit_tk_injoffset(struct timespec64 offset) { struct audit_context *context = audit_context(); /* only set type if not already set by NTP */ if (!context->type) context->type = AUDIT_TIME_INJOFFSET; memcpy(&context->time.tk_injoffset, &offset, sizeof(offset)); } void __audit_ntp_log(const struct audit_ntp_data *ad) { struct audit_context *context = audit_context(); int type; for (type = 0; type < AUDIT_NTP_NVALS; type++) if (ad->vals[type].newval != ad->vals[type].oldval) { /* unconditionally set type, overwriting TK */ context->type = AUDIT_TIME_ADJNTPVAL; memcpy(&context->time.ntp_data, ad, sizeof(*ad)); break; } } void __audit_log_nfcfg(const char *name, u8 af, unsigned int nentries, enum audit_nfcfgop op, gfp_t gfp) { struct audit_buffer *ab; char comm[sizeof(current->comm)]; ab = audit_log_start(audit_context(), gfp, AUDIT_NETFILTER_CFG); if (!ab) return; audit_log_format(ab, "table=%s family=%u entries=%u op=%s", name, af, nentries, audit_nfcfgs[op].s); audit_log_format(ab, " pid=%u", task_tgid_nr(current)); audit_log_task_context(ab); /* subj= */ audit_log_format(ab, " comm="); audit_log_untrustedstring(ab, get_task_comm(comm, current)); audit_log_end(ab); } EXPORT_SYMBOL_GPL(__audit_log_nfcfg); static void audit_log_task(struct audit_buffer *ab) { kuid_t auid, uid; kgid_t gid; unsigned int sessionid; char comm[sizeof(current->comm)]; auid = audit_get_loginuid(current); sessionid = audit_get_sessionid(current); current_uid_gid(&uid, &gid); audit_log_format(ab, "auid=%u uid=%u gid=%u ses=%u", from_kuid(&init_user_ns, auid), from_kuid(&init_user_ns, uid), from_kgid(&init_user_ns, gid), sessionid); audit_log_task_context(ab); audit_log_format(ab, " pid=%d comm=", task_tgid_nr(current)); audit_log_untrustedstring(ab, get_task_comm(comm, current)); audit_log_d_path_exe(ab, current->mm); } /** * audit_core_dumps - record information about processes that end abnormally * @signr: signal value * * If a process ends with a core dump, something fishy is going on and we * should record the event for investigation. */ void audit_core_dumps(long signr) { struct audit_buffer *ab; if (!audit_enabled) return; if (signr == SIGQUIT) /* don't care for those */ return; ab = audit_log_start(audit_context(), GFP_KERNEL, AUDIT_ANOM_ABEND); if (unlikely(!ab)) return; audit_log_task(ab); audit_log_format(ab, " sig=%ld res=1", signr); audit_log_end(ab); } /** * audit_seccomp - record information about a seccomp action * @syscall: syscall number * @signr: signal value * @code: the seccomp action * * Record the information associated with a seccomp action. Event filtering for * seccomp actions that are not to be logged is done in seccomp_log(). * Therefore, this function forces auditing independent of the audit_enabled * and dummy context state because seccomp actions should be logged even when * audit is not in use. */ void audit_seccomp(unsigned long syscall, long signr, int code) { struct audit_buffer *ab; ab = audit_log_start(audit_context(), GFP_KERNEL, AUDIT_SECCOMP); if (unlikely(!ab)) return; audit_log_task(ab); audit_log_format(ab, " sig=%ld arch=%x syscall=%ld compat=%d ip=0x%lx code=0x%x", signr, syscall_get_arch(current), syscall, in_compat_syscall(), KSTK_EIP(current), code); audit_log_end(ab); } void audit_seccomp_actions_logged(const char *names, const char *old_names, int res) { struct audit_buffer *ab; if (!audit_enabled) return; ab = audit_log_start(audit_context(), GFP_KERNEL, AUDIT_CONFIG_CHANGE); if (unlikely(!ab)) return; audit_log_format(ab, "op=seccomp-logging actions=%s old-actions=%s res=%d", names, old_names, res); audit_log_end(ab); } struct list_head *audit_killed_trees(void) { struct audit_context *ctx = audit_context(); if (likely(!ctx || ctx->context == AUDIT_CTX_UNUSED)) return NULL; return &ctx->killed_trees; } |
| 3 3 3 1 3 3 3 3 3 3 3 3 3 3 3 3 1 1 3 3 3 3 3 2 3 3 1 3 3 3 3 4 1 1 1 2 1 3 4 3 3 3 3 3 1 1 1 4 4 4 6 3 3 6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 | // SPDX-License-Identifier: GPL-1.0+ /* generic HDLC line discipline for Linux * * Written by Paul Fulghum paulkf@microgate.com * for Microgate Corporation * * Microgate and SyncLink are registered trademarks of Microgate Corporation * * Adapted from ppp.c, written by Michael Callahan <callahan@maths.ox.ac.uk>, * Al Longyear <longyear@netcom.com>, * Paul Mackerras <Paul.Mackerras@cs.anu.edu.au> * * Original release 01/11/99 * * This module implements the tty line discipline N_HDLC for use with * tty device drivers that support bit-synchronous HDLC communications. * * All HDLC data is frame oriented which means: * * 1. tty write calls represent one complete transmit frame of data * The device driver should accept the complete frame or none of * the frame (busy) in the write method. Each write call should have * a byte count in the range of 2-65535 bytes (2 is min HDLC frame * with 1 addr byte and 1 ctrl byte). The max byte count of 65535 * should include any crc bytes required. For example, when using * CCITT CRC32, 4 crc bytes are required, so the maximum size frame * the application may transmit is limited to 65531 bytes. For CCITT * CRC16, the maximum application frame size would be 65533. * * * 2. receive callbacks from the device driver represents * one received frame. The device driver should bypass * the tty flip buffer and call the line discipline receive * callback directly to avoid fragmenting or concatenating * multiple frames into a single receive callback. * * The HDLC line discipline queues the receive frames in separate * buffers so complete receive frames can be returned by the * tty read calls. * * 3. tty read calls returns an entire frame of data or nothing. * * 4. all send and receive data is considered raw. No processing * or translation is performed by the line discipline, regardless * of the tty flags * * 5. When line discipline is queried for the amount of receive * data available (FIOC), 0 is returned if no data available, * otherwise the count of the next available frame is returned. * (instead of the sum of all received frame counts). * * These conventions allow the standard tty programming interface * to be used for synchronous HDLC applications when used with * this line discipline (or another line discipline that is frame * oriented such as N_PPP). * * The SyncLink driver (synclink.c) implements both asynchronous * (using standard line discipline N_TTY) and synchronous HDLC * (using N_HDLC) communications, with the latter using the above * conventions. * * This implementation is very basic and does not maintain * any statistics. The main point is to enforce the raw data * and frame orientation of HDLC communications. * * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED * OF THE POSSIBILITY OF SUCH DAMAGE. */ #include <linux/module.h> #include <linux/init.h> #include <linux/kernel.h> #include <linux/sched.h> #include <linux/types.h> #include <linux/fcntl.h> #include <linux/interrupt.h> #include <linux/ptrace.h> #include <linux/poll.h> #include <linux/in.h> #include <linux/ioctl.h> #include <linux/slab.h> #include <linux/tty.h> #include <linux/errno.h> #include <linux/string.h> /* used in new tty drivers */ #include <linux/signal.h> /* used in new tty drivers */ #include <linux/if.h> #include <linux/bitops.h> #include <linux/uaccess.h> #include "tty.h" /* * Buffers for individual HDLC frames */ #define MAX_HDLC_FRAME_SIZE 65535 #define DEFAULT_RX_BUF_COUNT 10 #define MAX_RX_BUF_COUNT 60 #define DEFAULT_TX_BUF_COUNT 3 struct n_hdlc_buf { struct list_head list_item; size_t count; u8 buf[]; }; struct n_hdlc_buf_list { struct list_head list; int count; spinlock_t spinlock; }; /** * struct n_hdlc - per device instance data structure * @tbusy: reentrancy flag for tx wakeup code * @woke_up: tx wakeup needs to be run again as it was called while @tbusy * @tx_buf_list: list of pending transmit frame buffers * @rx_buf_list: list of received frame buffers * @tx_free_buf_list: list unused transmit frame buffers * @rx_free_buf_list: list unused received frame buffers */ struct n_hdlc { bool tbusy; bool woke_up; struct n_hdlc_buf_list tx_buf_list; struct n_hdlc_buf_list rx_buf_list; struct n_hdlc_buf_list tx_free_buf_list; struct n_hdlc_buf_list rx_free_buf_list; struct work_struct write_work; struct tty_struct *tty_for_write_work; }; /* * HDLC buffer list manipulation functions */ static void n_hdlc_buf_return(struct n_hdlc_buf_list *buf_list, struct n_hdlc_buf *buf); static void n_hdlc_buf_put(struct n_hdlc_buf_list *list, struct n_hdlc_buf *buf); static struct n_hdlc_buf *n_hdlc_buf_get(struct n_hdlc_buf_list *list); /* Local functions */ static struct n_hdlc *n_hdlc_alloc(void); static void n_hdlc_tty_write_work(struct work_struct *work); /* max frame size for memory allocations */ static int maxframe = 4096; static void flush_rx_queue(struct tty_struct *tty) { struct n_hdlc *n_hdlc = tty->disc_data; struct n_hdlc_buf *buf; while ((buf = n_hdlc_buf_get(&n_hdlc->rx_buf_list))) n_hdlc_buf_put(&n_hdlc->rx_free_buf_list, buf); } static void flush_tx_queue(struct tty_struct *tty) { struct n_hdlc *n_hdlc = tty->disc_data; struct n_hdlc_buf *buf; while ((buf = n_hdlc_buf_get(&n_hdlc->tx_buf_list))) n_hdlc_buf_put(&n_hdlc->tx_free_buf_list, buf); } static void n_hdlc_free_buf_list(struct n_hdlc_buf_list *list) { struct n_hdlc_buf *buf; do { buf = n_hdlc_buf_get(list); kfree(buf); } while (buf); } /** * n_hdlc_tty_close - line discipline close * @tty: pointer to tty info structure * * Called when the line discipline is changed to something * else, the tty is closed, or the tty detects a hangup. */ static void n_hdlc_tty_close(struct tty_struct *tty) { struct n_hdlc *n_hdlc = tty->disc_data; #if defined(TTY_NO_WRITE_SPLIT) clear_bit(TTY_NO_WRITE_SPLIT, &tty->flags); #endif tty->disc_data = NULL; /* Ensure that the n_hdlcd process is not hanging on select()/poll() */ wake_up_interruptible(&tty->read_wait); wake_up_interruptible(&tty->write_wait); cancel_work_sync(&n_hdlc->write_work); n_hdlc_free_buf_list(&n_hdlc->rx_free_buf_list); n_hdlc_free_buf_list(&n_hdlc->tx_free_buf_list); n_hdlc_free_buf_list(&n_hdlc->rx_buf_list); n_hdlc_free_buf_list(&n_hdlc->tx_buf_list); kfree(n_hdlc); } /* end of n_hdlc_tty_close() */ /** * n_hdlc_tty_open - called when line discipline changed to n_hdlc * @tty: pointer to tty info structure * * Returns 0 if success, otherwise error code */ static int n_hdlc_tty_open(struct tty_struct *tty) { struct n_hdlc *n_hdlc = tty->disc_data; pr_debug("%s() called (device=%s)\n", __func__, tty->name); /* There should not be an existing table for this slot. */ if (n_hdlc) { pr_err("%s: tty already associated!\n", __func__); return -EEXIST; } n_hdlc = n_hdlc_alloc(); if (!n_hdlc) { pr_err("%s: n_hdlc_alloc failed\n", __func__); return -ENFILE; } INIT_WORK(&n_hdlc->write_work, n_hdlc_tty_write_work); n_hdlc->tty_for_write_work = tty; tty->disc_data = n_hdlc; tty->receive_room = 65536; /* change tty_io write() to not split large writes into 8K chunks */ set_bit(TTY_NO_WRITE_SPLIT, &tty->flags); /* flush receive data from driver */ tty_driver_flush_buffer(tty); return 0; } /* end of n_tty_hdlc_open() */ /** * n_hdlc_send_frames - send frames on pending send buffer list * @n_hdlc: pointer to ldisc instance data * @tty: pointer to tty instance data * * Send frames on pending send buffer list until the driver does not accept a * frame (busy) this function is called after adding a frame to the send buffer * list and by the tty wakeup callback. */ static void n_hdlc_send_frames(struct n_hdlc *n_hdlc, struct tty_struct *tty) { unsigned long flags; struct n_hdlc_buf *tbuf; ssize_t actual; check_again: spin_lock_irqsave(&n_hdlc->tx_buf_list.spinlock, flags); if (n_hdlc->tbusy) { n_hdlc->woke_up = true; spin_unlock_irqrestore(&n_hdlc->tx_buf_list.spinlock, flags); return; } n_hdlc->tbusy = true; n_hdlc->woke_up = false; spin_unlock_irqrestore(&n_hdlc->tx_buf_list.spinlock, flags); tbuf = n_hdlc_buf_get(&n_hdlc->tx_buf_list); while (tbuf) { pr_debug("sending frame %p, count=%zu\n", tbuf, tbuf->count); /* Send the next block of data to device */ set_bit(TTY_DO_WRITE_WAKEUP, &tty->flags); actual = tty->ops->write(tty, tbuf->buf, tbuf->count); /* rollback was possible and has been done */ if (actual == -ERESTARTSYS) { n_hdlc_buf_return(&n_hdlc->tx_buf_list, tbuf); break; } /* if transmit error, throw frame away by */ /* pretending it was accepted by driver */ if (actual < 0) actual = tbuf->count; if (actual == tbuf->count) { pr_debug("frame %p completed\n", tbuf); /* free current transmit buffer */ n_hdlc_buf_put(&n_hdlc->tx_free_buf_list, tbuf); /* wait up sleeping writers */ wake_up_interruptible(&tty->write_wait); /* get next pending transmit buffer */ tbuf = n_hdlc_buf_get(&n_hdlc->tx_buf_list); } else { pr_debug("frame %p pending\n", tbuf); /* * the buffer was not accepted by driver, * return it back into tx queue */ n_hdlc_buf_return(&n_hdlc->tx_buf_list, tbuf); break; } } if (!tbuf) clear_bit(TTY_DO_WRITE_WAKEUP, &tty->flags); /* Clear the re-entry flag */ spin_lock_irqsave(&n_hdlc->tx_buf_list.spinlock, flags); n_hdlc->tbusy = false; spin_unlock_irqrestore(&n_hdlc->tx_buf_list.spinlock, flags); if (n_hdlc->woke_up) goto check_again; } /* end of n_hdlc_send_frames() */ /** * n_hdlc_tty_write_work - Asynchronous callback for transmit wakeup * @work: pointer to work_struct * * Called when low level device driver can accept more send data. */ static void n_hdlc_tty_write_work(struct work_struct *work) { struct n_hdlc *n_hdlc = container_of(work, struct n_hdlc, write_work); struct tty_struct *tty = n_hdlc->tty_for_write_work; n_hdlc_send_frames(n_hdlc, tty); } /* end of n_hdlc_tty_write_work() */ /** * n_hdlc_tty_wakeup - Callback for transmit wakeup * @tty: pointer to associated tty instance data * * Called when low level device driver can accept more send data. */ static void n_hdlc_tty_wakeup(struct tty_struct *tty) { struct n_hdlc *n_hdlc = tty->disc_data; schedule_work(&n_hdlc->write_work); } /* end of n_hdlc_tty_wakeup() */ /** * n_hdlc_tty_receive - Called by tty driver when receive data is available * @tty: pointer to tty instance data * @data: pointer to received data * @flags: pointer to flags for data * @count: count of received data in bytes * * Called by tty low level driver when receive data is available. Data is * interpreted as one HDLC frame. */ static void n_hdlc_tty_receive(struct tty_struct *tty, const u8 *data, const u8 *flags, size_t count) { register struct n_hdlc *n_hdlc = tty->disc_data; register struct n_hdlc_buf *buf; pr_debug("%s() called count=%zu\n", __func__, count); if (count > maxframe) { pr_debug("rx count>maxframesize, data discarded\n"); return; } /* get a free HDLC buffer */ buf = n_hdlc_buf_get(&n_hdlc->rx_free_buf_list); if (!buf) { /* * no buffers in free list, attempt to allocate another rx * buffer unless the maximum count has been reached */ if (n_hdlc->rx_buf_list.count < MAX_RX_BUF_COUNT) buf = kmalloc(struct_size(buf, buf, maxframe), GFP_ATOMIC); } if (!buf) { pr_debug("no more rx buffers, data discarded\n"); return; } /* copy received data to HDLC buffer */ memcpy(buf->buf, data, count); buf->count = count; /* add HDLC buffer to list of received frames */ n_hdlc_buf_put(&n_hdlc->rx_buf_list, buf); /* wake up any blocked reads and perform async signalling */ wake_up_interruptible(&tty->read_wait); if (tty->fasync != NULL) kill_fasync(&tty->fasync, SIGIO, POLL_IN); } /* end of n_hdlc_tty_receive() */ /** * n_hdlc_tty_read - Called to retrieve one frame of data (if available) * @tty: pointer to tty instance data * @file: pointer to open file object * @kbuf: pointer to returned data buffer * @nr: size of returned data buffer * @cookie: stored rbuf from previous run * @offset: offset into the data buffer * * Returns the number of bytes returned or error code. */ static ssize_t n_hdlc_tty_read(struct tty_struct *tty, struct file *file, u8 *kbuf, size_t nr, void **cookie, unsigned long offset) { struct n_hdlc *n_hdlc = tty->disc_data; int ret = 0; struct n_hdlc_buf *rbuf; DECLARE_WAITQUEUE(wait, current); /* Is this a repeated call for an rbuf we already found earlier? */ rbuf = *cookie; if (rbuf) goto have_rbuf; add_wait_queue(&tty->read_wait, &wait); for (;;) { if (test_bit(TTY_OTHER_CLOSED, &tty->flags)) { ret = -EIO; break; } if (tty_hung_up_p(file)) break; set_current_state(TASK_INTERRUPTIBLE); rbuf = n_hdlc_buf_get(&n_hdlc->rx_buf_list); if (rbuf) break; /* no data */ if (tty_io_nonblock(tty, file)) { ret = -EAGAIN; break; } schedule(); if (signal_pending(current)) { ret = -EINTR; break; } } remove_wait_queue(&tty->read_wait, &wait); __set_current_state(TASK_RUNNING); if (!rbuf) return ret; *cookie = rbuf; have_rbuf: /* Have we used it up entirely? */ if (offset >= rbuf->count) goto done_with_rbuf; /* More data to go, but can't copy any more? EOVERFLOW */ ret = -EOVERFLOW; if (!nr) goto done_with_rbuf; /* Copy as much data as possible */ ret = rbuf->count - offset; if (ret > nr) ret = nr; memcpy(kbuf, rbuf->buf+offset, ret); offset += ret; /* If we still have data left, we leave the rbuf in the cookie */ if (offset < rbuf->count) return ret; done_with_rbuf: *cookie = NULL; if (n_hdlc->rx_free_buf_list.count > DEFAULT_RX_BUF_COUNT) kfree(rbuf); else n_hdlc_buf_put(&n_hdlc->rx_free_buf_list, rbuf); return ret; } /* end of n_hdlc_tty_read() */ /** * n_hdlc_tty_write - write a single frame of data to device * @tty: pointer to associated tty device instance data * @file: pointer to file object data * @data: pointer to transmit data (one frame) * @count: size of transmit frame in bytes * * Returns the number of bytes written (or error code). */ static ssize_t n_hdlc_tty_write(struct tty_struct *tty, struct file *file, const u8 *data, size_t count) { struct n_hdlc *n_hdlc = tty->disc_data; DECLARE_WAITQUEUE(wait, current); struct n_hdlc_buf *tbuf; ssize_t error = 0; pr_debug("%s() called count=%zd\n", __func__, count); /* verify frame size */ if (count > maxframe) { pr_debug("%s: truncating user packet from %zu to %d\n", __func__, count, maxframe); count = maxframe; } add_wait_queue(&tty->write_wait, &wait); for (;;) { set_current_state(TASK_INTERRUPTIBLE); tbuf = n_hdlc_buf_get(&n_hdlc->tx_free_buf_list); if (tbuf) break; if (tty_io_nonblock(tty, file)) { error = -EAGAIN; break; } schedule(); if (signal_pending(current)) { error = -EINTR; break; } } __set_current_state(TASK_RUNNING); remove_wait_queue(&tty->write_wait, &wait); if (!error) { /* Retrieve the user's buffer */ memcpy(tbuf->buf, data, count); /* Send the data */ tbuf->count = error = count; n_hdlc_buf_put(&n_hdlc->tx_buf_list, tbuf); n_hdlc_send_frames(n_hdlc, tty); } return error; } /* end of n_hdlc_tty_write() */ /** * n_hdlc_tty_ioctl - process IOCTL system call for the tty device. * @tty: pointer to tty instance data * @cmd: IOCTL command code * @arg: argument for IOCTL call (cmd dependent) * * Returns command dependent result. */ static int n_hdlc_tty_ioctl(struct tty_struct *tty, unsigned int cmd, unsigned long arg) { struct n_hdlc *n_hdlc = tty->disc_data; int error = 0; int count; unsigned long flags; struct n_hdlc_buf *buf = NULL; pr_debug("%s() called %d\n", __func__, cmd); switch (cmd) { case FIONREAD: /* report count of read data available */ /* in next available frame (if any) */ spin_lock_irqsave(&n_hdlc->rx_buf_list.spinlock, flags); buf = list_first_entry_or_null(&n_hdlc->rx_buf_list.list, struct n_hdlc_buf, list_item); if (buf) count = buf->count; else count = 0; spin_unlock_irqrestore(&n_hdlc->rx_buf_list.spinlock, flags); error = put_user(count, (int __user *)arg); break; case TIOCOUTQ: /* get the pending tx byte count in the driver */ count = tty_chars_in_buffer(tty); /* add size of next output frame in queue */ spin_lock_irqsave(&n_hdlc->tx_buf_list.spinlock, flags); buf = list_first_entry_or_null(&n_hdlc->tx_buf_list.list, struct n_hdlc_buf, list_item); if (buf) count += buf->count; spin_unlock_irqrestore(&n_hdlc->tx_buf_list.spinlock, flags); error = put_user(count, (int __user *)arg); break; case TCFLSH: switch (arg) { case TCIOFLUSH: case TCOFLUSH: flush_tx_queue(tty); } fallthrough; /* to default */ default: error = n_tty_ioctl_helper(tty, cmd, arg); break; } return error; } /* end of n_hdlc_tty_ioctl() */ /** * n_hdlc_tty_poll - TTY callback for poll system call * @tty: pointer to tty instance data * @filp: pointer to open file object for device * @wait: wait queue for operations * * Determine which operations (read/write) will not block and return info * to caller. * Returns a bit mask containing info on which ops will not block. */ static __poll_t n_hdlc_tty_poll(struct tty_struct *tty, struct file *filp, poll_table *wait) { struct n_hdlc *n_hdlc = tty->disc_data; __poll_t mask = 0; /* * queue the current process into any wait queue that may awaken in the * future (read and write) */ poll_wait(filp, &tty->read_wait, wait); poll_wait(filp, &tty->write_wait, wait); /* set bits for operations that won't block */ if (!list_empty(&n_hdlc->rx_buf_list.list)) mask |= EPOLLIN | EPOLLRDNORM; /* readable */ if (test_bit(TTY_OTHER_CLOSED, &tty->flags)) mask |= EPOLLHUP; if (tty_hung_up_p(filp)) mask |= EPOLLHUP; if (!tty_is_writelocked(tty) && !list_empty(&n_hdlc->tx_free_buf_list.list)) mask |= EPOLLOUT | EPOLLWRNORM; /* writable */ return mask; } /* end of n_hdlc_tty_poll() */ static void n_hdlc_alloc_buf(struct n_hdlc_buf_list *list, unsigned int count, const char *name) { struct n_hdlc_buf *buf; unsigned int i; for (i = 0; i < count; i++) { buf = kmalloc(struct_size(buf, buf, maxframe), GFP_KERNEL); if (!buf) { pr_debug("%s(), kmalloc() failed for %s buffer %u\n", __func__, name, i); return; } n_hdlc_buf_put(list, buf); } } /** * n_hdlc_alloc - allocate an n_hdlc instance data structure * * Returns a pointer to newly created structure if success, otherwise %NULL */ static struct n_hdlc *n_hdlc_alloc(void) { struct n_hdlc *n_hdlc = kzalloc(sizeof(*n_hdlc), GFP_KERNEL); if (!n_hdlc) return NULL; spin_lock_init(&n_hdlc->rx_free_buf_list.spinlock); spin_lock_init(&n_hdlc->tx_free_buf_list.spinlock); spin_lock_init(&n_hdlc->rx_buf_list.spinlock); spin_lock_init(&n_hdlc->tx_buf_list.spinlock); INIT_LIST_HEAD(&n_hdlc->rx_free_buf_list.list); INIT_LIST_HEAD(&n_hdlc->tx_free_buf_list.list); INIT_LIST_HEAD(&n_hdlc->rx_buf_list.list); INIT_LIST_HEAD(&n_hdlc->tx_buf_list.list); n_hdlc_alloc_buf(&n_hdlc->rx_free_buf_list, DEFAULT_RX_BUF_COUNT, "rx"); n_hdlc_alloc_buf(&n_hdlc->tx_free_buf_list, DEFAULT_TX_BUF_COUNT, "tx"); return n_hdlc; } /* end of n_hdlc_alloc() */ /** * n_hdlc_buf_return - put the HDLC buffer after the head of the specified list * @buf_list: pointer to the buffer list * @buf: pointer to the buffer */ static void n_hdlc_buf_return(struct n_hdlc_buf_list *buf_list, struct n_hdlc_buf *buf) { unsigned long flags; spin_lock_irqsave(&buf_list->spinlock, flags); list_add(&buf->list_item, &buf_list->list); buf_list->count++; spin_unlock_irqrestore(&buf_list->spinlock, flags); } /** * n_hdlc_buf_put - add specified HDLC buffer to tail of specified list * @buf_list: pointer to buffer list * @buf: pointer to buffer */ static void n_hdlc_buf_put(struct n_hdlc_buf_list *buf_list, struct n_hdlc_buf *buf) { unsigned long flags; spin_lock_irqsave(&buf_list->spinlock, flags); list_add_tail(&buf->list_item, &buf_list->list); buf_list->count++; spin_unlock_irqrestore(&buf_list->spinlock, flags); } /* end of n_hdlc_buf_put() */ /** * n_hdlc_buf_get - remove and return an HDLC buffer from list * @buf_list: pointer to HDLC buffer list * * Remove and return an HDLC buffer from the head of the specified HDLC buffer * list. * Returns a pointer to HDLC buffer if available, otherwise %NULL. */ static struct n_hdlc_buf *n_hdlc_buf_get(struct n_hdlc_buf_list *buf_list) { unsigned long flags; struct n_hdlc_buf *buf; spin_lock_irqsave(&buf_list->spinlock, flags); buf = list_first_entry_or_null(&buf_list->list, struct n_hdlc_buf, list_item); if (buf) { list_del(&buf->list_item); buf_list->count--; } spin_unlock_irqrestore(&buf_list->spinlock, flags); return buf; } /* end of n_hdlc_buf_get() */ static struct tty_ldisc_ops n_hdlc_ldisc = { .owner = THIS_MODULE, .num = N_HDLC, .name = "hdlc", .open = n_hdlc_tty_open, .close = n_hdlc_tty_close, .read = n_hdlc_tty_read, .write = n_hdlc_tty_write, .ioctl = n_hdlc_tty_ioctl, .poll = n_hdlc_tty_poll, .receive_buf = n_hdlc_tty_receive, .write_wakeup = n_hdlc_tty_wakeup, .flush_buffer = flush_rx_queue, }; static int __init n_hdlc_init(void) { int status; /* range check maxframe arg */ maxframe = clamp(maxframe, 4096, MAX_HDLC_FRAME_SIZE); status = tty_register_ldisc(&n_hdlc_ldisc); if (!status) pr_info("N_HDLC line discipline registered with maxframe=%d\n", maxframe); else pr_err("N_HDLC: error registering line discipline: %d\n", status); return status; } /* end of init_module() */ static void __exit n_hdlc_exit(void) { tty_unregister_ldisc(&n_hdlc_ldisc); } module_init(n_hdlc_init); module_exit(n_hdlc_exit); MODULE_DESCRIPTION("HDLC line discipline support"); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Paul Fulghum paulkf@microgate.com"); module_param(maxframe, int, 0); MODULE_ALIAS_LDISC(N_HDLC); |
| 9 9 9 5 9 8 9 9 9 9 9 9 9 10 10 10 9 9 8 9 9 9 9 9 9 9 8 5 9 1 2 9 7 7 9 9 8 8 9 8 9 9 9 9 9 9 9 9 8 9 9 8 10 9 2 2 1 1 1 1 1 1 2 2 2 2 2 10 10 10 10 10 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 10 1 1 1 23 23 23 23 23 23 23 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 | // SPDX-License-Identifier: GPL-2.0 /* * MQ Deadline i/o scheduler - adaptation of the legacy deadline scheduler, * for the blk-mq scheduling framework * * Copyright (C) 2016 Jens Axboe <axboe@kernel.dk> */ #include <linux/kernel.h> #include <linux/fs.h> #include <linux/blkdev.h> #include <linux/bio.h> #include <linux/module.h> #include <linux/slab.h> #include <linux/init.h> #include <linux/compiler.h> #include <linux/rbtree.h> #include <linux/sbitmap.h> #include <trace/events/block.h> #include "elevator.h" #include "blk.h" #include "blk-mq.h" #include "blk-mq-debugfs.h" #include "blk-mq-sched.h" /* * See Documentation/block/deadline-iosched.rst */ static const int read_expire = HZ / 2; /* max time before a read is submitted. */ static const int write_expire = 5 * HZ; /* ditto for writes, these limits are SOFT! */ /* * Time after which to dispatch lower priority requests even if higher * priority requests are pending. */ static const int prio_aging_expire = 10 * HZ; static const int writes_starved = 2; /* max times reads can starve a write */ static const int fifo_batch = 16; /* # of sequential requests treated as one by the above parameters. For throughput. */ enum dd_data_dir { DD_READ = READ, DD_WRITE = WRITE, }; enum { DD_DIR_COUNT = 2 }; enum dd_prio { DD_RT_PRIO = 0, DD_BE_PRIO = 1, DD_IDLE_PRIO = 2, DD_PRIO_MAX = 2, }; enum { DD_PRIO_COUNT = 3 }; /* * I/O statistics per I/O priority. It is fine if these counters overflow. * What matters is that these counters are at least as wide as * log2(max_outstanding_requests). */ struct io_stats_per_prio { uint32_t inserted; uint32_t merged; uint32_t dispatched; atomic_t completed; }; /* * Deadline scheduler data per I/O priority (enum dd_prio). Requests are * present on both sort_list[] and fifo_list[]. */ struct dd_per_prio { struct list_head dispatch; struct rb_root sort_list[DD_DIR_COUNT]; struct list_head fifo_list[DD_DIR_COUNT]; /* Position of the most recently dispatched request. */ sector_t latest_pos[DD_DIR_COUNT]; struct io_stats_per_prio stats; }; struct deadline_data { /* * run time data */ struct dd_per_prio per_prio[DD_PRIO_COUNT]; /* Data direction of latest dispatched request. */ enum dd_data_dir last_dir; unsigned int batching; /* number of sequential requests made */ unsigned int starved; /* times reads have starved writes */ /* * settings that change how the i/o scheduler behaves */ int fifo_expire[DD_DIR_COUNT]; int fifo_batch; int writes_starved; int front_merges; u32 async_depth; int prio_aging_expire; spinlock_t lock; }; /* Maps an I/O priority class to a deadline scheduler priority. */ static const enum dd_prio ioprio_class_to_prio[] = { [IOPRIO_CLASS_NONE] = DD_BE_PRIO, [IOPRIO_CLASS_RT] = DD_RT_PRIO, [IOPRIO_CLASS_BE] = DD_BE_PRIO, [IOPRIO_CLASS_IDLE] = DD_IDLE_PRIO, }; static inline struct rb_root * deadline_rb_root(struct dd_per_prio *per_prio, struct request *rq) { return &per_prio->sort_list[rq_data_dir(rq)]; } /* * Returns the I/O priority class (IOPRIO_CLASS_*) that has been assigned to a * request. */ static u8 dd_rq_ioclass(struct request *rq) { return IOPRIO_PRIO_CLASS(req_get_ioprio(rq)); } /* * Return the first request for which blk_rq_pos() >= @pos. */ static inline struct request *deadline_from_pos(struct dd_per_prio *per_prio, enum dd_data_dir data_dir, sector_t pos) { struct rb_node *node = per_prio->sort_list[data_dir].rb_node; struct request *rq, *res = NULL; if (!node) return NULL; rq = rb_entry_rq(node); while (node) { rq = rb_entry_rq(node); if (blk_rq_pos(rq) >= pos) { res = rq; node = node->rb_left; } else { node = node->rb_right; } } return res; } static void deadline_add_rq_rb(struct dd_per_prio *per_prio, struct request *rq) { struct rb_root *root = deadline_rb_root(per_prio, rq); elv_rb_add(root, rq); } static inline void deadline_del_rq_rb(struct dd_per_prio *per_prio, struct request *rq) { elv_rb_del(deadline_rb_root(per_prio, rq), rq); } /* * remove rq from rbtree and fifo. */ static void deadline_remove_request(struct request_queue *q, struct dd_per_prio *per_prio, struct request *rq) { list_del_init(&rq->queuelist); /* * We might not be on the rbtree, if we are doing an insert merge */ if (!RB_EMPTY_NODE(&rq->rb_node)) deadline_del_rq_rb(per_prio, rq); elv_rqhash_del(q, rq); if (q->last_merge == rq) q->last_merge = NULL; } static void dd_request_merged(struct request_queue *q, struct request *req, enum elv_merge type) { struct deadline_data *dd = q->elevator->elevator_data; const u8 ioprio_class = dd_rq_ioclass(req); const enum dd_prio prio = ioprio_class_to_prio[ioprio_class]; struct dd_per_prio *per_prio = &dd->per_prio[prio]; /* * if the merge was a front merge, we need to reposition request */ if (type == ELEVATOR_FRONT_MERGE) { elv_rb_del(deadline_rb_root(per_prio, req), req); deadline_add_rq_rb(per_prio, req); } } /* * Callback function that is invoked after @next has been merged into @req. */ static void dd_merged_requests(struct request_queue *q, struct request *req, struct request *next) { struct deadline_data *dd = q->elevator->elevator_data; const u8 ioprio_class = dd_rq_ioclass(next); const enum dd_prio prio = ioprio_class_to_prio[ioprio_class]; lockdep_assert_held(&dd->lock); dd->per_prio[prio].stats.merged++; /* * if next expires before rq, assign its expire time to rq * and move into next position (next will be deleted) in fifo */ if (!list_empty(&req->queuelist) && !list_empty(&next->queuelist)) { if (time_before((unsigned long)next->fifo_time, (unsigned long)req->fifo_time)) { list_move(&req->queuelist, &next->queuelist); req->fifo_time = next->fifo_time; } } /* * kill knowledge of next, this one is a goner */ deadline_remove_request(q, &dd->per_prio[prio], next); } /* * move an entry to dispatch queue */ static void deadline_move_request(struct deadline_data *dd, struct dd_per_prio *per_prio, struct request *rq) { /* * take it off the sort and fifo list */ deadline_remove_request(rq->q, per_prio, rq); } /* Number of requests queued for a given priority level. */ static u32 dd_queued(struct deadline_data *dd, enum dd_prio prio) { const struct io_stats_per_prio *stats = &dd->per_prio[prio].stats; lockdep_assert_held(&dd->lock); return stats->inserted - atomic_read(&stats->completed); } /* * deadline_check_fifo returns true if and only if there are expired requests * in the FIFO list. Requires !list_empty(&dd->fifo_list[data_dir]). */ static inline bool deadline_check_fifo(struct dd_per_prio *per_prio, enum dd_data_dir data_dir) { struct request *rq = rq_entry_fifo(per_prio->fifo_list[data_dir].next); return time_is_before_eq_jiffies((unsigned long)rq->fifo_time); } /* * For the specified data direction, return the next request to * dispatch using arrival ordered lists. */ static struct request * deadline_fifo_request(struct deadline_data *dd, struct dd_per_prio *per_prio, enum dd_data_dir data_dir) { if (list_empty(&per_prio->fifo_list[data_dir])) return NULL; return rq_entry_fifo(per_prio->fifo_list[data_dir].next); } /* * For the specified data direction, return the next request to * dispatch using sector position sorted lists. */ static struct request * deadline_next_request(struct deadline_data *dd, struct dd_per_prio *per_prio, enum dd_data_dir data_dir) { return deadline_from_pos(per_prio, data_dir, per_prio->latest_pos[data_dir]); } /* * Returns true if and only if @rq started after @latest_start where * @latest_start is in jiffies. */ static bool started_after(struct deadline_data *dd, struct request *rq, unsigned long latest_start) { unsigned long start_time = (unsigned long)rq->fifo_time; start_time -= dd->fifo_expire[rq_data_dir(rq)]; return time_after(start_time, latest_start); } /* * deadline_dispatch_requests selects the best request according to * read/write expire, fifo_batch, etc and with a start time <= @latest_start. */ static struct request *__dd_dispatch_request(struct deadline_data *dd, struct dd_per_prio *per_prio, unsigned long latest_start) { struct request *rq, *next_rq; enum dd_data_dir data_dir; enum dd_prio prio; u8 ioprio_class; lockdep_assert_held(&dd->lock); if (!list_empty(&per_prio->dispatch)) { rq = list_first_entry(&per_prio->dispatch, struct request, queuelist); if (started_after(dd, rq, latest_start)) return NULL; list_del_init(&rq->queuelist); data_dir = rq_data_dir(rq); goto done; } /* * batches are currently reads XOR writes */ rq = deadline_next_request(dd, per_prio, dd->last_dir); if (rq && dd->batching < dd->fifo_batch) { /* we have a next request and are still entitled to batch */ data_dir = rq_data_dir(rq); goto dispatch_request; } /* * at this point we are not running a batch. select the appropriate * data direction (read / write) */ if (!list_empty(&per_prio->fifo_list[DD_READ])) { BUG_ON(RB_EMPTY_ROOT(&per_prio->sort_list[DD_READ])); if (deadline_fifo_request(dd, per_prio, DD_WRITE) && (dd->starved++ >= dd->writes_starved)) goto dispatch_writes; data_dir = DD_READ; goto dispatch_find_request; } /* * there are either no reads or writes have been starved */ if (!list_empty(&per_prio->fifo_list[DD_WRITE])) { dispatch_writes: BUG_ON(RB_EMPTY_ROOT(&per_prio->sort_list[DD_WRITE])); dd->starved = 0; data_dir = DD_WRITE; goto dispatch_find_request; } return NULL; dispatch_find_request: /* * we are not running a batch, find best request for selected data_dir */ next_rq = deadline_next_request(dd, per_prio, data_dir); if (deadline_check_fifo(per_prio, data_dir) || !next_rq) { /* * A deadline has expired, the last request was in the other * direction, or we have run out of higher-sectored requests. * Start again from the request with the earliest expiry time. */ rq = deadline_fifo_request(dd, per_prio, data_dir); } else { /* * The last req was the same dir and we have a next request in * sort order. No expired requests so continue on from here. */ rq = next_rq; } if (!rq) return NULL; dd->last_dir = data_dir; dd->batching = 0; dispatch_request: if (started_after(dd, rq, latest_start)) return NULL; /* * rq is the selected appropriate request. */ dd->batching++; deadline_move_request(dd, per_prio, rq); done: ioprio_class = dd_rq_ioclass(rq); prio = ioprio_class_to_prio[ioprio_class]; dd->per_prio[prio].latest_pos[data_dir] = blk_rq_pos(rq); dd->per_prio[prio].stats.dispatched++; rq->rq_flags |= RQF_STARTED; return rq; } /* * Check whether there are any requests with priority other than DD_RT_PRIO * that were inserted more than prio_aging_expire jiffies ago. */ static struct request *dd_dispatch_prio_aged_requests(struct deadline_data *dd, unsigned long now) { struct request *rq; enum dd_prio prio; int prio_cnt; lockdep_assert_held(&dd->lock); prio_cnt = !!dd_queued(dd, DD_RT_PRIO) + !!dd_queued(dd, DD_BE_PRIO) + !!dd_queued(dd, DD_IDLE_PRIO); if (prio_cnt < 2) return NULL; for (prio = DD_BE_PRIO; prio <= DD_PRIO_MAX; prio++) { rq = __dd_dispatch_request(dd, &dd->per_prio[prio], now - dd->prio_aging_expire); if (rq) return rq; } return NULL; } /* * Called from blk_mq_run_hw_queue() -> __blk_mq_sched_dispatch_requests(). * * One confusing aspect here is that we get called for a specific * hardware queue, but we may return a request that is for a * different hardware queue. This is because mq-deadline has shared * state for all hardware queues, in terms of sorting, FIFOs, etc. */ static struct request *dd_dispatch_request(struct blk_mq_hw_ctx *hctx) { struct deadline_data *dd = hctx->queue->elevator->elevator_data; const unsigned long now = jiffies; struct request *rq; enum dd_prio prio; spin_lock(&dd->lock); rq = dd_dispatch_prio_aged_requests(dd, now); if (rq) goto unlock; /* * Next, dispatch requests in priority order. Ignore lower priority * requests if any higher priority requests are pending. */ for (prio = 0; prio <= DD_PRIO_MAX; prio++) { rq = __dd_dispatch_request(dd, &dd->per_prio[prio], now); if (rq || dd_queued(dd, prio)) break; } unlock: spin_unlock(&dd->lock); return rq; } /* * Called by __blk_mq_alloc_request(). The shallow_depth value set by this * function is used by __blk_mq_get_tag(). */ static void dd_limit_depth(blk_opf_t opf, struct blk_mq_alloc_data *data) { struct deadline_data *dd = data->q->elevator->elevator_data; /* Do not throttle synchronous reads. */ if (op_is_sync(opf) && !op_is_write(opf)) return; /* * Throttle asynchronous requests and writes such that these requests * do not block the allocation of synchronous requests. */ data->shallow_depth = dd->async_depth; } /* Called by blk_mq_update_nr_requests(). */ static void dd_depth_updated(struct blk_mq_hw_ctx *hctx) { struct request_queue *q = hctx->queue; struct deadline_data *dd = q->elevator->elevator_data; struct blk_mq_tags *tags = hctx->sched_tags; dd->async_depth = q->nr_requests; sbitmap_queue_min_shallow_depth(&tags->bitmap_tags, 1); } /* Called by blk_mq_init_hctx() and blk_mq_init_sched(). */ static int dd_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx) { dd_depth_updated(hctx); return 0; } static void dd_exit_sched(struct elevator_queue *e) { struct deadline_data *dd = e->elevator_data; enum dd_prio prio; for (prio = 0; prio <= DD_PRIO_MAX; prio++) { struct dd_per_prio *per_prio = &dd->per_prio[prio]; const struct io_stats_per_prio *stats = &per_prio->stats; uint32_t queued; WARN_ON_ONCE(!list_empty(&per_prio->fifo_list[DD_READ])); WARN_ON_ONCE(!list_empty(&per_prio->fifo_list[DD_WRITE])); spin_lock(&dd->lock); queued = dd_queued(dd, prio); spin_unlock(&dd->lock); WARN_ONCE(queued != 0, "statistics for priority %d: i %u m %u d %u c %u\n", prio, stats->inserted, stats->merged, stats->dispatched, atomic_read(&stats->completed)); } kfree(dd); } /* * initialize elevator private data (deadline_data). */ static int dd_init_sched(struct request_queue *q, struct elevator_queue *eq) { struct deadline_data *dd; enum dd_prio prio; dd = kzalloc_node(sizeof(*dd), GFP_KERNEL, q->node); if (!dd) return -ENOMEM; eq->elevator_data = dd; for (prio = 0; prio <= DD_PRIO_MAX; prio++) { struct dd_per_prio *per_prio = &dd->per_prio[prio]; INIT_LIST_HEAD(&per_prio->dispatch); INIT_LIST_HEAD(&per_prio->fifo_list[DD_READ]); INIT_LIST_HEAD(&per_prio->fifo_list[DD_WRITE]); per_prio->sort_list[DD_READ] = RB_ROOT; per_prio->sort_list[DD_WRITE] = RB_ROOT; } dd->fifo_expire[DD_READ] = read_expire; dd->fifo_expire[DD_WRITE] = write_expire; dd->writes_starved = writes_starved; dd->front_merges = 1; dd->last_dir = DD_WRITE; dd->fifo_batch = fifo_batch; dd->prio_aging_expire = prio_aging_expire; spin_lock_init(&dd->lock); /* We dispatch from request queue wide instead of hw queue */ blk_queue_flag_set(QUEUE_FLAG_SQ_SCHED, q); q->elevator = eq; return 0; } /* * Try to merge @bio into an existing request. If @bio has been merged into * an existing request, store the pointer to that request into *@rq. */ static int dd_request_merge(struct request_queue *q, struct request **rq, struct bio *bio) { struct deadline_data *dd = q->elevator->elevator_data; const u8 ioprio_class = IOPRIO_PRIO_CLASS(bio->bi_ioprio); const enum dd_prio prio = ioprio_class_to_prio[ioprio_class]; struct dd_per_prio *per_prio = &dd->per_prio[prio]; sector_t sector = bio_end_sector(bio); struct request *__rq; if (!dd->front_merges) return ELEVATOR_NO_MERGE; __rq = elv_rb_find(&per_prio->sort_list[bio_data_dir(bio)], sector); if (__rq) { BUG_ON(sector != blk_rq_pos(__rq)); if (elv_bio_merge_ok(__rq, bio)) { *rq = __rq; if (blk_discard_mergable(__rq)) return ELEVATOR_DISCARD_MERGE; return ELEVATOR_FRONT_MERGE; } } return ELEVATOR_NO_MERGE; } /* * Attempt to merge a bio into an existing request. This function is called * before @bio is associated with a request. */ static bool dd_bio_merge(struct request_queue *q, struct bio *bio, unsigned int nr_segs) { struct deadline_data *dd = q->elevator->elevator_data; struct request *free = NULL; bool ret; spin_lock(&dd->lock); ret = blk_mq_sched_try_merge(q, bio, nr_segs, &free); spin_unlock(&dd->lock); if (free) blk_mq_free_request(free); return ret; } /* * add rq to rbtree and fifo */ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, blk_insert_t flags, struct list_head *free) { struct request_queue *q = hctx->queue; struct deadline_data *dd = q->elevator->elevator_data; const enum dd_data_dir data_dir = rq_data_dir(rq); u16 ioprio = req_get_ioprio(rq); u8 ioprio_class = IOPRIO_PRIO_CLASS(ioprio); struct dd_per_prio *per_prio; enum dd_prio prio; lockdep_assert_held(&dd->lock); prio = ioprio_class_to_prio[ioprio_class]; per_prio = &dd->per_prio[prio]; if (!rq->elv.priv[0]) per_prio->stats.inserted++; rq->elv.priv[0] = per_prio; if (blk_mq_sched_try_insert_merge(q, rq, free)) return; trace_block_rq_insert(rq); if (flags & BLK_MQ_INSERT_AT_HEAD) { list_add(&rq->queuelist, &per_prio->dispatch); rq->fifo_time = jiffies; } else { deadline_add_rq_rb(per_prio, rq); if (rq_mergeable(rq)) { elv_rqhash_add(q, rq); if (!q->last_merge) q->last_merge = rq; } /* * set expire time and add to fifo list */ rq->fifo_time = jiffies + dd->fifo_expire[data_dir]; list_add_tail(&rq->queuelist, &per_prio->fifo_list[data_dir]); } } /* * Called from blk_mq_insert_request() or blk_mq_dispatch_list(). */ static void dd_insert_requests(struct blk_mq_hw_ctx *hctx, struct list_head *list, blk_insert_t flags) { struct request_queue *q = hctx->queue; struct deadline_data *dd = q->elevator->elevator_data; LIST_HEAD(free); spin_lock(&dd->lock); while (!list_empty(list)) { struct request *rq; rq = list_first_entry(list, struct request, queuelist); list_del_init(&rq->queuelist); dd_insert_request(hctx, rq, flags, &free); } spin_unlock(&dd->lock); blk_mq_free_requests(&free); } /* Callback from inside blk_mq_rq_ctx_init(). */ static void dd_prepare_request(struct request *rq) { rq->elv.priv[0] = NULL; } /* * Callback from inside blk_mq_free_request(). */ static void dd_finish_request(struct request *rq) { struct dd_per_prio *per_prio = rq->elv.priv[0]; /* * The block layer core may call dd_finish_request() without having * called dd_insert_requests(). Skip requests that bypassed I/O * scheduling. See also blk_mq_request_bypass_insert(). */ if (per_prio) atomic_inc(&per_prio->stats.completed); } static bool dd_has_work_for_prio(struct dd_per_prio *per_prio) { return !list_empty_careful(&per_prio->dispatch) || !list_empty_careful(&per_prio->fifo_list[DD_READ]) || !list_empty_careful(&per_prio->fifo_list[DD_WRITE]); } static bool dd_has_work(struct blk_mq_hw_ctx *hctx) { struct deadline_data *dd = hctx->queue->elevator->elevator_data; enum dd_prio prio; for (prio = 0; prio <= DD_PRIO_MAX; prio++) if (dd_has_work_for_prio(&dd->per_prio[prio])) return true; return false; } /* * sysfs parts below */ #define SHOW_INT(__FUNC, __VAR) \ static ssize_t __FUNC(struct elevator_queue *e, char *page) \ { \ struct deadline_data *dd = e->elevator_data; \ \ return sysfs_emit(page, "%d\n", __VAR); \ } #define SHOW_JIFFIES(__FUNC, __VAR) SHOW_INT(__FUNC, jiffies_to_msecs(__VAR)) SHOW_JIFFIES(deadline_read_expire_show, dd->fifo_expire[DD_READ]); SHOW_JIFFIES(deadline_write_expire_show, dd->fifo_expire[DD_WRITE]); SHOW_JIFFIES(deadline_prio_aging_expire_show, dd->prio_aging_expire); SHOW_INT(deadline_writes_starved_show, dd->writes_starved); SHOW_INT(deadline_front_merges_show, dd->front_merges); SHOW_INT(deadline_async_depth_show, dd->async_depth); SHOW_INT(deadline_fifo_batch_show, dd->fifo_batch); #undef SHOW_INT #undef SHOW_JIFFIES #define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \ static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count) \ { \ struct deadline_data *dd = e->elevator_data; \ int __data, __ret; \ \ __ret = kstrtoint(page, 0, &__data); \ if (__ret < 0) \ return __ret; \ if (__data < (MIN)) \ __data = (MIN); \ else if (__data > (MAX)) \ __data = (MAX); \ *(__PTR) = __CONV(__data); \ return count; \ } #define STORE_INT(__FUNC, __PTR, MIN, MAX) \ STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, ) #define STORE_JIFFIES(__FUNC, __PTR, MIN, MAX) \ STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, msecs_to_jiffies) STORE_JIFFIES(deadline_read_expire_store, &dd->fifo_expire[DD_READ], 0, INT_MAX); STORE_JIFFIES(deadline_write_expire_store, &dd->fifo_expire[DD_WRITE], 0, INT_MAX); STORE_JIFFIES(deadline_prio_aging_expire_store, &dd->prio_aging_expire, 0, INT_MAX); STORE_INT(deadline_writes_starved_store, &dd->writes_starved, INT_MIN, INT_MAX); STORE_INT(deadline_front_merges_store, &dd->front_merges, 0, 1); STORE_INT(deadline_async_depth_store, &dd->async_depth, 1, INT_MAX); STORE_INT(deadline_fifo_batch_store, &dd->fifo_batch, 0, INT_MAX); #undef STORE_FUNCTION #undef STORE_INT #undef STORE_JIFFIES #define DD_ATTR(name) \ __ATTR(name, 0644, deadline_##name##_show, deadline_##name##_store) static const struct elv_fs_entry deadline_attrs[] = { DD_ATTR(read_expire), DD_ATTR(write_expire), DD_ATTR(writes_starved), DD_ATTR(front_merges), DD_ATTR(async_depth), DD_ATTR(fifo_batch), DD_ATTR(prio_aging_expire), __ATTR_NULL }; #ifdef CONFIG_BLK_DEBUG_FS #define DEADLINE_DEBUGFS_DDIR_ATTRS(prio, data_dir, name) \ static void *deadline_##name##_fifo_start(struct seq_file *m, \ loff_t *pos) \ __acquires(&dd->lock) \ { \ struct request_queue *q = m->private; \ struct deadline_data *dd = q->elevator->elevator_data; \ struct dd_per_prio *per_prio = &dd->per_prio[prio]; \ \ spin_lock(&dd->lock); \ return seq_list_start(&per_prio->fifo_list[data_dir], *pos); \ } \ \ static void *deadline_##name##_fifo_next(struct seq_file *m, void *v, \ loff_t *pos) \ { \ struct request_queue *q = m->private; \ struct deadline_data *dd = q->elevator->elevator_data; \ struct dd_per_prio *per_prio = &dd->per_prio[prio]; \ \ return seq_list_next(v, &per_prio->fifo_list[data_dir], pos); \ } \ \ static void deadline_##name##_fifo_stop(struct seq_file *m, void *v) \ __releases(&dd->lock) \ { \ struct request_queue *q = m->private; \ struct deadline_data *dd = q->elevator->elevator_data; \ \ spin_unlock(&dd->lock); \ } \ \ static const struct seq_operations deadline_##name##_fifo_seq_ops = { \ .start = deadline_##name##_fifo_start, \ .next = deadline_##name##_fifo_next, \ .stop = deadline_##name##_fifo_stop, \ .show = blk_mq_debugfs_rq_show, \ }; \ \ static int deadline_##name##_next_rq_show(void *data, \ struct seq_file *m) \ { \ struct request_queue *q = data; \ struct deadline_data *dd = q->elevator->elevator_data; \ struct dd_per_prio *per_prio = &dd->per_prio[prio]; \ struct request *rq; \ \ rq = deadline_from_pos(per_prio, data_dir, \ per_prio->latest_pos[data_dir]); \ if (rq) \ __blk_mq_debugfs_rq_show(m, rq); \ return 0; \ } DEADLINE_DEBUGFS_DDIR_ATTRS(DD_RT_PRIO, DD_READ, read0); DEADLINE_DEBUGFS_DDIR_ATTRS(DD_RT_PRIO, DD_WRITE, write0); DEADLINE_DEBUGFS_DDIR_ATTRS(DD_BE_PRIO, DD_READ, read1); DEADLINE_DEBUGFS_DDIR_ATTRS(DD_BE_PRIO, DD_WRITE, write1); DEADLINE_DEBUGFS_DDIR_ATTRS(DD_IDLE_PRIO, DD_READ, read2); DEADLINE_DEBUGFS_DDIR_ATTRS(DD_IDLE_PRIO, DD_WRITE, write2); #undef DEADLINE_DEBUGFS_DDIR_ATTRS static int deadline_batching_show(void *data, struct seq_file *m) { struct request_queue *q = data; struct deadline_data *dd = q->elevator->elevator_data; seq_printf(m, "%u\n", dd->batching); return 0; } static int deadline_starved_show(void *data, struct seq_file *m) { struct request_queue *q = data; struct deadline_data *dd = q->elevator->elevator_data; seq_printf(m, "%u\n", dd->starved); return 0; } static int dd_async_depth_show(void *data, struct seq_file *m) { struct request_queue *q = data; struct deadline_data *dd = q->elevator->elevator_data; seq_printf(m, "%u\n", dd->async_depth); return 0; } static int dd_queued_show(void *data, struct seq_file *m) { struct request_queue *q = data; struct deadline_data *dd = q->elevator->elevator_data; u32 rt, be, idle; spin_lock(&dd->lock); rt = dd_queued(dd, DD_RT_PRIO); be = dd_queued(dd, DD_BE_PRIO); idle = dd_queued(dd, DD_IDLE_PRIO); spin_unlock(&dd->lock); seq_printf(m, "%u %u %u\n", rt, be, idle); return 0; } /* Number of requests owned by the block driver for a given priority. */ static u32 dd_owned_by_driver(struct deadline_data *dd, enum dd_prio prio) { const struct io_stats_per_prio *stats = &dd->per_prio[prio].stats; lockdep_assert_held(&dd->lock); return stats->dispatched + stats->merged - atomic_read(&stats->completed); } static int dd_owned_by_driver_show(void *data, struct seq_file *m) { struct request_queue *q = data; struct deadline_data *dd = q->elevator->elevator_data; u32 rt, be, idle; spin_lock(&dd->lock); rt = dd_owned_by_driver(dd, DD_RT_PRIO); be = dd_owned_by_driver(dd, DD_BE_PRIO); idle = dd_owned_by_driver(dd, DD_IDLE_PRIO); spin_unlock(&dd->lock); seq_printf(m, "%u %u %u\n", rt, be, idle); return 0; } #define DEADLINE_DISPATCH_ATTR(prio) \ static void *deadline_dispatch##prio##_start(struct seq_file *m, \ loff_t *pos) \ __acquires(&dd->lock) \ { \ struct request_queue *q = m->private; \ struct deadline_data *dd = q->elevator->elevator_data; \ struct dd_per_prio *per_prio = &dd->per_prio[prio]; \ \ spin_lock(&dd->lock); \ return seq_list_start(&per_prio->dispatch, *pos); \ } \ \ static void *deadline_dispatch##prio##_next(struct seq_file *m, \ void *v, loff_t *pos) \ { \ struct request_queue *q = m->private; \ struct deadline_data *dd = q->elevator->elevator_data; \ struct dd_per_prio *per_prio = &dd->per_prio[prio]; \ \ return seq_list_next(v, &per_prio->dispatch, pos); \ } \ \ static void deadline_dispatch##prio##_stop(struct seq_file *m, void *v) \ __releases(&dd->lock) \ { \ struct request_queue *q = m->private; \ struct deadline_data *dd = q->elevator->elevator_data; \ \ spin_unlock(&dd->lock); \ } \ \ static const struct seq_operations deadline_dispatch##prio##_seq_ops = { \ .start = deadline_dispatch##prio##_start, \ .next = deadline_dispatch##prio##_next, \ .stop = deadline_dispatch##prio##_stop, \ .show = blk_mq_debugfs_rq_show, \ } DEADLINE_DISPATCH_ATTR(0); DEADLINE_DISPATCH_ATTR(1); DEADLINE_DISPATCH_ATTR(2); #undef DEADLINE_DISPATCH_ATTR #define DEADLINE_QUEUE_DDIR_ATTRS(name) \ {#name "_fifo_list", 0400, \ .seq_ops = &deadline_##name##_fifo_seq_ops} #define DEADLINE_NEXT_RQ_ATTR(name) \ {#name "_next_rq", 0400, deadline_##name##_next_rq_show} static const struct blk_mq_debugfs_attr deadline_queue_debugfs_attrs[] = { DEADLINE_QUEUE_DDIR_ATTRS(read0), DEADLINE_QUEUE_DDIR_ATTRS(write0), DEADLINE_QUEUE_DDIR_ATTRS(read1), DEADLINE_QUEUE_DDIR_ATTRS(write1), DEADLINE_QUEUE_DDIR_ATTRS(read2), DEADLINE_QUEUE_DDIR_ATTRS(write2), DEADLINE_NEXT_RQ_ATTR(read0), DEADLINE_NEXT_RQ_ATTR(write0), DEADLINE_NEXT_RQ_ATTR(read1), DEADLINE_NEXT_RQ_ATTR(write1), DEADLINE_NEXT_RQ_ATTR(read2), DEADLINE_NEXT_RQ_ATTR(write2), {"batching", 0400, deadline_batching_show}, {"starved", 0400, deadline_starved_show}, {"async_depth", 0400, dd_async_depth_show}, {"dispatch0", 0400, .seq_ops = &deadline_dispatch0_seq_ops}, {"dispatch1", 0400, .seq_ops = &deadline_dispatch1_seq_ops}, {"dispatch2", 0400, .seq_ops = &deadline_dispatch2_seq_ops}, {"owned_by_driver", 0400, dd_owned_by_driver_show}, {"queued", 0400, dd_queued_show}, {}, }; #undef DEADLINE_QUEUE_DDIR_ATTRS #endif static struct elevator_type mq_deadline = { .ops = { .depth_updated = dd_depth_updated, .limit_depth = dd_limit_depth, .insert_requests = dd_insert_requests, .dispatch_request = dd_dispatch_request, .prepare_request = dd_prepare_request, .finish_request = dd_finish_request, .next_request = elv_rb_latter_request, .former_request = elv_rb_former_request, .bio_merge = dd_bio_merge, .request_merge = dd_request_merge, .requests_merged = dd_merged_requests, .request_merged = dd_request_merged, .has_work = dd_has_work, .init_sched = dd_init_sched, .exit_sched = dd_exit_sched, .init_hctx = dd_init_hctx, }, #ifdef CONFIG_BLK_DEBUG_FS .queue_debugfs_attrs = deadline_queue_debugfs_attrs, #endif .elevator_attrs = deadline_attrs, .elevator_name = "mq-deadline", .elevator_alias = "deadline", .elevator_owner = THIS_MODULE, }; MODULE_ALIAS("mq-deadline-iosched"); static int __init deadline_init(void) { return elv_register(&mq_deadline); } static void __exit deadline_exit(void) { elv_unregister(&mq_deadline); } module_init(deadline_init); module_exit(deadline_exit); MODULE_AUTHOR("Jens Axboe, Damien Le Moal and Bart Van Assche"); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("MQ deadline IO scheduler"); |
| 4 4 4 15 15 15 3 15 15 15 15 15 15 17 16 17 17 15 15 15 15 15 15 15 15 15 5 5 5 5 5 1 7 5 5 5 5 15 15 15 15 15 15 15 15 3 3 4 4 4 4 4 4 4 4 4 1 1 1 1 1 1 1 1 1 15 15 15 15 15 15 15 15 15 3 15 15 15 15 14 15 15 15 15 15 15 15 15 15 15 15 15 3 15 3 3 3 3 15 4 4 4 4 4 4 4 4 4 4 4 4 4 4 3 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 | // SPDX-License-Identifier: GPL-2.0-or-later /* * zswap.c - zswap driver file * * zswap is a cache that takes pages that are in the process * of being swapped out and attempts to compress and store them in a * RAM-based memory pool. This can result in a significant I/O reduction on * the swap device and, in the case where decompressing from RAM is faster * than reading from the swap device, can also improve workload performance. * * Copyright (C) 2012 Seth Jennings <sjenning@linux.vnet.ibm.com> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/cpu.h> #include <linux/highmem.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/types.h> #include <linux/atomic.h> #include <linux/swap.h> #include <linux/crypto.h> #include <linux/scatterlist.h> #include <linux/mempolicy.h> #include <linux/mempool.h> #include <linux/zpool.h> #include <crypto/acompress.h> #include <linux/zswap.h> #include <linux/mm_types.h> #include <linux/page-flags.h> #include <linux/swapops.h> #include <linux/writeback.h> #include <linux/pagemap.h> #include <linux/workqueue.h> #include <linux/list_lru.h> #include "swap.h" #include "internal.h" /********************************* * statistics **********************************/ /* The number of compressed pages currently stored in zswap */ atomic_long_t zswap_stored_pages = ATOMIC_LONG_INIT(0); /* * The statistics below are not protected from concurrent access for * performance reasons so they may not be a 100% accurate. However, * they do provide useful information on roughly how many times a * certain event is occurring. */ /* Pool limit was hit (see zswap_max_pool_percent) */ static u64 zswap_pool_limit_hit; /* Pages written back when pool limit was reached */ static u64 zswap_written_back_pages; /* Store failed due to a reclaim failure after pool limit was reached */ static u64 zswap_reject_reclaim_fail; /* Store failed due to compression algorithm failure */ static u64 zswap_reject_compress_fail; /* Compressed page was too big for the allocator to (optimally) store */ static u64 zswap_reject_compress_poor; /* Load or writeback failed due to decompression failure */ static u64 zswap_decompress_fail; /* Store failed because underlying allocator could not get memory */ static u64 zswap_reject_alloc_fail; /* Store failed because the entry metadata could not be allocated (rare) */ static u64 zswap_reject_kmemcache_fail; /* Shrinker work queue */ static struct workqueue_struct *shrink_wq; /* Pool limit was hit, we need to calm down */ static bool zswap_pool_reached_full; /********************************* * tunables **********************************/ #define ZSWAP_PARAM_UNSET "" static int zswap_setup(void); /* Enable/disable zswap */ static DEFINE_STATIC_KEY_MAYBE(CONFIG_ZSWAP_DEFAULT_ON, zswap_ever_enabled); static bool zswap_enabled = IS_ENABLED(CONFIG_ZSWAP_DEFAULT_ON); static int zswap_enabled_param_set(const char *, const struct kernel_param *); static const struct kernel_param_ops zswap_enabled_param_ops = { .set = zswap_enabled_param_set, .get = param_get_bool, }; module_param_cb(enabled, &zswap_enabled_param_ops, &zswap_enabled, 0644); /* Crypto compressor to use */ static char *zswap_compressor = CONFIG_ZSWAP_COMPRESSOR_DEFAULT; static int zswap_compressor_param_set(const char *, const struct kernel_param *); static const struct kernel_param_ops zswap_compressor_param_ops = { .set = zswap_compressor_param_set, .get = param_get_charp, .free = param_free_charp, }; module_param_cb(compressor, &zswap_compressor_param_ops, &zswap_compressor, 0644); /* Compressed storage zpool to use */ static char *zswap_zpool_type = CONFIG_ZSWAP_ZPOOL_DEFAULT; static int zswap_zpool_param_set(const char *, const struct kernel_param *); static const struct kernel_param_ops zswap_zpool_param_ops = { .set = zswap_zpool_param_set, .get = param_get_charp, .free = param_free_charp, }; module_param_cb(zpool, &zswap_zpool_param_ops, &zswap_zpool_type, 0644); /* The maximum percentage of memory that the compressed pool can occupy */ static unsigned int zswap_max_pool_percent = 20; module_param_named(max_pool_percent, zswap_max_pool_percent, uint, 0644); /* The threshold for accepting new pages after the max_pool_percent was hit */ static unsigned int zswap_accept_thr_percent = 90; /* of max pool size */ module_param_named(accept_threshold_percent, zswap_accept_thr_percent, uint, 0644); /* Enable/disable memory pressure-based shrinker. */ static bool zswap_shrinker_enabled = IS_ENABLED( CONFIG_ZSWAP_SHRINKER_DEFAULT_ON); module_param_named(shrinker_enabled, zswap_shrinker_enabled, bool, 0644); bool zswap_is_enabled(void) { return zswap_enabled; } bool zswap_never_enabled(void) { return !static_branch_maybe(CONFIG_ZSWAP_DEFAULT_ON, &zswap_ever_enabled); } /********************************* * data structures **********************************/ struct crypto_acomp_ctx { struct crypto_acomp *acomp; struct acomp_req *req; struct crypto_wait wait; u8 *buffer; struct mutex mutex; bool is_sleepable; }; /* * The lock ordering is zswap_tree.lock -> zswap_pool.lru_lock. * The only case where lru_lock is not acquired while holding tree.lock is * when a zswap_entry is taken off the lru for writeback, in that case it * needs to be verified that it's still valid in the tree. */ struct zswap_pool { struct zpool *zpool; struct crypto_acomp_ctx __percpu *acomp_ctx; struct percpu_ref ref; struct list_head list; struct work_struct release_work; struct hlist_node node; char tfm_name[CRYPTO_MAX_ALG_NAME]; }; /* Global LRU lists shared by all zswap pools. */ static struct list_lru zswap_list_lru; /* The lock protects zswap_next_shrink updates. */ static DEFINE_SPINLOCK(zswap_shrink_lock); static struct mem_cgroup *zswap_next_shrink; static struct work_struct zswap_shrink_work; static struct shrinker *zswap_shrinker; /* * struct zswap_entry * * This structure contains the metadata for tracking a single compressed * page within zswap. * * swpentry - associated swap entry, the offset indexes into the red-black tree * length - the length in bytes of the compressed page data. Needed during * decompression. * referenced - true if the entry recently entered the zswap pool. Unset by the * writeback logic. The entry is only reclaimed by the writeback * logic if referenced is unset. See comments in the shrinker * section for context. * pool - the zswap_pool the entry's data is in * handle - zpool allocation handle that stores the compressed page data * objcg - the obj_cgroup that the compressed memory is charged to * lru - handle to the pool's lru used to evict pages. */ struct zswap_entry { swp_entry_t swpentry; unsigned int length; bool referenced; struct zswap_pool *pool; unsigned long handle; struct obj_cgroup *objcg; struct list_head lru; }; static struct xarray *zswap_trees[MAX_SWAPFILES]; static unsigned int nr_zswap_trees[MAX_SWAPFILES]; /* RCU-protected iteration */ static LIST_HEAD(zswap_pools); /* protects zswap_pools list modification */ static DEFINE_SPINLOCK(zswap_pools_lock); /* pool counter to provide unique names to zpool */ static atomic_t zswap_pools_count = ATOMIC_INIT(0); enum zswap_init_type { ZSWAP_UNINIT, ZSWAP_INIT_SUCCEED, ZSWAP_INIT_FAILED }; static enum zswap_init_type zswap_init_state; /* used to ensure the integrity of initialization */ static DEFINE_MUTEX(zswap_init_lock); /* init completed, but couldn't create the initial pool */ static bool zswap_has_pool; /********************************* * helpers and fwd declarations **********************************/ static inline struct xarray *swap_zswap_tree(swp_entry_t swp) { return &zswap_trees[swp_type(swp)][swp_offset(swp) >> SWAP_ADDRESS_SPACE_SHIFT]; } #define zswap_pool_debug(msg, p) \ pr_debug("%s pool %s/%s\n", msg, (p)->tfm_name, \ zpool_get_type((p)->zpool)) /********************************* * pool functions **********************************/ static void __zswap_pool_empty(struct percpu_ref *ref); static struct zswap_pool *zswap_pool_create(char *type, char *compressor) { struct zswap_pool *pool; char name[38]; /* 'zswap' + 32 char (max) num + \0 */ gfp_t gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM; int ret, cpu; if (!zswap_has_pool) { /* if either are unset, pool initialization failed, and we * need both params to be set correctly before trying to * create a pool. */ if (!strcmp(type, ZSWAP_PARAM_UNSET)) return NULL; if (!strcmp(compressor, ZSWAP_PARAM_UNSET)) return NULL; } pool = kzalloc(sizeof(*pool), GFP_KERNEL); if (!pool) return NULL; /* unique name for each pool specifically required by zsmalloc */ snprintf(name, 38, "zswap%x", atomic_inc_return(&zswap_pools_count)); pool->zpool = zpool_create_pool(type, name, gfp); if (!pool->zpool) { pr_err("%s zpool not available\n", type); goto error; } pr_debug("using %s zpool\n", zpool_get_type(pool->zpool)); strscpy(pool->tfm_name, compressor, sizeof(pool->tfm_name)); pool->acomp_ctx = alloc_percpu(*pool->acomp_ctx); if (!pool->acomp_ctx) { pr_err("percpu alloc failed\n"); goto error; } for_each_possible_cpu(cpu) mutex_init(&per_cpu_ptr(pool->acomp_ctx, cpu)->mutex); ret = cpuhp_state_add_instance(CPUHP_MM_ZSWP_POOL_PREPARE, &pool->node); if (ret) goto error; /* being the current pool takes 1 ref; this func expects the * caller to always add the new pool as the current pool */ ret = percpu_ref_init(&pool->ref, __zswap_pool_empty, PERCPU_REF_ALLOW_REINIT, GFP_KERNEL); if (ret) goto ref_fail; INIT_LIST_HEAD(&pool->list); zswap_pool_debug("created", pool); return pool; ref_fail: cpuhp_state_remove_instance(CPUHP_MM_ZSWP_POOL_PREPARE, &pool->node); error: if (pool->acomp_ctx) free_percpu(pool->acomp_ctx); if (pool->zpool) zpool_destroy_pool(pool->zpool); kfree(pool); return NULL; } static struct zswap_pool *__zswap_pool_create_fallback(void) { bool has_comp, has_zpool; has_comp = crypto_has_acomp(zswap_compressor, 0, 0); if (!has_comp && strcmp(zswap_compressor, CONFIG_ZSWAP_COMPRESSOR_DEFAULT)) { pr_err("compressor %s not available, using default %s\n", zswap_compressor, CONFIG_ZSWAP_COMPRESSOR_DEFAULT); param_free_charp(&zswap_compressor); zswap_compressor = CONFIG_ZSWAP_COMPRESSOR_DEFAULT; has_comp = crypto_has_acomp(zswap_compressor, 0, 0); } if (!has_comp) { pr_err("default compressor %s not available\n", zswap_compressor); param_free_charp(&zswap_compressor); zswap_compressor = ZSWAP_PARAM_UNSET; } has_zpool = zpool_has_pool(zswap_zpool_type); if (!has_zpool && strcmp(zswap_zpool_type, CONFIG_ZSWAP_ZPOOL_DEFAULT)) { pr_err("zpool %s not available, using default %s\n", zswap_zpool_type, CONFIG_ZSWAP_ZPOOL_DEFAULT); param_free_charp(&zswap_zpool_type); zswap_zpool_type = CONFIG_ZSWAP_ZPOOL_DEFAULT; has_zpool = zpool_has_pool(zswap_zpool_type); } if (!has_zpool) { pr_err("default zpool %s not available\n", zswap_zpool_type); param_free_charp(&zswap_zpool_type); zswap_zpool_type = ZSWAP_PARAM_UNSET; } if (!has_comp || !has_zpool) return NULL; return zswap_pool_create(zswap_zpool_type, zswap_compressor); } static void zswap_pool_destroy(struct zswap_pool *pool) { zswap_pool_debug("destroying", pool); cpuhp_state_remove_instance(CPUHP_MM_ZSWP_POOL_PREPARE, &pool->node); free_percpu(pool->acomp_ctx); zpool_destroy_pool(pool->zpool); kfree(pool); } static void __zswap_pool_release(struct work_struct *work) { struct zswap_pool *pool = container_of(work, typeof(*pool), release_work); synchronize_rcu(); /* nobody should have been able to get a ref... */ WARN_ON(!percpu_ref_is_zero(&pool->ref)); percpu_ref_exit(&pool->ref); /* pool is now off zswap_pools list and has no references. */ zswap_pool_destroy(pool); } static struct zswap_pool *zswap_pool_current(void); static void __zswap_pool_empty(struct percpu_ref *ref) { struct zswap_pool *pool; pool = container_of(ref, typeof(*pool), ref); spin_lock_bh(&zswap_pools_lock); WARN_ON(pool == zswap_pool_current()); list_del_rcu(&pool->list); INIT_WORK(&pool->release_work, __zswap_pool_release); schedule_work(&pool->release_work); spin_unlock_bh(&zswap_pools_lock); } static int __must_check zswap_pool_tryget(struct zswap_pool *pool) { if (!pool) return 0; return percpu_ref_tryget(&pool->ref); } /* The caller must already have a reference. */ static void zswap_pool_get(struct zswap_pool *pool) { percpu_ref_get(&pool->ref); } static void zswap_pool_put(struct zswap_pool *pool) { percpu_ref_put(&pool->ref); } static struct zswap_pool *__zswap_pool_current(void) { struct zswap_pool *pool; pool = list_first_or_null_rcu(&zswap_pools, typeof(*pool), list); WARN_ONCE(!pool && zswap_has_pool, "%s: no page storage pool!\n", __func__); return pool; } static struct zswap_pool *zswap_pool_current(void) { assert_spin_locked(&zswap_pools_lock); return __zswap_pool_current(); } static struct zswap_pool *zswap_pool_current_get(void) { struct zswap_pool *pool; rcu_read_lock(); pool = __zswap_pool_current(); if (!zswap_pool_tryget(pool)) pool = NULL; rcu_read_unlock(); return pool; } /* type and compressor must be null-terminated */ static struct zswap_pool *zswap_pool_find_get(char *type, char *compressor) { struct zswap_pool *pool; assert_spin_locked(&zswap_pools_lock); list_for_each_entry_rcu(pool, &zswap_pools, list) { if (strcmp(pool->tfm_name, compressor)) continue; if (strcmp(zpool_get_type(pool->zpool), type)) continue; /* if we can't get it, it's about to be destroyed */ if (!zswap_pool_tryget(pool)) continue; return pool; } return NULL; } static unsigned long zswap_max_pages(void) { return totalram_pages() * zswap_max_pool_percent / 100; } static unsigned long zswap_accept_thr_pages(void) { return zswap_max_pages() * zswap_accept_thr_percent / 100; } unsigned long zswap_total_pages(void) { struct zswap_pool *pool; unsigned long total = 0; rcu_read_lock(); list_for_each_entry_rcu(pool, &zswap_pools, list) total += zpool_get_total_pages(pool->zpool); rcu_read_unlock(); return total; } static bool zswap_check_limits(void) { unsigned long cur_pages = zswap_total_pages(); unsigned long max_pages = zswap_max_pages(); if (cur_pages >= max_pages) { zswap_pool_limit_hit++; zswap_pool_reached_full = true; } else if (zswap_pool_reached_full && cur_pages <= zswap_accept_thr_pages()) { zswap_pool_reached_full = false; } return zswap_pool_reached_full; } /********************************* * param callbacks **********************************/ static bool zswap_pool_changed(const char *s, const struct kernel_param *kp) { /* no change required */ if (!strcmp(s, *(char **)kp->arg) && zswap_has_pool) return false; return true; } /* val must be a null-terminated string */ static int __zswap_param_set(const char *val, const struct kernel_param *kp, char *type, char *compressor) { struct zswap_pool *pool, *put_pool = NULL; char *s = strstrip((char *)val); int ret = 0; bool new_pool = false; mutex_lock(&zswap_init_lock); switch (zswap_init_state) { case ZSWAP_UNINIT: /* if this is load-time (pre-init) param setting, * don't create a pool; that's done during init. */ ret = param_set_charp(s, kp); break; case ZSWAP_INIT_SUCCEED: new_pool = zswap_pool_changed(s, kp); break; case ZSWAP_INIT_FAILED: pr_err("can't set param, initialization failed\n"); ret = -ENODEV; } mutex_unlock(&zswap_init_lock); /* no need to create a new pool, return directly */ if (!new_pool) return ret; if (!type) { if (!zpool_has_pool(s)) { pr_err("zpool %s not available\n", s); return -ENOENT; } type = s; } else if (!compressor) { if (!crypto_has_acomp(s, 0, 0)) { pr_err("compressor %s not available\n", s); return -ENOENT; } compressor = s; } else { WARN_ON(1); return -EINVAL; } spin_lock_bh(&zswap_pools_lock); pool = zswap_pool_find_get(type, compressor); if (pool) { zswap_pool_debug("using existing", pool); WARN_ON(pool == zswap_pool_current()); list_del_rcu(&pool->list); } spin_unlock_bh(&zswap_pools_lock); if (!pool) pool = zswap_pool_create(type, compressor); else { /* * Restore the initial ref dropped by percpu_ref_kill() * when the pool was decommissioned and switch it again * to percpu mode. */ percpu_ref_resurrect(&pool->ref); /* Drop the ref from zswap_pool_find_get(). */ zswap_pool_put(pool); } if (pool) ret = param_set_charp(s, kp); else ret = -EINVAL; spin_lock_bh(&zswap_pools_lock); if (!ret) { put_pool = zswap_pool_current(); list_add_rcu(&pool->list, &zswap_pools); zswap_has_pool = true; } else if (pool) { /* add the possibly pre-existing pool to the end of the pools * list; if it's new (and empty) then it'll be removed and * destroyed by the put after we drop the lock */ list_add_tail_rcu(&pool->list, &zswap_pools); put_pool = pool; } spin_unlock_bh(&zswap_pools_lock); if (!zswap_has_pool && !pool) { /* if initial pool creation failed, and this pool creation also * failed, maybe both compressor and zpool params were bad. * Allow changing this param, so pool creation will succeed * when the other param is changed. We already verified this * param is ok in the zpool_has_pool() or crypto_has_acomp() * checks above. */ ret = param_set_charp(s, kp); } /* drop the ref from either the old current pool, * or the new pool we failed to add */ if (put_pool) percpu_ref_kill(&put_pool->ref); return ret; } static int zswap_compressor_param_set(const char *val, const struct kernel_param *kp) { return __zswap_param_set(val, kp, zswap_zpool_type, NULL); } static int zswap_zpool_param_set(const char *val, const struct kernel_param *kp) { return __zswap_param_set(val, kp, NULL, zswap_compressor); } static int zswap_enabled_param_set(const char *val, const struct kernel_param *kp) { int ret = -ENODEV; /* if this is load-time (pre-init) param setting, only set param. */ if (system_state != SYSTEM_RUNNING) return param_set_bool(val, kp); mutex_lock(&zswap_init_lock); switch (zswap_init_state) { case ZSWAP_UNINIT: if (zswap_setup()) break; fallthrough; case ZSWAP_INIT_SUCCEED: if (!zswap_has_pool) pr_err("can't enable, no pool configured\n"); else ret = param_set_bool(val, kp); break; case ZSWAP_INIT_FAILED: pr_err("can't enable, initialization failed\n"); } mutex_unlock(&zswap_init_lock); return ret; } /********************************* * lru functions **********************************/ /* should be called under RCU */ #ifdef CONFIG_MEMCG static inline struct mem_cgroup *mem_cgroup_from_entry(struct zswap_entry *entry) { return entry->objcg ? obj_cgroup_memcg(entry->objcg) : NULL; } #else static inline struct mem_cgroup *mem_cgroup_from_entry(struct zswap_entry *entry) { return NULL; } #endif static inline int entry_to_nid(struct zswap_entry *entry) { return page_to_nid(virt_to_page(entry)); } static void zswap_lru_add(struct list_lru *list_lru, struct zswap_entry *entry) { int nid = entry_to_nid(entry); struct mem_cgroup *memcg; /* * Note that it is safe to use rcu_read_lock() here, even in the face of * concurrent memcg offlining: * * 1. list_lru_add() is called before list_lru_one is dead. The * new entry will be reparented to memcg's parent's list_lru. * 2. list_lru_add() is called after list_lru_one is dead. The * new entry will be added directly to memcg's parent's list_lru. * * Similar reasoning holds for list_lru_del(). */ rcu_read_lock(); memcg = mem_cgroup_from_entry(entry); /* will always succeed */ list_lru_add(list_lru, &entry->lru, nid, memcg); rcu_read_unlock(); } static void zswap_lru_del(struct list_lru *list_lru, struct zswap_entry *entry) { int nid = entry_to_nid(entry); struct mem_cgroup *memcg; rcu_read_lock(); memcg = mem_cgroup_from_entry(entry); /* will always succeed */ list_lru_del(list_lru, &entry->lru, nid, memcg); rcu_read_unlock(); } void zswap_lruvec_state_init(struct lruvec *lruvec) { atomic_long_set(&lruvec->zswap_lruvec_state.nr_disk_swapins, 0); } void zswap_folio_swapin(struct folio *folio) { struct lruvec *lruvec; if (folio) { lruvec = folio_lruvec(folio); atomic_long_inc(&lruvec->zswap_lruvec_state.nr_disk_swapins); } } /* * This function should be called when a memcg is being offlined. * * Since the global shrinker shrink_worker() may hold a reference * of the memcg, we must check and release the reference in * zswap_next_shrink. * * shrink_worker() must handle the case where this function releases * the reference of memcg being shrunk. */ void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg) { /* lock out zswap shrinker walking memcg tree */ spin_lock(&zswap_shrink_lock); if (zswap_next_shrink == memcg) { do { zswap_next_shrink = mem_cgroup_iter(NULL, zswap_next_shrink, NULL); } while (zswap_next_shrink && !mem_cgroup_online(zswap_next_shrink)); } spin_unlock(&zswap_shrink_lock); } /********************************* * zswap entry functions **********************************/ static struct kmem_cache *zswap_entry_cache; static struct zswap_entry *zswap_entry_cache_alloc(gfp_t gfp, int nid) { struct zswap_entry *entry; entry = kmem_cache_alloc_node(zswap_entry_cache, gfp, nid); if (!entry) return NULL; return entry; } static void zswap_entry_cache_free(struct zswap_entry *entry) { kmem_cache_free(zswap_entry_cache, entry); } /* * Carries out the common pattern of freeing and entry's zpool allocation, * freeing the entry itself, and decrementing the number of stored pages. */ static void zswap_entry_free(struct zswap_entry *entry) { zswap_lru_del(&zswap_list_lru, entry); zpool_free(entry->pool->zpool, entry->handle); zswap_pool_put(entry->pool); if (entry->objcg) { obj_cgroup_uncharge_zswap(entry->objcg, entry->length); obj_cgroup_put(entry->objcg); } zswap_entry_cache_free(entry); atomic_long_dec(&zswap_stored_pages); } /********************************* * compressed storage functions **********************************/ static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node) { struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node); struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu); struct crypto_acomp *acomp = NULL; struct acomp_req *req = NULL; u8 *buffer = NULL; int ret; buffer = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL, cpu_to_node(cpu)); if (!buffer) { ret = -ENOMEM; goto fail; } acomp = crypto_alloc_acomp_node(pool->tfm_name, 0, 0, cpu_to_node(cpu)); if (IS_ERR(acomp)) { pr_err("could not alloc crypto acomp %s : %ld\n", pool->tfm_name, PTR_ERR(acomp)); ret = PTR_ERR(acomp); goto fail; } req = acomp_request_alloc(acomp); if (!req) { pr_err("could not alloc crypto acomp_request %s\n", pool->tfm_name); ret = -ENOMEM; goto fail; } /* * Only hold the mutex after completing allocations, otherwise we may * recurse into zswap through reclaim and attempt to hold the mutex * again resulting in a deadlock. */ mutex_lock(&acomp_ctx->mutex); crypto_init_wait(&acomp_ctx->wait); /* * if the backend of acomp is async zip, crypto_req_done() will wakeup * crypto_wait_req(); if the backend of acomp is scomp, the callback * won't be called, crypto_wait_req() will return without blocking. */ acomp_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG, crypto_req_done, &acomp_ctx->wait); acomp_ctx->buffer = buffer; acomp_ctx->acomp = acomp; acomp_ctx->is_sleepable = acomp_is_async(acomp); acomp_ctx->req = req; mutex_unlock(&acomp_ctx->mutex); return 0; fail: if (acomp) crypto_free_acomp(acomp); kfree(buffer); return ret; } static int zswap_cpu_comp_dead(unsigned int cpu, struct hlist_node *node) { struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node); struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu); struct acomp_req *req; struct crypto_acomp *acomp; u8 *buffer; if (IS_ERR_OR_NULL(acomp_ctx)) return 0; mutex_lock(&acomp_ctx->mutex); req = acomp_ctx->req; acomp = acomp_ctx->acomp; buffer = acomp_ctx->buffer; acomp_ctx->req = NULL; acomp_ctx->acomp = NULL; acomp_ctx->buffer = NULL; mutex_unlock(&acomp_ctx->mutex); /* * Do the actual freeing after releasing the mutex to avoid subtle * locking dependencies causing deadlocks. */ if (!IS_ERR_OR_NULL(req)) acomp_request_free(req); if (!IS_ERR_OR_NULL(acomp)) crypto_free_acomp(acomp); kfree(buffer); return 0; } static struct crypto_acomp_ctx *acomp_ctx_get_cpu_lock(struct zswap_pool *pool) { struct crypto_acomp_ctx *acomp_ctx; for (;;) { acomp_ctx = raw_cpu_ptr(pool->acomp_ctx); mutex_lock(&acomp_ctx->mutex); if (likely(acomp_ctx->req)) return acomp_ctx; /* * It is possible that we were migrated to a different CPU after * getting the per-CPU ctx but before the mutex was acquired. If * the old CPU got offlined, zswap_cpu_comp_dead() could have * already freed ctx->req (among other things) and set it to * NULL. Just try again on the new CPU that we ended up on. */ mutex_unlock(&acomp_ctx->mutex); } } static void acomp_ctx_put_unlock(struct crypto_acomp_ctx *acomp_ctx) { mutex_unlock(&acomp_ctx->mutex); } static bool zswap_compress(struct page *page, struct zswap_entry *entry, struct zswap_pool *pool) { struct crypto_acomp_ctx *acomp_ctx; struct scatterlist input, output; int comp_ret = 0, alloc_ret = 0; unsigned int dlen = PAGE_SIZE; unsigned long handle; struct zpool *zpool; gfp_t gfp; u8 *dst; acomp_ctx = acomp_ctx_get_cpu_lock(pool); dst = acomp_ctx->buffer; sg_init_table(&input, 1); sg_set_page(&input, page, PAGE_SIZE, 0); /* * We need PAGE_SIZE * 2 here since there maybe over-compression case, * and hardware-accelerators may won't check the dst buffer size, so * giving the dst buffer with enough length to avoid buffer overflow. */ sg_init_one(&output, dst, PAGE_SIZE * 2); acomp_request_set_params(acomp_ctx->req, &input, &output, PAGE_SIZE, dlen); /* * it maybe looks a little bit silly that we send an asynchronous request, * then wait for its completion synchronously. This makes the process look * synchronous in fact. * Theoretically, acomp supports users send multiple acomp requests in one * acomp instance, then get those requests done simultaneously. but in this * case, zswap actually does store and load page by page, there is no * existing method to send the second page before the first page is done * in one thread doing zwap. * but in different threads running on different cpu, we have different * acomp instance, so multiple threads can do (de)compression in parallel. */ comp_ret = crypto_wait_req(crypto_acomp_compress(acomp_ctx->req), &acomp_ctx->wait); dlen = acomp_ctx->req->dlen; if (comp_ret) goto unlock; zpool = pool->zpool; gfp = GFP_NOWAIT | __GFP_NORETRY | __GFP_HIGHMEM | __GFP_MOVABLE; alloc_ret = zpool_malloc(zpool, dlen, gfp, &handle, page_to_nid(page)); if (alloc_ret) goto unlock; zpool_obj_write(zpool, handle, dst, dlen); entry->handle = handle; entry->length = dlen; unlock: if (comp_ret == -ENOSPC || alloc_ret == -ENOSPC) zswap_reject_compress_poor++; else if (comp_ret) zswap_reject_compress_fail++; else if (alloc_ret) zswap_reject_alloc_fail++; acomp_ctx_put_unlock(acomp_ctx); return comp_ret == 0 && alloc_ret == 0; } static bool zswap_decompress(struct zswap_entry *entry, struct folio *folio) { struct zpool *zpool = entry->pool->zpool; struct scatterlist input, output; struct crypto_acomp_ctx *acomp_ctx; int decomp_ret, dlen; u8 *src, *obj; acomp_ctx = acomp_ctx_get_cpu_lock(entry->pool); obj = zpool_obj_read_begin(zpool, entry->handle, acomp_ctx->buffer); /* * zpool_obj_read_begin() might return a kmap address of highmem when * acomp_ctx->buffer is not used. However, sg_init_one() does not * handle highmem addresses, so copy the object to acomp_ctx->buffer. */ if (virt_addr_valid(obj)) { src = obj; } else { WARN_ON_ONCE(obj == acomp_ctx->buffer); memcpy(acomp_ctx->buffer, obj, entry->length); src = acomp_ctx->buffer; } sg_init_one(&input, src, entry->length); sg_init_table(&output, 1); sg_set_folio(&output, folio, PAGE_SIZE, 0); acomp_request_set_params(acomp_ctx->req, &input, &output, entry->length, PAGE_SIZE); decomp_ret = crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req), &acomp_ctx->wait); dlen = acomp_ctx->req->dlen; zpool_obj_read_end(zpool, entry->handle, obj); acomp_ctx_put_unlock(acomp_ctx); if (!decomp_ret && dlen == PAGE_SIZE) return true; zswap_decompress_fail++; pr_alert_ratelimited("Decompression error from zswap (%d:%lu %s %u->%d)\n", swp_type(entry->swpentry), swp_offset(entry->swpentry), entry->pool->tfm_name, entry->length, dlen); return false; } /********************************* * writeback code **********************************/ /* * Attempts to free an entry by adding a folio to the swap cache, * decompressing the entry data into the folio, and issuing a * bio write to write the folio back to the swap device. * * This can be thought of as a "resumed writeback" of the folio * to the swap device. We are basically resuming the same swap * writeback path that was intercepted with the zswap_store() * in the first place. After the folio has been decompressed into * the swap cache, the compressed version stored by zswap can be * freed. */ static int zswap_writeback_entry(struct zswap_entry *entry, swp_entry_t swpentry) { struct xarray *tree; pgoff_t offset = swp_offset(swpentry); struct folio *folio; struct mempolicy *mpol; bool folio_was_allocated; struct swap_info_struct *si; int ret = 0; /* try to allocate swap cache folio */ si = get_swap_device(swpentry); if (!si) return -EEXIST; mpol = get_task_policy(current); folio = __read_swap_cache_async(swpentry, GFP_KERNEL, mpol, NO_INTERLEAVE_INDEX, &folio_was_allocated, true); put_swap_device(si); if (!folio) return -ENOMEM; /* * Found an existing folio, we raced with swapin or concurrent * shrinker. We generally writeback cold folios from zswap, and * swapin means the folio just became hot, so skip this folio. * For unlikely concurrent shrinker case, it will be unlinked * and freed when invalidated by the concurrent shrinker anyway. */ if (!folio_was_allocated) { ret = -EEXIST; goto out; } /* * folio is locked, and the swapcache is now secured against * concurrent swapping to and from the slot, and concurrent * swapoff so we can safely dereference the zswap tree here. * Verify that the swap entry hasn't been invalidated and recycled * behind our backs, to avoid overwriting a new swap folio with * old compressed data. Only when this is successful can the entry * be dereferenced. */ tree = swap_zswap_tree(swpentry); if (entry != xa_load(tree, offset)) { ret = -ENOMEM; goto out; } if (!zswap_decompress(entry, folio)) { ret = -EIO; goto out; } xa_erase(tree, offset); count_vm_event(ZSWPWB); if (entry->objcg) count_objcg_events(entry->objcg, ZSWPWB, 1); zswap_entry_free(entry); /* folio is up to date */ folio_mark_uptodate(folio); /* move it to the tail of the inactive list after end_writeback */ folio_set_reclaim(folio); /* start writeback */ __swap_writepage(folio, NULL); out: if (ret && ret != -EEXIST) { delete_from_swap_cache(folio); folio_unlock(folio); } folio_put(folio); return ret; } /********************************* * shrinker functions **********************************/ /* * The dynamic shrinker is modulated by the following factors: * * 1. Each zswap entry has a referenced bit, which the shrinker unsets (giving * the entry a second chance) before rotating it in the LRU list. If the * entry is considered again by the shrinker, with its referenced bit unset, * it is written back. The writeback rate as a result is dynamically * adjusted by the pool activities - if the pool is dominated by new entries * (i.e lots of recent zswapouts), these entries will be protected and * the writeback rate will slow down. On the other hand, if the pool has a * lot of stagnant entries, these entries will be reclaimed immediately, * effectively increasing the writeback rate. * * 2. Swapins counter: If we observe swapins, it is a sign that we are * overshrinking and should slow down. We maintain a swapins counter, which * is consumed and subtract from the number of eligible objects on the LRU * in zswap_shrinker_count(). * * 3. Compression ratio. The better the workload compresses, the less gains we * can expect from writeback. We scale down the number of objects available * for reclaim by this ratio. */ static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_one *l, void *arg) { struct zswap_entry *entry = container_of(item, struct zswap_entry, lru); bool *encountered_page_in_swapcache = (bool *)arg; swp_entry_t swpentry; enum lru_status ret = LRU_REMOVED_RETRY; int writeback_result; /* * Second chance algorithm: if the entry has its referenced bit set, give it * a second chance. Only clear the referenced bit and rotate it in the * zswap's LRU list. */ if (entry->referenced) { entry->referenced = false; return LRU_ROTATE; } /* * As soon as we drop the LRU lock, the entry can be freed by * a concurrent invalidation. This means the following: * * 1. We extract the swp_entry_t to the stack, allowing * zswap_writeback_entry() to pin the swap entry and * then validate the zwap entry against that swap entry's * tree using pointer value comparison. Only when that * is successful can the entry be dereferenced. * * 2. Usually, objects are taken off the LRU for reclaim. In * this case this isn't possible, because if reclaim fails * for whatever reason, we have no means of knowing if the * entry is alive to put it back on the LRU. * * So rotate it before dropping the lock. If the entry is * written back or invalidated, the free path will unlink * it. For failures, rotation is the right thing as well. * * Temporary failures, where the same entry should be tried * again immediately, almost never happen for this shrinker. * We don't do any trylocking; -ENOMEM comes closest, * but that's extremely rare and doesn't happen spuriously * either. Don't bother distinguishing this case. */ list_move_tail(item, &l->list); /* * Once the lru lock is dropped, the entry might get freed. The * swpentry is copied to the stack, and entry isn't deref'd again * until the entry is verified to still be alive in the tree. */ swpentry = entry->swpentry; /* * It's safe to drop the lock here because we return either * LRU_REMOVED_RETRY, LRU_RETRY or LRU_STOP. */ spin_unlock(&l->lock); writeback_result = zswap_writeback_entry(entry, swpentry); if (writeback_result) { zswap_reject_reclaim_fail++; ret = LRU_RETRY; /* * Encountering a page already in swap cache is a sign that we are shrinking * into the warmer region. We should terminate shrinking (if we're in the dynamic * shrinker context). */ if (writeback_result == -EEXIST && encountered_page_in_swapcache) { ret = LRU_STOP; *encountered_page_in_swapcache = true; } } else { zswap_written_back_pages++; } return ret; } static unsigned long zswap_shrinker_scan(struct shrinker *shrinker, struct shrink_control *sc) { unsigned long shrink_ret; bool encountered_page_in_swapcache = false; if (!zswap_shrinker_enabled || !mem_cgroup_zswap_writeback_enabled(sc->memcg)) { sc->nr_scanned = 0; return SHRINK_STOP; } shrink_ret = list_lru_shrink_walk(&zswap_list_lru, sc, &shrink_memcg_cb, &encountered_page_in_swapcache); if (encountered_page_in_swapcache) return SHRINK_STOP; return shrink_ret ? shrink_ret : SHRINK_STOP; } static unsigned long zswap_shrinker_count(struct shrinker *shrinker, struct shrink_control *sc) { struct mem_cgroup *memcg = sc->memcg; struct lruvec *lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(sc->nid)); atomic_long_t *nr_disk_swapins = &lruvec->zswap_lruvec_state.nr_disk_swapins; unsigned long nr_backing, nr_stored, nr_freeable, nr_disk_swapins_cur, nr_remain; if (!zswap_shrinker_enabled || !mem_cgroup_zswap_writeback_enabled(memcg)) return 0; /* * The shrinker resumes swap writeback, which will enter block * and may enter fs. XXX: Harmonize with vmscan.c __GFP_FS * rules (may_enter_fs()), which apply on a per-folio basis. */ if (!gfp_has_io_fs(sc->gfp_mask)) return 0; /* * For memcg, use the cgroup-wide ZSWAP stats since we don't * have them per-node and thus per-lruvec. Careful if memcg is * runtime-disabled: we can get sc->memcg == NULL, which is ok * for the lruvec, but not for memcg_page_state(). * * Without memcg, use the zswap pool-wide metrics. */ if (!mem_cgroup_disabled()) { mem_cgroup_flush_stats(memcg); nr_backing = memcg_page_state(memcg, MEMCG_ZSWAP_B) >> PAGE_SHIFT; nr_stored = memcg_page_state(memcg, MEMCG_ZSWAPPED); } else { nr_backing = zswap_total_pages(); nr_stored = atomic_long_read(&zswap_stored_pages); } if (!nr_stored) return 0; nr_freeable = list_lru_shrink_count(&zswap_list_lru, sc); if (!nr_freeable) return 0; /* * Subtract from the lru size the number of pages that are recently swapped * in from disk. The idea is that had we protect the zswap's LRU by this * amount of pages, these disk swapins would not have happened. */ nr_disk_swapins_cur = atomic_long_read(nr_disk_swapins); do { if (nr_freeable >= nr_disk_swapins_cur) nr_remain = 0; else nr_remain = nr_disk_swapins_cur - nr_freeable; } while (!atomic_long_try_cmpxchg( nr_disk_swapins, &nr_disk_swapins_cur, nr_remain)); nr_freeable -= nr_disk_swapins_cur - nr_remain; if (!nr_freeable) return 0; /* * Scale the number of freeable pages by the memory saving factor. * This ensures that the better zswap compresses memory, the fewer * pages we will evict to swap (as it will otherwise incur IO for * relatively small memory saving). */ return mult_frac(nr_freeable, nr_backing, nr_stored); } static struct shrinker *zswap_alloc_shrinker(void) { struct shrinker *shrinker; shrinker = shrinker_alloc(SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE, "mm-zswap"); if (!shrinker) return NULL; shrinker->scan_objects = zswap_shrinker_scan; shrinker->count_objects = zswap_shrinker_count; shrinker->batch = 0; shrinker->seeks = DEFAULT_SEEKS; return shrinker; } static int shrink_memcg(struct mem_cgroup *memcg) { int nid, shrunk = 0, scanned = 0; if (!mem_cgroup_zswap_writeback_enabled(memcg)) return -ENOENT; /* * Skip zombies because their LRUs are reparented and we would be * reclaiming from the parent instead of the dead memcg. */ if (memcg && !mem_cgroup_online(memcg)) return -ENOENT; for_each_node_state(nid, N_NORMAL_MEMORY) { unsigned long nr_to_walk = 1; shrunk += list_lru_walk_one(&zswap_list_lru, nid, memcg, &shrink_memcg_cb, NULL, &nr_to_walk); scanned += 1 - nr_to_walk; } if (!scanned) return -ENOENT; return shrunk ? 0 : -EAGAIN; } static void shrink_worker(struct work_struct *w) { struct mem_cgroup *memcg; int ret, failures = 0, attempts = 0; unsigned long thr; /* Reclaim down to the accept threshold */ thr = zswap_accept_thr_pages(); /* * Global reclaim will select cgroup in a round-robin fashion from all * online memcgs, but memcgs that have no pages in zswap and * writeback-disabled memcgs (memory.zswap.writeback=0) are not * candidates for shrinking. * * Shrinking will be aborted if we encounter the following * MAX_RECLAIM_RETRIES times: * - No writeback-candidate memcgs found in a memcg tree walk. * - Shrinking a writeback-candidate memcg failed. * * We save iteration cursor memcg into zswap_next_shrink, * which can be modified by the offline memcg cleaner * zswap_memcg_offline_cleanup(). * * Since the offline cleaner is called only once, we cannot leave an * offline memcg reference in zswap_next_shrink. * We can rely on the cleaner only if we get online memcg under lock. * * If we get an offline memcg, we cannot determine if the cleaner has * already been called or will be called later. We must put back the * reference before returning from this function. Otherwise, the * offline memcg left in zswap_next_shrink will hold the reference * until the next run of shrink_worker(). */ do { /* * Start shrinking from the next memcg after zswap_next_shrink. * When the offline cleaner has already advanced the cursor, * advancing the cursor here overlooks one memcg, but this * should be negligibly rare. * * If we get an online memcg, keep the extra reference in case * the original one obtained by mem_cgroup_iter() is dropped by * zswap_memcg_offline_cleanup() while we are shrinking the * memcg. */ spin_lock(&zswap_shrink_lock); do { memcg = mem_cgroup_iter(NULL, zswap_next_shrink, NULL); zswap_next_shrink = memcg; } while (memcg && !mem_cgroup_tryget_online(memcg)); spin_unlock(&zswap_shrink_lock); if (!memcg) { /* * Continue shrinking without incrementing failures if * we found candidate memcgs in the last tree walk. */ if (!attempts && ++failures == MAX_RECLAIM_RETRIES) break; attempts = 0; goto resched; } ret = shrink_memcg(memcg); /* drop the extra reference */ mem_cgroup_put(memcg); /* * There are no writeback-candidate pages in the memcg. * This is not an issue as long as we can find another memcg * with pages in zswap. Skip this without incrementing attempts * and failures. */ if (ret == -ENOENT) continue; ++attempts; if (ret && ++failures == MAX_RECLAIM_RETRIES) break; resched: cond_resched(); } while (zswap_total_pages() > thr); } /********************************* * main API **********************************/ static bool zswap_store_page(struct page *page, struct obj_cgroup *objcg, struct zswap_pool *pool) { swp_entry_t page_swpentry = page_swap_entry(page); struct zswap_entry *entry, *old; /* allocate entry */ entry = zswap_entry_cache_alloc(GFP_KERNEL, page_to_nid(page)); if (!entry) { zswap_reject_kmemcache_fail++; return false; } if (!zswap_compress(page, entry, pool)) goto compress_failed; old = xa_store(swap_zswap_tree(page_swpentry), swp_offset(page_swpentry), entry, GFP_KERNEL); if (xa_is_err(old)) { int err = xa_err(old); WARN_ONCE(err != -ENOMEM, "unexpected xarray error: %d\n", err); zswap_reject_alloc_fail++; goto store_failed; } /* * We may have had an existing entry that became stale when * the folio was redirtied and now the new version is being * swapped out. Get rid of the old. */ if (old) zswap_entry_free(old); /* * The entry is successfully compressed and stored in the tree, there is * no further possibility of failure. Grab refs to the pool and objcg, * charge zswap memory, and increment zswap_stored_pages. * The opposite actions will be performed by zswap_entry_free() * when the entry is removed from the tree. */ zswap_pool_get(pool); if (objcg) { obj_cgroup_get(objcg); obj_cgroup_charge_zswap(objcg, entry->length); } atomic_long_inc(&zswap_stored_pages); /* * We finish initializing the entry while it's already in xarray. * This is safe because: * * 1. Concurrent stores and invalidations are excluded by folio lock. * * 2. Writeback is excluded by the entry not being on the LRU yet. * The publishing order matters to prevent writeback from seeing * an incoherent entry. */ entry->pool = pool; entry->swpentry = page_swpentry; entry->objcg = objcg; entry->referenced = true; if (entry->length) { INIT_LIST_HEAD(&entry->lru); zswap_lru_add(&zswap_list_lru, entry); } return true; store_failed: zpool_free(pool->zpool, entry->handle); compress_failed: zswap_entry_cache_free(entry); return false; } bool zswap_store(struct folio *folio) { long nr_pages = folio_nr_pages(folio); swp_entry_t swp = folio->swap; struct obj_cgroup *objcg = NULL; struct mem_cgroup *memcg = NULL; struct zswap_pool *pool; bool ret = false; long index; VM_WARN_ON_ONCE(!folio_test_locked(folio)); VM_WARN_ON_ONCE(!folio_test_swapcache(folio)); if (!zswap_enabled) goto check_old; objcg = get_obj_cgroup_from_folio(folio); if (objcg && !obj_cgroup_may_zswap(objcg)) { memcg = get_mem_cgroup_from_objcg(objcg); if (shrink_memcg(memcg)) { mem_cgroup_put(memcg); goto put_objcg; } mem_cgroup_put(memcg); } if (zswap_check_limits()) goto put_objcg; pool = zswap_pool_current_get(); if (!pool) goto put_objcg; if (objcg) { memcg = get_mem_cgroup_from_objcg(objcg); if (memcg_list_lru_alloc(memcg, &zswap_list_lru, GFP_KERNEL)) { mem_cgroup_put(memcg); goto put_pool; } mem_cgroup_put(memcg); } for (index = 0; index < nr_pages; ++index) { struct page *page = folio_page(folio, index); if (!zswap_store_page(page, objcg, pool)) goto put_pool; } if (objcg) count_objcg_events(objcg, ZSWPOUT, nr_pages); count_vm_events(ZSWPOUT, nr_pages); ret = true; put_pool: zswap_pool_put(pool); put_objcg: obj_cgroup_put(objcg); if (!ret && zswap_pool_reached_full) queue_work(shrink_wq, &zswap_shrink_work); check_old: /* * If the zswap store fails or zswap is disabled, we must invalidate * the possibly stale entries which were previously stored at the * offsets corresponding to each page of the folio. Otherwise, * writeback could overwrite the new data in the swapfile. */ if (!ret) { unsigned type = swp_type(swp); pgoff_t offset = swp_offset(swp); struct zswap_entry *entry; struct xarray *tree; for (index = 0; index < nr_pages; ++index) { tree = swap_zswap_tree(swp_entry(type, offset + index)); entry = xa_erase(tree, offset + index); if (entry) zswap_entry_free(entry); } } return ret; } /** * zswap_load() - load a folio from zswap * @folio: folio to load * * Return: 0 on success, with the folio unlocked and marked up-to-date, or one * of the following error codes: * * -EIO: if the swapped out content was in zswap, but could not be loaded * into the page due to a decompression failure. The folio is unlocked, but * NOT marked up-to-date, so that an IO error is emitted (e.g. do_swap_page() * will SIGBUS). * * -EINVAL: if the swapped out content was in zswap, but the page belongs * to a large folio, which is not supported by zswap. The folio is unlocked, * but NOT marked up-to-date, so that an IO error is emitted (e.g. * do_swap_page() will SIGBUS). * * -ENOENT: if the swapped out content was not in zswap. The folio remains * locked on return. */ int zswap_load(struct folio *folio) { swp_entry_t swp = folio->swap; pgoff_t offset = swp_offset(swp); bool swapcache = folio_test_swapcache(folio); struct xarray *tree = swap_zswap_tree(swp); struct zswap_entry *entry; VM_WARN_ON_ONCE(!folio_test_locked(folio)); if (zswap_never_enabled()) return -ENOENT; /* * Large folios should not be swapped in while zswap is being used, as * they are not properly handled. Zswap does not properly load large * folios, and a large folio may only be partially in zswap. */ if (WARN_ON_ONCE(folio_test_large(folio))) { folio_unlock(folio); return -EINVAL; } entry = xa_load(tree, offset); if (!entry) return -ENOENT; if (!zswap_decompress(entry, folio)) { folio_unlock(folio); return -EIO; } folio_mark_uptodate(folio); count_vm_event(ZSWPIN); if (entry->objcg) count_objcg_events(entry->objcg, ZSWPIN, 1); /* * When reading into the swapcache, invalidate our entry. The * swapcache can be the authoritative owner of the page and * its mappings, and the pressure that results from having two * in-memory copies outweighs any benefits of caching the * compression work. * * (Most swapins go through the swapcache. The notable * exception is the singleton fault on SWP_SYNCHRONOUS_IO * files, which reads into a private page and may free it if * the fault fails. We remain the primary owner of the entry.) */ if (swapcache) { folio_mark_dirty(folio); xa_erase(tree, offset); zswap_entry_free(entry); } folio_unlock(folio); return 0; } void zswap_invalidate(swp_entry_t swp) { pgoff_t offset = swp_offset(swp); struct xarray *tree = swap_zswap_tree(swp); struct zswap_entry *entry; if (xa_empty(tree)) return; entry = xa_erase(tree, offset); if (entry) zswap_entry_free(entry); } int zswap_swapon(int type, unsigned long nr_pages) { struct xarray *trees, *tree; unsigned int nr, i; nr = DIV_ROUND_UP(nr_pages, SWAP_ADDRESS_SPACE_PAGES); trees = kvcalloc(nr, sizeof(*tree), GFP_KERNEL); if (!trees) { pr_err("alloc failed, zswap disabled for swap type %d\n", type); return -ENOMEM; } for (i = 0; i < nr; i++) xa_init(trees + i); nr_zswap_trees[type] = nr; zswap_trees[type] = trees; return 0; } void zswap_swapoff(int type) { struct xarray *trees = zswap_trees[type]; unsigned int i; if (!trees) return; /* try_to_unuse() invalidated all the entries already */ for (i = 0; i < nr_zswap_trees[type]; i++) WARN_ON_ONCE(!xa_empty(trees + i)); kvfree(trees); nr_zswap_trees[type] = 0; zswap_trees[type] = NULL; } /********************************* * debugfs functions **********************************/ #ifdef CONFIG_DEBUG_FS #include <linux/debugfs.h> static struct dentry *zswap_debugfs_root; static int debugfs_get_total_size(void *data, u64 *val) { *val = zswap_total_pages() * PAGE_SIZE; return 0; } DEFINE_DEBUGFS_ATTRIBUTE(total_size_fops, debugfs_get_total_size, NULL, "%llu\n"); static int debugfs_get_stored_pages(void *data, u64 *val) { *val = atomic_long_read(&zswap_stored_pages); return 0; } DEFINE_DEBUGFS_ATTRIBUTE(stored_pages_fops, debugfs_get_stored_pages, NULL, "%llu\n"); static int zswap_debugfs_init(void) { if (!debugfs_initialized()) return -ENODEV; zswap_debugfs_root = debugfs_create_dir("zswap", NULL); debugfs_create_u64("pool_limit_hit", 0444, zswap_debugfs_root, &zswap_pool_limit_hit); debugfs_create_u64("reject_reclaim_fail", 0444, zswap_debugfs_root, &zswap_reject_reclaim_fail); debugfs_create_u64("reject_alloc_fail", 0444, zswap_debugfs_root, &zswap_reject_alloc_fail); debugfs_create_u64("reject_kmemcache_fail", 0444, zswap_debugfs_root, &zswap_reject_kmemcache_fail); debugfs_create_u64("reject_compress_fail", 0444, zswap_debugfs_root, &zswap_reject_compress_fail); debugfs_create_u64("reject_compress_poor", 0444, zswap_debugfs_root, &zswap_reject_compress_poor); debugfs_create_u64("decompress_fail", 0444, zswap_debugfs_root, &zswap_decompress_fail); debugfs_create_u64("written_back_pages", 0444, zswap_debugfs_root, &zswap_written_back_pages); debugfs_create_file("pool_total_size", 0444, zswap_debugfs_root, NULL, &total_size_fops); debugfs_create_file("stored_pages", 0444, zswap_debugfs_root, NULL, &stored_pages_fops); return 0; } #else static int zswap_debugfs_init(void) { return 0; } #endif /********************************* * module init and exit **********************************/ static int zswap_setup(void) { struct zswap_pool *pool; int ret; zswap_entry_cache = KMEM_CACHE(zswap_entry, 0); if (!zswap_entry_cache) { pr_err("entry cache creation failed\n"); goto cache_fail; } ret = cpuhp_setup_state_multi(CPUHP_MM_ZSWP_POOL_PREPARE, "mm/zswap_pool:prepare", zswap_cpu_comp_prepare, zswap_cpu_comp_dead); if (ret) goto hp_fail; shrink_wq = alloc_workqueue("zswap-shrink", WQ_UNBOUND|WQ_MEM_RECLAIM, 1); if (!shrink_wq) goto shrink_wq_fail; zswap_shrinker = zswap_alloc_shrinker(); if (!zswap_shrinker) goto shrinker_fail; if (list_lru_init_memcg(&zswap_list_lru, zswap_shrinker)) goto lru_fail; shrinker_register(zswap_shrinker); INIT_WORK(&zswap_shrink_work, shrink_worker); pool = __zswap_pool_create_fallback(); if (pool) { pr_info("loaded using pool %s/%s\n", pool->tfm_name, zpool_get_type(pool->zpool)); list_add(&pool->list, &zswap_pools); zswap_has_pool = true; static_branch_enable(&zswap_ever_enabled); } else { pr_err("pool creation failed\n"); zswap_enabled = false; } if (zswap_debugfs_init()) pr_warn("debugfs initialization failed\n"); zswap_init_state = ZSWAP_INIT_SUCCEED; return 0; lru_fail: shrinker_free(zswap_shrinker); shrinker_fail: destroy_workqueue(shrink_wq); shrink_wq_fail: cpuhp_remove_multi_state(CPUHP_MM_ZSWP_POOL_PREPARE); hp_fail: kmem_cache_destroy(zswap_entry_cache); cache_fail: /* if built-in, we aren't unloaded on failure; don't allow use */ zswap_init_state = ZSWAP_INIT_FAILED; zswap_enabled = false; return -ENOMEM; } static int __init zswap_init(void) { if (!zswap_enabled) return 0; return zswap_setup(); } /* must be late so crypto has time to come up */ late_initcall(zswap_init); MODULE_AUTHOR("Seth Jennings <sjennings@variantweb.net>"); MODULE_DESCRIPTION("Compressed cache for swap pages"); |
| 14 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __FIRMWARE_SYSFS_H #define __FIRMWARE_SYSFS_H #include <linux/device.h> #include "firmware.h" MODULE_IMPORT_NS("FIRMWARE_LOADER_PRIVATE"); extern struct firmware_fallback_config fw_fallback_config; extern struct device_attribute dev_attr_loading; #ifdef CONFIG_FW_LOADER_USER_HELPER /** * struct firmware_fallback_config - firmware fallback configuration settings * * Helps describe and fine tune the fallback mechanism. * * @force_sysfs_fallback: force the sysfs fallback mechanism to be used * as if one had enabled CONFIG_FW_LOADER_USER_HELPER_FALLBACK=y. * Useful to help debug a CONFIG_FW_LOADER_USER_HELPER_FALLBACK=y * functionality on a kernel where that config entry has been disabled. * @ignore_sysfs_fallback: force to disable the sysfs fallback mechanism. * This emulates the behaviour as if we had set the kernel * config CONFIG_FW_LOADER_USER_HELPER=n. * @old_timeout: for internal use * @loading_timeout: the timeout to wait for the fallback mechanism before * giving up, in seconds. */ struct firmware_fallback_config { unsigned int force_sysfs_fallback; unsigned int ignore_sysfs_fallback; int old_timeout; int loading_timeout; }; /* These getters are vetted to use int properly */ static inline int __firmware_loading_timeout(void) { return fw_fallback_config.loading_timeout; } /* These setters are vetted to use int properly */ static inline void __fw_fallback_set_timeout(int timeout) { fw_fallback_config.loading_timeout = timeout; } #endif #ifdef CONFIG_FW_LOADER_SYSFS int register_sysfs_loader(void); void unregister_sysfs_loader(void); #if defined(CONFIG_FW_LOADER_USER_HELPER) && defined(CONFIG_SYSCTL) int register_firmware_config_sysctl(void); void unregister_firmware_config_sysctl(void); #else static inline int register_firmware_config_sysctl(void) { return 0; } static inline void unregister_firmware_config_sysctl(void) { } #endif /* CONFIG_FW_LOADER_USER_HELPER && CONFIG_SYSCTL */ #else /* CONFIG_FW_LOADER_SYSFS */ static inline int register_sysfs_loader(void) { return 0; } static inline void unregister_sysfs_loader(void) { } #endif /* CONFIG_FW_LOADER_SYSFS */ struct fw_sysfs { bool nowait; struct device dev; struct fw_priv *fw_priv; struct firmware *fw; void *fw_upload_priv; }; #define to_fw_sysfs(__dev) container_of_const(__dev, struct fw_sysfs, dev) void __fw_load_abort(struct fw_priv *fw_priv); static inline void fw_load_abort(struct fw_sysfs *fw_sysfs) { struct fw_priv *fw_priv = fw_sysfs->fw_priv; __fw_load_abort(fw_priv); } struct fw_sysfs * fw_create_instance(struct firmware *firmware, const char *fw_name, struct device *device, u32 opt_flags); #ifdef CONFIG_FW_UPLOAD extern struct device_attribute dev_attr_status; extern struct device_attribute dev_attr_error; extern struct device_attribute dev_attr_cancel; extern struct device_attribute dev_attr_remaining_size; int fw_upload_start(struct fw_sysfs *fw_sysfs); void fw_upload_free(struct fw_sysfs *fw_sysfs); umode_t fw_upload_is_visible(struct kobject *kobj, struct attribute *attr, int n); #else static inline int fw_upload_start(struct fw_sysfs *fw_sysfs) { return 0; } static inline void fw_upload_free(struct fw_sysfs *fw_sysfs) { } #endif #endif /* __FIRMWARE_SYSFS_H */ |
| 51 84 84 61 61 84 22 22 22 22 62 91 91 62 62 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 | // SPDX-License-Identifier: GPL-2.0 /* * Ldisc rw semaphore * * The ldisc semaphore is semantically a rw_semaphore but which enforces * an alternate policy, namely: * 1) Supports lock wait timeouts * 2) Write waiter has priority * 3) Downgrading is not supported * * Implementation notes: * 1) Upper half of semaphore count is a wait count (differs from rwsem * in that rwsem normalizes the upper half to the wait bias) * 2) Lacks overflow checking * * The generic counting was copied and modified from include/asm-generic/rwsem.h * by Paul Mackerras <paulus@samba.org>. * * The scheduling policy was copied and modified from lib/rwsem.c * Written by David Howells (dhowells@redhat.com). * * This implementation incorporates the write lock stealing work of * Michel Lespinasse <walken@google.com>. * * Copyright (C) 2013 Peter Hurley <peter@hurleysoftware.com> */ #include <linux/list.h> #include <linux/spinlock.h> #include <linux/atomic.h> #include <linux/tty.h> #include <linux/sched.h> #include <linux/sched/debug.h> #include <linux/sched/task.h> #if BITS_PER_LONG == 64 # define LDSEM_ACTIVE_MASK 0xffffffffL #else # define LDSEM_ACTIVE_MASK 0x0000ffffL #endif #define LDSEM_UNLOCKED 0L #define LDSEM_ACTIVE_BIAS 1L #define LDSEM_WAIT_BIAS (-LDSEM_ACTIVE_MASK-1) #define LDSEM_READ_BIAS LDSEM_ACTIVE_BIAS #define LDSEM_WRITE_BIAS (LDSEM_WAIT_BIAS + LDSEM_ACTIVE_BIAS) struct ldsem_waiter { struct list_head list; struct task_struct *task; }; /* * Initialize an ldsem: */ void __init_ldsem(struct ld_semaphore *sem, const char *name, struct lock_class_key *key) { #ifdef CONFIG_DEBUG_LOCK_ALLOC /* * Make sure we are not reinitializing a held semaphore: */ debug_check_no_locks_freed((void *)sem, sizeof(*sem)); lockdep_init_map(&sem->dep_map, name, key, 0); #endif atomic_long_set(&sem->count, LDSEM_UNLOCKED); sem->wait_readers = 0; raw_spin_lock_init(&sem->wait_lock); INIT_LIST_HEAD(&sem->read_wait); INIT_LIST_HEAD(&sem->write_wait); } static void __ldsem_wake_readers(struct ld_semaphore *sem) { struct ldsem_waiter *waiter, *next; struct task_struct *tsk; long adjust, count; /* * Try to grant read locks to all readers on the read wait list. * Note the 'active part' of the count is incremented by * the number of readers before waking any processes up. */ adjust = sem->wait_readers * (LDSEM_ACTIVE_BIAS - LDSEM_WAIT_BIAS); count = atomic_long_add_return(adjust, &sem->count); do { if (count > 0) break; if (atomic_long_try_cmpxchg(&sem->count, &count, count - adjust)) return; } while (1); list_for_each_entry_safe(waiter, next, &sem->read_wait, list) { tsk = waiter->task; smp_store_release(&waiter->task, NULL); wake_up_process(tsk); put_task_struct(tsk); } INIT_LIST_HEAD(&sem->read_wait); sem->wait_readers = 0; } static inline int writer_trylock(struct ld_semaphore *sem) { /* * Only wake this writer if the active part of the count can be * transitioned from 0 -> 1 */ long count = atomic_long_add_return(LDSEM_ACTIVE_BIAS, &sem->count); do { if ((count & LDSEM_ACTIVE_MASK) == LDSEM_ACTIVE_BIAS) return 1; if (atomic_long_try_cmpxchg(&sem->count, &count, count - LDSEM_ACTIVE_BIAS)) return 0; } while (1); } static void __ldsem_wake_writer(struct ld_semaphore *sem) { struct ldsem_waiter *waiter; waiter = list_entry(sem->write_wait.next, struct ldsem_waiter, list); wake_up_process(waiter->task); } /* * handle the lock release when processes blocked on it that can now run * - if we come here from up_xxxx(), then: * - the 'active part' of count (&0x0000ffff) reached 0 (but may have changed) * - the 'waiting part' of count (&0xffff0000) is -ve (and will still be so) * - the spinlock must be held by the caller * - woken process blocks are discarded from the list after having task zeroed */ static void __ldsem_wake(struct ld_semaphore *sem) { if (!list_empty(&sem->write_wait)) __ldsem_wake_writer(sem); else if (!list_empty(&sem->read_wait)) __ldsem_wake_readers(sem); } static void ldsem_wake(struct ld_semaphore *sem) { unsigned long flags; raw_spin_lock_irqsave(&sem->wait_lock, flags); __ldsem_wake(sem); raw_spin_unlock_irqrestore(&sem->wait_lock, flags); } /* * wait for the read lock to be granted */ static struct ld_semaphore __sched * down_read_failed(struct ld_semaphore *sem, long count, long timeout) { struct ldsem_waiter waiter; long adjust = -LDSEM_ACTIVE_BIAS + LDSEM_WAIT_BIAS; /* set up my own style of waitqueue */ raw_spin_lock_irq(&sem->wait_lock); /* * Try to reverse the lock attempt but if the count has changed * so that reversing fails, check if there are no waiters, * and early-out if not */ do { if (atomic_long_try_cmpxchg(&sem->count, &count, count + adjust)) { count += adjust; break; } if (count > 0) { raw_spin_unlock_irq(&sem->wait_lock); return sem; } } while (1); list_add_tail(&waiter.list, &sem->read_wait); sem->wait_readers++; waiter.task = current; get_task_struct(current); /* if there are no active locks, wake the new lock owner(s) */ if ((count & LDSEM_ACTIVE_MASK) == 0) __ldsem_wake(sem); raw_spin_unlock_irq(&sem->wait_lock); /* wait to be given the lock */ for (;;) { set_current_state(TASK_UNINTERRUPTIBLE); if (!smp_load_acquire(&waiter.task)) break; if (!timeout) break; timeout = schedule_timeout(timeout); } __set_current_state(TASK_RUNNING); if (!timeout) { /* * Lock timed out but check if this task was just * granted lock ownership - if so, pretend there * was no timeout; otherwise, cleanup lock wait. */ raw_spin_lock_irq(&sem->wait_lock); if (waiter.task) { atomic_long_add_return(-LDSEM_WAIT_BIAS, &sem->count); sem->wait_readers--; list_del(&waiter.list); raw_spin_unlock_irq(&sem->wait_lock); put_task_struct(waiter.task); return NULL; } raw_spin_unlock_irq(&sem->wait_lock); } return sem; } /* * wait for the write lock to be granted */ static struct ld_semaphore __sched * down_write_failed(struct ld_semaphore *sem, long count, long timeout) { struct ldsem_waiter waiter; long adjust = -LDSEM_ACTIVE_BIAS; int locked = 0; /* set up my own style of waitqueue */ raw_spin_lock_irq(&sem->wait_lock); /* * Try to reverse the lock attempt but if the count has changed * so that reversing fails, check if the lock is now owned, * and early-out if so. */ do { if (atomic_long_try_cmpxchg(&sem->count, &count, count + adjust)) break; if ((count & LDSEM_ACTIVE_MASK) == LDSEM_ACTIVE_BIAS) { raw_spin_unlock_irq(&sem->wait_lock); return sem; } } while (1); list_add_tail(&waiter.list, &sem->write_wait); waiter.task = current; set_current_state(TASK_UNINTERRUPTIBLE); for (;;) { if (!timeout) break; raw_spin_unlock_irq(&sem->wait_lock); timeout = schedule_timeout(timeout); raw_spin_lock_irq(&sem->wait_lock); set_current_state(TASK_UNINTERRUPTIBLE); locked = writer_trylock(sem); if (locked) break; } if (!locked) atomic_long_add_return(-LDSEM_WAIT_BIAS, &sem->count); list_del(&waiter.list); /* * In case of timeout, wake up every reader who gave the right of way * to writer. Prevent separation readers into two groups: * one that helds semaphore and another that sleeps. * (in case of no contention with a writer) */ if (!locked && list_empty(&sem->write_wait)) __ldsem_wake_readers(sem); raw_spin_unlock_irq(&sem->wait_lock); __set_current_state(TASK_RUNNING); /* lock wait may have timed out */ if (!locked) return NULL; return sem; } static int __ldsem_down_read_nested(struct ld_semaphore *sem, int subclass, long timeout) { long count; rwsem_acquire_read(&sem->dep_map, subclass, 0, _RET_IP_); count = atomic_long_add_return(LDSEM_READ_BIAS, &sem->count); if (count <= 0) { lock_contended(&sem->dep_map, _RET_IP_); if (!down_read_failed(sem, count, timeout)) { rwsem_release(&sem->dep_map, _RET_IP_); return 0; } } lock_acquired(&sem->dep_map, _RET_IP_); return 1; } static int __ldsem_down_write_nested(struct ld_semaphore *sem, int subclass, long timeout) { long count; rwsem_acquire(&sem->dep_map, subclass, 0, _RET_IP_); count = atomic_long_add_return(LDSEM_WRITE_BIAS, &sem->count); if ((count & LDSEM_ACTIVE_MASK) != LDSEM_ACTIVE_BIAS) { lock_contended(&sem->dep_map, _RET_IP_); if (!down_write_failed(sem, count, timeout)) { rwsem_release(&sem->dep_map, _RET_IP_); return 0; } } lock_acquired(&sem->dep_map, _RET_IP_); return 1; } /* * lock for reading -- returns 1 if successful, 0 if timed out */ int __sched ldsem_down_read(struct ld_semaphore *sem, long timeout) { might_sleep(); return __ldsem_down_read_nested(sem, 0, timeout); } /* * trylock for reading -- returns 1 if successful, 0 if contention */ int ldsem_down_read_trylock(struct ld_semaphore *sem) { long count = atomic_long_read(&sem->count); while (count >= 0) { if (atomic_long_try_cmpxchg(&sem->count, &count, count + LDSEM_READ_BIAS)) { rwsem_acquire_read(&sem->dep_map, 0, 1, _RET_IP_); lock_acquired(&sem->dep_map, _RET_IP_); return 1; } } return 0; } /* * lock for writing -- returns 1 if successful, 0 if timed out */ int __sched ldsem_down_write(struct ld_semaphore *sem, long timeout) { might_sleep(); return __ldsem_down_write_nested(sem, 0, timeout); } /* * release a read lock */ void ldsem_up_read(struct ld_semaphore *sem) { long count; rwsem_release(&sem->dep_map, _RET_IP_); count = atomic_long_add_return(-LDSEM_READ_BIAS, &sem->count); if (count < 0 && (count & LDSEM_ACTIVE_MASK) == 0) ldsem_wake(sem); } /* * release a write lock */ void ldsem_up_write(struct ld_semaphore *sem) { long count; rwsem_release(&sem->dep_map, _RET_IP_); count = atomic_long_add_return(-LDSEM_WRITE_BIAS, &sem->count); if (count < 0) ldsem_wake(sem); } #ifdef CONFIG_DEBUG_LOCK_ALLOC int ldsem_down_read_nested(struct ld_semaphore *sem, int subclass, long timeout) { might_sleep(); return __ldsem_down_read_nested(sem, subclass, timeout); } int ldsem_down_write_nested(struct ld_semaphore *sem, int subclass, long timeout) { might_sleep(); return __ldsem_down_write_nested(sem, subclass, timeout); } #endif |
| 3 1367 506 86 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_JIFFIES_H #define _LINUX_JIFFIES_H #include <linux/cache.h> #include <linux/limits.h> #include <linux/math64.h> #include <linux/minmax.h> #include <linux/types.h> #include <linux/time.h> #include <linux/timex.h> #include <vdso/jiffies.h> #include <asm/param.h> /* for HZ */ #include <generated/timeconst.h> /* * The following defines establish the engineering parameters of the PLL * model. The HZ variable establishes the timer interrupt frequency, 100 Hz * for the SunOS kernel, 256 Hz for the Ultrix kernel and 1024 Hz for the * OSF/1 kernel. The SHIFT_HZ define expresses the same value as the * nearest power of two in order to avoid hardware multiply operations. */ #if HZ >= 12 && HZ < 24 # define SHIFT_HZ 4 #elif HZ >= 24 && HZ < 48 # define SHIFT_HZ 5 #elif HZ >= 48 && HZ < 96 # define SHIFT_HZ 6 #elif HZ >= 96 && HZ < 192 # define SHIFT_HZ 7 #elif HZ >= 192 && HZ < 384 # define SHIFT_HZ 8 #elif HZ >= 384 && HZ < 768 # define SHIFT_HZ 9 #elif HZ >= 768 && HZ < 1536 # define SHIFT_HZ 10 #elif HZ >= 1536 && HZ < 3072 # define SHIFT_HZ 11 #elif HZ >= 3072 && HZ < 6144 # define SHIFT_HZ 12 #elif HZ >= 6144 && HZ < 12288 # define SHIFT_HZ 13 #else # error Invalid value of HZ. #endif /* Suppose we want to divide two numbers NOM and DEN: NOM/DEN, then we can * improve accuracy by shifting LSH bits, hence calculating: * (NOM << LSH) / DEN * This however means trouble for large NOM, because (NOM << LSH) may no * longer fit in 32 bits. The following way of calculating this gives us * some slack, under the following conditions: * - (NOM / DEN) fits in (32 - LSH) bits. * - (NOM % DEN) fits in (32 - LSH) bits. */ #define SH_DIV(NOM,DEN,LSH) ( (((NOM) / (DEN)) << (LSH)) \ + ((((NOM) % (DEN)) << (LSH)) + (DEN) / 2) / (DEN)) /* LATCH is used in the interval timer and ftape setup. */ #define LATCH ((CLOCK_TICK_RATE + HZ/2) / HZ) /* For divider */ extern void register_refined_jiffies(long clock_tick_rate); /* TICK_USEC is the time between ticks in usec assuming SHIFTED_HZ */ #define TICK_USEC ((USEC_PER_SEC + HZ/2) / HZ) /* USER_TICK_USEC is the time between ticks in usec assuming fake USER_HZ */ #define USER_TICK_USEC ((1000000UL + USER_HZ/2) / USER_HZ) #ifndef __jiffy_arch_data #define __jiffy_arch_data #endif /* * The 64-bit value is not atomic on 32-bit systems - you MUST NOT read it * without sampling the sequence number in jiffies_lock. * get_jiffies_64() will do this for you as appropriate. * * jiffies and jiffies_64 are at the same address for little-endian systems * and for 64-bit big-endian systems. * On 32-bit big-endian systems, jiffies is the lower 32 bits of jiffies_64 * (i.e., at address @jiffies_64 + 4). * See arch/ARCH/kernel/vmlinux.lds.S */ extern u64 __cacheline_aligned_in_smp jiffies_64; extern unsigned long volatile __cacheline_aligned_in_smp __jiffy_arch_data jiffies; #if (BITS_PER_LONG < 64) u64 get_jiffies_64(void); #else /** * get_jiffies_64 - read the 64-bit non-atomic jiffies_64 value * * When BITS_PER_LONG < 64, this uses sequence number sampling using * jiffies_lock to protect the 64-bit read. * * Return: current 64-bit jiffies value */ static inline u64 get_jiffies_64(void) { return (u64)jiffies; } #endif /** * DOC: General information about time_* inlines * * These inlines deal with timer wrapping correctly. You are strongly encouraged * to use them: * * #. Because people otherwise forget * #. Because if the timer wrap changes in future you won't have to alter your * driver code. */ /** * time_after - returns true if the time a is after time b. * @a: first comparable as unsigned long * @b: second comparable as unsigned long * * Do this with "<0" and ">=0" to only test the sign of the result. A * good compiler would generate better code (and a really good compiler * wouldn't care). Gcc is currently neither. * * Return: %true is time a is after time b, otherwise %false. */ #define time_after(a,b) \ (typecheck(unsigned long, a) && \ typecheck(unsigned long, b) && \ ((long)((b) - (a)) < 0)) /** * time_before - returns true if the time a is before time b. * @a: first comparable as unsigned long * @b: second comparable as unsigned long * * Return: %true is time a is before time b, otherwise %false. */ #define time_before(a,b) time_after(b,a) /** * time_after_eq - returns true if the time a is after or the same as time b. * @a: first comparable as unsigned long * @b: second comparable as unsigned long * * Return: %true is time a is after or the same as time b, otherwise %false. */ #define time_after_eq(a,b) \ (typecheck(unsigned long, a) && \ typecheck(unsigned long, b) && \ ((long)((a) - (b)) >= 0)) /** * time_before_eq - returns true if the time a is before or the same as time b. * @a: first comparable as unsigned long * @b: second comparable as unsigned long * * Return: %true is time a is before or the same as time b, otherwise %false. */ #define time_before_eq(a,b) time_after_eq(b,a) /** * time_in_range - Calculate whether a is in the range of [b, c]. * @a: time to test * @b: beginning of the range * @c: end of the range * * Return: %true is time a is in the range [b, c], otherwise %false. */ #define time_in_range(a,b,c) \ (time_after_eq(a,b) && \ time_before_eq(a,c)) /** * time_in_range_open - Calculate whether a is in the range of [b, c). * @a: time to test * @b: beginning of the range * @c: end of the range * * Return: %true is time a is in the range [b, c), otherwise %false. */ #define time_in_range_open(a,b,c) \ (time_after_eq(a,b) && \ time_before(a,c)) /* Same as above, but does so with platform independent 64bit types. * These must be used when utilizing jiffies_64 (i.e. return value of * get_jiffies_64()). */ /** * time_after64 - returns true if the time a is after time b. * @a: first comparable as __u64 * @b: second comparable as __u64 * * This must be used when utilizing jiffies_64 (i.e. return value of * get_jiffies_64()). * * Return: %true is time a is after time b, otherwise %false. */ #define time_after64(a,b) \ (typecheck(__u64, a) && \ typecheck(__u64, b) && \ ((__s64)((b) - (a)) < 0)) /** * time_before64 - returns true if the time a is before time b. * @a: first comparable as __u64 * @b: second comparable as __u64 * * This must be used when utilizing jiffies_64 (i.e. return value of * get_jiffies_64()). * * Return: %true is time a is before time b, otherwise %false. */ #define time_before64(a,b) time_after64(b,a) /** * time_after_eq64 - returns true if the time a is after or the same as time b. * @a: first comparable as __u64 * @b: second comparable as __u64 * * This must be used when utilizing jiffies_64 (i.e. return value of * get_jiffies_64()). * * Return: %true is time a is after or the same as time b, otherwise %false. */ #define time_after_eq64(a,b) \ (typecheck(__u64, a) && \ typecheck(__u64, b) && \ ((__s64)((a) - (b)) >= 0)) /** * time_before_eq64 - returns true if the time a is before or the same as time b. * @a: first comparable as __u64 * @b: second comparable as __u64 * * This must be used when utilizing jiffies_64 (i.e. return value of * get_jiffies_64()). * * Return: %true is time a is before or the same as time b, otherwise %false. */ #define time_before_eq64(a,b) time_after_eq64(b,a) /** * time_in_range64 - Calculate whether a is in the range of [b, c]. * @a: time to test * @b: beginning of the range * @c: end of the range * * Return: %true is time a is in the range [b, c], otherwise %false. */ #define time_in_range64(a, b, c) \ (time_after_eq64(a, b) && \ time_before_eq64(a, c)) /* * These eight macros compare jiffies[_64] and 'a' for convenience. */ /** * time_is_before_jiffies - return true if a is before jiffies * @a: time (unsigned long) to compare to jiffies * * Return: %true is time a is before jiffies, otherwise %false. */ #define time_is_before_jiffies(a) time_after(jiffies, a) /** * time_is_before_jiffies64 - return true if a is before jiffies_64 * @a: time (__u64) to compare to jiffies_64 * * Return: %true is time a is before jiffies_64, otherwise %false. */ #define time_is_before_jiffies64(a) time_after64(get_jiffies_64(), a) /** * time_is_after_jiffies - return true if a is after jiffies * @a: time (unsigned long) to compare to jiffies * * Return: %true is time a is after jiffies, otherwise %false. */ #define time_is_after_jiffies(a) time_before(jiffies, a) /** * time_is_after_jiffies64 - return true if a is after jiffies_64 * @a: time (__u64) to compare to jiffies_64 * * Return: %true is time a is after jiffies_64, otherwise %false. */ #define time_is_after_jiffies64(a) time_before64(get_jiffies_64(), a) /** * time_is_before_eq_jiffies - return true if a is before or equal to jiffies * @a: time (unsigned long) to compare to jiffies * * Return: %true is time a is before or the same as jiffies, otherwise %false. */ #define time_is_before_eq_jiffies(a) time_after_eq(jiffies, a) /** * time_is_before_eq_jiffies64 - return true if a is before or equal to jiffies_64 * @a: time (__u64) to compare to jiffies_64 * * Return: %true is time a is before or the same jiffies_64, otherwise %false. */ #define time_is_before_eq_jiffies64(a) time_after_eq64(get_jiffies_64(), a) /** * time_is_after_eq_jiffies - return true if a is after or equal to jiffies * @a: time (unsigned long) to compare to jiffies * * Return: %true is time a is after or the same as jiffies, otherwise %false. */ #define time_is_after_eq_jiffies(a) time_before_eq(jiffies, a) /** * time_is_after_eq_jiffies64 - return true if a is after or equal to jiffies_64 * @a: time (__u64) to compare to jiffies_64 * * Return: %true is time a is after or the same as jiffies_64, otherwise %false. */ #define time_is_after_eq_jiffies64(a) time_before_eq64(get_jiffies_64(), a) /* * Have the 32-bit jiffies value wrap 5 minutes after boot * so jiffies wrap bugs show up earlier. */ #define INITIAL_JIFFIES ((unsigned long)(unsigned int) (-300*HZ)) /* * Change timeval to jiffies, trying to avoid the * most obvious overflows.. * * And some not so obvious. * * Note that we don't want to return LONG_MAX, because * for various timeout reasons we often end up having * to wait "jiffies+1" in order to guarantee that we wait * at _least_ "jiffies" - so "jiffies+1" had better still * be positive. */ #define MAX_JIFFY_OFFSET ((LONG_MAX >> 1)-1) extern unsigned long preset_lpj; /* * We want to do realistic conversions of time so we need to use the same * values the update wall clock code uses as the jiffies size. This value * is: TICK_NSEC (which is defined in timex.h). This * is a constant and is in nanoseconds. We will use scaled math * with a set of scales defined here as SEC_JIFFIE_SC, USEC_JIFFIE_SC and * NSEC_JIFFIE_SC. Note that these defines contain nothing but * constants and so are computed at compile time. SHIFT_HZ (computed in * timex.h) adjusts the scaling for different HZ values. * Scaled math??? What is that? * * Scaled math is a way to do integer math on values that would, * otherwise, either overflow, underflow, or cause undesired div * instructions to appear in the execution path. In short, we "scale" * up the operands so they take more bits (more precision, less * underflow), do the desired operation and then "scale" the result back * by the same amount. If we do the scaling by shifting we avoid the * costly mpy and the dastardly div instructions. * Suppose, for example, we want to convert from seconds to jiffies * where jiffies is defined in nanoseconds as NSEC_PER_JIFFIE. The * simple math is: jiff = (sec * NSEC_PER_SEC) / NSEC_PER_JIFFIE; We * observe that (NSEC_PER_SEC / NSEC_PER_JIFFIE) is a constant which we * might calculate at compile time, however, the result will only have * about 3-4 bits of precision (less for smaller values of HZ). * * So, we scale as follows: * jiff = (sec) * (NSEC_PER_SEC / NSEC_PER_JIFFIE); * jiff = ((sec) * ((NSEC_PER_SEC * SCALE)/ NSEC_PER_JIFFIE)) / SCALE; * Then we make SCALE a power of two so: * jiff = ((sec) * ((NSEC_PER_SEC << SCALE)/ NSEC_PER_JIFFIE)) >> SCALE; * Now we define: * #define SEC_CONV = ((NSEC_PER_SEC << SCALE)/ NSEC_PER_JIFFIE)) * jiff = (sec * SEC_CONV) >> SCALE; * * Often the math we use will expand beyond 32-bits so we tell C how to * do this and pass the 64-bit result of the mpy through the ">> SCALE" * which should take the result back to 32-bits. We want this expansion * to capture as much precision as possible. At the same time we don't * want to overflow so we pick the SCALE to avoid this. In this file, * that means using a different scale for each range of HZ values (as * defined in timex.h). * * For those who want to know, gcc will give a 64-bit result from a "*" * operator if the result is a long long AND at least one of the * operands is cast to long long (usually just prior to the "*" so as * not to confuse it into thinking it really has a 64-bit operand, * which, buy the way, it can do, but it takes more code and at least 2 * mpys). * We also need to be aware that one second in nanoseconds is only a * couple of bits away from overflowing a 32-bit word, so we MUST use * 64-bits to get the full range time in nanoseconds. */ /* * Here are the scales we will use. One for seconds, nanoseconds and * microseconds. * * Within the limits of cpp we do a rough cut at the SEC_JIFFIE_SC and * check if the sign bit is set. If not, we bump the shift count by 1. * (Gets an extra bit of precision where we can use it.) * We know it is set for HZ = 1024 and HZ = 100 not for 1000. * Haven't tested others. * Limits of cpp (for #if expressions) only long (no long long), but * then we only need the most signicant bit. */ #define SEC_JIFFIE_SC (31 - SHIFT_HZ) #if !((((NSEC_PER_SEC << 2) / TICK_NSEC) << (SEC_JIFFIE_SC - 2)) & 0x80000000) #undef SEC_JIFFIE_SC #define SEC_JIFFIE_SC (32 - SHIFT_HZ) #endif #define NSEC_JIFFIE_SC (SEC_JIFFIE_SC + 29) #define SEC_CONVERSION ((unsigned long)((((u64)NSEC_PER_SEC << SEC_JIFFIE_SC) +\ TICK_NSEC -1) / (u64)TICK_NSEC)) #define NSEC_CONVERSION ((unsigned long)((((u64)1 << NSEC_JIFFIE_SC) +\ TICK_NSEC -1) / (u64)TICK_NSEC)) /* * The maximum jiffy value is (MAX_INT >> 1). Here we translate that * into seconds. The 64-bit case will overflow if we are not careful, * so use the messy SH_DIV macro to do it. Still all constants. */ #if BITS_PER_LONG < 64 # define MAX_SEC_IN_JIFFIES \ (long)((u64)((u64)MAX_JIFFY_OFFSET * TICK_NSEC) / NSEC_PER_SEC) #else /* take care of overflow on 64-bit machines */ # define MAX_SEC_IN_JIFFIES \ (SH_DIV((MAX_JIFFY_OFFSET >> SEC_JIFFIE_SC) * TICK_NSEC, NSEC_PER_SEC, 1) - 1) #endif /* * Convert various time units to each other: */ extern unsigned int jiffies_to_msecs(const unsigned long j); extern unsigned int jiffies_to_usecs(const unsigned long j); /** * jiffies_to_nsecs - Convert jiffies to nanoseconds * @j: jiffies value * * Return: nanoseconds value */ static inline u64 jiffies_to_nsecs(const unsigned long j) { return (u64)jiffies_to_usecs(j) * NSEC_PER_USEC; } extern u64 jiffies64_to_nsecs(u64 j); extern u64 jiffies64_to_msecs(u64 j); extern unsigned long __msecs_to_jiffies(const unsigned int m); #if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ) /* * HZ is equal to or smaller than 1000, and 1000 is a nice round * multiple of HZ, divide with the factor between them, but round * upwards: */ static inline unsigned long _msecs_to_jiffies(const unsigned int m) { return (m + (MSEC_PER_SEC / HZ) - 1) / (MSEC_PER_SEC / HZ); } #elif HZ > MSEC_PER_SEC && !(HZ % MSEC_PER_SEC) /* * HZ is larger than 1000, and HZ is a nice round multiple of 1000 - * simply multiply with the factor between them. * * But first make sure the multiplication result cannot overflow: */ static inline unsigned long _msecs_to_jiffies(const unsigned int m) { if (m > jiffies_to_msecs(MAX_JIFFY_OFFSET)) return MAX_JIFFY_OFFSET; return m * (HZ / MSEC_PER_SEC); } #else /* * Generic case - multiply, round and divide. But first check that if * we are doing a net multiplication, that we wouldn't overflow: */ static inline unsigned long _msecs_to_jiffies(const unsigned int m) { if (HZ > MSEC_PER_SEC && m > jiffies_to_msecs(MAX_JIFFY_OFFSET)) return MAX_JIFFY_OFFSET; return (MSEC_TO_HZ_MUL32 * m + MSEC_TO_HZ_ADJ32) >> MSEC_TO_HZ_SHR32; } #endif /** * msecs_to_jiffies: - convert milliseconds to jiffies * @m: time in milliseconds * * conversion is done as follows: * * - negative values mean 'infinite timeout' (MAX_JIFFY_OFFSET) * * - 'too large' values [that would result in larger than * MAX_JIFFY_OFFSET values] mean 'infinite timeout' too. * * - all other values are converted to jiffies by either multiplying * the input value by a factor or dividing it with a factor and * handling any 32-bit overflows. * for the details see _msecs_to_jiffies() * * msecs_to_jiffies() checks for the passed in value being a constant * via __builtin_constant_p() allowing gcc to eliminate most of the * code. __msecs_to_jiffies() is called if the value passed does not * allow constant folding and the actual conversion must be done at * runtime. * The HZ range specific helpers _msecs_to_jiffies() are called both * directly here and from __msecs_to_jiffies() in the case where * constant folding is not possible. * * Return: jiffies value */ static __always_inline unsigned long msecs_to_jiffies(const unsigned int m) { if (__builtin_constant_p(m)) { if ((int)m < 0) return MAX_JIFFY_OFFSET; return _msecs_to_jiffies(m); } else { return __msecs_to_jiffies(m); } } /** * secs_to_jiffies: - convert seconds to jiffies * @_secs: time in seconds * * Conversion is done by simple multiplication with HZ * * secs_to_jiffies() is defined as a macro rather than a static inline * function so it can be used in static initializers. * * Return: jiffies value */ #define secs_to_jiffies(_secs) (unsigned long)((_secs) * HZ) extern unsigned long __usecs_to_jiffies(const unsigned int u); #if !(USEC_PER_SEC % HZ) static inline unsigned long _usecs_to_jiffies(const unsigned int u) { return (u + (USEC_PER_SEC / HZ) - 1) / (USEC_PER_SEC / HZ); } #else static inline unsigned long _usecs_to_jiffies(const unsigned int u) { return (USEC_TO_HZ_MUL32 * u + USEC_TO_HZ_ADJ32) >> USEC_TO_HZ_SHR32; } #endif /** * usecs_to_jiffies: - convert microseconds to jiffies * @u: time in microseconds * * conversion is done as follows: * * - 'too large' values [that would result in larger than * MAX_JIFFY_OFFSET values] mean 'infinite timeout' too. * * - all other values are converted to jiffies by either multiplying * the input value by a factor or dividing it with a factor and * handling any 32-bit overflows as for msecs_to_jiffies. * * usecs_to_jiffies() checks for the passed in value being a constant * via __builtin_constant_p() allowing gcc to eliminate most of the * code. __usecs_to_jiffies() is called if the value passed does not * allow constant folding and the actual conversion must be done at * runtime. * The HZ range specific helpers _usecs_to_jiffies() are called both * directly here and from __msecs_to_jiffies() in the case where * constant folding is not possible. * * Return: jiffies value */ static __always_inline unsigned long usecs_to_jiffies(const unsigned int u) { if (__builtin_constant_p(u)) { if (u > jiffies_to_usecs(MAX_JIFFY_OFFSET)) return MAX_JIFFY_OFFSET; return _usecs_to_jiffies(u); } else { return __usecs_to_jiffies(u); } } extern unsigned long timespec64_to_jiffies(const struct timespec64 *value); extern void jiffies_to_timespec64(const unsigned long jiffies, struct timespec64 *value); extern clock_t jiffies_to_clock_t(unsigned long x); static inline clock_t jiffies_delta_to_clock_t(long delta) { return jiffies_to_clock_t(max(0L, delta)); } static inline unsigned int jiffies_delta_to_msecs(long delta) { return jiffies_to_msecs(max(0L, delta)); } extern unsigned long clock_t_to_jiffies(unsigned long x); extern u64 jiffies_64_to_clock_t(u64 x); extern u64 nsec_to_clock_t(u64 x); extern u64 nsecs_to_jiffies64(u64 n); extern unsigned long nsecs_to_jiffies(u64 n); #define TIMESTAMP_SIZE 30 #endif |
| 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 | /* SPDX-License-Identifier: GPL-2.0 */ /* * PHY device list allow maintaining a list of PHY devices that are * part of a netdevice's link topology. PHYs can for example be chained, * as is the case when using a PHY that exposes an SFP module, on which an * SFP transceiver that embeds a PHY is connected. * * This list can then be used by userspace to leverage individual PHY * capabilities. */ #ifndef __PHY_LINK_TOPOLOGY_H #define __PHY_LINK_TOPOLOGY_H #include <linux/ethtool.h> #include <linux/netdevice.h> struct xarray; struct phy_device; struct sfp_bus; struct phy_link_topology { struct xarray phys; u32 next_phy_index; }; struct phy_device_node { enum phy_upstream upstream_type; union { struct net_device *netdev; struct phy_device *phydev; } upstream; struct sfp_bus *parent_sfp_bus; struct phy_device *phy; }; #if IS_ENABLED(CONFIG_PHYLIB) int phy_link_topo_add_phy(struct net_device *dev, struct phy_device *phy, enum phy_upstream upt, void *upstream); void phy_link_topo_del_phy(struct net_device *dev, struct phy_device *phy); static inline struct phy_device * phy_link_topo_get_phy(struct net_device *dev, u32 phyindex) { struct phy_link_topology *topo = dev->link_topo; struct phy_device_node *pdn; if (!topo) return NULL; pdn = xa_load(&topo->phys, phyindex); if (pdn) return pdn->phy; return NULL; } #else static inline int phy_link_topo_add_phy(struct net_device *dev, struct phy_device *phy, enum phy_upstream upt, void *upstream) { return 0; } static inline void phy_link_topo_del_phy(struct net_device *dev, struct phy_device *phy) { } static inline struct phy_device * phy_link_topo_get_phy(struct net_device *dev, u32 phyindex) { return NULL; } #endif #endif /* __PHY_LINK_TOPOLOGY_H */ |
| 26 24 24 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 | /* * Copyright (C) 2011-2013 Intel Corporation * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sublicense, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice (including the next * paragraph) shall be included in all copies or substantial portions of the * Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #ifndef DRM_RECT_H #define DRM_RECT_H #include <linux/types.h> /** * DOC: rect utils * * Utility functions to help manage rectangular areas for * clipping, scaling, etc. calculations. */ /** * struct drm_rect - two dimensional rectangle * @x1: horizontal starting coordinate (inclusive) * @x2: horizontal ending coordinate (exclusive) * @y1: vertical starting coordinate (inclusive) * @y2: vertical ending coordinate (exclusive) * * Note that this must match the layout of struct drm_mode_rect or the damage * helpers like drm_atomic_helper_damage_iter_init() break. */ struct drm_rect { int x1, y1, x2, y2; }; /** * DRM_RECT_INIT - initialize a rectangle from x/y/w/h * @x: x coordinate * @y: y coordinate * @w: width * @h: height * * RETURNS: * A new rectangle of the specified size. */ #define DRM_RECT_INIT(x, y, w, h) ((struct drm_rect){ \ .x1 = (x), \ .y1 = (y), \ .x2 = (x) + (w), \ .y2 = (y) + (h) }) /** * DRM_RECT_FMT - printf string for &struct drm_rect */ #define DRM_RECT_FMT "%dx%d%+d%+d" /** * DRM_RECT_ARG - printf arguments for &struct drm_rect * @r: rectangle struct */ #define DRM_RECT_ARG(r) drm_rect_width(r), drm_rect_height(r), (r)->x1, (r)->y1 /** * DRM_RECT_FP_FMT - printf string for &struct drm_rect in 16.16 fixed point */ #define DRM_RECT_FP_FMT "%d.%06ux%d.%06u%+d.%06u%+d.%06u" /** * DRM_RECT_FP_ARG - printf arguments for &struct drm_rect in 16.16 fixed point * @r: rectangle struct * * This is useful for e.g. printing plane source rectangles, which are in 16.16 * fixed point. */ #define DRM_RECT_FP_ARG(r) \ drm_rect_width(r) >> 16, ((drm_rect_width(r) & 0xffff) * 15625) >> 10, \ drm_rect_height(r) >> 16, ((drm_rect_height(r) & 0xffff) * 15625) >> 10, \ (r)->x1 >> 16, (((r)->x1 & 0xffff) * 15625) >> 10, \ (r)->y1 >> 16, (((r)->y1 & 0xffff) * 15625) >> 10 /** * drm_rect_init - initialize the rectangle from x/y/w/h * @r: rectangle * @x: x coordinate * @y: y coordinate * @width: width * @height: height */ static inline void drm_rect_init(struct drm_rect *r, int x, int y, int width, int height) { r->x1 = x; r->y1 = y; r->x2 = x + width; r->y2 = y + height; } /** * drm_rect_adjust_size - adjust the size of the rectangle * @r: rectangle to be adjusted * @dw: horizontal adjustment * @dh: vertical adjustment * * Change the size of rectangle @r by @dw in the horizontal direction, * and by @dh in the vertical direction, while keeping the center * of @r stationary. * * Positive @dw and @dh increase the size, negative values decrease it. */ static inline void drm_rect_adjust_size(struct drm_rect *r, int dw, int dh) { r->x1 -= dw >> 1; r->y1 -= dh >> 1; r->x2 += (dw + 1) >> 1; r->y2 += (dh + 1) >> 1; } /** * drm_rect_translate - translate the rectangle * @r: rectangle to be translated * @dx: horizontal translation * @dy: vertical translation * * Move rectangle @r by @dx in the horizontal direction, * and by @dy in the vertical direction. */ static inline void drm_rect_translate(struct drm_rect *r, int dx, int dy) { r->x1 += dx; r->y1 += dy; r->x2 += dx; r->y2 += dy; } /** * drm_rect_translate_to - translate the rectangle to an absolute position * @r: rectangle to be translated * @x: horizontal position * @y: vertical position * * Move rectangle @r to @x in the horizontal direction, * and to @y in the vertical direction. */ static inline void drm_rect_translate_to(struct drm_rect *r, int x, int y) { drm_rect_translate(r, x - r->x1, y - r->y1); } /** * drm_rect_downscale - downscale a rectangle * @r: rectangle to be downscaled * @horz: horizontal downscale factor * @vert: vertical downscale factor * * Divide the coordinates of rectangle @r by @horz and @vert. */ static inline void drm_rect_downscale(struct drm_rect *r, int horz, int vert) { r->x1 /= horz; r->y1 /= vert; r->x2 /= horz; r->y2 /= vert; } /** * drm_rect_width - determine the rectangle width * @r: rectangle whose width is returned * * RETURNS: * The width of the rectangle. */ static inline int drm_rect_width(const struct drm_rect *r) { return r->x2 - r->x1; } /** * drm_rect_height - determine the rectangle height * @r: rectangle whose height is returned * * RETURNS: * The height of the rectangle. */ static inline int drm_rect_height(const struct drm_rect *r) { return r->y2 - r->y1; } /** * drm_rect_visible - determine if the rectangle is visible * @r: rectangle whose visibility is returned * * RETURNS: * %true if the rectangle is visible, %false otherwise. */ static inline bool drm_rect_visible(const struct drm_rect *r) { return drm_rect_width(r) > 0 && drm_rect_height(r) > 0; } /** * drm_rect_equals - determine if two rectangles are equal * @r1: first rectangle * @r2: second rectangle * * RETURNS: * %true if the rectangles are equal, %false otherwise. */ static inline bool drm_rect_equals(const struct drm_rect *r1, const struct drm_rect *r2) { return r1->x1 == r2->x1 && r1->x2 == r2->x2 && r1->y1 == r2->y1 && r1->y2 == r2->y2; } /** * drm_rect_fp_to_int - Convert a rect in 16.16 fixed point form to int form. * @dst: rect to be stored the converted value * @src: rect in 16.16 fixed point form */ static inline void drm_rect_fp_to_int(struct drm_rect *dst, const struct drm_rect *src) { drm_rect_init(dst, src->x1 >> 16, src->y1 >> 16, drm_rect_width(src) >> 16, drm_rect_height(src) >> 16); } /** * drm_rect_overlap - Check if two rectangles overlap * @a: first rectangle * @b: second rectangle * * RETURNS: * %true if the rectangles overlap, %false otherwise. */ static inline bool drm_rect_overlap(const struct drm_rect *a, const struct drm_rect *b) { return (a->x2 > b->x1 && b->x2 > a->x1 && a->y2 > b->y1 && b->y2 > a->y1); } bool drm_rect_intersect(struct drm_rect *r, const struct drm_rect *clip); bool drm_rect_clip_scaled(struct drm_rect *src, struct drm_rect *dst, const struct drm_rect *clip); int drm_rect_calc_hscale(const struct drm_rect *src, const struct drm_rect *dst, int min_hscale, int max_hscale); int drm_rect_calc_vscale(const struct drm_rect *src, const struct drm_rect *dst, int min_vscale, int max_vscale); void drm_rect_debug_print(const char *prefix, const struct drm_rect *r, bool fixed_point); void drm_rect_rotate(struct drm_rect *r, int width, int height, unsigned int rotation); void drm_rect_rotate_inv(struct drm_rect *r, int width, int height, unsigned int rotation); #endif |
| 24 24 1 22 24 1 24 24 54 54 54 4 4 4 4 55 55 23 54 2 2 1 1 2 24 24 23 24 23 23 23 1 1 1 1 57 57 56 56 56 56 56 2 2 1 2 57 56 55 6 6 6 6 6 8 3 8 8 23 23 23 23 23 23 23 23 22 22 22 22 22 1 1 1 1 1 1 20 11 10 20 43 42 1 1 1 1 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 | // SPDX-License-Identifier: GPL-2.0-or-later /* * ALSA sequencer Timer * Copyright (c) 1998-1999 by Frank van de Pol <fvdpol@coil.demon.nl> * Jaroslav Kysela <perex@perex.cz> */ #include <sound/core.h> #include <linux/slab.h> #include "seq_timer.h" #include "seq_queue.h" #include "seq_info.h" /* allowed sequencer timer frequencies, in Hz */ #define MIN_FREQUENCY 10 #define MAX_FREQUENCY 6250 #define DEFAULT_FREQUENCY 1000 #define SKEW_BASE 0x10000 /* 16bit shift */ static void snd_seq_timer_set_tick_resolution(struct snd_seq_timer *tmr) { unsigned int threshold = tmr->tempo_base == 1000 ? 1000000 : 10000; if (tmr->tempo < threshold) tmr->tick.resolution = (tmr->tempo * tmr->tempo_base) / tmr->ppq; else { /* might overflow.. */ unsigned int s; s = tmr->tempo % tmr->ppq; s = (s * tmr->tempo_base) / tmr->ppq; tmr->tick.resolution = (tmr->tempo / tmr->ppq) * tmr->tempo_base; tmr->tick.resolution += s; } if (tmr->tick.resolution <= 0) tmr->tick.resolution = 1; snd_seq_timer_update_tick(&tmr->tick, 0); } /* create new timer (constructor) */ struct snd_seq_timer *snd_seq_timer_new(void) { struct snd_seq_timer *tmr; tmr = kzalloc(sizeof(*tmr), GFP_KERNEL); if (!tmr) return NULL; spin_lock_init(&tmr->lock); /* reset setup to defaults */ snd_seq_timer_defaults(tmr); /* reset time */ snd_seq_timer_reset(tmr); return tmr; } /* delete timer (destructor) */ void snd_seq_timer_delete(struct snd_seq_timer **tmr) { struct snd_seq_timer *t = *tmr; *tmr = NULL; if (t == NULL) { pr_debug("ALSA: seq: snd_seq_timer_delete() called with NULL timer\n"); return; } t->running = 0; /* reset time */ snd_seq_timer_stop(t); snd_seq_timer_reset(t); kfree(t); } void snd_seq_timer_defaults(struct snd_seq_timer * tmr) { guard(spinlock_irqsave)(&tmr->lock); /* setup defaults */ tmr->ppq = 96; /* 96 PPQ */ tmr->tempo = 500000; /* 120 BPM */ tmr->tempo_base = 1000; /* 1us */ snd_seq_timer_set_tick_resolution(tmr); tmr->running = 0; tmr->type = SNDRV_SEQ_TIMER_ALSA; tmr->alsa_id.dev_class = seq_default_timer_class; tmr->alsa_id.dev_sclass = seq_default_timer_sclass; tmr->alsa_id.card = seq_default_timer_card; tmr->alsa_id.device = seq_default_timer_device; tmr->alsa_id.subdevice = seq_default_timer_subdevice; tmr->preferred_resolution = seq_default_timer_resolution; tmr->skew = tmr->skew_base = SKEW_BASE; } static void seq_timer_reset(struct snd_seq_timer *tmr) { /* reset time & songposition */ tmr->cur_time.tv_sec = 0; tmr->cur_time.tv_nsec = 0; tmr->tick.cur_tick = 0; tmr->tick.fraction = 0; } void snd_seq_timer_reset(struct snd_seq_timer *tmr) { guard(spinlock_irqsave)(&tmr->lock); seq_timer_reset(tmr); } /* called by timer interrupt routine. the period time since previous invocation is passed */ static void snd_seq_timer_interrupt(struct snd_timer_instance *timeri, unsigned long resolution, unsigned long ticks) { struct snd_seq_queue *q = timeri->callback_data; struct snd_seq_timer *tmr; if (q == NULL) return; tmr = q->timer; if (tmr == NULL) return; scoped_guard(spinlock_irqsave, &tmr->lock) { if (!tmr->running) return; resolution *= ticks; if (tmr->skew != tmr->skew_base) { /* FIXME: assuming skew_base = 0x10000 */ resolution = (resolution >> 16) * tmr->skew + (((resolution & 0xffff) * tmr->skew) >> 16); } /* update timer */ snd_seq_inc_time_nsec(&tmr->cur_time, resolution); /* calculate current tick */ snd_seq_timer_update_tick(&tmr->tick, resolution); /* register actual time of this timer update */ ktime_get_ts64(&tmr->last_update); } /* check queues and dispatch events */ snd_seq_check_queue(q, 1, 0); } /* set current tempo */ int snd_seq_timer_set_tempo(struct snd_seq_timer * tmr, int tempo) { if (snd_BUG_ON(!tmr)) return -EINVAL; if (tempo <= 0) return -EINVAL; guard(spinlock_irqsave)(&tmr->lock); if ((unsigned int)tempo != tmr->tempo) { tmr->tempo = tempo; snd_seq_timer_set_tick_resolution(tmr); } return 0; } /* set current tempo, ppq and base in a shot */ int snd_seq_timer_set_tempo_ppq(struct snd_seq_timer *tmr, int tempo, int ppq, unsigned int tempo_base) { int changed; if (snd_BUG_ON(!tmr)) return -EINVAL; if (tempo <= 0 || ppq <= 0) return -EINVAL; /* allow only 10ns or 1us tempo base for now */ if (tempo_base && tempo_base != 10 && tempo_base != 1000) return -EINVAL; guard(spinlock_irqsave)(&tmr->lock); if (tmr->running && (ppq != tmr->ppq)) { /* refuse to change ppq on running timers */ /* because it will upset the song position (ticks) */ pr_debug("ALSA: seq: cannot change ppq of a running timer\n"); return -EBUSY; } changed = (tempo != tmr->tempo) || (ppq != tmr->ppq); tmr->tempo = tempo; tmr->ppq = ppq; tmr->tempo_base = tempo_base ? tempo_base : 1000; if (changed) snd_seq_timer_set_tick_resolution(tmr); return 0; } /* set current tick position */ int snd_seq_timer_set_position_tick(struct snd_seq_timer *tmr, snd_seq_tick_time_t position) { if (snd_BUG_ON(!tmr)) return -EINVAL; guard(spinlock_irqsave)(&tmr->lock); tmr->tick.cur_tick = position; tmr->tick.fraction = 0; return 0; } /* set current real-time position */ int snd_seq_timer_set_position_time(struct snd_seq_timer *tmr, snd_seq_real_time_t position) { if (snd_BUG_ON(!tmr)) return -EINVAL; snd_seq_sanity_real_time(&position); guard(spinlock_irqsave)(&tmr->lock); tmr->cur_time = position; return 0; } /* set timer skew */ int snd_seq_timer_set_skew(struct snd_seq_timer *tmr, unsigned int skew, unsigned int base) { if (snd_BUG_ON(!tmr)) return -EINVAL; /* FIXME */ if (base != SKEW_BASE) { pr_debug("ALSA: seq: invalid skew base 0x%x\n", base); return -EINVAL; } guard(spinlock_irqsave)(&tmr->lock); tmr->skew = skew; return 0; } int snd_seq_timer_open(struct snd_seq_queue *q) { struct snd_timer_instance *t; struct snd_seq_timer *tmr; char str[32]; int err; tmr = q->timer; if (snd_BUG_ON(!tmr)) return -EINVAL; if (tmr->timeri) return -EBUSY; sprintf(str, "sequencer queue %i", q->queue); if (tmr->type != SNDRV_SEQ_TIMER_ALSA) /* standard ALSA timer */ return -EINVAL; if (tmr->alsa_id.dev_class != SNDRV_TIMER_CLASS_SLAVE) tmr->alsa_id.dev_sclass = SNDRV_TIMER_SCLASS_SEQUENCER; t = snd_timer_instance_new(str); if (!t) return -ENOMEM; t->callback = snd_seq_timer_interrupt; t->callback_data = q; t->flags |= SNDRV_TIMER_IFLG_AUTO; err = snd_timer_open(t, &tmr->alsa_id, q->queue); if (err < 0 && tmr->alsa_id.dev_class != SNDRV_TIMER_CLASS_SLAVE) { if (tmr->alsa_id.dev_class != SNDRV_TIMER_CLASS_GLOBAL || tmr->alsa_id.device != SNDRV_TIMER_GLOBAL_SYSTEM) { struct snd_timer_id tid; memset(&tid, 0, sizeof(tid)); tid.dev_class = SNDRV_TIMER_CLASS_GLOBAL; tid.dev_sclass = SNDRV_TIMER_SCLASS_SEQUENCER; tid.card = -1; tid.device = SNDRV_TIMER_GLOBAL_SYSTEM; err = snd_timer_open(t, &tid, q->queue); } } if (err < 0) { pr_err("ALSA: seq fatal error: cannot create timer (%i)\n", err); snd_timer_instance_free(t); return err; } scoped_guard(spinlock_irq, &tmr->lock) { if (tmr->timeri) err = -EBUSY; else tmr->timeri = t; } if (err < 0) { snd_timer_close(t); snd_timer_instance_free(t); return err; } return 0; } int snd_seq_timer_close(struct snd_seq_queue *q) { struct snd_seq_timer *tmr; struct snd_timer_instance *t; tmr = q->timer; if (snd_BUG_ON(!tmr)) return -EINVAL; scoped_guard(spinlock_irq, &tmr->lock) { t = tmr->timeri; tmr->timeri = NULL; } if (t) { snd_timer_close(t); snd_timer_instance_free(t); } return 0; } static int seq_timer_stop(struct snd_seq_timer *tmr) { if (! tmr->timeri) return -EINVAL; if (!tmr->running) return 0; tmr->running = 0; snd_timer_pause(tmr->timeri); return 0; } int snd_seq_timer_stop(struct snd_seq_timer *tmr) { guard(spinlock_irqsave)(&tmr->lock); return seq_timer_stop(tmr); } static int initialize_timer(struct snd_seq_timer *tmr) { struct snd_timer *t; unsigned long freq; t = tmr->timeri->timer; if (!t) return -EINVAL; freq = tmr->preferred_resolution; if (!freq) freq = DEFAULT_FREQUENCY; else if (freq < MIN_FREQUENCY) freq = MIN_FREQUENCY; else if (freq > MAX_FREQUENCY) freq = MAX_FREQUENCY; tmr->ticks = 1; if (!(t->hw.flags & SNDRV_TIMER_HW_SLAVE)) { unsigned long r = snd_timer_resolution(tmr->timeri); if (r) { tmr->ticks = (unsigned int)(1000000000uL / (r * freq)); if (! tmr->ticks) tmr->ticks = 1; } } tmr->initialized = 1; return 0; } static int seq_timer_start(struct snd_seq_timer *tmr) { if (! tmr->timeri) return -EINVAL; if (tmr->running) seq_timer_stop(tmr); seq_timer_reset(tmr); if (initialize_timer(tmr) < 0) return -EINVAL; snd_timer_start(tmr->timeri, tmr->ticks); tmr->running = 1; ktime_get_ts64(&tmr->last_update); return 0; } int snd_seq_timer_start(struct snd_seq_timer *tmr) { guard(spinlock_irqsave)(&tmr->lock); return seq_timer_start(tmr); } static int seq_timer_continue(struct snd_seq_timer *tmr) { if (! tmr->timeri) return -EINVAL; if (tmr->running) return -EBUSY; if (! tmr->initialized) { seq_timer_reset(tmr); if (initialize_timer(tmr) < 0) return -EINVAL; } snd_timer_start(tmr->timeri, tmr->ticks); tmr->running = 1; ktime_get_ts64(&tmr->last_update); return 0; } int snd_seq_timer_continue(struct snd_seq_timer *tmr) { guard(spinlock_irqsave)(&tmr->lock); return seq_timer_continue(tmr); } /* return current 'real' time. use timeofday() to get better granularity. */ snd_seq_real_time_t snd_seq_timer_get_cur_time(struct snd_seq_timer *tmr, bool adjust_ktime) { snd_seq_real_time_t cur_time; guard(spinlock_irqsave)(&tmr->lock); cur_time = tmr->cur_time; if (adjust_ktime && tmr->running) { struct timespec64 tm; ktime_get_ts64(&tm); tm = timespec64_sub(tm, tmr->last_update); cur_time.tv_nsec += tm.tv_nsec; cur_time.tv_sec += tm.tv_sec; snd_seq_sanity_real_time(&cur_time); } return cur_time; } /* TODO: use interpolation on tick queue (will only be useful for very high PPQ values) */ snd_seq_tick_time_t snd_seq_timer_get_cur_tick(struct snd_seq_timer *tmr) { guard(spinlock_irqsave)(&tmr->lock); return tmr->tick.cur_tick; } #ifdef CONFIG_SND_PROC_FS /* exported to seq_info.c */ void snd_seq_info_timer_read(struct snd_info_entry *entry, struct snd_info_buffer *buffer) { int idx; struct snd_seq_queue *q; struct snd_seq_timer *tmr; struct snd_timer_instance *ti; unsigned long resolution; for (idx = 0; idx < SNDRV_SEQ_MAX_QUEUES; idx++) { q = queueptr(idx); if (q == NULL) continue; scoped_guard(mutex, &q->timer_mutex) { tmr = q->timer; if (!tmr) break; ti = tmr->timeri; if (!ti) break; snd_iprintf(buffer, "Timer for queue %i : %s\n", q->queue, ti->timer->name); resolution = snd_timer_resolution(ti) * tmr->ticks; snd_iprintf(buffer, " Period time : %lu.%09lu\n", resolution / 1000000000, resolution % 1000000000); snd_iprintf(buffer, " Skew : %u / %u\n", tmr->skew, tmr->skew_base); } queuefree(q); } } #endif /* CONFIG_SND_PROC_FS */ |
| 7 6 7 7 7 7 7 6 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 | // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2016 Thomas Gleixner. * Copyright (C) 2016-2017 Christoph Hellwig. */ #include <linux/kernel.h> #include <linux/slab.h> #include <linux/cpu.h> #include <linux/sort.h> #include <linux/group_cpus.h> #ifdef CONFIG_SMP static void grp_spread_init_one(struct cpumask *irqmsk, struct cpumask *nmsk, unsigned int cpus_per_grp) { const struct cpumask *siblmsk; int cpu, sibl; for ( ; cpus_per_grp > 0; ) { cpu = cpumask_first(nmsk); /* Should not happen, but I'm too lazy to think about it */ if (cpu >= nr_cpu_ids) return; cpumask_clear_cpu(cpu, nmsk); cpumask_set_cpu(cpu, irqmsk); cpus_per_grp--; /* If the cpu has siblings, use them first */ siblmsk = topology_sibling_cpumask(cpu); for (sibl = -1; cpus_per_grp > 0; ) { sibl = cpumask_next(sibl, siblmsk); if (sibl >= nr_cpu_ids) break; if (!cpumask_test_and_clear_cpu(sibl, nmsk)) continue; cpumask_set_cpu(sibl, irqmsk); cpus_per_grp--; } } } static cpumask_var_t *alloc_node_to_cpumask(void) { cpumask_var_t *masks; int node; masks = kcalloc(nr_node_ids, sizeof(cpumask_var_t), GFP_KERNEL); if (!masks) return NULL; for (node = 0; node < nr_node_ids; node++) { if (!zalloc_cpumask_var(&masks[node], GFP_KERNEL)) goto out_unwind; } return masks; out_unwind: while (--node >= 0) free_cpumask_var(masks[node]); kfree(masks); return NULL; } static void free_node_to_cpumask(cpumask_var_t *masks) { int node; for (node = 0; node < nr_node_ids; node++) free_cpumask_var(masks[node]); kfree(masks); } static void build_node_to_cpumask(cpumask_var_t *masks) { int cpu; for_each_possible_cpu(cpu) cpumask_set_cpu(cpu, masks[cpu_to_node(cpu)]); } static int get_nodes_in_cpumask(cpumask_var_t *node_to_cpumask, const struct cpumask *mask, nodemask_t *nodemsk) { int n, nodes = 0; /* Calculate the number of nodes in the supplied affinity mask */ for_each_node(n) { if (cpumask_intersects(mask, node_to_cpumask[n])) { node_set(n, *nodemsk); nodes++; } } return nodes; } struct node_groups { unsigned id; union { unsigned ngroups; unsigned ncpus; }; }; static int ncpus_cmp_func(const void *l, const void *r) { const struct node_groups *ln = l; const struct node_groups *rn = r; return ln->ncpus - rn->ncpus; } /* * Allocate group number for each node, so that for each node: * * 1) the allocated number is >= 1 * * 2) the allocated number is <= active CPU number of this node * * The actual allocated total groups may be less than @numgrps when * active total CPU number is less than @numgrps. * * Active CPUs means the CPUs in '@cpu_mask AND @node_to_cpumask[]' * for each node. */ static void alloc_nodes_groups(unsigned int numgrps, cpumask_var_t *node_to_cpumask, const struct cpumask *cpu_mask, const nodemask_t nodemsk, struct cpumask *nmsk, struct node_groups *node_groups) { unsigned n, remaining_ncpus = 0; for (n = 0; n < nr_node_ids; n++) { node_groups[n].id = n; node_groups[n].ncpus = UINT_MAX; } for_each_node_mask(n, nodemsk) { unsigned ncpus; cpumask_and(nmsk, cpu_mask, node_to_cpumask[n]); ncpus = cpumask_weight(nmsk); if (!ncpus) continue; remaining_ncpus += ncpus; node_groups[n].ncpus = ncpus; } numgrps = min_t(unsigned, remaining_ncpus, numgrps); sort(node_groups, nr_node_ids, sizeof(node_groups[0]), ncpus_cmp_func, NULL); /* * Allocate groups for each node according to the ratio of this * node's nr_cpus to remaining un-assigned ncpus. 'numgrps' is * bigger than number of active numa nodes. Always start the * allocation from the node with minimized nr_cpus. * * This way guarantees that each active node gets allocated at * least one group, and the theory is simple: over-allocation * is only done when this node is assigned by one group, so * other nodes will be allocated >= 1 groups, since 'numgrps' is * bigger than number of numa nodes. * * One perfect invariant is that number of allocated groups for * each node is <= CPU count of this node: * * 1) suppose there are two nodes: A and B * ncpu(X) is CPU count of node X * grps(X) is the group count allocated to node X via this * algorithm * * ncpu(A) <= ncpu(B) * ncpu(A) + ncpu(B) = N * grps(A) + grps(B) = G * * grps(A) = max(1, round_down(G * ncpu(A) / N)) * grps(B) = G - grps(A) * * both N and G are integer, and 2 <= G <= N, suppose * G = N - delta, and 0 <= delta <= N - 2 * * 2) obviously grps(A) <= ncpu(A) because: * * if grps(A) is 1, then grps(A) <= ncpu(A) given * ncpu(A) >= 1 * * otherwise, * grps(A) <= G * ncpu(A) / N <= ncpu(A), given G <= N * * 3) prove how grps(B) <= ncpu(B): * * if round_down(G * ncpu(A) / N) == 0, vecs(B) won't be * over-allocated, so grps(B) <= ncpu(B), * * otherwise: * * grps(A) = * round_down(G * ncpu(A) / N) = * round_down((N - delta) * ncpu(A) / N) = * round_down((N * ncpu(A) - delta * ncpu(A)) / N) >= * round_down((N * ncpu(A) - delta * N) / N) = * cpu(A) - delta * * then: * * grps(A) - G >= ncpu(A) - delta - G * => * G - grps(A) <= G + delta - ncpu(A) * => * grps(B) <= N - ncpu(A) * => * grps(B) <= cpu(B) * * For nodes >= 3, it can be thought as one node and another big * node given that is exactly what this algorithm is implemented, * and we always re-calculate 'remaining_ncpus' & 'numgrps', and * finally for each node X: grps(X) <= ncpu(X). * */ for (n = 0; n < nr_node_ids; n++) { unsigned ngroups, ncpus; if (node_groups[n].ncpus == UINT_MAX) continue; WARN_ON_ONCE(numgrps == 0); ncpus = node_groups[n].ncpus; ngroups = max_t(unsigned, 1, numgrps * ncpus / remaining_ncpus); WARN_ON_ONCE(ngroups > ncpus); node_groups[n].ngroups = ngroups; remaining_ncpus -= ncpus; numgrps -= ngroups; } } static int __group_cpus_evenly(unsigned int startgrp, unsigned int numgrps, cpumask_var_t *node_to_cpumask, const struct cpumask *cpu_mask, struct cpumask *nmsk, struct cpumask *masks) { unsigned int i, n, nodes, cpus_per_grp, extra_grps, done = 0; unsigned int last_grp = numgrps; unsigned int curgrp = startgrp; nodemask_t nodemsk = NODE_MASK_NONE; struct node_groups *node_groups; if (cpumask_empty(cpu_mask)) return 0; nodes = get_nodes_in_cpumask(node_to_cpumask, cpu_mask, &nodemsk); /* * If the number of nodes in the mask is greater than or equal the * number of groups we just spread the groups across the nodes. */ if (numgrps <= nodes) { for_each_node_mask(n, nodemsk) { /* Ensure that only CPUs which are in both masks are set */ cpumask_and(nmsk, cpu_mask, node_to_cpumask[n]); cpumask_or(&masks[curgrp], &masks[curgrp], nmsk); if (++curgrp == last_grp) curgrp = 0; } return numgrps; } node_groups = kcalloc(nr_node_ids, sizeof(struct node_groups), GFP_KERNEL); if (!node_groups) return -ENOMEM; /* allocate group number for each node */ alloc_nodes_groups(numgrps, node_to_cpumask, cpu_mask, nodemsk, nmsk, node_groups); for (i = 0; i < nr_node_ids; i++) { unsigned int ncpus, v; struct node_groups *nv = &node_groups[i]; if (nv->ngroups == UINT_MAX) continue; /* Get the cpus on this node which are in the mask */ cpumask_and(nmsk, cpu_mask, node_to_cpumask[nv->id]); ncpus = cpumask_weight(nmsk); if (!ncpus) continue; WARN_ON_ONCE(nv->ngroups > ncpus); /* Account for rounding errors */ extra_grps = ncpus - nv->ngroups * (ncpus / nv->ngroups); /* Spread allocated groups on CPUs of the current node */ for (v = 0; v < nv->ngroups; v++, curgrp++) { cpus_per_grp = ncpus / nv->ngroups; /* Account for extra groups to compensate rounding errors */ if (extra_grps) { cpus_per_grp++; --extra_grps; } /* * wrapping has to be considered given 'startgrp' * may start anywhere */ if (curgrp >= last_grp) curgrp = 0; grp_spread_init_one(&masks[curgrp], nmsk, cpus_per_grp); } done += nv->ngroups; } kfree(node_groups); return done; } /** * group_cpus_evenly - Group all CPUs evenly per NUMA/CPU locality * @numgrps: number of groups * @nummasks: number of initialized cpumasks * * Return: cpumask array if successful, NULL otherwise. And each element * includes CPUs assigned to this group. nummasks contains the number * of initialized masks which can be less than numgrps. * * Try to put close CPUs from viewpoint of CPU and NUMA locality into * same group, and run two-stage grouping: * 1) allocate present CPUs on these groups evenly first * 2) allocate other possible CPUs on these groups evenly * * We guarantee in the resulted grouping that all CPUs are covered, and * no same CPU is assigned to multiple groups */ struct cpumask *group_cpus_evenly(unsigned int numgrps, unsigned int *nummasks) { unsigned int curgrp = 0, nr_present = 0, nr_others = 0; cpumask_var_t *node_to_cpumask; cpumask_var_t nmsk, npresmsk; int ret = -ENOMEM; struct cpumask *masks = NULL; if (numgrps == 0) return NULL; if (!zalloc_cpumask_var(&nmsk, GFP_KERNEL)) return NULL; if (!zalloc_cpumask_var(&npresmsk, GFP_KERNEL)) goto fail_nmsk; node_to_cpumask = alloc_node_to_cpumask(); if (!node_to_cpumask) goto fail_npresmsk; masks = kcalloc(numgrps, sizeof(*masks), GFP_KERNEL); if (!masks) goto fail_node_to_cpumask; build_node_to_cpumask(node_to_cpumask); /* * Make a local cache of 'cpu_present_mask', so the two stages * spread can observe consistent 'cpu_present_mask' without holding * cpu hotplug lock, then we can reduce deadlock risk with cpu * hotplug code. * * Here CPU hotplug may happen when reading `cpu_present_mask`, and * we can live with the case because it only affects that hotplug * CPU is handled in the 1st or 2nd stage, and either way is correct * from API user viewpoint since 2-stage spread is sort of * optimization. */ cpumask_copy(npresmsk, data_race(cpu_present_mask)); /* grouping present CPUs first */ ret = __group_cpus_evenly(curgrp, numgrps, node_to_cpumask, npresmsk, nmsk, masks); if (ret < 0) goto fail_node_to_cpumask; nr_present = ret; /* * Allocate non present CPUs starting from the next group to be * handled. If the grouping of present CPUs already exhausted the * group space, assign the non present CPUs to the already * allocated out groups. */ if (nr_present >= numgrps) curgrp = 0; else curgrp = nr_present; cpumask_andnot(npresmsk, cpu_possible_mask, npresmsk); ret = __group_cpus_evenly(curgrp, numgrps, node_to_cpumask, npresmsk, nmsk, masks); if (ret >= 0) nr_others = ret; fail_node_to_cpumask: free_node_to_cpumask(node_to_cpumask); fail_npresmsk: free_cpumask_var(npresmsk); fail_nmsk: free_cpumask_var(nmsk); if (ret < 0) { kfree(masks); return NULL; } *nummasks = min(nr_present + nr_others, numgrps); return masks; } #else /* CONFIG_SMP */ struct cpumask *group_cpus_evenly(unsigned int numgrps, unsigned int *nummasks) { struct cpumask *masks; if (numgrps == 0) return NULL; masks = kcalloc(numgrps, sizeof(*masks), GFP_KERNEL); if (!masks) return NULL; /* assign all CPUs(cpu 0) to the 1st group only */ cpumask_copy(&masks[0], cpu_possible_mask); *nummasks = 1; return masks; } #endif /* CONFIG_SMP */ EXPORT_SYMBOL_GPL(group_cpus_evenly); |
| 2 9 1 4 4 3 2 2 2 2 1 2 5 5 5 5 3 3 3 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 3 2 2 1 1 3 1 1 1 1 1 1 3 3 2 3 3 3 2 1 2 2 2 2 2 2 7 7 7 3 3 3 2 3 3 2 3 3 3 3 2 2 1 1 3 2 2 2 2 2 2 2 2 2 4 2 2 2 4 3 3 3 3 1 2 2 2 6 6 5 2 4 4 |