Total coverage: 180310 (15%)of 1240303
21 190 79 134 7 78 87 12 202 222 319 2 133 70 265 312 289 13 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _FAT_H #define _FAT_H #include <linux/buffer_head.h> #include <linux/nls.h> #include <linux/hash.h> #include <linux/ratelimit.h> #include <linux/msdos_fs.h> #include <linux/fs_context.h> #include <linux/fs_parser.h> /* * vfat shortname flags */ #define VFAT_SFN_DISPLAY_LOWER 0x0001 /* convert to lowercase for display */ #define VFAT_SFN_DISPLAY_WIN95 0x0002 /* emulate win95 rule for display */ #define VFAT_SFN_DISPLAY_WINNT 0x0004 /* emulate winnt rule for display */ #define VFAT_SFN_CREATE_WIN95 0x0100 /* emulate win95 rule for create */ #define VFAT_SFN_CREATE_WINNT 0x0200 /* emulate winnt rule for create */ #define FAT_ERRORS_CONT 1 /* ignore error and continue */ #define FAT_ERRORS_PANIC 2 /* panic on error */ #define FAT_ERRORS_RO 3 /* remount r/o on error */ #define FAT_NFS_STALE_RW 1 /* NFS RW support, can cause ESTALE */ #define FAT_NFS_NOSTALE_RO 2 /* NFS RO support, no ESTALE issue */ struct fat_mount_options { kuid_t fs_uid; kgid_t fs_gid; unsigned short fs_fmask; unsigned short fs_dmask; unsigned short codepage; /* Codepage for shortname conversions */ int time_offset; /* Offset of timestamps from UTC (in minutes) */ char *iocharset; /* Charset used for filename input/display */ unsigned short shortname; /* flags for shortname display/create rule */ unsigned char name_check; /* r = relaxed, n = normal, s = strict */ unsigned char errors; /* On error: continue, panic, remount-ro */ unsigned char nfs; /* NFS support: nostale_ro, stale_rw */ unsigned short allow_utime;/* permission for setting the [am]time */ unsigned quiet:1, /* set = fake successful chmods and chowns */ showexec:1, /* set = only set x bit for com/exe/bat */ sys_immutable:1, /* set = system files are immutable */ dotsOK:1, /* set = hidden and system files are named '.filename' */ isvfat:1, /* 0=no vfat long filename support, 1=vfat support */ utf8:1, /* Use of UTF-8 character set (Default) */ unicode_xlate:1, /* create escape sequences for unhandled Unicode */ numtail:1, /* Does first alias have a numeric '~1' type tail? */ flush:1, /* write things quickly */ nocase:1, /* Does this need case conversion? 0=need case conversion*/ usefree:1, /* Use free_clusters for FAT32 */ tz_set:1, /* Filesystem timestamps' offset set */ rodir:1, /* allow ATTR_RO for directory */ discard:1, /* Issue discard requests on deletions */ dos1xfloppy:1, /* Assume default BPB for DOS 1.x floppies */ debug:1; /* Not currently used */ }; #define FAT_HASH_BITS 8 #define FAT_HASH_SIZE (1UL << FAT_HASH_BITS) /* * MS-DOS file system in-core superblock data */ struct msdos_sb_info { unsigned short sec_per_clus; /* sectors/cluster */ unsigned short cluster_bits; /* log2(cluster_size) */ unsigned int cluster_size; /* cluster size */ unsigned char fats, fat_bits; /* number of FATs, FAT bits (12,16 or 32) */ unsigned short fat_start; unsigned long fat_length; /* FAT start & length (sec.) */ unsigned long dir_start; unsigned short dir_entries; /* root dir start & entries */ unsigned long data_start; /* first data sector */ unsigned long max_cluster; /* maximum cluster number */ unsigned long root_cluster; /* first cluster of the root directory */ unsigned long fsinfo_sector; /* sector number of FAT32 fsinfo */ struct mutex fat_lock; struct mutex nfs_build_inode_lock; struct mutex s_lock; unsigned int prev_free; /* previously allocated cluster number */ unsigned int free_clusters; /* -1 if undefined */ unsigned int free_clus_valid; /* is free_clusters valid? */ struct fat_mount_options options; struct nls_table *nls_disk; /* Codepage used on disk */ struct nls_table *nls_io; /* Charset used for input and display */ const void *dir_ops; /* Opaque; default directory operations */ int dir_per_block; /* dir entries per block */ int dir_per_block_bits; /* log2(dir_per_block) */ unsigned int vol_id; /*volume ID*/ int fatent_shift; const struct fatent_operations *fatent_ops; struct inode *fat_inode; struct inode *fsinfo_inode; struct ratelimit_state ratelimit; spinlock_t inode_hash_lock; struct hlist_head inode_hashtable[FAT_HASH_SIZE]; spinlock_t dir_hash_lock; struct hlist_head dir_hashtable[FAT_HASH_SIZE]; unsigned int dirty; /* fs state before mount */ struct rcu_head rcu; }; #define FAT_CACHE_VALID 0 /* special case for valid cache */ /* * MS-DOS file system inode data in memory */ struct msdos_inode_info { spinlock_t cache_lru_lock; struct list_head cache_lru; int nr_caches; /* for avoiding the race between fat_free() and fat_get_cluster() */ unsigned int cache_valid_id; /* NOTE: mmu_private is 64bits, so must hold ->i_mutex to access */ loff_t mmu_private; /* physically allocated size */ int i_start; /* first cluster or 0 */ int i_logstart; /* logical first cluster */ int i_attrs; /* unused attribute bits */ loff_t i_pos; /* on-disk position of directory entry or 0 */ struct hlist_node i_fat_hash; /* hash by i_location */ struct hlist_node i_dir_hash; /* hash by i_logstart */ struct rw_semaphore truncate_lock; /* protect bmap against truncate */ struct timespec64 i_crtime; /* File creation (birth) time */ struct inode vfs_inode; }; struct fat_slot_info { loff_t i_pos; /* on-disk position of directory entry */ loff_t slot_off; /* offset for slot or de start */ int nr_slots; /* number of slots + 1(de) in filename */ struct msdos_dir_entry *de; struct buffer_head *bh; }; static inline struct msdos_sb_info *MSDOS_SB(struct super_block *sb) { return sb->s_fs_info; } /* * Functions that determine the variant of the FAT file system (i.e., * whether this is FAT12, FAT16 or FAT32. */ static inline bool is_fat12(const struct msdos_sb_info *sbi) { return sbi->fat_bits == 12; } static inline bool is_fat16(const struct msdos_sb_info *sbi) { return sbi->fat_bits == 16; } static inline bool is_fat32(const struct msdos_sb_info *sbi) { return sbi->fat_bits == 32; } /* Maximum number of clusters */ static inline u32 max_fat(struct super_block *sb) { struct msdos_sb_info *sbi = MSDOS_SB(sb); return is_fat32(sbi) ? MAX_FAT32 : is_fat16(sbi) ? MAX_FAT16 : MAX_FAT12; } static inline struct msdos_inode_info *MSDOS_I(struct inode *inode) { return container_of(inode, struct msdos_inode_info, vfs_inode); } /* * If ->i_mode can't hold S_IWUGO (i.e. ATTR_RO), we use ->i_attrs to * save ATTR_RO instead of ->i_mode. * * If it's directory and !sbi->options.rodir, ATTR_RO isn't read-only * bit, it's just used as flag for app. */ static inline int fat_mode_can_hold_ro(struct inode *inode) { struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb); umode_t mask; if (S_ISDIR(inode->i_mode)) { if (!sbi->options.rodir) return 0; mask = ~sbi->options.fs_dmask; } else mask = ~sbi->options.fs_fmask; if (!(mask & S_IWUGO)) return 0; return 1; } /* Convert attribute bits and a mask to the UNIX mode. */ static inline umode_t fat_make_mode(struct msdos_sb_info *sbi, u8 attrs, umode_t mode) { if (attrs & ATTR_RO && !((attrs & ATTR_DIR) && !sbi->options.rodir)) mode &= ~S_IWUGO; if (attrs & ATTR_DIR) return (mode & ~sbi->options.fs_dmask) | S_IFDIR; else return (mode & ~sbi->options.fs_fmask) | S_IFREG; } /* Return the FAT attribute byte for this inode */ static inline u8 fat_make_attrs(struct inode *inode) { u8 attrs = MSDOS_I(inode)->i_attrs; if (S_ISDIR(inode->i_mode)) attrs |= ATTR_DIR; if (fat_mode_can_hold_ro(inode) && !(inode->i_mode & S_IWUGO)) attrs |= ATTR_RO; return attrs; } static inline void fat_save_attrs(struct inode *inode, u8 attrs) { if (fat_mode_can_hold_ro(inode)) MSDOS_I(inode)->i_attrs = attrs & ATTR_UNUSED; else MSDOS_I(inode)->i_attrs = attrs & (ATTR_UNUSED | ATTR_RO); } static inline unsigned char fat_checksum(const __u8 *name) { unsigned char s = name[0]; s = (s<<7) + (s>>1) + name[1]; s = (s<<7) + (s>>1) + name[2]; s = (s<<7) + (s>>1) + name[3]; s = (s<<7) + (s>>1) + name[4]; s = (s<<7) + (s>>1) + name[5]; s = (s<<7) + (s>>1) + name[6]; s = (s<<7) + (s>>1) + name[7]; s = (s<<7) + (s>>1) + name[8]; s = (s<<7) + (s>>1) + name[9]; s = (s<<7) + (s>>1) + name[10]; return s; } static inline sector_t fat_clus_to_blknr(struct msdos_sb_info *sbi, int clus) { return ((sector_t)clus - FAT_START_ENT) * sbi->sec_per_clus + sbi->data_start; } static inline void fat_get_blknr_offset(struct msdos_sb_info *sbi, loff_t i_pos, sector_t *blknr, int *offset) { *blknr = i_pos >> sbi->dir_per_block_bits; *offset = i_pos & (sbi->dir_per_block - 1); } static inline loff_t fat_i_pos_read(struct msdos_sb_info *sbi, struct inode *inode) { loff_t i_pos; #if BITS_PER_LONG == 32 spin_lock(&sbi->inode_hash_lock); #endif i_pos = MSDOS_I(inode)->i_pos; #if BITS_PER_LONG == 32 spin_unlock(&sbi->inode_hash_lock); #endif return i_pos; } static inline void fat16_towchar(wchar_t *dst, const __u8 *src, size_t len) { #ifdef __BIG_ENDIAN while (len--) { *dst++ = src[0] | (src[1] << 8); src += 2; } #else memcpy(dst, src, len * 2); #endif } static inline int fat_get_start(const struct msdos_sb_info *sbi, const struct msdos_dir_entry *de) { int cluster = le16_to_cpu(de->start); if (is_fat32(sbi)) cluster |= (le16_to_cpu(de->starthi) << 16); return cluster; } static inline void fat_set_start(struct msdos_dir_entry *de, int cluster) { de->start = cpu_to_le16(cluster); de->starthi = cpu_to_le16(cluster >> 16); } static inline void fatwchar_to16(__u8 *dst, const wchar_t *src, size_t len) { #ifdef __BIG_ENDIAN while (len--) { dst[0] = *src & 0x00FF; dst[1] = (*src & 0xFF00) >> 8; dst += 2; src++; } #else memcpy(dst, src, len * 2); #endif } /* fat/cache.c */ extern void fat_cache_inval_inode(struct inode *inode); extern int fat_get_cluster(struct inode *inode, int cluster, int *fclus, int *dclus); extern int fat_get_mapped_cluster(struct inode *inode, sector_t sector, sector_t last_block, unsigned long *mapped_blocks, sector_t *bmap); extern int fat_bmap(struct inode *inode, sector_t sector, sector_t *phys, unsigned long *mapped_blocks, int create, bool from_bmap); /* fat/dir.c */ extern const struct file_operations fat_dir_operations; extern int fat_search_long(struct inode *inode, const unsigned char *name, int name_len, struct fat_slot_info *sinfo); extern int fat_dir_empty(struct inode *dir); extern int fat_subdirs(struct inode *dir); extern int fat_scan(struct inode *dir, const unsigned char *name, struct fat_slot_info *sinfo); extern int fat_scan_logstart(struct inode *dir, int i_logstart, struct fat_slot_info *sinfo); extern int fat_get_dotdot_entry(struct inode *dir, struct buffer_head **bh, struct msdos_dir_entry **de); extern int fat_alloc_new_dir(struct inode *dir, struct timespec64 *ts); extern int fat_add_entries(struct inode *dir, void *slots, int nr_slots, struct fat_slot_info *sinfo); extern int fat_remove_entries(struct inode *dir, struct fat_slot_info *sinfo); /* fat/fatent.c */ struct fat_entry { int entry; union { u8 *ent12_p[2]; __le16 *ent16_p; __le32 *ent32_p; } u; int nr_bhs; struct buffer_head *bhs[2]; struct inode *fat_inode; }; static inline void fatent_init(struct fat_entry *fatent) { fatent->nr_bhs = 0; fatent->entry = 0; fatent->u.ent32_p = NULL; fatent->bhs[0] = fatent->bhs[1] = NULL; fatent->fat_inode = NULL; } static inline void fatent_set_entry(struct fat_entry *fatent, int entry) { fatent->entry = entry; fatent->u.ent32_p = NULL; } static inline void fatent_brelse(struct fat_entry *fatent) { int i; fatent->u.ent32_p = NULL; for (i = 0; i < fatent->nr_bhs; i++) brelse(fatent->bhs[i]); fatent->nr_bhs = 0; fatent->bhs[0] = fatent->bhs[1] = NULL; fatent->fat_inode = NULL; } static inline bool fat_valid_entry(struct msdos_sb_info *sbi, int entry) { return FAT_START_ENT <= entry && entry < sbi->max_cluster; } extern void fat_ent_access_init(struct super_block *sb); extern int fat_ent_read(struct inode *inode, struct fat_entry *fatent, int entry); extern int fat_ent_write(struct inode *inode, struct fat_entry *fatent, int new, int wait); extern int fat_alloc_clusters(struct inode *inode, int *cluster, int nr_cluster); extern int fat_free_clusters(struct inode *inode, int cluster); extern int fat_count_free_clusters(struct super_block *sb); extern int fat_trim_fs(struct inode *inode, struct fstrim_range *range); /* fat/file.c */ extern long fat_generic_ioctl(struct file *filp, unsigned int cmd, unsigned long arg); extern const struct file_operations fat_file_operations; extern const struct inode_operations fat_file_inode_operations; extern int fat_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr); extern void fat_truncate_blocks(struct inode *inode, loff_t offset); extern int fat_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags); extern int fat_file_fsync(struct file *file, loff_t start, loff_t end, int datasync); /* fat/inode.c */ extern int fat_block_truncate_page(struct inode *inode, loff_t from); extern void fat_attach(struct inode *inode, loff_t i_pos); extern void fat_detach(struct inode *inode); extern struct inode *fat_iget(struct super_block *sb, loff_t i_pos); extern struct inode *fat_build_inode(struct super_block *sb, struct msdos_dir_entry *de, loff_t i_pos); extern int fat_sync_inode(struct inode *inode); extern int fat_fill_super(struct super_block *sb, struct fs_context *fc, void (*setup)(struct super_block *)); extern int fat_fill_inode(struct inode *inode, struct msdos_dir_entry *de); extern int fat_flush_inodes(struct super_block *sb, struct inode *i1, struct inode *i2); extern const struct fs_parameter_spec fat_param_spec[]; int fat_init_fs_context(struct fs_context *fc, bool is_vfat); void fat_free_fc(struct fs_context *fc); int fat_parse_param(struct fs_context *fc, struct fs_parameter *param, bool is_vfat); int fat_reconfigure(struct fs_context *fc); static inline unsigned long fat_dir_hash(int logstart) { return hash_32(logstart, FAT_HASH_BITS); } extern int fat_add_cluster(struct inode *inode); /* fat/misc.c */ extern __printf(3, 4) __cold void __fat_fs_error(struct super_block *sb, int report, const char *fmt, ...); #define fat_fs_error(sb, fmt, args...) \ __fat_fs_error(sb, 1, fmt , ## args) #define fat_fs_error_ratelimit(sb, fmt, args...) \ __fat_fs_error(sb, __ratelimit(&MSDOS_SB(sb)->ratelimit), fmt , ## args) #define FAT_PRINTK_PREFIX "%sFAT-fs (%s): " #define fat_msg(sb, level, fmt, args...) \ do { \ printk_index_subsys_emit(FAT_PRINTK_PREFIX, level, fmt, ##args);\ _fat_msg(sb, level, fmt, ##args); \ } while (0) __printf(3, 4) __cold void _fat_msg(struct super_block *sb, const char *level, const char *fmt, ...); #define fat_msg_ratelimit(sb, level, fmt, args...) \ do { \ if (__ratelimit(&MSDOS_SB(sb)->ratelimit)) \ fat_msg(sb, level, fmt, ## args); \ } while (0) extern int fat_clusters_flush(struct super_block *sb); extern int fat_chain_add(struct inode *inode, int new_dclus, int nr_cluster); extern void fat_time_fat2unix(struct msdos_sb_info *sbi, struct timespec64 *ts, __le16 __time, __le16 __date, u8 time_cs); extern void fat_time_unix2fat(struct msdos_sb_info *sbi, struct timespec64 *ts, __le16 *time, __le16 *date, u8 *time_cs); extern struct timespec64 fat_truncate_atime(const struct msdos_sb_info *sbi, const struct timespec64 *ts); extern struct timespec64 fat_truncate_mtime(const struct msdos_sb_info *sbi, const struct timespec64 *ts); extern int fat_truncate_time(struct inode *inode, struct timespec64 *now, int flags); extern int fat_update_time(struct inode *inode, int flags); extern int fat_sync_bhs(struct buffer_head **bhs, int nr_bhs); int fat_cache_init(void); void fat_cache_destroy(void); /* fat/nfs.c */ extern const struct export_operations fat_export_ops; extern const struct export_operations fat_export_ops_nostale; /* helper for printk */ typedef unsigned long long llu; #endif /* !_FAT_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_STRING_H_ #define _LINUX_STRING_H_ #include <linux/args.h> #include <linux/array_size.h> #include <linux/cleanup.h> /* for DEFINE_FREE() */ #include <linux/compiler.h> /* for inline */ #include <linux/types.h> /* for size_t */ #include <linux/stddef.h> /* for NULL */ #include <linux/err.h> /* for ERR_PTR() */ #include <linux/errno.h> /* for E2BIG */ #include <linux/overflow.h> /* for check_mul_overflow() */ #include <linux/stdarg.h> #include <uapi/linux/string.h> extern char *strndup_user(const char __user *, long); extern void *memdup_user(const void __user *, size_t) __realloc_size(2); extern void *vmemdup_user(const void __user *, size_t) __realloc_size(2); extern void *memdup_user_nul(const void __user *, size_t); /** * memdup_array_user - duplicate array from user space * @src: source address in user space * @n: number of array members to copy * @size: size of one array member * * Return: an ERR_PTR() on failure. Result is physically * contiguous, to be freed by kfree(). */ static inline __realloc_size(2, 3) void *memdup_array_user(const void __user *src, size_t n, size_t size) { size_t nbytes; if (check_mul_overflow(n, size, &nbytes)) return ERR_PTR(-EOVERFLOW); return memdup_user(src, nbytes); } /** * vmemdup_array_user - duplicate array from user space * @src: source address in user space * @n: number of array members to copy * @size: size of one array member * * Return: an ERR_PTR() on failure. Result may be not * physically contiguous. Use kvfree() to free. */ static inline __realloc_size(2, 3) void *vmemdup_array_user(const void __user *src, size_t n, size_t size) { size_t nbytes; if (check_mul_overflow(n, size, &nbytes)) return ERR_PTR(-EOVERFLOW); return vmemdup_user(src, nbytes); } /* * Include machine specific inline routines */ #include <asm/string.h> #ifndef __HAVE_ARCH_STRCPY extern char * strcpy(char *,const char *); #endif #ifndef __HAVE_ARCH_STRNCPY extern char * strncpy(char *,const char *, __kernel_size_t); #endif ssize_t sized_strscpy(char *, const char *, size_t); /* * The 2 argument style can only be used when dst is an array with a * known size. */ #define __strscpy0(dst, src, ...) \ sized_strscpy(dst, src, sizeof(dst) + __must_be_array(dst) + \ __must_be_cstr(dst) + __must_be_cstr(src)) #define __strscpy1(dst, src, size) \ sized_strscpy(dst, src, size + __must_be_cstr(dst) + __must_be_cstr(src)) #define __strscpy_pad0(dst, src, ...) \ sized_strscpy_pad(dst, src, sizeof(dst) + __must_be_array(dst) + \ __must_be_cstr(dst) + __must_be_cstr(src)) #define __strscpy_pad1(dst, src, size) \ sized_strscpy_pad(dst, src, size + __must_be_cstr(dst) + __must_be_cstr(src)) /** * strscpy - Copy a C-string into a sized buffer * @dst: Where to copy the string to * @src: Where to copy the string from * @...: Size of destination buffer (optional) * * Copy the source string @src, or as much of it as fits, into the * destination @dst buffer. The behavior is undefined if the string * buffers overlap. The destination @dst buffer is always NUL terminated, * unless it's zero-sized. * * The size argument @... is only required when @dst is not an array, or * when the copy needs to be smaller than sizeof(@dst). * * Preferred to strncpy() since it always returns a valid string, and * doesn't unnecessarily force the tail of the destination buffer to be * zero padded. If padding is desired please use strscpy_pad(). * * Returns the number of characters copied in @dst (not including the * trailing %NUL) or -E2BIG if @size is 0 or the copy from @src was * truncated. */ #define strscpy(dst, src, ...) \ CONCATENATE(__strscpy, COUNT_ARGS(__VA_ARGS__))(dst, src, __VA_ARGS__) #define sized_strscpy_pad(dest, src, count) ({ \ char *__dst = (dest); \ const char *__src = (src); \ const size_t __count = (count); \ ssize_t __wrote; \ \ __wrote = sized_strscpy(__dst, __src, __count); \ if (__wrote >= 0 && __wrote < __count) \ memset(__dst + __wrote + 1, 0, __count - __wrote - 1); \ __wrote; \ }) /** * strscpy_pad() - Copy a C-string into a sized buffer * @dst: Where to copy the string to * @src: Where to copy the string from * @...: Size of destination buffer * * Copy the string, or as much of it as fits, into the dest buffer. The * behavior is undefined if the string buffers overlap. The destination * buffer is always %NUL terminated, unless it's zero-sized. * * If the source string is shorter than the destination buffer, the * remaining bytes in the buffer will be filled with %NUL bytes. * * For full explanation of why you may want to consider using the * 'strscpy' functions please see the function docstring for strscpy(). * * Returns: * * The number of characters copied (not including the trailing %NULs) * * -E2BIG if count is 0 or @src was truncated. */ #define strscpy_pad(dst, src, ...) \ CONCATENATE(__strscpy_pad, COUNT_ARGS(__VA_ARGS__))(dst, src, __VA_ARGS__) #ifndef __HAVE_ARCH_STRCAT extern char * strcat(char *, const char *); #endif #ifndef __HAVE_ARCH_STRNCAT extern char * strncat(char *, const char *, __kernel_size_t); #endif #ifndef __HAVE_ARCH_STRLCAT extern size_t strlcat(char *, const char *, __kernel_size_t); #endif #ifndef __HAVE_ARCH_STRCMP extern int strcmp(const char *,const char *); #endif #ifndef __HAVE_ARCH_STRNCMP extern int strncmp(const char *,const char *,__kernel_size_t); #endif #ifndef __HAVE_ARCH_STRCASECMP extern int strcasecmp(const char *s1, const char *s2); #endif #ifndef __HAVE_ARCH_STRNCASECMP extern int strncasecmp(const char *s1, const char *s2, size_t n); #endif #ifndef __HAVE_ARCH_STRCHR extern char * strchr(const char *,int); #endif #ifndef __HAVE_ARCH_STRCHRNUL extern char * strchrnul(const char *,int); #endif extern char * strnchrnul(const char *, size_t, int); #ifndef __HAVE_ARCH_STRNCHR extern char * strnchr(const char *, size_t, int); #endif #ifndef __HAVE_ARCH_STRRCHR extern char * strrchr(const char *,int); #endif extern char * __must_check skip_spaces(const char *); extern char *strim(char *); static inline __must_check char *strstrip(char *str) { return strim(str); } #ifndef __HAVE_ARCH_STRSTR extern char * strstr(const char *, const char *); #endif #ifndef __HAVE_ARCH_STRNSTR extern char * strnstr(const char *, const char *, size_t); #endif #ifndef __HAVE_ARCH_STRLEN extern __kernel_size_t strlen(const char *); #endif #ifndef __HAVE_ARCH_STRNLEN extern __kernel_size_t strnlen(const char *,__kernel_size_t); #endif #ifndef __HAVE_ARCH_STRPBRK extern char * strpbrk(const char *,const char *); #endif #ifndef __HAVE_ARCH_STRSEP extern char * strsep(char **,const char *); #endif #ifndef __HAVE_ARCH_STRSPN extern __kernel_size_t strspn(const char *,const char *); #endif #ifndef __HAVE_ARCH_STRCSPN extern __kernel_size_t strcspn(const char *,const char *); #endif #ifndef __HAVE_ARCH_MEMSET extern void * memset(void *,int,__kernel_size_t); #endif #ifndef __HAVE_ARCH_MEMSET16 extern void *memset16(uint16_t *, uint16_t, __kernel_size_t); #endif #ifndef __HAVE_ARCH_MEMSET32 extern void *memset32(uint32_t *, uint32_t, __kernel_size_t); #endif #ifndef __HAVE_ARCH_MEMSET64 extern void *memset64(uint64_t *, uint64_t, __kernel_size_t); #endif static inline void *memset_l(unsigned long *p, unsigned long v, __kernel_size_t n) { if (BITS_PER_LONG == 32) return memset32((uint32_t *)p, v, n); else return memset64((uint64_t *)p, v, n); } static inline void *memset_p(void **p, void *v, __kernel_size_t n) { if (BITS_PER_LONG == 32) return memset32((uint32_t *)p, (uintptr_t)v, n); else return memset64((uint64_t *)p, (uintptr_t)v, n); } extern void **__memcat_p(void **a, void **b); #define memcat_p(a, b) ({ \ BUILD_BUG_ON_MSG(!__same_type(*(a), *(b)), \ "type mismatch in memcat_p()"); \ (typeof(*a) *)__memcat_p((void **)(a), (void **)(b)); \ }) #ifndef __HAVE_ARCH_MEMCPY extern void * memcpy(void *,const void *,__kernel_size_t); #endif #ifndef __HAVE_ARCH_MEMMOVE extern void * memmove(void *,const void *,__kernel_size_t); #endif #ifndef __HAVE_ARCH_MEMSCAN extern void * memscan(void *,int,__kernel_size_t); #endif #ifndef __HAVE_ARCH_MEMCMP extern int memcmp(const void *,const void *,__kernel_size_t); #endif #ifndef __HAVE_ARCH_BCMP extern int bcmp(const void *,const void *,__kernel_size_t); #endif #ifndef __HAVE_ARCH_MEMCHR extern void * memchr(const void *,int,__kernel_size_t); #endif #ifndef __HAVE_ARCH_MEMCPY_FLUSHCACHE static inline void memcpy_flushcache(void *dst, const void *src, size_t cnt) { memcpy(dst, src, cnt); } #endif void *memchr_inv(const void *s, int c, size_t n); char *strreplace(char *str, char old, char new); /** * mem_is_zero - Check if an area of memory is all 0's. * @s: The memory area * @n: The size of the area * * Return: True if the area of memory is all 0's. */ static inline bool mem_is_zero(const void *s, size_t n) { return !memchr_inv(s, 0, n); } extern void kfree_const(const void *x); extern char *kstrdup(const char *s, gfp_t gfp) __malloc; extern const char *kstrdup_const(const char *s, gfp_t gfp); extern char *kstrndup(const char *s, size_t len, gfp_t gfp); extern void *kmemdup_noprof(const void *src, size_t len, gfp_t gfp) __realloc_size(2); #define kmemdup(...) alloc_hooks(kmemdup_noprof(__VA_ARGS__)) extern void *kvmemdup(const void *src, size_t len, gfp_t gfp) __realloc_size(2); extern char *kmemdup_nul(const char *s, size_t len, gfp_t gfp); extern void *kmemdup_array(const void *src, size_t count, size_t element_size, gfp_t gfp) __realloc_size(2, 3); /* lib/argv_split.c */ extern char **argv_split(gfp_t gfp, const char *str, int *argcp); extern void argv_free(char **argv); DEFINE_FREE(argv_free, char **, if (!IS_ERR_OR_NULL(_T)) argv_free(_T)) /* lib/cmdline.c */ extern int get_option(char **str, int *pint); extern char *get_options(const char *str, int nints, int *ints); extern unsigned long long memparse(const char *ptr, char **retptr); extern bool parse_option_str(const char *str, const char *option); extern char *next_arg(char *args, char **param, char **val); extern bool sysfs_streq(const char *s1, const char *s2); int match_string(const char * const *array, size_t n, const char *string); int __sysfs_match_string(const char * const *array, size_t n, const char *s); /** * sysfs_match_string - matches given string in an array * @_a: array of strings * @_s: string to match with * * Helper for __sysfs_match_string(). Calculates the size of @a automatically. */ #define sysfs_match_string(_a, _s) __sysfs_match_string(_a, ARRAY_SIZE(_a), _s) #ifdef CONFIG_BINARY_PRINTF __printf(3, 0) int vbin_printf(u32 *bin_buf, size_t size, const char *fmt, va_list args); __printf(3, 0) int bstr_printf(char *buf, size_t size, const char *fmt, const u32 *bin_buf); #endif extern ssize_t memory_read_from_buffer(void *to, size_t count, loff_t *ppos, const void *from, size_t available); int ptr_to_hashval(const void *ptr, unsigned long *hashval_out); size_t memweight(const void *ptr, size_t bytes); /** * memzero_explicit - Fill a region of memory (e.g. sensitive * keying data) with 0s. * @s: Pointer to the start of the area. * @count: The size of the area. * * Note: usually using memset() is just fine (!), but in cases * where clearing out _local_ data at the end of a scope is * necessary, memzero_explicit() should be used instead in * order to prevent the compiler from optimising away zeroing. * * memzero_explicit() doesn't need an arch-specific version as * it just invokes the one of memset() implicitly. */ static inline void memzero_explicit(void *s, size_t count) { memset(s, 0, count); barrier_data(s); } /** * kbasename - return the last part of a pathname. * * @path: path to extract the filename from. * * Returns: * Pointer to the filename portion inside @path. If no '/' exists, * returns @path unchanged. */ static inline const char *kbasename(const char *path) { const char *tail = strrchr(path, '/'); return tail ? tail + 1 : path; } #if !defined(__NO_FORTIFY) && defined(__OPTIMIZE__) && defined(CONFIG_FORTIFY_SOURCE) #include <linux/fortify-string.h> #endif #ifndef unsafe_memcpy #define unsafe_memcpy(dst, src, bytes, justification) \ memcpy(dst, src, bytes) #endif void memcpy_and_pad(void *dest, size_t dest_len, const void *src, size_t count, int pad); /** * strtomem_pad - Copy NUL-terminated string to non-NUL-terminated buffer * * @dest: Pointer of destination character array (marked as __nonstring) * @src: Pointer to NUL-terminated string * @pad: Padding character to fill any remaining bytes of @dest after copy * * This is a replacement for strncpy() uses where the destination is not * a NUL-terminated string, but with bounds checking on the source size, and * an explicit padding character. If padding is not required, use strtomem(). * * Note that the size of @dest is not an argument, as the length of @dest * must be discoverable by the compiler. */ #define strtomem_pad(dest, src, pad) do { \ const size_t _dest_len = __must_be_byte_array(dest) + \ __must_be_noncstr(dest) + \ ARRAY_SIZE(dest); \ const size_t _src_len = __must_be_cstr(src) + \ __builtin_object_size(src, 1); \ \ BUILD_BUG_ON(!__builtin_constant_p(_dest_len) || \ _dest_len == (size_t)-1); \ memcpy_and_pad(dest, _dest_len, src, \ strnlen(src, min(_src_len, _dest_len)), pad); \ } while (0) /** * strtomem - Copy NUL-terminated string to non-NUL-terminated buffer * * @dest: Pointer of destination character array (marked as __nonstring) * @src: Pointer to NUL-terminated string * * This is a replacement for strncpy() uses where the destination is not * a NUL-terminated string, but with bounds checking on the source size, and * without trailing padding. If padding is required, use strtomem_pad(). * * Note that the size of @dest is not an argument, as the length of @dest * must be discoverable by the compiler. */ #define strtomem(dest, src) do { \ const size_t _dest_len = __must_be_byte_array(dest) + \ __must_be_noncstr(dest) + \ ARRAY_SIZE(dest); \ const size_t _src_len = __must_be_cstr(src) + \ __builtin_object_size(src, 1); \ \ BUILD_BUG_ON(!__builtin_constant_p(_dest_len) || \ _dest_len == (size_t)-1); \ memcpy(dest, src, strnlen(src, min(_src_len, _dest_len))); \ } while (0) /** * memtostr - Copy a possibly non-NUL-term string to a NUL-term string * @dest: Pointer to destination NUL-terminates string * @src: Pointer to character array (likely marked as __nonstring) * * This is a replacement for strncpy() uses where the source is not * a NUL-terminated string. * * Note that sizes of @dest and @src must be known at compile-time. */ #define memtostr(dest, src) do { \ const size_t _dest_len = __must_be_byte_array(dest) + \ __must_be_cstr(dest) + \ ARRAY_SIZE(dest); \ const size_t _src_len = __must_be_noncstr(src) + \ __builtin_object_size(src, 1); \ const size_t _src_chars = strnlen(src, _src_len); \ const size_t _copy_len = min(_dest_len - 1, _src_chars); \ \ BUILD_BUG_ON(!__builtin_constant_p(_dest_len) || \ !__builtin_constant_p(_src_len) || \ _dest_len == 0 || _dest_len == (size_t)-1 || \ _src_len == 0 || _src_len == (size_t)-1); \ memcpy(dest, src, _copy_len); \ dest[_copy_len] = '\0'; \ } while (0) /** * memtostr_pad - Copy a possibly non-NUL-term string to a NUL-term string * with NUL padding in the destination * @dest: Pointer to destination NUL-terminates string * @src: Pointer to character array (likely marked as __nonstring) * * This is a replacement for strncpy() uses where the source is not * a NUL-terminated string. * * Note that sizes of @dest and @src must be known at compile-time. */ #define memtostr_pad(dest, src) do { \ const size_t _dest_len = __must_be_byte_array(dest) + \ __must_be_cstr(dest) + \ ARRAY_SIZE(dest); \ const size_t _src_len = __must_be_noncstr(src) + \ __builtin_object_size(src, 1); \ const size_t _src_chars = strnlen(src, _src_len); \ const size_t _copy_len = min(_dest_len - 1, _src_chars); \ \ BUILD_BUG_ON(!__builtin_constant_p(_dest_len) || \ !__builtin_constant_p(_src_len) || \ _dest_len == 0 || _dest_len == (size_t)-1 || \ _src_len == 0 || _src_len == (size_t)-1); \ memcpy(dest, src, _copy_len); \ memset(&dest[_copy_len], 0, _dest_len - _copy_len); \ } while (0) /** * memset_after - Set a value after a struct member to the end of a struct * * @obj: Address of target struct instance * @v: Byte value to repeatedly write * @member: after which struct member to start writing bytes * * This is good for clearing padding following the given member. */ #define memset_after(obj, v, member) \ ({ \ u8 *__ptr = (u8 *)(obj); \ typeof(v) __val = (v); \ memset(__ptr + offsetofend(typeof(*(obj)), member), __val, \ sizeof(*(obj)) - offsetofend(typeof(*(obj)), member)); \ }) /** * memset_startat - Set a value starting at a member to the end of a struct * * @obj: Address of target struct instance * @v: Byte value to repeatedly write * @member: struct member to start writing at * * Note that if there is padding between the prior member and the target * member, memset_after() should be used to clear the prior padding. */ #define memset_startat(obj, v, member) \ ({ \ u8 *__ptr = (u8 *)(obj); \ typeof(v) __val = (v); \ memset(__ptr + offsetof(typeof(*(obj)), member), __val, \ sizeof(*(obj)) - offsetof(typeof(*(obj)), member)); \ }) /** * str_has_prefix - Test if a string has a given prefix * @str: The string to test * @prefix: The string to see if @str starts with * * A common way to test a prefix of a string is to do: * strncmp(str, prefix, sizeof(prefix) - 1) * * But this can lead to bugs due to typos, or if prefix is a pointer * and not a constant. Instead use str_has_prefix(). * * Returns: * * strlen(@prefix) if @str starts with @prefix * * 0 if @str does not start with @prefix */ static __always_inline size_t str_has_prefix(const char *str, const char *prefix) { size_t len = strlen(prefix); return strncmp(str, prefix, len) == 0 ? len : 0; } /** * strstarts - does @str start with @prefix? * @str: string to examine * @prefix: prefix to look for. * * Returns: * True if @str begins with @prefix. False in all other cases. */ static inline bool strstarts(const char *str, const char *prefix) { return strncmp(str, prefix, strlen(prefix)) == 0; } /** * strends - Check if a string ends with another string. * @str: NULL-terminated string to check against @suffix * @suffix: NULL-terminated string defining the suffix to look for in @str * * Returns: * True if @str ends with @suffix. False in all other cases. */ static inline bool __attribute__((nonnull(1, 2))) strends(const char *str, const char *suffix) { unsigned int str_len = strlen(str), suffix_len = strlen(suffix); if (str_len < suffix_len) return false; return !(strcmp(str + str_len - suffix_len, suffix)); } #endif /* _LINUX_STRING_H_ */
84 84 83 44 53 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 // SPDX-License-Identifier: GPL-2.0+ /* * Serial core port device driver * * Copyright (C) 2023 Texas Instruments Incorporated - https://www.ti.com/ * Author: Tony Lindgren <tony@atomide.com> */ #include <linux/device.h> #include <linux/module.h> #include <linux/of.h> #include <linux/platform_device.h> #include <linux/pm_runtime.h> #include <linux/pnp.h> #include <linux/property.h> #include <linux/serial_core.h> #include <linux/spinlock.h> #include "serial_base.h" #define SERIAL_PORT_AUTOSUSPEND_DELAY_MS 500 /* Only considers pending TX for now. Caller must take care of locking */ static int __serial_port_busy(struct uart_port *port) { return !uart_tx_stopped(port) && !kfifo_is_empty(&port->state->port.xmit_fifo); } static int serial_port_runtime_resume(struct device *dev) { struct serial_port_device *port_dev = to_serial_base_port_device(dev); struct uart_port *port; unsigned long flags; port = port_dev->port; if (port->flags & UPF_DEAD) goto out; /* Flush any pending TX for the port */ uart_port_lock_irqsave(port, &flags); if (!port_dev->tx_enabled) goto unlock; if (__serial_port_busy(port)) port->ops->start_tx(port); unlock: uart_port_unlock_irqrestore(port, flags); out: pm_runtime_mark_last_busy(dev); return 0; } static int serial_port_runtime_suspend(struct device *dev) { struct serial_port_device *port_dev = to_serial_base_port_device(dev); struct uart_port *port = port_dev->port; unsigned long flags; bool busy; if (port->flags & UPF_DEAD) return 0; /* * Nothing to do on pm_runtime_force_suspend(), see * DEFINE_RUNTIME_DEV_PM_OPS. */ if (!pm_runtime_enabled(dev)) return 0; uart_port_lock_irqsave(port, &flags); if (!port_dev->tx_enabled) { uart_port_unlock_irqrestore(port, flags); return 0; } busy = __serial_port_busy(port); if (busy) port->ops->start_tx(port); uart_port_unlock_irqrestore(port, flags); if (busy) pm_runtime_mark_last_busy(dev); return busy ? -EBUSY : 0; } static void serial_base_port_set_tx(struct uart_port *port, struct serial_port_device *port_dev, bool enabled) { unsigned long flags; uart_port_lock_irqsave(port, &flags); port_dev->tx_enabled = enabled; uart_port_unlock_irqrestore(port, flags); } void serial_base_port_startup(struct uart_port *port) { struct serial_port_device *port_dev = port->port_dev; serial_base_port_set_tx(port, port_dev, true); } void serial_base_port_shutdown(struct uart_port *port) { struct serial_port_device *port_dev = port->port_dev; serial_base_port_set_tx(port, port_dev, false); } static DEFINE_RUNTIME_DEV_PM_OPS(serial_port_pm, serial_port_runtime_suspend, serial_port_runtime_resume, NULL); static int serial_port_probe(struct device *dev) { pm_runtime_enable(dev); pm_runtime_set_autosuspend_delay(dev, SERIAL_PORT_AUTOSUSPEND_DELAY_MS); pm_runtime_use_autosuspend(dev); return 0; } static int serial_port_remove(struct device *dev) { pm_runtime_dont_use_autosuspend(dev); pm_runtime_disable(dev); return 0; } /* * Serial core port device init functions. Note that the physical serial * port device driver may not have completed probe at this point. */ int uart_add_one_port(struct uart_driver *drv, struct uart_port *port) { return serial_ctrl_register_port(drv, port); } EXPORT_SYMBOL(uart_add_one_port); void uart_remove_one_port(struct uart_driver *drv, struct uart_port *port) { serial_ctrl_unregister_port(drv, port); } EXPORT_SYMBOL(uart_remove_one_port); /** * __uart_read_properties - read firmware properties of the given UART port * @port: corresponding port * @use_defaults: apply defaults (when %true) or validate the values (when %false) * * The following device properties are supported: * - clock-frequency (optional) * - fifo-size (optional) * - no-loopback-test (optional) * - reg-shift (defaults may apply) * - reg-offset (value may be validated) * - reg-io-width (defaults may apply or value may be validated) * - interrupts (OF only) * - serial [alias ID] (OF only) * * If the port->dev is of struct platform_device type the interrupt line * will be retrieved via platform_get_irq() call against that device. * Otherwise it will be assigned by fwnode_irq_get() call. In both cases * the index 0 of the resource is used. * * The caller is responsible to initialize the following fields of the @port * ->dev (must be valid) * ->flags * ->iobase * ->mapbase * ->mapsize * ->regshift (if @use_defaults is false) * before calling this function. Alternatively the above mentioned fields * may be zeroed, in such case the only ones, that have associated properties * found, will be set to the respective values. * * If no error happened, the ->irq, ->mapbase, ->mapsize will be altered. * The ->iotype is always altered. * * When @use_defaults is true and the respective property is not found * the following values will be applied: * ->regshift = 0 * In this case IRQ must be provided, otherwise an error will be returned. * * When @use_defaults is false and the respective property is found * the following values will be validated: * - reg-io-width (->iotype) * - reg-offset (->mapsize against ->mapbase) * * Returns: 0 on success or negative errno on failure */ static int __uart_read_properties(struct uart_port *port, bool use_defaults) { struct device *dev = port->dev; u32 value; int ret; /* Read optional UART functional clock frequency */ device_property_read_u32(dev, "clock-frequency", &port->uartclk); /* Read the registers alignment (default: 8-bit) */ ret = device_property_read_u32(dev, "reg-shift", &value); if (ret) port->regshift = use_defaults ? 0 : port->regshift; else port->regshift = value; /* Read the registers I/O access type (default: MMIO 8-bit) */ ret = device_property_read_u32(dev, "reg-io-width", &value); if (ret) { port->iotype = port->iobase ? UPIO_PORT : UPIO_MEM; } else { switch (value) { case 1: port->iotype = UPIO_MEM; break; case 2: port->iotype = UPIO_MEM16; break; case 4: port->iotype = device_is_big_endian(dev) ? UPIO_MEM32BE : UPIO_MEM32; break; default: port->iotype = UPIO_UNKNOWN; break; } } if (!use_defaults && port->iotype == UPIO_UNKNOWN) { dev_err(dev, "Unsupported reg-io-width (%u)\n", value); return -EINVAL; } /* Read the address mapping base offset (default: no offset) */ ret = device_property_read_u32(dev, "reg-offset", &value); if (ret) value = 0; /* Check for shifted address mapping overflow */ if (!use_defaults && port->mapsize < value) { dev_err(dev, "reg-offset %u exceeds region size %pa\n", value, &port->mapsize); return -EINVAL; } port->mapbase += value; port->mapsize -= value; /* Read optional FIFO size */ device_property_read_u32(dev, "fifo-size", &port->fifosize); if (device_property_read_bool(dev, "no-loopback-test")) port->flags |= UPF_SKIP_TEST; /* Get index of serial line, if found in DT aliases */ ret = of_alias_get_id(dev_of_node(dev), "serial"); if (ret >= 0) port->line = ret; if (dev_is_platform(dev)) ret = platform_get_irq(to_platform_device(dev), 0); else if (dev_is_pnp(dev)) { ret = pnp_irq(to_pnp_dev(dev), 0); if (ret < 0) ret = -ENXIO; } else ret = fwnode_irq_get(dev_fwnode(dev), 0); if (ret == -EPROBE_DEFER) return ret; if (ret > 0) port->irq = ret; else if (use_defaults) /* By default IRQ support is mandatory */ return ret; else port->irq = 0; port->flags |= UPF_SHARE_IRQ; return 0; } int uart_read_port_properties(struct uart_port *port) { return __uart_read_properties(port, true); } EXPORT_SYMBOL_GPL(uart_read_port_properties); int uart_read_and_validate_port_properties(struct uart_port *port) { return __uart_read_properties(port, false); } EXPORT_SYMBOL_GPL(uart_read_and_validate_port_properties); static struct device_driver serial_port_driver = { .name = "port", .suppress_bind_attrs = true, .probe = serial_port_probe, .remove = serial_port_remove, .pm = pm_ptr(&serial_port_pm), }; int serial_base_port_init(void) { return serial_base_driver_register(&serial_port_driver); } void serial_base_port_exit(void) { serial_base_driver_unregister(&serial_port_driver); } MODULE_AUTHOR("Tony Lindgren <tony@atomide.com>"); MODULE_DESCRIPTION("Serial controller port driver"); MODULE_LICENSE("GPL");
26 26 23 10 64 65 48 65 65 65 67 66 20 21 20 21 20 21 44 43 44 43 2 44 43 1 43 44 44 2 2 2 2 9 1 1 1 8 8 8 3 1 7 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 // SPDX-License-Identifier: GPL-2.0 /* * fs/proc_namespace.c - handling of /proc/<pid>/{mounts,mountinfo,mountstats} * * In fact, that's a piece of procfs; it's *almost* isolated from * the rest of fs/proc, but has rather close relationships with * fs/namespace.c, thus here instead of fs/proc * */ #include <linux/mnt_namespace.h> #include <linux/nsproxy.h> #include <linux/security.h> #include <linux/fs_struct.h> #include <linux/sched/task.h> #include "proc/internal.h" /* only for get_proc_task() in ->open() */ #include "pnode.h" #include "internal.h" static __poll_t mounts_poll(struct file *file, poll_table *wait) { struct seq_file *m = file->private_data; struct proc_mounts *p = m->private; struct mnt_namespace *ns = p->ns; __poll_t res = EPOLLIN | EPOLLRDNORM; int event; poll_wait(file, &p->ns->poll, wait); event = READ_ONCE(ns->event); if (m->poll_event != event) { m->poll_event = event; res |= EPOLLERR | EPOLLPRI; } return res; } struct proc_fs_opts { int flag; const char *str; }; static int show_sb_opts(struct seq_file *m, struct super_block *sb) { static const struct proc_fs_opts fs_opts[] = { { SB_SYNCHRONOUS, ",sync" }, { SB_DIRSYNC, ",dirsync" }, { SB_MANDLOCK, ",mand" }, { SB_LAZYTIME, ",lazytime" }, { 0, NULL } }; const struct proc_fs_opts *fs_infop; for (fs_infop = fs_opts; fs_infop->flag; fs_infop++) { if (sb->s_flags & fs_infop->flag) seq_puts(m, fs_infop->str); } return security_sb_show_options(m, sb); } static void show_vfsmnt_opts(struct seq_file *m, struct vfsmount *mnt) { static const struct proc_fs_opts mnt_opts[] = { { MNT_NOSUID, ",nosuid" }, { MNT_NODEV, ",nodev" }, { MNT_NOEXEC, ",noexec" }, { MNT_NOATIME, ",noatime" }, { MNT_NODIRATIME, ",nodiratime" }, { MNT_RELATIME, ",relatime" }, { MNT_NOSYMFOLLOW, ",nosymfollow" }, { 0, NULL } }; const struct proc_fs_opts *fs_infop; for (fs_infop = mnt_opts; fs_infop->flag; fs_infop++) { if (mnt->mnt_flags & fs_infop->flag) seq_puts(m, fs_infop->str); } if (is_idmapped_mnt(mnt)) seq_puts(m, ",idmapped"); } static inline void mangle(struct seq_file *m, const char *s) { seq_escape(m, s, " \t\n\\#"); } static void show_type(struct seq_file *m, struct super_block *sb) { mangle(m, sb->s_type->name); if (sb->s_subtype) { seq_putc(m, '.'); mangle(m, sb->s_subtype); } } static int show_vfsmnt(struct seq_file *m, struct vfsmount *mnt) { struct proc_mounts *p = m->private; struct mount *r = real_mount(mnt); struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt }; struct super_block *sb = mnt_path.dentry->d_sb; int err; if (sb->s_op->show_devname) { err = sb->s_op->show_devname(m, mnt_path.dentry); if (err) goto out; } else { mangle(m, r->mnt_devname); } seq_putc(m, ' '); /* mountpoints outside of chroot jail will give SEQ_SKIP on this */ err = seq_path_root(m, &mnt_path, &p->root, " \t\n\\"); if (err) goto out; seq_putc(m, ' '); show_type(m, sb); seq_puts(m, __mnt_is_readonly(mnt) ? " ro" : " rw"); err = show_sb_opts(m, sb); if (err) goto out; show_vfsmnt_opts(m, mnt); if (sb->s_op->show_options) err = sb->s_op->show_options(m, mnt_path.dentry); seq_puts(m, " 0 0\n"); out: return err; } static int show_mountinfo(struct seq_file *m, struct vfsmount *mnt) { struct proc_mounts *p = m->private; struct mount *r = real_mount(mnt); struct super_block *sb = mnt->mnt_sb; struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt }; int err; seq_printf(m, "%i %i %u:%u ", r->mnt_id, r->mnt_parent->mnt_id, MAJOR(sb->s_dev), MINOR(sb->s_dev)); err = show_path(m, mnt->mnt_root); if (err) goto out; seq_putc(m, ' '); /* mountpoints outside of chroot jail will give SEQ_SKIP on this */ err = seq_path_root(m, &mnt_path, &p->root, " \t\n\\"); if (err) goto out; seq_puts(m, mnt->mnt_flags & MNT_READONLY ? " ro" : " rw"); show_vfsmnt_opts(m, mnt); /* Tagged fields ("foo:X" or "bar") */ if (IS_MNT_SHARED(r)) seq_printf(m, " shared:%i", r->mnt_group_id); if (IS_MNT_SLAVE(r)) { int master = r->mnt_master->mnt_group_id; int dom = get_dominating_id(r, &p->root); seq_printf(m, " master:%i", master); if (dom && dom != master) seq_printf(m, " propagate_from:%i", dom); } if (IS_MNT_UNBINDABLE(r)) seq_puts(m, " unbindable"); /* Filesystem specific data */ seq_puts(m, " - "); show_type(m, sb); seq_putc(m, ' '); if (sb->s_op->show_devname) { err = sb->s_op->show_devname(m, mnt->mnt_root); if (err) goto out; } else { mangle(m, r->mnt_devname); } seq_puts(m, sb_rdonly(sb) ? " ro" : " rw"); err = show_sb_opts(m, sb); if (err) goto out; if (sb->s_op->show_options) err = sb->s_op->show_options(m, mnt->mnt_root); seq_putc(m, '\n'); out: return err; } static int show_vfsstat(struct seq_file *m, struct vfsmount *mnt) { struct proc_mounts *p = m->private; struct mount *r = real_mount(mnt); struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt }; struct super_block *sb = mnt_path.dentry->d_sb; int err; /* device */ seq_puts(m, "device "); if (sb->s_op->show_devname) { err = sb->s_op->show_devname(m, mnt_path.dentry); if (err) goto out; } else { mangle(m, r->mnt_devname); } /* mount point */ seq_puts(m, " mounted on "); /* mountpoints outside of chroot jail will give SEQ_SKIP on this */ err = seq_path_root(m, &mnt_path, &p->root, " \t\n\\"); if (err) goto out; seq_putc(m, ' '); /* file system type */ seq_puts(m, "with fstype "); show_type(m, sb); /* optional statistics */ if (sb->s_op->show_stats) { seq_putc(m, ' '); err = sb->s_op->show_stats(m, mnt_path.dentry); } seq_putc(m, '\n'); out: return err; } static int mounts_open_common(struct inode *inode, struct file *file, int (*show)(struct seq_file *, struct vfsmount *)) { struct task_struct *task = get_proc_task(inode); struct nsproxy *nsp; struct mnt_namespace *ns = NULL; struct path root; struct proc_mounts *p; struct seq_file *m; int ret = -EINVAL; if (!task) goto err; task_lock(task); nsp = task->nsproxy; if (!nsp || !nsp->mnt_ns) { task_unlock(task); put_task_struct(task); goto err; } ns = nsp->mnt_ns; get_mnt_ns(ns); if (!task->fs) { task_unlock(task); put_task_struct(task); ret = -ENOENT; goto err_put_ns; } get_fs_root(task->fs, &root); task_unlock(task); put_task_struct(task); ret = seq_open_private(file, &mounts_op, sizeof(struct proc_mounts)); if (ret) goto err_put_path; m = file->private_data; m->poll_event = ns->event; p = m->private; p->ns = ns; p->root = root; p->show = show; return 0; err_put_path: path_put(&root); err_put_ns: put_mnt_ns(ns); err: return ret; } static int mounts_release(struct inode *inode, struct file *file) { struct seq_file *m = file->private_data; struct proc_mounts *p = m->private; path_put(&p->root); put_mnt_ns(p->ns); return seq_release_private(inode, file); } static int mounts_open(struct inode *inode, struct file *file) { return mounts_open_common(inode, file, show_vfsmnt); } static int mountinfo_open(struct inode *inode, struct file *file) { return mounts_open_common(inode, file, show_mountinfo); } static int mountstats_open(struct inode *inode, struct file *file) { return mounts_open_common(inode, file, show_vfsstat); } const struct file_operations proc_mounts_operations = { .open = mounts_open, .read_iter = seq_read_iter, .splice_read = copy_splice_read, .llseek = seq_lseek, .release = mounts_release, .poll = mounts_poll, }; const struct file_operations proc_mountinfo_operations = { .open = mountinfo_open, .read_iter = seq_read_iter, .splice_read = copy_splice_read, .llseek = seq_lseek, .release = mounts_release, .poll = mounts_poll, }; const struct file_operations proc_mountstats_operations = { .open = mountstats_open, .read_iter = seq_read_iter, .splice_read = copy_splice_read, .llseek = seq_lseek, .release = mounts_release, };
9 1 8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 // SPDX-License-Identifier: GPL-2.0-only /* module that allows mangling of the arp payload */ #include <linux/module.h> #include <linux/netfilter.h> #include <linux/netfilter_arp/arpt_mangle.h> #include <net/sock.h> MODULE_LICENSE("GPL"); MODULE_AUTHOR("Bart De Schuymer <bdschuym@pandora.be>"); MODULE_DESCRIPTION("arptables arp payload mangle target"); static unsigned int target(struct sk_buff *skb, const struct xt_action_param *par) { const struct arpt_mangle *mangle = par->targinfo; const struct arphdr *arp; unsigned char *arpptr; int pln, hln; if (skb_ensure_writable(skb, skb->len)) return NF_DROP; arp = arp_hdr(skb); arpptr = skb_network_header(skb) + sizeof(*arp); pln = arp->ar_pln; hln = arp->ar_hln; /* We assume that pln and hln were checked in the match */ if (mangle->flags & ARPT_MANGLE_SDEV) { if (ARPT_DEV_ADDR_LEN_MAX < hln || (arpptr + hln > skb_tail_pointer(skb))) return NF_DROP; memcpy(arpptr, mangle->src_devaddr, hln); } arpptr += hln; if (mangle->flags & ARPT_MANGLE_SIP) { if (ARPT_MANGLE_ADDR_LEN_MAX < pln || (arpptr + pln > skb_tail_pointer(skb))) return NF_DROP; memcpy(arpptr, &mangle->u_s.src_ip, pln); } arpptr += pln; if (mangle->flags & ARPT_MANGLE_TDEV) { if (ARPT_DEV_ADDR_LEN_MAX < hln || (arpptr + hln > skb_tail_pointer(skb))) return NF_DROP; memcpy(arpptr, mangle->tgt_devaddr, hln); } arpptr += hln; if (mangle->flags & ARPT_MANGLE_TIP) { if (ARPT_MANGLE_ADDR_LEN_MAX < pln || (arpptr + pln > skb_tail_pointer(skb))) return NF_DROP; memcpy(arpptr, &mangle->u_t.tgt_ip, pln); } return mangle->target; } static int checkentry(const struct xt_tgchk_param *par) { const struct arpt_mangle *mangle = par->targinfo; if (mangle->flags & ~ARPT_MANGLE_MASK || !(mangle->flags & ARPT_MANGLE_MASK)) return -EINVAL; if (mangle->target != NF_DROP && mangle->target != NF_ACCEPT && mangle->target != XT_CONTINUE) return -EINVAL; return 0; } static struct xt_target arpt_mangle_reg __read_mostly = { .name = "mangle", .family = NFPROTO_ARP, .target = target, .targetsize = sizeof(struct arpt_mangle), .checkentry = checkentry, .me = THIS_MODULE, }; static int __init arpt_mangle_init(void) { return xt_register_target(&arpt_mangle_reg); } static void __exit arpt_mangle_fini(void) { xt_unregister_target(&arpt_mangle_reg); } module_init(arpt_mangle_init); module_exit(arpt_mangle_fini);
38 769 373 13230 31 16634 1453 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM sock #if !defined(_TRACE_SOCK_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_SOCK_H #include <net/sock.h> #include <net/ipv6.h> #include <linux/tracepoint.h> #include <linux/ipv6.h> #include <linux/tcp.h> #include <trace/events/net_probe_common.h> #define family_names \ EM(AF_INET) \ EMe(AF_INET6) /* The protocol traced by inet_sock_set_state */ #define inet_protocol_names \ EM(IPPROTO_TCP) \ EM(IPPROTO_SCTP) \ EMe(IPPROTO_MPTCP) #define tcp_state_names \ EM(TCP_ESTABLISHED) \ EM(TCP_SYN_SENT) \ EM(TCP_SYN_RECV) \ EM(TCP_FIN_WAIT1) \ EM(TCP_FIN_WAIT2) \ EM(TCP_TIME_WAIT) \ EM(TCP_CLOSE) \ EM(TCP_CLOSE_WAIT) \ EM(TCP_LAST_ACK) \ EM(TCP_LISTEN) \ EM(TCP_CLOSING) \ EMe(TCP_NEW_SYN_RECV) #define skmem_kind_names \ EM(SK_MEM_SEND) \ EMe(SK_MEM_RECV) /* enums need to be exported to user space */ #undef EM #undef EMe #define EM(a) TRACE_DEFINE_ENUM(a); #define EMe(a) TRACE_DEFINE_ENUM(a); family_names inet_protocol_names tcp_state_names skmem_kind_names #undef EM #undef EMe #define EM(a) { a, #a }, #define EMe(a) { a, #a } #define show_family_name(val) \ __print_symbolic(val, family_names) #define show_inet_protocol_name(val) \ __print_symbolic(val, inet_protocol_names) #define show_tcp_state_name(val) \ __print_symbolic(val, tcp_state_names) #define show_skmem_kind_names(val) \ __print_symbolic(val, skmem_kind_names) TRACE_EVENT(sock_rcvqueue_full, TP_PROTO(struct sock *sk, struct sk_buff *skb), TP_ARGS(sk, skb), TP_STRUCT__entry( __field(int, rmem_alloc) __field(unsigned int, truesize) __field(int, sk_rcvbuf) ), TP_fast_assign( __entry->rmem_alloc = atomic_read(&sk->sk_rmem_alloc); __entry->truesize = skb->truesize; __entry->sk_rcvbuf = READ_ONCE(sk->sk_rcvbuf); ), TP_printk("rmem_alloc=%d truesize=%u sk_rcvbuf=%d", __entry->rmem_alloc, __entry->truesize, __entry->sk_rcvbuf) ); TRACE_EVENT(sock_exceed_buf_limit, TP_PROTO(struct sock *sk, struct proto *prot, long allocated, int kind), TP_ARGS(sk, prot, allocated, kind), TP_STRUCT__entry( __array(char, name, 32) __array(long, sysctl_mem, 3) __field(long, allocated) __field(int, sysctl_rmem) __field(int, rmem_alloc) __field(int, sysctl_wmem) __field(int, wmem_alloc) __field(int, wmem_queued) __field(int, kind) ), TP_fast_assign( strscpy(__entry->name, prot->name, 32); __entry->sysctl_mem[0] = READ_ONCE(prot->sysctl_mem[0]); __entry->sysctl_mem[1] = READ_ONCE(prot->sysctl_mem[1]); __entry->sysctl_mem[2] = READ_ONCE(prot->sysctl_mem[2]); __entry->allocated = allocated; __entry->sysctl_rmem = sk_get_rmem0(sk, prot); __entry->rmem_alloc = atomic_read(&sk->sk_rmem_alloc); __entry->sysctl_wmem = sk_get_wmem0(sk, prot); __entry->wmem_alloc = refcount_read(&sk->sk_wmem_alloc); __entry->wmem_queued = READ_ONCE(sk->sk_wmem_queued); __entry->kind = kind; ), TP_printk("proto:%s sysctl_mem=%ld,%ld,%ld allocated=%ld sysctl_rmem=%d rmem_alloc=%d sysctl_wmem=%d wmem_alloc=%d wmem_queued=%d kind=%s", __entry->name, __entry->sysctl_mem[0], __entry->sysctl_mem[1], __entry->sysctl_mem[2], __entry->allocated, __entry->sysctl_rmem, __entry->rmem_alloc, __entry->sysctl_wmem, __entry->wmem_alloc, __entry->wmem_queued, show_skmem_kind_names(__entry->kind) ) ); TRACE_EVENT(inet_sock_set_state, TP_PROTO(const struct sock *sk, const int oldstate, const int newstate), TP_ARGS(sk, oldstate, newstate), TP_STRUCT__entry( __field(const void *, skaddr) __field(int, oldstate) __field(int, newstate) __field(__u16, sport) __field(__u16, dport) __field(__u16, family) __field(__u16, protocol) __array(__u8, saddr, 4) __array(__u8, daddr, 4) __array(__u8, saddr_v6, 16) __array(__u8, daddr_v6, 16) ), TP_fast_assign( const struct inet_sock *inet = inet_sk(sk); __be32 *p32; __entry->skaddr = sk; __entry->oldstate = oldstate; __entry->newstate = newstate; __entry->family = sk->sk_family; __entry->protocol = sk->sk_protocol; __entry->sport = ntohs(inet->inet_sport); __entry->dport = ntohs(inet->inet_dport); p32 = (__be32 *) __entry->saddr; *p32 = inet->inet_saddr; p32 = (__be32 *) __entry->daddr; *p32 = inet->inet_daddr; TP_STORE_ADDRS(__entry, inet->inet_saddr, inet->inet_daddr, sk->sk_v6_rcv_saddr, sk->sk_v6_daddr); ), TP_printk("family=%s protocol=%s sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 saddrv6=%pI6c daddrv6=%pI6c oldstate=%s newstate=%s", show_family_name(__entry->family), show_inet_protocol_name(__entry->protocol), __entry->sport, __entry->dport, __entry->saddr, __entry->daddr, __entry->saddr_v6, __entry->daddr_v6, show_tcp_state_name(__entry->oldstate), show_tcp_state_name(__entry->newstate)) ); TRACE_EVENT(inet_sk_error_report, TP_PROTO(const struct sock *sk), TP_ARGS(sk), TP_STRUCT__entry( __field(int, error) __field(__u16, sport) __field(__u16, dport) __field(__u16, family) __field(__u16, protocol) __array(__u8, saddr, 4) __array(__u8, daddr, 4) __array(__u8, saddr_v6, 16) __array(__u8, daddr_v6, 16) ), TP_fast_assign( const struct inet_sock *inet = inet_sk(sk); __be32 *p32; __entry->error = sk->sk_err; __entry->family = sk->sk_family; __entry->protocol = sk->sk_protocol; __entry->sport = ntohs(inet->inet_sport); __entry->dport = ntohs(inet->inet_dport); p32 = (__be32 *) __entry->saddr; *p32 = inet->inet_saddr; p32 = (__be32 *) __entry->daddr; *p32 = inet->inet_daddr; TP_STORE_ADDRS(__entry, inet->inet_saddr, inet->inet_daddr, sk->sk_v6_rcv_saddr, sk->sk_v6_daddr); ), TP_printk("family=%s protocol=%s sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 saddrv6=%pI6c daddrv6=%pI6c error=%d", show_family_name(__entry->family), show_inet_protocol_name(__entry->protocol), __entry->sport, __entry->dport, __entry->saddr, __entry->daddr, __entry->saddr_v6, __entry->daddr_v6, __entry->error) ); TRACE_EVENT(sk_data_ready, TP_PROTO(const struct sock *sk), TP_ARGS(sk), TP_STRUCT__entry( __field(const void *, skaddr) __field(__u16, family) __field(__u16, protocol) __field(unsigned long, ip) ), TP_fast_assign( __entry->skaddr = sk; __entry->family = sk->sk_family; __entry->protocol = sk->sk_protocol; __entry->ip = _RET_IP_; ), TP_printk("family=%u protocol=%u func=%ps", __entry->family, __entry->protocol, (void *)__entry->ip) ); /* * sock send/recv msg length */ DECLARE_EVENT_CLASS(sock_msg_length, TP_PROTO(struct sock *sk, int ret, int flags), TP_ARGS(sk, ret, flags), TP_STRUCT__entry( __field(void *, sk) __field(__u16, family) __field(__u16, protocol) __field(int, ret) __field(int, flags) ), TP_fast_assign( __entry->sk = sk; __entry->family = sk->sk_family; __entry->protocol = sk->sk_protocol; __entry->ret = ret; __entry->flags = flags; ), TP_printk("sk address = %p, family = %s protocol = %s, length = %d, error = %d, flags = 0x%x", __entry->sk, show_family_name(__entry->family), show_inet_protocol_name(__entry->protocol), !(__entry->flags & MSG_PEEK) ? (__entry->ret > 0 ? __entry->ret : 0) : 0, __entry->ret < 0 ? __entry->ret : 0, __entry->flags) ); DEFINE_EVENT(sock_msg_length, sock_send_length, TP_PROTO(struct sock *sk, int ret, int flags), TP_ARGS(sk, ret, flags) ); DEFINE_EVENT(sock_msg_length, sock_recv_length, TP_PROTO(struct sock *sk, int ret, int flags), TP_ARGS(sk, ret, flags) ); #endif /* _TRACE_SOCK_H */ /* This part must be outside protection */ #include <trace/define_trace.h>
22 22 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 // SPDX-License-Identifier: GPL-2.0-or-later /* mpi-sub-ui.c - Subtract an unsigned integer from an MPI. * * Copyright 1991, 1993, 1994, 1996, 1999-2002, 2004, 2012, 2013, 2015 * Free Software Foundation, Inc. * * This file was based on the GNU MP Library source file: * https://gmplib.org/repo/gmp-6.2/file/510b83519d1c/mpz/aors_ui.h * * The GNU MP Library is free software; you can redistribute it and/or modify * it under the terms of either: * * * the GNU Lesser General Public License as published by the Free * Software Foundation; either version 3 of the License, or (at your * option) any later version. * * or * * * the GNU General Public License as published by the Free Software * Foundation; either version 2 of the License, or (at your option) any * later version. * * or both in parallel, as here. * * The GNU MP Library is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License * for more details. * * You should have received copies of the GNU General Public License and the * GNU Lesser General Public License along with the GNU MP Library. If not, * see https://www.gnu.org/licenses/. */ #include <linux/export.h> #include "mpi-internal.h" int mpi_sub_ui(MPI w, MPI u, unsigned long vval) { if (u->nlimbs == 0) { if (mpi_resize(w, 1) < 0) return -ENOMEM; w->d[0] = vval; w->nlimbs = (vval != 0); w->sign = (vval != 0); return 0; } /* If not space for W (and possible carry), increase space. */ if (mpi_resize(w, u->nlimbs + 1)) return -ENOMEM; if (u->sign) { mpi_limb_t cy; cy = mpihelp_add_1(w->d, u->d, u->nlimbs, (mpi_limb_t) vval); w->d[u->nlimbs] = cy; w->nlimbs = u->nlimbs + cy; w->sign = 1; } else { /* The signs are different. Need exact comparison to determine * which operand to subtract from which. */ if (u->nlimbs == 1 && u->d[0] < vval) { w->d[0] = vval - u->d[0]; w->nlimbs = 1; w->sign = 1; } else { mpihelp_sub_1(w->d, u->d, u->nlimbs, (mpi_limb_t) vval); /* Size can decrease with at most one limb. */ w->nlimbs = (u->nlimbs - (w->d[u->nlimbs - 1] == 0)); w->sign = 0; } } mpi_normalize(w); return 0; } EXPORT_SYMBOL_GPL(mpi_sub_ui);
20 20 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 1 2 2 2 4 3 1 8 7 1 1 1 1 1 1 6 1 4 1 4 3 1 1 4 3 1 1 1 1 1 15 12 1 3 17 11 2 6 2 4 4 2 2 2 2 4 2 3 2 2 2 2 2 3 2 2 2 1 15 36 6 2 1 5 15 26 1 8 7 1 1 10 3 3 1 2 3 4 4 2 1 1 1 2 1 1 13 7 3 1 2 10 8 2 1 1 13 6 2 2 1 1 2 1 3 2 1 6 1 1 2 2 1 14 1 3 2 6 2 9 3 2 2 2 7 5 1 2 1 1 5 4 1 8 7 1 3 3 8 8 12 9 9 8 8 3 3 11 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 /* * Copyright (c) 2017 Mellanox Technologies. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the names of the copyright holders nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * Alternatively, this software may be distributed under the terms of the * GNU General Public License ("GPL") version 2 as published by the Free * Software Foundation. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include <linux/module.h> #include <linux/pid.h> #include <linux/pid_namespace.h> #include <linux/mutex.h> #include <net/netlink.h> #include <rdma/rdma_cm.h> #include <rdma/rdma_netlink.h> #include "core_priv.h" #include "cma_priv.h" #include "restrack.h" #include "uverbs.h" /* * This determines whether a non-privileged user is allowed to specify a * controlled QKEY or not, when true non-privileged user is allowed to specify * a controlled QKEY. */ static bool privileged_qkey; typedef int (*res_fill_func_t)(struct sk_buff*, bool, struct rdma_restrack_entry*, uint32_t); /* * Sort array elements by the netlink attribute name */ static const struct nla_policy nldev_policy[RDMA_NLDEV_ATTR_MAX] = { [RDMA_NLDEV_ATTR_CHARDEV] = { .type = NLA_U64 }, [RDMA_NLDEV_ATTR_CHARDEV_ABI] = { .type = NLA_U64 }, [RDMA_NLDEV_ATTR_CHARDEV_NAME] = { .type = NLA_NUL_STRING, .len = RDMA_NLDEV_ATTR_EMPTY_STRING }, [RDMA_NLDEV_ATTR_CHARDEV_TYPE] = { .type = NLA_NUL_STRING, .len = RDMA_NLDEV_ATTR_CHARDEV_TYPE_SIZE }, [RDMA_NLDEV_ATTR_DEV_DIM] = { .type = NLA_U8 }, [RDMA_NLDEV_ATTR_DEV_INDEX] = { .type = NLA_U32 }, [RDMA_NLDEV_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, .len = IB_DEVICE_NAME_MAX }, [RDMA_NLDEV_ATTR_DEV_NODE_TYPE] = { .type = NLA_U8 }, [RDMA_NLDEV_ATTR_DEV_PROTOCOL] = { .type = NLA_NUL_STRING, .len = RDMA_NLDEV_ATTR_EMPTY_STRING }, [RDMA_NLDEV_ATTR_DRIVER] = { .type = NLA_NESTED }, [RDMA_NLDEV_ATTR_DRIVER_ENTRY] = { .type = NLA_NESTED }, [RDMA_NLDEV_ATTR_DRIVER_PRINT_TYPE] = { .type = NLA_U8 }, [RDMA_NLDEV_ATTR_DRIVER_STRING] = { .type = NLA_NUL_STRING, .len = RDMA_NLDEV_ATTR_EMPTY_STRING }, [RDMA_NLDEV_ATTR_DRIVER_S32] = { .type = NLA_S32 }, [RDMA_NLDEV_ATTR_DRIVER_S64] = { .type = NLA_S64 }, [RDMA_NLDEV_ATTR_DRIVER_U32] = { .type = NLA_U32 }, [RDMA_NLDEV_ATTR_DRIVER_U64] = { .type = NLA_U64 }, [RDMA_NLDEV_ATTR_FW_VERSION] = { .type = NLA_NUL_STRING, .len = RDMA_NLDEV_ATTR_EMPTY_STRING }, [RDMA_NLDEV_ATTR_LID] = { .type = NLA_U32 }, [RDMA_NLDEV_ATTR_LINK_TYPE] = { .type = NLA_NUL_STRING, .len = IFNAMSIZ }, [RDMA_NLDEV_ATTR_LMC] = { .type = NLA_U8 }, [RDMA_NLDEV_ATTR_NDEV_INDEX] = { .type = NLA_U32 }, [RDMA_NLDEV_ATTR_NDEV_NAME] = { .type = NLA_NUL_STRING, .len = IFNAMSIZ }, [RDMA_NLDEV_ATTR_NODE_GUID] = { .type = NLA_U64 }, [RDMA_NLDEV_ATTR_PORT_INDEX] = { .type = NLA_U32 }, [RDMA_NLDEV_ATTR_PORT_PHYS_STATE] = { .type = NLA_U8 }, [RDMA_NLDEV_ATTR_PORT_STATE] = { .type = NLA_U8 }, [RDMA_NLDEV_ATTR_RES_CM_ID] = { .type = NLA_NESTED }, [RDMA_NLDEV_ATTR_RES_CM_IDN] = { .type = NLA_U32 }, [RDMA_NLDEV_ATTR_RES_CM_ID_ENTRY] = { .type = NLA_NESTED }, [RDMA_NLDEV_ATTR_RES_CQ] = { .type = NLA_NESTED }, [RDMA_NLDEV_ATTR_RES_CQE] = { .type = NLA_U32 }, [RDMA_NLDEV_ATTR_RES_CQN] = { .type = NLA_U32 }, [RDMA_NLDEV_ATTR_RES_CQ_ENTRY] = { .type = NLA_NESTED }, [RDMA_NLDEV_ATTR_RES_CTX] = { .type = NLA_NESTED }, [RDMA_NLDEV_ATTR_RES_CTXN] = { .type = NLA_U32 }, [RDMA_NLDEV_ATTR_RES_CTX_ENTRY] = { .type = NLA_NESTED }, [RDMA_NLDEV_ATTR_RES_DST_ADDR] = { .len = sizeof(struct __kernel_sockaddr_storage) }, [RDMA_NLDEV_ATTR_RES_IOVA] = { .type = NLA_U64 }, [RDMA_NLDEV_ATTR_RES_KERN_NAME] = { .type = NLA_NUL_STRING, .len = RDMA_NLDEV_ATTR_EMPTY_STRING }, [RDMA_NLDEV_ATTR_RES_LKEY] = { .type = NLA_U32 }, [RDMA_NLDEV_ATTR_RES_LOCAL_DMA_LKEY] = { .type = NLA_U32 }, [RDMA_NLDEV_ATTR_RES_LQPN] = { .type = NLA_U32 }, [RDMA_NLDEV_ATTR_RES_MR] = { .type = NLA_NESTED }, [RDMA_NLDEV_ATTR_RES_MRLEN] = { .type = NLA_U64 }, [RDMA_NLDEV_ATTR_RES_MRN] = { .type = NLA_U32 }, [RDMA_NLDEV_ATTR_RES_MR_ENTRY] = { .type = NLA_NESTED }, [RDMA_NLDEV_ATTR_RES_PATH_MIG_STATE] = { .type = NLA_U8 }, [RDMA_NLDEV_ATTR_RES_PD] = { .type = NLA_NESTED }, [RDMA_NLDEV_ATTR_RES_PDN] = { .type = NLA_U32 }, [RDMA_NLDEV_ATTR_RES_PD_ENTRY] = { .type = NLA_NESTED }, [RDMA_NLDEV_ATTR_RES_PID] = { .type = NLA_U32 }, [RDMA_NLDEV_ATTR_RES_POLL_CTX] = { .type = NLA_U8 }, [RDMA_NLDEV_ATTR_RES_PS] = { .type = NLA_U32 }, [RDMA_NLDEV_ATTR_RES_QP] = { .type = NLA_NESTED }, [RDMA_NLDEV_ATTR_RES_QP_ENTRY] = { .type = NLA_NESTED }, [RDMA_NLDEV_ATTR_RES_RAW] = { .type = NLA_BINARY }, [RDMA_NLDEV_ATTR_RES_RKEY] = { .type = NLA_U32 }, [RDMA_NLDEV_ATTR_RES_RQPN] = { .type = NLA_U32 }, [RDMA_NLDEV_ATTR_RES_RQ_PSN] = { .type = NLA_U32 }, [RDMA_NLDEV_ATTR_RES_SQ_PSN] = { .type = NLA_U32 }, [RDMA_NLDEV_ATTR_RES_SRC_ADDR] = { .len = sizeof(struct __kernel_sockaddr_storage) }, [RDMA_NLDEV_ATTR_RES_STATE] = { .type = NLA_U8 }, [RDMA_NLDEV_ATTR_RES_SUMMARY] = { .type = NLA_NESTED }, [RDMA_NLDEV_ATTR_RES_SUMMARY_ENTRY] = { .type = NLA_NESTED }, [RDMA_NLDEV_ATTR_RES_SUMMARY_ENTRY_CURR]= { .type = NLA_U64 }, [RDMA_NLDEV_ATTR_RES_SUMMARY_ENTRY_NAME]= { .type = NLA_NUL_STRING, .len = RDMA_NLDEV_ATTR_EMPTY_STRING }, [RDMA_NLDEV_ATTR_RES_TYPE] = { .type = NLA_U8 }, [RDMA_NLDEV_ATTR_RES_SUBTYPE] = { .type = NLA_NUL_STRING, .len = RDMA_NLDEV_ATTR_EMPTY_STRING }, [RDMA_NLDEV_ATTR_RES_UNSAFE_GLOBAL_RKEY]= { .type = NLA_U32 }, [RDMA_NLDEV_ATTR_RES_USECNT] = { .type = NLA_U64 }, [RDMA_NLDEV_ATTR_RES_SRQ] = { .type = NLA_NESTED }, [RDMA_NLDEV_ATTR_RES_SRQN] = { .type = NLA_U32 }, [RDMA_NLDEV_ATTR_RES_SRQ_ENTRY] = { .type = NLA_NESTED }, [RDMA_NLDEV_ATTR_MIN_RANGE] = { .type = NLA_U32 }, [RDMA_NLDEV_ATTR_MAX_RANGE] = { .type = NLA_U32 }, [RDMA_NLDEV_ATTR_SM_LID] = { .type = NLA_U32 }, [RDMA_NLDEV_ATTR_SUBNET_PREFIX] = { .type = NLA_U64 }, [RDMA_NLDEV_ATTR_STAT_AUTO_MODE_MASK] = { .type = NLA_U32 }, [RDMA_NLDEV_ATTR_STAT_MODE] = { .type = NLA_U32 }, [RDMA_NLDEV_ATTR_STAT_RES] = { .type = NLA_U32 }, [RDMA_NLDEV_ATTR_STAT_COUNTER] = { .type = NLA_NESTED }, [RDMA_NLDEV_ATTR_STAT_COUNTER_ENTRY] = { .type = NLA_NESTED }, [RDMA_NLDEV_ATTR_STAT_COUNTER_ID] = { .type = NLA_U32 }, [RDMA_NLDEV_ATTR_STAT_HWCOUNTERS] = { .type = NLA_NESTED }, [RDMA_NLDEV_ATTR_STAT_HWCOUNTER_ENTRY] = { .type = NLA_NESTED }, [RDMA_NLDEV_ATTR_STAT_HWCOUNTER_ENTRY_NAME] = { .type = NLA_NUL_STRING }, [RDMA_NLDEV_ATTR_STAT_HWCOUNTER_ENTRY_VALUE] = { .type = NLA_U64 }, [RDMA_NLDEV_ATTR_SYS_IMAGE_GUID] = { .type = NLA_U64 }, [RDMA_NLDEV_ATTR_UVERBS_DRIVER_ID] = { .type = NLA_U32 }, [RDMA_NLDEV_NET_NS_FD] = { .type = NLA_U32 }, [RDMA_NLDEV_SYS_ATTR_NETNS_MODE] = { .type = NLA_U8 }, [RDMA_NLDEV_SYS_ATTR_COPY_ON_FORK] = { .type = NLA_U8 }, [RDMA_NLDEV_ATTR_STAT_HWCOUNTER_INDEX] = { .type = NLA_U32 }, [RDMA_NLDEV_ATTR_STAT_HWCOUNTER_DYNAMIC] = { .type = NLA_U8 }, [RDMA_NLDEV_SYS_ATTR_PRIVILEGED_QKEY_MODE] = { .type = NLA_U8 }, [RDMA_NLDEV_ATTR_DRIVER_DETAILS] = { .type = NLA_U8 }, [RDMA_NLDEV_ATTR_DEV_TYPE] = { .type = NLA_U8 }, [RDMA_NLDEV_ATTR_PARENT_NAME] = { .type = NLA_NUL_STRING }, [RDMA_NLDEV_ATTR_NAME_ASSIGN_TYPE] = { .type = NLA_U8 }, [RDMA_NLDEV_ATTR_EVENT_TYPE] = { .type = NLA_U8 }, [RDMA_NLDEV_ATTR_STAT_OPCOUNTER_ENABLED] = { .type = NLA_U8 }, }; static int put_driver_name_print_type(struct sk_buff *msg, const char *name, enum rdma_nldev_print_type print_type) { if (nla_put_string(msg, RDMA_NLDEV_ATTR_DRIVER_STRING, name)) return -EMSGSIZE; if (print_type != RDMA_NLDEV_PRINT_TYPE_UNSPEC && nla_put_u8(msg, RDMA_NLDEV_ATTR_DRIVER_PRINT_TYPE, print_type)) return -EMSGSIZE; return 0; } static int _rdma_nl_put_driver_u32(struct sk_buff *msg, const char *name, enum rdma_nldev_print_type print_type, u32 value) { if (put_driver_name_print_type(msg, name, print_type)) return -EMSGSIZE; if (nla_put_u32(msg, RDMA_NLDEV_ATTR_DRIVER_U32, value)) return -EMSGSIZE; return 0; } static int _rdma_nl_put_driver_u64(struct sk_buff *msg, const char *name, enum rdma_nldev_print_type print_type, u64 value) { if (put_driver_name_print_type(msg, name, print_type)) return -EMSGSIZE; if (nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_DRIVER_U64, value, RDMA_NLDEV_ATTR_PAD)) return -EMSGSIZE; return 0; } int rdma_nl_put_driver_string(struct sk_buff *msg, const char *name, const char *str) { if (put_driver_name_print_type(msg, name, RDMA_NLDEV_PRINT_TYPE_UNSPEC)) return -EMSGSIZE; if (nla_put_string(msg, RDMA_NLDEV_ATTR_DRIVER_STRING, str)) return -EMSGSIZE; return 0; } EXPORT_SYMBOL(rdma_nl_put_driver_string); int rdma_nl_put_driver_u32(struct sk_buff *msg, const char *name, u32 value) { return _rdma_nl_put_driver_u32(msg, name, RDMA_NLDEV_PRINT_TYPE_UNSPEC, value); } EXPORT_SYMBOL(rdma_nl_put_driver_u32); int rdma_nl_put_driver_u32_hex(struct sk_buff *msg, const char *name, u32 value) { return _rdma_nl_put_driver_u32(msg, name, RDMA_NLDEV_PRINT_TYPE_HEX, value); } EXPORT_SYMBOL(rdma_nl_put_driver_u32_hex); int rdma_nl_put_driver_u64(struct sk_buff *msg, const char *name, u64 value) { return _rdma_nl_put_driver_u64(msg, name, RDMA_NLDEV_PRINT_TYPE_UNSPEC, value); } EXPORT_SYMBOL(rdma_nl_put_driver_u64); int rdma_nl_put_driver_u64_hex(struct sk_buff *msg, const char *name, u64 value) { return _rdma_nl_put_driver_u64(msg, name, RDMA_NLDEV_PRINT_TYPE_HEX, value); } EXPORT_SYMBOL(rdma_nl_put_driver_u64_hex); bool rdma_nl_get_privileged_qkey(void) { return privileged_qkey; } EXPORT_SYMBOL(rdma_nl_get_privileged_qkey); static int fill_nldev_handle(struct sk_buff *msg, struct ib_device *device) { if (nla_put_u32(msg, RDMA_NLDEV_ATTR_DEV_INDEX, device->index)) return -EMSGSIZE; if (nla_put_string(msg, RDMA_NLDEV_ATTR_DEV_NAME, dev_name(&device->dev))) return -EMSGSIZE; return 0; } static int fill_dev_info(struct sk_buff *msg, struct ib_device *device) { char fw[IB_FW_VERSION_NAME_MAX]; int ret = 0; u32 port; if (fill_nldev_handle(msg, device)) return -EMSGSIZE; if (nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, rdma_end_port(device))) return -EMSGSIZE; BUILD_BUG_ON(sizeof(device->attrs.device_cap_flags) != sizeof(u64)); if (nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_CAP_FLAGS, device->attrs.device_cap_flags, RDMA_NLDEV_ATTR_PAD)) return -EMSGSIZE; ib_get_device_fw_str(device, fw); /* Device without FW has strlen(fw) = 0 */ if (strlen(fw) && nla_put_string(msg, RDMA_NLDEV_ATTR_FW_VERSION, fw)) return -EMSGSIZE; if (nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_NODE_GUID, be64_to_cpu(device->node_guid), RDMA_NLDEV_ATTR_PAD)) return -EMSGSIZE; if (nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_SYS_IMAGE_GUID, be64_to_cpu(device->attrs.sys_image_guid), RDMA_NLDEV_ATTR_PAD)) return -EMSGSIZE; if (nla_put_u8(msg, RDMA_NLDEV_ATTR_DEV_NODE_TYPE, device->node_type)) return -EMSGSIZE; if (nla_put_u8(msg, RDMA_NLDEV_ATTR_DEV_DIM, device->use_cq_dim)) return -EMSGSIZE; if (device->type && nla_put_u8(msg, RDMA_NLDEV_ATTR_DEV_TYPE, device->type)) return -EMSGSIZE; if (device->parent && nla_put_string(msg, RDMA_NLDEV_ATTR_PARENT_NAME, dev_name(&device->parent->dev))) return -EMSGSIZE; if (nla_put_u8(msg, RDMA_NLDEV_ATTR_NAME_ASSIGN_TYPE, device->name_assign_type)) return -EMSGSIZE; /* * Link type is determined on first port and mlx4 device * which can potentially have two different link type for the same * IB device is considered as better to be avoided in the future, */ port = rdma_start_port(device); if (rdma_cap_opa_mad(device, port)) ret = nla_put_string(msg, RDMA_NLDEV_ATTR_DEV_PROTOCOL, "opa"); else if (rdma_protocol_ib(device, port)) ret = nla_put_string(msg, RDMA_NLDEV_ATTR_DEV_PROTOCOL, "ib"); else if (rdma_protocol_iwarp(device, port)) ret = nla_put_string(msg, RDMA_NLDEV_ATTR_DEV_PROTOCOL, "iw"); else if (rdma_protocol_roce(device, port)) ret = nla_put_string(msg, RDMA_NLDEV_ATTR_DEV_PROTOCOL, "roce"); else if (rdma_protocol_usnic(device, port)) ret = nla_put_string(msg, RDMA_NLDEV_ATTR_DEV_PROTOCOL, "usnic"); return ret; } static int fill_port_info(struct sk_buff *msg, struct ib_device *device, u32 port, const struct net *net) { struct net_device *netdev = NULL; struct ib_port_attr attr; int ret; u64 cap_flags = 0; if (fill_nldev_handle(msg, device)) return -EMSGSIZE; if (nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, port)) return -EMSGSIZE; ret = ib_query_port(device, port, &attr); if (ret) return ret; if (rdma_protocol_ib(device, port)) { BUILD_BUG_ON((sizeof(attr.port_cap_flags) + sizeof(attr.port_cap_flags2)) > sizeof(u64)); cap_flags = attr.port_cap_flags | ((u64)attr.port_cap_flags2 << 32); if (nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_CAP_FLAGS, cap_flags, RDMA_NLDEV_ATTR_PAD)) return -EMSGSIZE; if (nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_SUBNET_PREFIX, attr.subnet_prefix, RDMA_NLDEV_ATTR_PAD)) return -EMSGSIZE; if (nla_put_u32(msg, RDMA_NLDEV_ATTR_LID, attr.lid)) return -EMSGSIZE; if (nla_put_u32(msg, RDMA_NLDEV_ATTR_SM_LID, attr.sm_lid)) return -EMSGSIZE; if (nla_put_u8(msg, RDMA_NLDEV_ATTR_LMC, attr.lmc)) return -EMSGSIZE; } if (nla_put_u8(msg, RDMA_NLDEV_ATTR_PORT_STATE, attr.state)) return -EMSGSIZE; if (nla_put_u8(msg, RDMA_NLDEV_ATTR_PORT_PHYS_STATE, attr.phys_state)) return -EMSGSIZE; netdev = ib_device_get_netdev(device, port); if (netdev && net_eq(dev_net(netdev), net)) { ret = nla_put_u32(msg, RDMA_NLDEV_ATTR_NDEV_INDEX, netdev->ifindex); if (ret) goto out; ret = nla_put_string(msg, RDMA_NLDEV_ATTR_NDEV_NAME, netdev->name); } out: dev_put(netdev); return ret; } static int fill_res_info_entry(struct sk_buff *msg, const char *name, u64 curr) { struct nlattr *entry_attr; entry_attr = nla_nest_start_noflag(msg, RDMA_NLDEV_ATTR_RES_SUMMARY_ENTRY); if (!entry_attr) return -EMSGSIZE; if (nla_put_string(msg, RDMA_NLDEV_ATTR_RES_SUMMARY_ENTRY_NAME, name)) goto err; if (nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_RES_SUMMARY_ENTRY_CURR, curr, RDMA_NLDEV_ATTR_PAD)) goto err; nla_nest_end(msg, entry_attr); return 0; err: nla_nest_cancel(msg, entry_attr); return -EMSGSIZE; } static int fill_res_info(struct sk_buff *msg, struct ib_device *device, bool show_details) { static const char * const names[RDMA_RESTRACK_MAX] = { [RDMA_RESTRACK_PD] = "pd", [RDMA_RESTRACK_CQ] = "cq", [RDMA_RESTRACK_QP] = "qp", [RDMA_RESTRACK_CM_ID] = "cm_id", [RDMA_RESTRACK_MR] = "mr", [RDMA_RESTRACK_CTX] = "ctx", [RDMA_RESTRACK_SRQ] = "srq", }; struct nlattr *table_attr; int ret, i, curr; if (fill_nldev_handle(msg, device)) return -EMSGSIZE; table_attr = nla_nest_start_noflag(msg, RDMA_NLDEV_ATTR_RES_SUMMARY); if (!table_attr) return -EMSGSIZE; for (i = 0; i < RDMA_RESTRACK_MAX; i++) { if (!names[i]) continue; curr = rdma_restrack_count(device, i, show_details); ret = fill_res_info_entry(msg, names[i], curr); if (ret) goto err; } nla_nest_end(msg, table_attr); return 0; err: nla_nest_cancel(msg, table_attr); return ret; } static int fill_res_name_pid(struct sk_buff *msg, struct rdma_restrack_entry *res) { int err = 0; /* * For user resources, user is should read /proc/PID/comm to get the * name of the task file. */ if (rdma_is_kernel_res(res)) { err = nla_put_string(msg, RDMA_NLDEV_ATTR_RES_KERN_NAME, res->kern_name); } else { pid_t pid; pid = task_pid_vnr(res->task); /* * Task is dead and in zombie state. * There is no need to print PID anymore. */ if (pid) /* * This part is racy, task can be killed and PID will * be zero right here but it is ok, next query won't * return PID. We don't promise real-time reflection * of SW objects. */ err = nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_PID, pid); } return err ? -EMSGSIZE : 0; } static int fill_res_qp_entry_query(struct sk_buff *msg, struct rdma_restrack_entry *res, struct ib_device *dev, struct ib_qp *qp) { struct ib_qp_init_attr qp_init_attr; struct ib_qp_attr qp_attr; int ret; ret = ib_query_qp(qp, &qp_attr, 0, &qp_init_attr); if (ret) return ret; if (qp->qp_type == IB_QPT_RC || qp->qp_type == IB_QPT_UC) { if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_RQPN, qp_attr.dest_qp_num)) goto err; if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_RQ_PSN, qp_attr.rq_psn)) goto err; } if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_SQ_PSN, qp_attr.sq_psn)) goto err; if (qp->qp_type == IB_QPT_RC || qp->qp_type == IB_QPT_UC || qp->qp_type == IB_QPT_XRC_INI || qp->qp_type == IB_QPT_XRC_TGT) { if (nla_put_u8(msg, RDMA_NLDEV_ATTR_RES_PATH_MIG_STATE, qp_attr.path_mig_state)) goto err; } if (nla_put_u8(msg, RDMA_NLDEV_ATTR_RES_TYPE, qp->qp_type)) goto err; if (nla_put_u8(msg, RDMA_NLDEV_ATTR_RES_STATE, qp_attr.qp_state)) goto err; if (dev->ops.fill_res_qp_entry) return dev->ops.fill_res_qp_entry(msg, qp); return 0; err: return -EMSGSIZE; } static int fill_res_qp_entry(struct sk_buff *msg, bool has_cap_net_admin, struct rdma_restrack_entry *res, uint32_t port) { struct ib_qp *qp = container_of(res, struct ib_qp, res); struct ib_device *dev = qp->device; int ret; if (port && port != qp->port) return -EAGAIN; /* In create_qp() port is not set yet */ if (qp->port && nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, qp->port)) return -EMSGSIZE; ret = nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_LQPN, qp->qp_num); if (ret) return -EMSGSIZE; if (!rdma_is_kernel_res(res) && nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_PDN, qp->pd->res.id)) return -EMSGSIZE; ret = fill_res_name_pid(msg, res); if (ret) return -EMSGSIZE; return fill_res_qp_entry_query(msg, res, dev, qp); } static int fill_res_qp_raw_entry(struct sk_buff *msg, bool has_cap_net_admin, struct rdma_restrack_entry *res, uint32_t port) { struct ib_qp *qp = container_of(res, struct ib_qp, res); struct ib_device *dev = qp->device; if (port && port != qp->port) return -EAGAIN; if (!dev->ops.fill_res_qp_entry_raw) return -EINVAL; return dev->ops.fill_res_qp_entry_raw(msg, qp); } static int fill_res_cm_id_entry(struct sk_buff *msg, bool has_cap_net_admin, struct rdma_restrack_entry *res, uint32_t port) { struct rdma_id_private *id_priv = container_of(res, struct rdma_id_private, res); struct ib_device *dev = id_priv->id.device; struct rdma_cm_id *cm_id = &id_priv->id; if (port && port != cm_id->port_num) return -EAGAIN; if (cm_id->port_num && nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, cm_id->port_num)) goto err; if (id_priv->qp_num) { if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_LQPN, id_priv->qp_num)) goto err; if (nla_put_u8(msg, RDMA_NLDEV_ATTR_RES_TYPE, cm_id->qp_type)) goto err; } if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_PS, cm_id->ps)) goto err; if (nla_put_u8(msg, RDMA_NLDEV_ATTR_RES_STATE, id_priv->state)) goto err; if (cm_id->route.addr.src_addr.ss_family && nla_put(msg, RDMA_NLDEV_ATTR_RES_SRC_ADDR, sizeof(cm_id->route.addr.src_addr), &cm_id->route.addr.src_addr)) goto err; if (cm_id->route.addr.dst_addr.ss_family && nla_put(msg, RDMA_NLDEV_ATTR_RES_DST_ADDR, sizeof(cm_id->route.addr.dst_addr), &cm_id->route.addr.dst_addr)) goto err; if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_CM_IDN, res->id)) goto err; if (fill_res_name_pid(msg, res)) goto err; if (dev->ops.fill_res_cm_id_entry) return dev->ops.fill_res_cm_id_entry(msg, cm_id); return 0; err: return -EMSGSIZE; } static int fill_res_cq_entry(struct sk_buff *msg, bool has_cap_net_admin, struct rdma_restrack_entry *res, uint32_t port) { struct ib_cq *cq = container_of(res, struct ib_cq, res); struct ib_device *dev = cq->device; if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_CQE, cq->cqe)) return -EMSGSIZE; if (nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_RES_USECNT, atomic_read(&cq->usecnt), RDMA_NLDEV_ATTR_PAD)) return -EMSGSIZE; /* Poll context is only valid for kernel CQs */ if (rdma_is_kernel_res(res) && nla_put_u8(msg, RDMA_NLDEV_ATTR_RES_POLL_CTX, cq->poll_ctx)) return -EMSGSIZE; if (nla_put_u8(msg, RDMA_NLDEV_ATTR_DEV_DIM, (cq->dim != NULL))) return -EMSGSIZE; if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_CQN, res->id)) return -EMSGSIZE; if (!rdma_is_kernel_res(res) && nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_CTXN, cq->uobject->uevent.uobject.context->res.id)) return -EMSGSIZE; if (fill_res_name_pid(msg, res)) return -EMSGSIZE; return (dev->ops.fill_res_cq_entry) ? dev->ops.fill_res_cq_entry(msg, cq) : 0; } static int fill_res_cq_raw_entry(struct sk_buff *msg, bool has_cap_net_admin, struct rdma_restrack_entry *res, uint32_t port) { struct ib_cq *cq = container_of(res, struct ib_cq, res); struct ib_device *dev = cq->device; if (!dev->ops.fill_res_cq_entry_raw) return -EINVAL; return dev->ops.fill_res_cq_entry_raw(msg, cq); } static int fill_res_mr_entry(struct sk_buff *msg, bool has_cap_net_admin, struct rdma_restrack_entry *res, uint32_t port) { struct ib_mr *mr = container_of(res, struct ib_mr, res); struct ib_device *dev = mr->pd->device; if (has_cap_net_admin) { if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_RKEY, mr->rkey)) return -EMSGSIZE; if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_LKEY, mr->lkey)) return -EMSGSIZE; } if (nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_RES_MRLEN, mr->length, RDMA_NLDEV_ATTR_PAD)) return -EMSGSIZE; if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_MRN, res->id)) return -EMSGSIZE; if (!rdma_is_kernel_res(res) && nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_PDN, mr->pd->res.id)) return -EMSGSIZE; if (fill_res_name_pid(msg, res)) return -EMSGSIZE; return (dev->ops.fill_res_mr_entry) ? dev->ops.fill_res_mr_entry(msg, mr) : 0; } static int fill_res_mr_raw_entry(struct sk_buff *msg, bool has_cap_net_admin, struct rdma_restrack_entry *res, uint32_t port) { struct ib_mr *mr = container_of(res, struct ib_mr, res); struct ib_device *dev = mr->pd->device; if (!dev->ops.fill_res_mr_entry_raw) return -EINVAL; return dev->ops.fill_res_mr_entry_raw(msg, mr); } static int fill_res_pd_entry(struct sk_buff *msg, bool has_cap_net_admin, struct rdma_restrack_entry *res, uint32_t port) { struct ib_pd *pd = container_of(res, struct ib_pd, res); if (has_cap_net_admin) { if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_LOCAL_DMA_LKEY, pd->local_dma_lkey)) goto err; if ((pd->flags & IB_PD_UNSAFE_GLOBAL_RKEY) && nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_UNSAFE_GLOBAL_RKEY, pd->unsafe_global_rkey)) goto err; } if (nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_RES_USECNT, atomic_read(&pd->usecnt), RDMA_NLDEV_ATTR_PAD)) goto err; if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_PDN, res->id)) goto err; if (!rdma_is_kernel_res(res) && nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_CTXN, pd->uobject->context->res.id)) goto err; return fill_res_name_pid(msg, res); err: return -EMSGSIZE; } static int fill_res_ctx_entry(struct sk_buff *msg, bool has_cap_net_admin, struct rdma_restrack_entry *res, uint32_t port) { struct ib_ucontext *ctx = container_of(res, struct ib_ucontext, res); if (rdma_is_kernel_res(res)) return 0; if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_CTXN, ctx->res.id)) return -EMSGSIZE; return fill_res_name_pid(msg, res); } static int fill_res_range_qp_entry(struct sk_buff *msg, uint32_t min_range, uint32_t max_range) { struct nlattr *entry_attr; if (!min_range) return 0; entry_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_RES_QP_ENTRY); if (!entry_attr) return -EMSGSIZE; if (min_range == max_range) { if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_LQPN, min_range)) goto err; } else { if (nla_put_u32(msg, RDMA_NLDEV_ATTR_MIN_RANGE, min_range)) goto err; if (nla_put_u32(msg, RDMA_NLDEV_ATTR_MAX_RANGE, max_range)) goto err; } nla_nest_end(msg, entry_attr); return 0; err: nla_nest_cancel(msg, entry_attr); return -EMSGSIZE; } static int fill_res_srq_qps(struct sk_buff *msg, struct ib_srq *srq) { uint32_t min_range = 0, prev = 0; struct rdma_restrack_entry *res; struct rdma_restrack_root *rt; struct nlattr *table_attr; struct ib_qp *qp = NULL; unsigned long id = 0; table_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_RES_QP); if (!table_attr) return -EMSGSIZE; rt = &srq->device->res[RDMA_RESTRACK_QP]; xa_lock(&rt->xa); xa_for_each(&rt->xa, id, res) { if (!rdma_restrack_get(res)) continue; qp = container_of(res, struct ib_qp, res); if (!qp->srq || (qp->srq->res.id != srq->res.id)) { rdma_restrack_put(res); continue; } if (qp->qp_num < prev) /* qp_num should be ascending */ goto err_loop; if (min_range == 0) { min_range = qp->qp_num; } else if (qp->qp_num > (prev + 1)) { if (fill_res_range_qp_entry(msg, min_range, prev)) goto err_loop; min_range = qp->qp_num; } prev = qp->qp_num; rdma_restrack_put(res); } xa_unlock(&rt->xa); if (fill_res_range_qp_entry(msg, min_range, prev)) goto err; nla_nest_end(msg, table_attr); return 0; err_loop: rdma_restrack_put(res); xa_unlock(&rt->xa); err: nla_nest_cancel(msg, table_attr); return -EMSGSIZE; } static int fill_res_srq_entry(struct sk_buff *msg, bool has_cap_net_admin, struct rdma_restrack_entry *res, uint32_t port) { struct ib_srq *srq = container_of(res, struct ib_srq, res); struct ib_device *dev = srq->device; if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_SRQN, srq->res.id)) goto err; if (nla_put_u8(msg, RDMA_NLDEV_ATTR_RES_TYPE, srq->srq_type)) goto err; if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_PDN, srq->pd->res.id)) goto err; if (ib_srq_has_cq(srq->srq_type)) { if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_CQN, srq->ext.cq->res.id)) goto err; } if (fill_res_srq_qps(msg, srq)) goto err; if (fill_res_name_pid(msg, res)) goto err; if (dev->ops.fill_res_srq_entry) return dev->ops.fill_res_srq_entry(msg, srq); return 0; err: return -EMSGSIZE; } static int fill_res_srq_raw_entry(struct sk_buff *msg, bool has_cap_net_admin, struct rdma_restrack_entry *res, uint32_t port) { struct ib_srq *srq = container_of(res, struct ib_srq, res); struct ib_device *dev = srq->device; if (!dev->ops.fill_res_srq_entry_raw) return -EINVAL; return dev->ops.fill_res_srq_entry_raw(msg, srq); } static int fill_stat_counter_mode(struct sk_buff *msg, struct rdma_counter *counter) { struct rdma_counter_mode *m = &counter->mode; if (nla_put_u32(msg, RDMA_NLDEV_ATTR_STAT_MODE, m->mode)) return -EMSGSIZE; if (m->mode == RDMA_COUNTER_MODE_AUTO) { if ((m->mask & RDMA_COUNTER_MASK_QP_TYPE) && nla_put_u8(msg, RDMA_NLDEV_ATTR_RES_TYPE, m->param.qp_type)) return -EMSGSIZE; if ((m->mask & RDMA_COUNTER_MASK_PID) && fill_res_name_pid(msg, &counter->res)) return -EMSGSIZE; } return 0; } static int fill_stat_counter_qp_entry(struct sk_buff *msg, u32 qpn) { struct nlattr *entry_attr; entry_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_RES_QP_ENTRY); if (!entry_attr) return -EMSGSIZE; if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_LQPN, qpn)) goto err; nla_nest_end(msg, entry_attr); return 0; err: nla_nest_cancel(msg, entry_attr); return -EMSGSIZE; } static int fill_stat_counter_qps(struct sk_buff *msg, struct rdma_counter *counter) { struct rdma_restrack_entry *res; struct rdma_restrack_root *rt; struct nlattr *table_attr; struct ib_qp *qp = NULL; unsigned long id = 0; int ret = 0; table_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_RES_QP); if (!table_attr) return -EMSGSIZE; rt = &counter->device->res[RDMA_RESTRACK_QP]; xa_lock(&rt->xa); xa_for_each(&rt->xa, id, res) { qp = container_of(res, struct ib_qp, res); if (!qp->counter || (qp->counter->id != counter->id)) continue; ret = fill_stat_counter_qp_entry(msg, qp->qp_num); if (ret) goto err; } xa_unlock(&rt->xa); nla_nest_end(msg, table_attr); return 0; err: xa_unlock(&rt->xa); nla_nest_cancel(msg, table_attr); return ret; } int rdma_nl_stat_hwcounter_entry(struct sk_buff *msg, const char *name, u64 value) { struct nlattr *entry_attr; entry_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_STAT_HWCOUNTER_ENTRY); if (!entry_attr) return -EMSGSIZE; if (nla_put_string(msg, RDMA_NLDEV_ATTR_STAT_HWCOUNTER_ENTRY_NAME, name)) goto err; if (nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_STAT_HWCOUNTER_ENTRY_VALUE, value, RDMA_NLDEV_ATTR_PAD)) goto err; nla_nest_end(msg, entry_attr); return 0; err: nla_nest_cancel(msg, entry_attr); return -EMSGSIZE; } EXPORT_SYMBOL(rdma_nl_stat_hwcounter_entry); static int fill_stat_mr_entry(struct sk_buff *msg, bool has_cap_net_admin, struct rdma_restrack_entry *res, uint32_t port) { struct ib_mr *mr = container_of(res, struct ib_mr, res); struct ib_device *dev = mr->pd->device; if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_MRN, res->id)) goto err; if (dev->ops.fill_stat_mr_entry) return dev->ops.fill_stat_mr_entry(msg, mr); return 0; err: return -EMSGSIZE; } static int fill_stat_counter_hwcounters(struct sk_buff *msg, struct rdma_counter *counter) { struct rdma_hw_stats *st = counter->stats; struct nlattr *table_attr; int i; table_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_STAT_HWCOUNTERS); if (!table_attr) return -EMSGSIZE; mutex_lock(&st->lock); for (i = 0; i < st->num_counters; i++) { if (test_bit(i, st->is_disabled)) continue; if (rdma_nl_stat_hwcounter_entry(msg, st->descs[i].name, st->value[i])) goto err; } mutex_unlock(&st->lock); nla_nest_end(msg, table_attr); return 0; err: mutex_unlock(&st->lock); nla_nest_cancel(msg, table_attr); return -EMSGSIZE; } static int fill_res_counter_entry(struct sk_buff *msg, bool has_cap_net_admin, struct rdma_restrack_entry *res, uint32_t port) { struct rdma_counter *counter = container_of(res, struct rdma_counter, res); if (port && port != counter->port) return -EAGAIN; /* Dump it even query failed */ rdma_counter_query_stats(counter); if (nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, counter->port) || nla_put_u32(msg, RDMA_NLDEV_ATTR_STAT_COUNTER_ID, counter->id) || fill_stat_counter_mode(msg, counter) || fill_stat_counter_qps(msg, counter) || fill_stat_counter_hwcounters(msg, counter)) return -EMSGSIZE; return 0; } static int nldev_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; struct ib_device *device; struct sk_buff *msg; u32 index; int err; err = __nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, nldev_policy, NL_VALIDATE_LIBERAL, extack); if (err || !tb[RDMA_NLDEV_ATTR_DEV_INDEX]) return -EINVAL; index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]); device = ib_device_get_by_index(sock_net(skb->sk), index); if (!device) return -EINVAL; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) { err = -ENOMEM; goto err; } nlh = nlmsg_put(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq, RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_GET), 0, 0); if (!nlh) { err = -EMSGSIZE; goto err_free; } err = fill_dev_info(msg, device); if (err) goto err_free; nlmsg_end(msg, nlh); ib_device_put(device); return rdma_nl_unicast(sock_net(skb->sk), msg, NETLINK_CB(skb).portid); err_free: nlmsg_free(msg); err: ib_device_put(device); return err; } static int nldev_set_doit(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; struct ib_device *device; u32 index; int err; err = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, nldev_policy, extack); if (err || !tb[RDMA_NLDEV_ATTR_DEV_INDEX]) return -EINVAL; index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]); device = ib_device_get_by_index(sock_net(skb->sk), index); if (!device) return -EINVAL; if (tb[RDMA_NLDEV_ATTR_DEV_NAME]) { char name[IB_DEVICE_NAME_MAX] = {}; nla_strscpy(name, tb[RDMA_NLDEV_ATTR_DEV_NAME], IB_DEVICE_NAME_MAX); if (strlen(name) == 0) { err = -EINVAL; goto done; } err = ib_device_rename(device, name); goto done; } if (tb[RDMA_NLDEV_NET_NS_FD]) { u32 ns_fd; ns_fd = nla_get_u32(tb[RDMA_NLDEV_NET_NS_FD]); err = ib_device_set_netns_put(skb, device, ns_fd); goto put_done; } if (tb[RDMA_NLDEV_ATTR_DEV_DIM]) { u8 use_dim; use_dim = nla_get_u8(tb[RDMA_NLDEV_ATTR_DEV_DIM]); err = ib_device_set_dim(device, use_dim); goto done; } done: ib_device_put(device); put_done: return err; } static int _nldev_get_dumpit(struct ib_device *device, struct sk_buff *skb, struct netlink_callback *cb, unsigned int idx) { int start = cb->args[0]; struct nlmsghdr *nlh; if (idx < start) return 0; nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_GET), 0, NLM_F_MULTI); if (!nlh || fill_dev_info(skb, device)) { nlmsg_cancel(skb, nlh); goto out; } nlmsg_end(skb, nlh); idx++; out: cb->args[0] = idx; return skb->len; } static int nldev_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { /* * There is no need to take lock, because * we are relying on ib_core's locking. */ return ib_enum_all_devs(_nldev_get_dumpit, skb, cb); } static int nldev_port_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; struct ib_device *device; struct sk_buff *msg; u32 index; u32 port; int err; err = __nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, nldev_policy, NL_VALIDATE_LIBERAL, extack); if (err || !tb[RDMA_NLDEV_ATTR_DEV_INDEX] || !tb[RDMA_NLDEV_ATTR_PORT_INDEX]) return -EINVAL; index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]); device = ib_device_get_by_index(sock_net(skb->sk), index); if (!device) return -EINVAL; port = nla_get_u32(tb[RDMA_NLDEV_ATTR_PORT_INDEX]); if (!rdma_is_port_valid(device, port)) { err = -EINVAL; goto err; } msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) { err = -ENOMEM; goto err; } nlh = nlmsg_put(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq, RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_GET), 0, 0); if (!nlh) { err = -EMSGSIZE; goto err_free; } err = fill_port_info(msg, device, port, sock_net(skb->sk)); if (err) goto err_free; nlmsg_end(msg, nlh); ib_device_put(device); return rdma_nl_unicast(sock_net(skb->sk), msg, NETLINK_CB(skb).portid); err_free: nlmsg_free(msg); err: ib_device_put(device); return err; } static int nldev_port_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; struct ib_device *device; int start = cb->args[0]; struct nlmsghdr *nlh; u32 idx = 0; u32 ifindex; int err; unsigned int p; err = __nlmsg_parse(cb->nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, nldev_policy, NL_VALIDATE_LIBERAL, NULL); if (err || !tb[RDMA_NLDEV_ATTR_DEV_INDEX]) return -EINVAL; ifindex = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]); device = ib_device_get_by_index(sock_net(skb->sk), ifindex); if (!device) return -EINVAL; rdma_for_each_port (device, p) { /* * The dumpit function returns all information from specific * index. This specific index is taken from the netlink * messages request sent by user and it is available * in cb->args[0]. * * Usually, the user doesn't fill this field and it causes * to return everything. * */ if (idx < start) { idx++; continue; } nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_PORT_GET), 0, NLM_F_MULTI); if (!nlh || fill_port_info(skb, device, p, sock_net(skb->sk))) { nlmsg_cancel(skb, nlh); goto out; } idx++; nlmsg_end(skb, nlh); } out: ib_device_put(device); cb->args[0] = idx; return skb->len; } static int nldev_res_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; bool show_details = false; struct ib_device *device; struct sk_buff *msg; u32 index; int ret; ret = __nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, nldev_policy, NL_VALIDATE_LIBERAL, extack); if (ret || !tb[RDMA_NLDEV_ATTR_DEV_INDEX]) return -EINVAL; index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]); device = ib_device_get_by_index(sock_net(skb->sk), index); if (!device) return -EINVAL; if (tb[RDMA_NLDEV_ATTR_DRIVER_DETAILS]) show_details = nla_get_u8(tb[RDMA_NLDEV_ATTR_DRIVER_DETAILS]); msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) { ret = -ENOMEM; goto err; } nlh = nlmsg_put(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq, RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_RES_GET), 0, 0); if (!nlh) { ret = -EMSGSIZE; goto err_free; } ret = fill_res_info(msg, device, show_details); if (ret) goto err_free; nlmsg_end(msg, nlh); ib_device_put(device); return rdma_nl_unicast(sock_net(skb->sk), msg, NETLINK_CB(skb).portid); err_free: nlmsg_free(msg); err: ib_device_put(device); return ret; } static int _nldev_res_get_dumpit(struct ib_device *device, struct sk_buff *skb, struct netlink_callback *cb, unsigned int idx) { int start = cb->args[0]; struct nlmsghdr *nlh; if (idx < start) return 0; nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_RES_GET), 0, NLM_F_MULTI); if (!nlh || fill_res_info(skb, device, false)) { nlmsg_cancel(skb, nlh); goto out; } nlmsg_end(skb, nlh); idx++; out: cb->args[0] = idx; return skb->len; } static int nldev_res_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { return ib_enum_all_devs(_nldev_res_get_dumpit, skb, cb); } struct nldev_fill_res_entry { enum rdma_nldev_attr nldev_attr; u8 flags; u32 entry; u32 id; }; enum nldev_res_flags { NLDEV_PER_DEV = 1 << 0, }; static const struct nldev_fill_res_entry fill_entries[RDMA_RESTRACK_MAX] = { [RDMA_RESTRACK_QP] = { .nldev_attr = RDMA_NLDEV_ATTR_RES_QP, .entry = RDMA_NLDEV_ATTR_RES_QP_ENTRY, .id = RDMA_NLDEV_ATTR_RES_LQPN, }, [RDMA_RESTRACK_CM_ID] = { .nldev_attr = RDMA_NLDEV_ATTR_RES_CM_ID, .entry = RDMA_NLDEV_ATTR_RES_CM_ID_ENTRY, .id = RDMA_NLDEV_ATTR_RES_CM_IDN, }, [RDMA_RESTRACK_CQ] = { .nldev_attr = RDMA_NLDEV_ATTR_RES_CQ, .flags = NLDEV_PER_DEV, .entry = RDMA_NLDEV_ATTR_RES_CQ_ENTRY, .id = RDMA_NLDEV_ATTR_RES_CQN, }, [RDMA_RESTRACK_MR] = { .nldev_attr = RDMA_NLDEV_ATTR_RES_MR, .flags = NLDEV_PER_DEV, .entry = RDMA_NLDEV_ATTR_RES_MR_ENTRY, .id = RDMA_NLDEV_ATTR_RES_MRN, }, [RDMA_RESTRACK_PD] = { .nldev_attr = RDMA_NLDEV_ATTR_RES_PD, .flags = NLDEV_PER_DEV, .entry = RDMA_NLDEV_ATTR_RES_PD_ENTRY, .id = RDMA_NLDEV_ATTR_RES_PDN, }, [RDMA_RESTRACK_COUNTER] = { .nldev_attr = RDMA_NLDEV_ATTR_STAT_COUNTER, .entry = RDMA_NLDEV_ATTR_STAT_COUNTER_ENTRY, .id = RDMA_NLDEV_ATTR_STAT_COUNTER_ID, }, [RDMA_RESTRACK_CTX] = { .nldev_attr = RDMA_NLDEV_ATTR_RES_CTX, .flags = NLDEV_PER_DEV, .entry = RDMA_NLDEV_ATTR_RES_CTX_ENTRY, .id = RDMA_NLDEV_ATTR_RES_CTXN, }, [RDMA_RESTRACK_SRQ] = { .nldev_attr = RDMA_NLDEV_ATTR_RES_SRQ, .flags = NLDEV_PER_DEV, .entry = RDMA_NLDEV_ATTR_RES_SRQ_ENTRY, .id = RDMA_NLDEV_ATTR_RES_SRQN, }, }; static noinline_for_stack int res_get_common_doit(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack, enum rdma_restrack_type res_type, res_fill_func_t fill_func) { const struct nldev_fill_res_entry *fe = &fill_entries[res_type]; struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; struct rdma_restrack_entry *res; struct ib_device *device; u32 index, id, port = 0; bool has_cap_net_admin; struct sk_buff *msg; int ret; ret = __nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, nldev_policy, NL_VALIDATE_LIBERAL, extack); if (ret || !tb[RDMA_NLDEV_ATTR_DEV_INDEX] || !fe->id || !tb[fe->id]) return -EINVAL; index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]); device = ib_device_get_by_index(sock_net(skb->sk), index); if (!device) return -EINVAL; if (tb[RDMA_NLDEV_ATTR_PORT_INDEX]) { port = nla_get_u32(tb[RDMA_NLDEV_ATTR_PORT_INDEX]); if (!rdma_is_port_valid(device, port)) { ret = -EINVAL; goto err; } } if ((port && fe->flags & NLDEV_PER_DEV) || (!port && ~fe->flags & NLDEV_PER_DEV)) { ret = -EINVAL; goto err; } id = nla_get_u32(tb[fe->id]); res = rdma_restrack_get_byid(device, res_type, id); if (IS_ERR(res)) { ret = PTR_ERR(res); goto err; } msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) { ret = -ENOMEM; goto err_get; } nlh = nlmsg_put(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq, RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NL_GET_OP(nlh->nlmsg_type)), 0, 0); if (!nlh || fill_nldev_handle(msg, device)) { ret = -EMSGSIZE; goto err_free; } has_cap_net_admin = netlink_capable(skb, CAP_NET_ADMIN); ret = fill_func(msg, has_cap_net_admin, res, port); if (ret) goto err_free; rdma_restrack_put(res); nlmsg_end(msg, nlh); ib_device_put(device); return rdma_nl_unicast(sock_net(skb->sk), msg, NETLINK_CB(skb).portid); err_free: nlmsg_free(msg); err_get: rdma_restrack_put(res); err: ib_device_put(device); return ret; } static int res_get_common_dumpit(struct sk_buff *skb, struct netlink_callback *cb, enum rdma_restrack_type res_type, res_fill_func_t fill_func) { const struct nldev_fill_res_entry *fe = &fill_entries[res_type]; struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; struct rdma_restrack_entry *res; struct rdma_restrack_root *rt; int err, ret = 0, idx = 0; bool show_details = false; struct nlattr *table_attr; struct nlattr *entry_attr; struct ib_device *device; int start = cb->args[0]; bool has_cap_net_admin; struct nlmsghdr *nlh; unsigned long id; u32 index, port = 0; bool filled = false; err = __nlmsg_parse(cb->nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, nldev_policy, NL_VALIDATE_LIBERAL, NULL); /* * Right now, we are expecting the device index to get res information, * but it is possible to extend this code to return all devices in * one shot by checking the existence of RDMA_NLDEV_ATTR_DEV_INDEX. * if it doesn't exist, we will iterate over all devices. * * But it is not needed for now. */ if (err || !tb[RDMA_NLDEV_ATTR_DEV_INDEX]) return -EINVAL; index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]); device = ib_device_get_by_index(sock_net(skb->sk), index); if (!device) return -EINVAL; if (tb[RDMA_NLDEV_ATTR_DRIVER_DETAILS]) show_details = nla_get_u8(tb[RDMA_NLDEV_ATTR_DRIVER_DETAILS]); /* * If no PORT_INDEX is supplied, we will return all QPs from that device */ if (tb[RDMA_NLDEV_ATTR_PORT_INDEX]) { port = nla_get_u32(tb[RDMA_NLDEV_ATTR_PORT_INDEX]); if (!rdma_is_port_valid(device, port)) { ret = -EINVAL; goto err_index; } } nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NL_GET_OP(cb->nlh->nlmsg_type)), 0, NLM_F_MULTI); if (!nlh || fill_nldev_handle(skb, device)) { ret = -EMSGSIZE; goto err; } table_attr = nla_nest_start_noflag(skb, fe->nldev_attr); if (!table_attr) { ret = -EMSGSIZE; goto err; } has_cap_net_admin = netlink_capable(cb->skb, CAP_NET_ADMIN); rt = &device->res[res_type]; xa_lock(&rt->xa); /* * FIXME: if the skip ahead is something common this loop should * use xas_for_each & xas_pause to optimize, we can have a lot of * objects. */ xa_for_each(&rt->xa, id, res) { if (xa_get_mark(&rt->xa, res->id, RESTRACK_DD) && !show_details) goto next; if (idx < start || !rdma_restrack_get(res)) goto next; xa_unlock(&rt->xa); filled = true; entry_attr = nla_nest_start_noflag(skb, fe->entry); if (!entry_attr) { ret = -EMSGSIZE; rdma_restrack_put(res); goto msg_full; } ret = fill_func(skb, has_cap_net_admin, res, port); rdma_restrack_put(res); if (ret) { nla_nest_cancel(skb, entry_attr); if (ret == -EMSGSIZE) goto msg_full; if (ret == -EAGAIN) goto again; goto res_err; } nla_nest_end(skb, entry_attr); again: xa_lock(&rt->xa); next: idx++; } xa_unlock(&rt->xa); msg_full: nla_nest_end(skb, table_attr); nlmsg_end(skb, nlh); cb->args[0] = idx; /* * No more entries to fill, cancel the message and * return 0 to mark end of dumpit. */ if (!filled) goto err; ib_device_put(device); return skb->len; res_err: nla_nest_cancel(skb, table_attr); err: nlmsg_cancel(skb, nlh); err_index: ib_device_put(device); return ret; } #define RES_GET_FUNCS(name, type) \ static int nldev_res_get_##name##_dumpit(struct sk_buff *skb, \ struct netlink_callback *cb) \ { \ return res_get_common_dumpit(skb, cb, type, \ fill_res_##name##_entry); \ } \ static int nldev_res_get_##name##_doit(struct sk_buff *skb, \ struct nlmsghdr *nlh, \ struct netlink_ext_ack *extack) \ { \ return res_get_common_doit(skb, nlh, extack, type, \ fill_res_##name##_entry); \ } RES_GET_FUNCS(qp, RDMA_RESTRACK_QP); RES_GET_FUNCS(qp_raw, RDMA_RESTRACK_QP); RES_GET_FUNCS(cm_id, RDMA_RESTRACK_CM_ID); RES_GET_FUNCS(cq, RDMA_RESTRACK_CQ); RES_GET_FUNCS(cq_raw, RDMA_RESTRACK_CQ); RES_GET_FUNCS(pd, RDMA_RESTRACK_PD); RES_GET_FUNCS(mr, RDMA_RESTRACK_MR); RES_GET_FUNCS(mr_raw, RDMA_RESTRACK_MR); RES_GET_FUNCS(counter, RDMA_RESTRACK_COUNTER); RES_GET_FUNCS(ctx, RDMA_RESTRACK_CTX); RES_GET_FUNCS(srq, RDMA_RESTRACK_SRQ); RES_GET_FUNCS(srq_raw, RDMA_RESTRACK_SRQ); static LIST_HEAD(link_ops); static DECLARE_RWSEM(link_ops_rwsem); static const struct rdma_link_ops *link_ops_get(const char *type) { const struct rdma_link_ops *ops; list_for_each_entry(ops, &link_ops, list) { if (!strcmp(ops->type, type)) goto out; } ops = NULL; out: return ops; } void rdma_link_register(struct rdma_link_ops *ops) { down_write(&link_ops_rwsem); if (WARN_ON_ONCE(link_ops_get(ops->type))) goto out; list_add(&ops->list, &link_ops); out: up_write(&link_ops_rwsem); } EXPORT_SYMBOL(rdma_link_register); void rdma_link_unregister(struct rdma_link_ops *ops) { down_write(&link_ops_rwsem); list_del(&ops->list); up_write(&link_ops_rwsem); } EXPORT_SYMBOL(rdma_link_unregister); static int nldev_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; char ibdev_name[IB_DEVICE_NAME_MAX]; const struct rdma_link_ops *ops; char ndev_name[IFNAMSIZ]; struct net_device *ndev; char type[IFNAMSIZ]; int err; err = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, nldev_policy, extack); if (err || !tb[RDMA_NLDEV_ATTR_DEV_NAME] || !tb[RDMA_NLDEV_ATTR_LINK_TYPE] || !tb[RDMA_NLDEV_ATTR_NDEV_NAME]) return -EINVAL; nla_strscpy(ibdev_name, tb[RDMA_NLDEV_ATTR_DEV_NAME], sizeof(ibdev_name)); if (strchr(ibdev_name, '%') || strlen(ibdev_name) == 0) return -EINVAL; nla_strscpy(type, tb[RDMA_NLDEV_ATTR_LINK_TYPE], sizeof(type)); nla_strscpy(ndev_name, tb[RDMA_NLDEV_ATTR_NDEV_NAME], sizeof(ndev_name)); ndev = dev_get_by_name(sock_net(skb->sk), ndev_name); if (!ndev) return -ENODEV; down_read(&link_ops_rwsem); ops = link_ops_get(type); #ifdef CONFIG_MODULES if (!ops) { up_read(&link_ops_rwsem); request_module("rdma-link-%s", type); down_read(&link_ops_rwsem); ops = link_ops_get(type); } #endif err = ops ? ops->newlink(ibdev_name, ndev) : -EINVAL; up_read(&link_ops_rwsem); dev_put(ndev); return err; } static int nldev_dellink(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; struct ib_device *device; u32 index; int err; err = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, nldev_policy, extack); if (err || !tb[RDMA_NLDEV_ATTR_DEV_INDEX]) return -EINVAL; index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]); device = ib_device_get_by_index(sock_net(skb->sk), index); if (!device) return -EINVAL; if (!(device->attrs.kernel_cap_flags & IBK_ALLOW_USER_UNREG)) { ib_device_put(device); return -EINVAL; } ib_unregister_device_and_put(device); return 0; } static int nldev_get_chardev(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; char client_name[RDMA_NLDEV_ATTR_CHARDEV_TYPE_SIZE]; struct ib_client_nl_info data = {}; struct ib_device *ibdev = NULL; struct sk_buff *msg; u32 index; int err; err = __nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, nldev_policy, NL_VALIDATE_LIBERAL, extack); if (err || !tb[RDMA_NLDEV_ATTR_CHARDEV_TYPE]) return -EINVAL; nla_strscpy(client_name, tb[RDMA_NLDEV_ATTR_CHARDEV_TYPE], sizeof(client_name)); if (tb[RDMA_NLDEV_ATTR_DEV_INDEX]) { index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]); ibdev = ib_device_get_by_index(sock_net(skb->sk), index); if (!ibdev) return -EINVAL; if (tb[RDMA_NLDEV_ATTR_PORT_INDEX]) { data.port = nla_get_u32(tb[RDMA_NLDEV_ATTR_PORT_INDEX]); if (!rdma_is_port_valid(ibdev, data.port)) { err = -EINVAL; goto out_put; } } else { data.port = -1; } } else if (tb[RDMA_NLDEV_ATTR_PORT_INDEX]) { return -EINVAL; } msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) { err = -ENOMEM; goto out_put; } nlh = nlmsg_put(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq, RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_GET_CHARDEV), 0, 0); if (!nlh) { err = -EMSGSIZE; goto out_nlmsg; } data.nl_msg = msg; err = ib_get_client_nl_info(ibdev, client_name, &data); if (err) goto out_nlmsg; err = nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_CHARDEV, huge_encode_dev(data.cdev->devt), RDMA_NLDEV_ATTR_PAD); if (err) goto out_data; err = nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_CHARDEV_ABI, data.abi, RDMA_NLDEV_ATTR_PAD); if (err) goto out_data; if (nla_put_string(msg, RDMA_NLDEV_ATTR_CHARDEV_NAME, dev_name(data.cdev))) { err = -EMSGSIZE; goto out_data; } nlmsg_end(msg, nlh); put_device(data.cdev); if (ibdev) ib_device_put(ibdev); return rdma_nl_unicast(sock_net(skb->sk), msg, NETLINK_CB(skb).portid); out_data: put_device(data.cdev); out_nlmsg: nlmsg_free(msg); out_put: if (ibdev) ib_device_put(ibdev); return err; } static int nldev_sys_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; struct sk_buff *msg; int err; err = __nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, nldev_policy, NL_VALIDATE_LIBERAL, extack); if (err) return err; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; nlh = nlmsg_put(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq, RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_SYS_GET), 0, 0); if (!nlh) { nlmsg_free(msg); return -EMSGSIZE; } err = nla_put_u8(msg, RDMA_NLDEV_SYS_ATTR_NETNS_MODE, (u8)ib_devices_shared_netns); if (err) { nlmsg_free(msg); return err; } err = nla_put_u8(msg, RDMA_NLDEV_SYS_ATTR_PRIVILEGED_QKEY_MODE, (u8)privileged_qkey); if (err) { nlmsg_free(msg); return err; } err = nla_put_u8(msg, RDMA_NLDEV_SYS_ATTR_MONITOR_MODE, 1); if (err) { nlmsg_free(msg); return err; } /* * Copy-on-fork is supported. * See commits: * 70e806e4e645 ("mm: Do early cow for pinned pages during fork() for ptes") * 4eae4efa2c29 ("hugetlb: do early cow when page pinned on src mm") * for more details. Don't backport this without them. * * Return value ignored on purpose, assume copy-on-fork is not * supported in case of failure. */ nla_put_u8(msg, RDMA_NLDEV_SYS_ATTR_COPY_ON_FORK, 1); nlmsg_end(msg, nlh); return rdma_nl_unicast(sock_net(skb->sk), msg, NETLINK_CB(skb).portid); } static int nldev_set_sys_set_netns_doit(struct nlattr *tb[]) { u8 enable; int err; enable = nla_get_u8(tb[RDMA_NLDEV_SYS_ATTR_NETNS_MODE]); /* Only 0 and 1 are supported */ if (enable > 1) return -EINVAL; err = rdma_compatdev_set(enable); return err; } static int nldev_set_sys_set_pqkey_doit(struct nlattr *tb[]) { u8 enable; enable = nla_get_u8(tb[RDMA_NLDEV_SYS_ATTR_PRIVILEGED_QKEY_MODE]); /* Only 0 and 1 are supported */ if (enable > 1) return -EINVAL; privileged_qkey = enable; return 0; } static int nldev_set_sys_set_doit(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; int err; err = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, nldev_policy, extack); if (err) return -EINVAL; if (tb[RDMA_NLDEV_SYS_ATTR_NETNS_MODE]) return nldev_set_sys_set_netns_doit(tb); if (tb[RDMA_NLDEV_SYS_ATTR_PRIVILEGED_QKEY_MODE]) return nldev_set_sys_set_pqkey_doit(tb); return -EINVAL; } static int nldev_stat_set_mode_doit(struct sk_buff *msg, struct netlink_ext_ack *extack, struct nlattr *tb[], struct ib_device *device, u32 port) { u32 mode, mask = 0, qpn, cntn = 0; bool opcnt = false; int ret; /* Currently only counter for QP is supported */ if (!tb[RDMA_NLDEV_ATTR_STAT_RES] || nla_get_u32(tb[RDMA_NLDEV_ATTR_STAT_RES]) != RDMA_NLDEV_ATTR_RES_QP) return -EINVAL; if (tb[RDMA_NLDEV_ATTR_STAT_OPCOUNTER_ENABLED]) opcnt = !!nla_get_u8( tb[RDMA_NLDEV_ATTR_STAT_OPCOUNTER_ENABLED]); mode = nla_get_u32(tb[RDMA_NLDEV_ATTR_STAT_MODE]); if (mode == RDMA_COUNTER_MODE_AUTO) { if (tb[RDMA_NLDEV_ATTR_STAT_AUTO_MODE_MASK]) mask = nla_get_u32( tb[RDMA_NLDEV_ATTR_STAT_AUTO_MODE_MASK]); return rdma_counter_set_auto_mode(device, port, mask, opcnt, extack); } if (!tb[RDMA_NLDEV_ATTR_RES_LQPN]) return -EINVAL; qpn = nla_get_u32(tb[RDMA_NLDEV_ATTR_RES_LQPN]); if (tb[RDMA_NLDEV_ATTR_STAT_COUNTER_ID]) { cntn = nla_get_u32(tb[RDMA_NLDEV_ATTR_STAT_COUNTER_ID]); ret = rdma_counter_bind_qpn(device, port, qpn, cntn); if (ret) return ret; } else { ret = rdma_counter_bind_qpn_alloc(device, port, qpn, &cntn); if (ret) return ret; } if (nla_put_u32(msg, RDMA_NLDEV_ATTR_STAT_COUNTER_ID, cntn) || nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_LQPN, qpn)) { ret = -EMSGSIZE; goto err_fill; } return 0; err_fill: rdma_counter_unbind_qpn(device, port, qpn, cntn); return ret; } static int nldev_stat_set_counter_dynamic_doit(struct nlattr *tb[], struct ib_device *device, u32 port) { struct rdma_hw_stats *stats; struct nlattr *entry_attr; unsigned long *target; int rem, i, ret = 0; u32 index; stats = ib_get_hw_stats_port(device, port); if (!stats) return -EINVAL; target = kcalloc(BITS_TO_LONGS(stats->num_counters), sizeof(*stats->is_disabled), GFP_KERNEL); if (!target) return -ENOMEM; nla_for_each_nested(entry_attr, tb[RDMA_NLDEV_ATTR_STAT_HWCOUNTERS], rem) { index = nla_get_u32(entry_attr); if ((index >= stats->num_counters) || !(stats->descs[index].flags & IB_STAT_FLAG_OPTIONAL)) { ret = -EINVAL; goto out; } set_bit(index, target); } for (i = 0; i < stats->num_counters; i++) { if (!(stats->descs[i].flags & IB_STAT_FLAG_OPTIONAL)) continue; ret = rdma_counter_modify(device, port, i, test_bit(i, target)); if (ret) goto out; } out: kfree(target); return ret; } static int nldev_stat_set_doit(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; struct ib_device *device; struct sk_buff *msg; u32 index, port; int ret; ret = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, nldev_policy, extack); if (ret || !tb[RDMA_NLDEV_ATTR_DEV_INDEX] || !tb[RDMA_NLDEV_ATTR_PORT_INDEX]) return -EINVAL; index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]); device = ib_device_get_by_index(sock_net(skb->sk), index); if (!device) return -EINVAL; port = nla_get_u32(tb[RDMA_NLDEV_ATTR_PORT_INDEX]); if (!rdma_is_port_valid(device, port)) { ret = -EINVAL; goto err_put_device; } if (!tb[RDMA_NLDEV_ATTR_STAT_MODE] && !tb[RDMA_NLDEV_ATTR_STAT_HWCOUNTERS]) { ret = -EINVAL; goto err_put_device; } msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) { ret = -ENOMEM; goto err_put_device; } nlh = nlmsg_put(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq, RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_STAT_SET), 0, 0); if (!nlh || fill_nldev_handle(msg, device) || nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, port)) { ret = -EMSGSIZE; goto err_free_msg; } if (tb[RDMA_NLDEV_ATTR_STAT_MODE]) { ret = nldev_stat_set_mode_doit(msg, extack, tb, device, port); if (ret) goto err_free_msg; } if (tb[RDMA_NLDEV_ATTR_STAT_HWCOUNTERS]) { ret = nldev_stat_set_counter_dynamic_doit(tb, device, port); if (ret) goto err_free_msg; } nlmsg_end(msg, nlh); ib_device_put(device); return rdma_nl_unicast(sock_net(skb->sk), msg, NETLINK_CB(skb).portid); err_free_msg: nlmsg_free(msg); err_put_device: ib_device_put(device); return ret; } static int nldev_stat_del_doit(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; struct ib_device *device; struct sk_buff *msg; u32 index, port, qpn, cntn; int ret; ret = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, nldev_policy, extack); if (ret || !tb[RDMA_NLDEV_ATTR_STAT_RES] || !tb[RDMA_NLDEV_ATTR_DEV_INDEX] || !tb[RDMA_NLDEV_ATTR_PORT_INDEX] || !tb[RDMA_NLDEV_ATTR_STAT_COUNTER_ID] || !tb[RDMA_NLDEV_ATTR_RES_LQPN]) return -EINVAL; if (nla_get_u32(tb[RDMA_NLDEV_ATTR_STAT_RES]) != RDMA_NLDEV_ATTR_RES_QP) return -EINVAL; index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]); device = ib_device_get_by_index(sock_net(skb->sk), index); if (!device) return -EINVAL; port = nla_get_u32(tb[RDMA_NLDEV_ATTR_PORT_INDEX]); if (!rdma_is_port_valid(device, port)) { ret = -EINVAL; goto err; } msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) { ret = -ENOMEM; goto err; } nlh = nlmsg_put(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq, RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_STAT_SET), 0, 0); if (!nlh) { ret = -EMSGSIZE; goto err_fill; } cntn = nla_get_u32(tb[RDMA_NLDEV_ATTR_STAT_COUNTER_ID]); qpn = nla_get_u32(tb[RDMA_NLDEV_ATTR_RES_LQPN]); if (fill_nldev_handle(msg, device) || nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, port) || nla_put_u32(msg, RDMA_NLDEV_ATTR_STAT_COUNTER_ID, cntn) || nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_LQPN, qpn)) { ret = -EMSGSIZE; goto err_fill; } ret = rdma_counter_unbind_qpn(device, port, qpn, cntn); if (ret) goto err_fill; nlmsg_end(msg, nlh); ib_device_put(device); return rdma_nl_unicast(sock_net(skb->sk), msg, NETLINK_CB(skb).portid); err_fill: nlmsg_free(msg); err: ib_device_put(device); return ret; } static noinline_for_stack int stat_get_doit_default_counter(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack, struct nlattr *tb[]) { struct rdma_hw_stats *stats; struct nlattr *table_attr; struct ib_device *device; int ret, num_cnts, i; struct sk_buff *msg; u32 index, port; u64 v; if (!tb[RDMA_NLDEV_ATTR_DEV_INDEX] || !tb[RDMA_NLDEV_ATTR_PORT_INDEX]) return -EINVAL; index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]); device = ib_device_get_by_index(sock_net(skb->sk), index); if (!device) return -EINVAL; if (!device->ops.alloc_hw_port_stats || !device->ops.get_hw_stats) { ret = -EINVAL; goto err; } port = nla_get_u32(tb[RDMA_NLDEV_ATTR_PORT_INDEX]); stats = ib_get_hw_stats_port(device, port); if (!stats) { ret = -EINVAL; goto err; } msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) { ret = -ENOMEM; goto err; } nlh = nlmsg_put(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq, RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_STAT_GET), 0, 0); if (!nlh || fill_nldev_handle(msg, device) || nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, port)) { ret = -EMSGSIZE; goto err_msg; } mutex_lock(&stats->lock); num_cnts = device->ops.get_hw_stats(device, stats, port, 0); if (num_cnts < 0) { ret = -EINVAL; goto err_stats; } table_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_STAT_HWCOUNTERS); if (!table_attr) { ret = -EMSGSIZE; goto err_stats; } for (i = 0; i < num_cnts; i++) { if (test_bit(i, stats->is_disabled)) continue; v = stats->value[i] + rdma_counter_get_hwstat_value(device, port, i); if (rdma_nl_stat_hwcounter_entry(msg, stats->descs[i].name, v)) { ret = -EMSGSIZE; goto err_table; } } nla_nest_end(msg, table_attr); mutex_unlock(&stats->lock); nlmsg_end(msg, nlh); ib_device_put(device); return rdma_nl_unicast(sock_net(skb->sk), msg, NETLINK_CB(skb).portid); err_table: nla_nest_cancel(msg, table_attr); err_stats: mutex_unlock(&stats->lock); err_msg: nlmsg_free(msg); err: ib_device_put(device); return ret; } static noinline_for_stack int stat_get_doit_qp(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack, struct nlattr *tb[]) { static enum rdma_nl_counter_mode mode; static enum rdma_nl_counter_mask mask; struct ib_device *device; struct sk_buff *msg; u32 index, port; bool opcnt; int ret; if (tb[RDMA_NLDEV_ATTR_STAT_COUNTER_ID]) return nldev_res_get_counter_doit(skb, nlh, extack); if (!tb[RDMA_NLDEV_ATTR_STAT_MODE] || !tb[RDMA_NLDEV_ATTR_DEV_INDEX] || !tb[RDMA_NLDEV_ATTR_PORT_INDEX]) return -EINVAL; index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]); device = ib_device_get_by_index(sock_net(skb->sk), index); if (!device) return -EINVAL; port = nla_get_u32(tb[RDMA_NLDEV_ATTR_PORT_INDEX]); if (!rdma_is_port_valid(device, port)) { ret = -EINVAL; goto err; } msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) { ret = -ENOMEM; goto err; } nlh = nlmsg_put(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq, RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_STAT_GET), 0, 0); if (!nlh) { ret = -EMSGSIZE; goto err_msg; } ret = rdma_counter_get_mode(device, port, &mode, &mask, &opcnt); if (ret) goto err_msg; if (fill_nldev_handle(msg, device) || nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, port) || nla_put_u32(msg, RDMA_NLDEV_ATTR_STAT_MODE, mode)) { ret = -EMSGSIZE; goto err_msg; } if ((mode == RDMA_COUNTER_MODE_AUTO) && nla_put_u32(msg, RDMA_NLDEV_ATTR_STAT_AUTO_MODE_MASK, mask)) { ret = -EMSGSIZE; goto err_msg; } if ((mode == RDMA_COUNTER_MODE_AUTO) && nla_put_u8(msg, RDMA_NLDEV_ATTR_STAT_OPCOUNTER_ENABLED, opcnt)) { ret = -EMSGSIZE; goto err_msg; } nlmsg_end(msg, nlh); ib_device_put(device); return rdma_nl_unicast(sock_net(skb->sk), msg, NETLINK_CB(skb).portid); err_msg: nlmsg_free(msg); err: ib_device_put(device); return ret; } static int nldev_stat_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; int ret; ret = __nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, nldev_policy, NL_VALIDATE_LIBERAL, extack); if (ret) return -EINVAL; if (!tb[RDMA_NLDEV_ATTR_STAT_RES]) return stat_get_doit_default_counter(skb, nlh, extack, tb); switch (nla_get_u32(tb[RDMA_NLDEV_ATTR_STAT_RES])) { case RDMA_NLDEV_ATTR_RES_QP: ret = stat_get_doit_qp(skb, nlh, extack, tb); break; case RDMA_NLDEV_ATTR_RES_MR: ret = res_get_common_doit(skb, nlh, extack, RDMA_RESTRACK_MR, fill_stat_mr_entry); break; default: ret = -EINVAL; break; } return ret; } static int nldev_stat_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; int ret; ret = __nlmsg_parse(cb->nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, nldev_policy, NL_VALIDATE_LIBERAL, NULL); if (ret || !tb[RDMA_NLDEV_ATTR_STAT_RES]) return -EINVAL; switch (nla_get_u32(tb[RDMA_NLDEV_ATTR_STAT_RES])) { case RDMA_NLDEV_ATTR_RES_QP: ret = nldev_res_get_counter_dumpit(skb, cb); break; case RDMA_NLDEV_ATTR_RES_MR: ret = res_get_common_dumpit(skb, cb, RDMA_RESTRACK_MR, fill_stat_mr_entry); break; default: ret = -EINVAL; break; } return ret; } static int nldev_stat_get_counter_status_doit(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct nlattr *tb[RDMA_NLDEV_ATTR_MAX], *table, *entry; struct rdma_hw_stats *stats; struct ib_device *device; struct sk_buff *msg; u32 devid, port; int ret, i; ret = __nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, nldev_policy, NL_VALIDATE_LIBERAL, extack); if (ret || !tb[RDMA_NLDEV_ATTR_DEV_INDEX] || !tb[RDMA_NLDEV_ATTR_PORT_INDEX]) return -EINVAL; devid = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]); device = ib_device_get_by_index(sock_net(skb->sk), devid); if (!device) return -EINVAL; port = nla_get_u32(tb[RDMA_NLDEV_ATTR_PORT_INDEX]); if (!rdma_is_port_valid(device, port)) { ret = -EINVAL; goto err; } stats = ib_get_hw_stats_port(device, port); if (!stats) { ret = -EINVAL; goto err; } msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) { ret = -ENOMEM; goto err; } nlh = nlmsg_put( msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq, RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_STAT_GET_STATUS), 0, 0); ret = -EMSGSIZE; if (!nlh || fill_nldev_handle(msg, device) || nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, port)) goto err_msg; table = nla_nest_start(msg, RDMA_NLDEV_ATTR_STAT_HWCOUNTERS); if (!table) goto err_msg; mutex_lock(&stats->lock); for (i = 0; i < stats->num_counters; i++) { entry = nla_nest_start(msg, RDMA_NLDEV_ATTR_STAT_HWCOUNTER_ENTRY); if (!entry) goto err_msg_table; if (nla_put_string(msg, RDMA_NLDEV_ATTR_STAT_HWCOUNTER_ENTRY_NAME, stats->descs[i].name) || nla_put_u32(msg, RDMA_NLDEV_ATTR_STAT_HWCOUNTER_INDEX, i)) goto err_msg_entry; if ((stats->descs[i].flags & IB_STAT_FLAG_OPTIONAL) && (nla_put_u8(msg, RDMA_NLDEV_ATTR_STAT_HWCOUNTER_DYNAMIC, !test_bit(i, stats->is_disabled)))) goto err_msg_entry; nla_nest_end(msg, entry); } mutex_unlock(&stats->lock); nla_nest_end(msg, table); nlmsg_end(msg, nlh); ib_device_put(device); return rdma_nl_unicast(sock_net(skb->sk), msg, NETLINK_CB(skb).portid); err_msg_entry: nla_nest_cancel(msg, entry); err_msg_table: mutex_unlock(&stats->lock); nla_nest_cancel(msg, table); err_msg: nlmsg_free(msg); err: ib_device_put(device); return ret; } static int nldev_newdev(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; enum rdma_nl_dev_type type; struct ib_device *parent; char name[IFNAMSIZ] = {}; u32 parentid; int ret; ret = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, nldev_policy, extack); if (ret || !tb[RDMA_NLDEV_ATTR_DEV_INDEX] || !tb[RDMA_NLDEV_ATTR_DEV_NAME] || !tb[RDMA_NLDEV_ATTR_DEV_TYPE]) return -EINVAL; nla_strscpy(name, tb[RDMA_NLDEV_ATTR_DEV_NAME], sizeof(name)); type = nla_get_u8(tb[RDMA_NLDEV_ATTR_DEV_TYPE]); parentid = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]); parent = ib_device_get_by_index(sock_net(skb->sk), parentid); if (!parent) return -EINVAL; ret = ib_add_sub_device(parent, type, name); ib_device_put(parent); return ret; } static int nldev_deldev(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; struct ib_device *device; u32 devid; int ret; ret = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, nldev_policy, extack); if (ret || !tb[RDMA_NLDEV_ATTR_DEV_INDEX]) return -EINVAL; devid = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]); device = ib_device_get_by_index(sock_net(skb->sk), devid); if (!device) return -EINVAL; return ib_del_sub_device_and_put(device); } static const struct rdma_nl_cbs nldev_cb_table[RDMA_NLDEV_NUM_OPS] = { [RDMA_NLDEV_CMD_GET] = { .doit = nldev_get_doit, .dump = nldev_get_dumpit, }, [RDMA_NLDEV_CMD_GET_CHARDEV] = { .doit = nldev_get_chardev, }, [RDMA_NLDEV_CMD_SET] = { .doit = nldev_set_doit, .flags = RDMA_NL_ADMIN_PERM, }, [RDMA_NLDEV_CMD_NEWLINK] = { .doit = nldev_newlink, .flags = RDMA_NL_ADMIN_PERM, }, [RDMA_NLDEV_CMD_DELLINK] = { .doit = nldev_dellink, .flags = RDMA_NL_ADMIN_PERM, }, [RDMA_NLDEV_CMD_PORT_GET] = { .doit = nldev_port_get_doit, .dump = nldev_port_get_dumpit, }, [RDMA_NLDEV_CMD_RES_GET] = { .doit = nldev_res_get_doit, .dump = nldev_res_get_dumpit, }, [RDMA_NLDEV_CMD_RES_QP_GET] = { .doit = nldev_res_get_qp_doit, .dump = nldev_res_get_qp_dumpit, }, [RDMA_NLDEV_CMD_RES_CM_ID_GET] = { .doit = nldev_res_get_cm_id_doit, .dump = nldev_res_get_cm_id_dumpit, }, [RDMA_NLDEV_CMD_RES_CQ_GET] = { .doit = nldev_res_get_cq_doit, .dump = nldev_res_get_cq_dumpit, }, [RDMA_NLDEV_CMD_RES_MR_GET] = { .doit = nldev_res_get_mr_doit, .dump = nldev_res_get_mr_dumpit, }, [RDMA_NLDEV_CMD_RES_PD_GET] = { .doit = nldev_res_get_pd_doit, .dump = nldev_res_get_pd_dumpit, }, [RDMA_NLDEV_CMD_RES_CTX_GET] = { .doit = nldev_res_get_ctx_doit, .dump = nldev_res_get_ctx_dumpit, }, [RDMA_NLDEV_CMD_RES_SRQ_GET] = { .doit = nldev_res_get_srq_doit, .dump = nldev_res_get_srq_dumpit, }, [RDMA_NLDEV_CMD_SYS_GET] = { .doit = nldev_sys_get_doit, }, [RDMA_NLDEV_CMD_SYS_SET] = { .doit = nldev_set_sys_set_doit, .flags = RDMA_NL_ADMIN_PERM, }, [RDMA_NLDEV_CMD_STAT_SET] = { .doit = nldev_stat_set_doit, .flags = RDMA_NL_ADMIN_PERM, }, [RDMA_NLDEV_CMD_STAT_GET] = { .doit = nldev_stat_get_doit, .dump = nldev_stat_get_dumpit, }, [RDMA_NLDEV_CMD_STAT_DEL] = { .doit = nldev_stat_del_doit, .flags = RDMA_NL_ADMIN_PERM, }, [RDMA_NLDEV_CMD_RES_QP_GET_RAW] = { .doit = nldev_res_get_qp_raw_doit, .dump = nldev_res_get_qp_raw_dumpit, .flags = RDMA_NL_ADMIN_PERM, }, [RDMA_NLDEV_CMD_RES_CQ_GET_RAW] = { .doit = nldev_res_get_cq_raw_doit, .dump = nldev_res_get_cq_raw_dumpit, .flags = RDMA_NL_ADMIN_PERM, }, [RDMA_NLDEV_CMD_RES_MR_GET_RAW] = { .doit = nldev_res_get_mr_raw_doit, .dump = nldev_res_get_mr_raw_dumpit, .flags = RDMA_NL_ADMIN_PERM, }, [RDMA_NLDEV_CMD_RES_SRQ_GET_RAW] = { .doit = nldev_res_get_srq_raw_doit, .dump = nldev_res_get_srq_raw_dumpit, .flags = RDMA_NL_ADMIN_PERM, }, [RDMA_NLDEV_CMD_STAT_GET_STATUS] = { .doit = nldev_stat_get_counter_status_doit, }, [RDMA_NLDEV_CMD_NEWDEV] = { .doit = nldev_newdev, .flags = RDMA_NL_ADMIN_PERM, }, [RDMA_NLDEV_CMD_DELDEV] = { .doit = nldev_deldev, .flags = RDMA_NL_ADMIN_PERM, }, }; static int fill_mon_netdev_rename(struct sk_buff *msg, struct ib_device *device, u32 port, const struct net *net) { struct net_device *netdev = ib_device_get_netdev(device, port); int ret = 0; if (!netdev || !net_eq(dev_net(netdev), net)) goto out; ret = nla_put_u32(msg, RDMA_NLDEV_ATTR_NDEV_INDEX, netdev->ifindex); if (ret) goto out; ret = nla_put_string(msg, RDMA_NLDEV_ATTR_NDEV_NAME, netdev->name); out: dev_put(netdev); return ret; } static int fill_mon_netdev_association(struct sk_buff *msg, struct ib_device *device, u32 port, const struct net *net) { struct net_device *netdev = ib_device_get_netdev(device, port); int ret = 0; if (netdev && !net_eq(dev_net(netdev), net)) goto out; ret = nla_put_u32(msg, RDMA_NLDEV_ATTR_DEV_INDEX, device->index); if (ret) goto out; ret = nla_put_string(msg, RDMA_NLDEV_ATTR_DEV_NAME, dev_name(&device->dev)); if (ret) goto out; ret = nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, port); if (ret) goto out; if (netdev) { ret = nla_put_u32(msg, RDMA_NLDEV_ATTR_NDEV_INDEX, netdev->ifindex); if (ret) goto out; ret = nla_put_string(msg, RDMA_NLDEV_ATTR_NDEV_NAME, netdev->name); } out: dev_put(netdev); return ret; } static void rdma_nl_notify_err_msg(struct ib_device *device, u32 port_num, enum rdma_nl_notify_event_type type) { struct net_device *netdev; switch (type) { case RDMA_REGISTER_EVENT: dev_warn_ratelimited(&device->dev, "Failed to send RDMA monitor register device event\n"); break; case RDMA_UNREGISTER_EVENT: dev_warn_ratelimited(&device->dev, "Failed to send RDMA monitor unregister device event\n"); break; case RDMA_NETDEV_ATTACH_EVENT: netdev = ib_device_get_netdev(device, port_num); dev_warn_ratelimited(&device->dev, "Failed to send RDMA monitor netdev attach event: port %d netdev %d\n", port_num, netdev->ifindex); dev_put(netdev); break; case RDMA_NETDEV_DETACH_EVENT: dev_warn_ratelimited(&device->dev, "Failed to send RDMA monitor netdev detach event: port %d\n", port_num); break; case RDMA_RENAME_EVENT: dev_warn_ratelimited(&device->dev, "Failed to send RDMA monitor rename device event\n"); break; case RDMA_NETDEV_RENAME_EVENT: netdev = ib_device_get_netdev(device, port_num); dev_warn_ratelimited(&device->dev, "Failed to send RDMA monitor netdev rename event: port %d netdev %d\n", port_num, netdev->ifindex); dev_put(netdev); break; default: break; } } int rdma_nl_notify_event(struct ib_device *device, u32 port_num, enum rdma_nl_notify_event_type type) { struct sk_buff *skb; int ret = -EMSGSIZE; struct net *net; void *nlh; net = read_pnet(&device->coredev.rdma_net); if (!net) return -EINVAL; skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!skb) return -ENOMEM; nlh = nlmsg_put(skb, 0, 0, RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_MONITOR), 0, 0); if (!nlh) goto err_free; switch (type) { case RDMA_REGISTER_EVENT: case RDMA_UNREGISTER_EVENT: case RDMA_RENAME_EVENT: ret = fill_nldev_handle(skb, device); if (ret) goto err_free; break; case RDMA_NETDEV_ATTACH_EVENT: case RDMA_NETDEV_DETACH_EVENT: ret = fill_mon_netdev_association(skb, device, port_num, net); if (ret) goto err_free; break; case RDMA_NETDEV_RENAME_EVENT: ret = fill_mon_netdev_rename(skb, device, port_num, net); if (ret) goto err_free; break; default: break; } ret = nla_put_u8(skb, RDMA_NLDEV_ATTR_EVENT_TYPE, type); if (ret) goto err_free; nlmsg_end(skb, nlh); ret = rdma_nl_multicast(net, skb, RDMA_NL_GROUP_NOTIFY, GFP_KERNEL); if (ret && ret != -ESRCH) { skb = NULL; /* skb is freed in the netlink send-op handling */ goto err_free; } return 0; err_free: rdma_nl_notify_err_msg(device, port_num, type); nlmsg_free(skb); return ret; } void __init nldev_init(void) { rdma_nl_register(RDMA_NL_NLDEV, nldev_cb_table); } void nldev_exit(void) { rdma_nl_unregister(RDMA_NL_NLDEV); } MODULE_ALIAS_RDMA_NETLINK(RDMA_NL_NLDEV, 5);
1 4 1 3 3 3 1 1 3 1 2 1 1 2 1 1 36 36 36 3 19 2 21 12 60 58 10 2 1 11 2 7 3 1 4 1 1 1 1 8 8 7 1 2 6 2 1 1 12 8 2 2 2 10 8 12 7 7 7 7 7 8 55 55 41 19 7 7 12 12 8 8 12 10 12 6 6 11 2 2 1 1 16 7 7 12 1 2 2 2 2 2 2 2 1 1 2 10 10 10 36 35 24 13 2 2 2 2 67 62 14 3 3 3 41 41 85 4 1 2 3 4 1 57 4 1 58 4 1 2 1 1 1 1 1 7 41 8 65 2 67 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 // SPDX-License-Identifier: GPL-2.0 /* Multipath TCP * * Copyright (c) 2017 - 2019, Intel Corporation. */ #define pr_fmt(fmt) "MPTCP: " fmt #include <linux/kernel.h> #include <crypto/sha2.h> #include <net/tcp.h> #include <net/mptcp.h> #include "protocol.h" #include "mib.h" #include <trace/events/mptcp.h> static bool mptcp_cap_flag_sha256(u8 flags) { return (flags & MPTCP_CAP_FLAG_MASK) == MPTCP_CAP_HMAC_SHA256; } static void mptcp_parse_option(const struct sk_buff *skb, const unsigned char *ptr, int opsize, struct mptcp_options_received *mp_opt) { u8 subtype = *ptr >> 4; int expected_opsize; u16 subopt; u8 version; u8 flags; u8 i; switch (subtype) { case MPTCPOPT_MP_CAPABLE: /* strict size checking */ if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) { if (skb->len > tcp_hdr(skb)->doff << 2) expected_opsize = TCPOLEN_MPTCP_MPC_ACK_DATA; else expected_opsize = TCPOLEN_MPTCP_MPC_ACK; subopt = OPTION_MPTCP_MPC_ACK; } else { if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACK) { expected_opsize = TCPOLEN_MPTCP_MPC_SYNACK; subopt = OPTION_MPTCP_MPC_SYNACK; } else { expected_opsize = TCPOLEN_MPTCP_MPC_SYN; subopt = OPTION_MPTCP_MPC_SYN; } } /* Cfr RFC 8684 Section 3.3.0: * If a checksum is present but its use had * not been negotiated in the MP_CAPABLE handshake, the receiver MUST * close the subflow with a RST, as it is not behaving as negotiated. * If a checksum is not present when its use has been negotiated, the * receiver MUST close the subflow with a RST, as it is considered * broken * We parse even option with mismatching csum presence, so that * later in subflow_data_ready we can trigger the reset. */ if (opsize != expected_opsize && (expected_opsize != TCPOLEN_MPTCP_MPC_ACK_DATA || opsize != TCPOLEN_MPTCP_MPC_ACK_DATA_CSUM)) break; /* try to be gentle vs future versions on the initial syn */ version = *ptr++ & MPTCP_VERSION_MASK; if (opsize != TCPOLEN_MPTCP_MPC_SYN) { if (version != MPTCP_SUPPORTED_VERSION) break; } else if (version < MPTCP_SUPPORTED_VERSION) { break; } flags = *ptr++; if (!mptcp_cap_flag_sha256(flags) || (flags & MPTCP_CAP_EXTENSIBILITY)) break; /* RFC 6824, Section 3.1: * "For the Checksum Required bit (labeled "A"), if either * host requires the use of checksums, checksums MUST be used. * In other words, the only way for checksums not to be used * is if both hosts in their SYNs set A=0." */ if (flags & MPTCP_CAP_CHECKSUM_REQD) mp_opt->suboptions |= OPTION_MPTCP_CSUMREQD; mp_opt->deny_join_id0 = !!(flags & MPTCP_CAP_DENY_JOIN_ID0); mp_opt->suboptions |= subopt; if (opsize >= TCPOLEN_MPTCP_MPC_SYNACK) { mp_opt->sndr_key = get_unaligned_be64(ptr); ptr += 8; } if (opsize >= TCPOLEN_MPTCP_MPC_ACK) { mp_opt->rcvr_key = get_unaligned_be64(ptr); ptr += 8; } if (opsize >= TCPOLEN_MPTCP_MPC_ACK_DATA) { /* Section 3.1.: * "the data parameters in a MP_CAPABLE are semantically * equivalent to those in a DSS option and can be used * interchangeably." */ mp_opt->suboptions |= OPTION_MPTCP_DSS; mp_opt->use_map = 1; mp_opt->mpc_map = 1; mp_opt->data_len = get_unaligned_be16(ptr); ptr += 2; } if (opsize == TCPOLEN_MPTCP_MPC_ACK_DATA_CSUM) { mp_opt->csum = get_unaligned((__force __sum16 *)ptr); mp_opt->suboptions |= OPTION_MPTCP_CSUMREQD; ptr += 2; } pr_debug("MP_CAPABLE version=%x, flags=%x, optlen=%d sndr=%llu, rcvr=%llu len=%d csum=%u\n", version, flags, opsize, mp_opt->sndr_key, mp_opt->rcvr_key, mp_opt->data_len, mp_opt->csum); break; case MPTCPOPT_MP_JOIN: if (opsize == TCPOLEN_MPTCP_MPJ_SYN) { mp_opt->suboptions |= OPTION_MPTCP_MPJ_SYN; mp_opt->backup = *ptr++ & MPTCPOPT_BACKUP; mp_opt->join_id = *ptr++; mp_opt->token = get_unaligned_be32(ptr); ptr += 4; mp_opt->nonce = get_unaligned_be32(ptr); ptr += 4; pr_debug("MP_JOIN bkup=%u, id=%u, token=%u, nonce=%u\n", mp_opt->backup, mp_opt->join_id, mp_opt->token, mp_opt->nonce); } else if (opsize == TCPOLEN_MPTCP_MPJ_SYNACK) { mp_opt->suboptions |= OPTION_MPTCP_MPJ_SYNACK; mp_opt->backup = *ptr++ & MPTCPOPT_BACKUP; mp_opt->join_id = *ptr++; mp_opt->thmac = get_unaligned_be64(ptr); ptr += 8; mp_opt->nonce = get_unaligned_be32(ptr); ptr += 4; pr_debug("MP_JOIN bkup=%u, id=%u, thmac=%llu, nonce=%u\n", mp_opt->backup, mp_opt->join_id, mp_opt->thmac, mp_opt->nonce); } else if (opsize == TCPOLEN_MPTCP_MPJ_ACK) { mp_opt->suboptions |= OPTION_MPTCP_MPJ_ACK; ptr += 2; memcpy(mp_opt->hmac, ptr, MPTCPOPT_HMAC_LEN); pr_debug("MP_JOIN hmac\n"); } break; case MPTCPOPT_DSS: pr_debug("DSS\n"); ptr++; flags = (*ptr++) & MPTCP_DSS_FLAG_MASK; mp_opt->data_fin = (flags & MPTCP_DSS_DATA_FIN) != 0; mp_opt->dsn64 = (flags & MPTCP_DSS_DSN64) != 0; mp_opt->use_map = (flags & MPTCP_DSS_HAS_MAP) != 0; mp_opt->ack64 = (flags & MPTCP_DSS_ACK64) != 0; mp_opt->use_ack = (flags & MPTCP_DSS_HAS_ACK); pr_debug("data_fin=%d dsn64=%d use_map=%d ack64=%d use_ack=%d\n", mp_opt->data_fin, mp_opt->dsn64, mp_opt->use_map, mp_opt->ack64, mp_opt->use_ack); expected_opsize = TCPOLEN_MPTCP_DSS_BASE; if (mp_opt->use_ack) { if (mp_opt->ack64) expected_opsize += TCPOLEN_MPTCP_DSS_ACK64; else expected_opsize += TCPOLEN_MPTCP_DSS_ACK32; } if (mp_opt->use_map) { if (mp_opt->dsn64) expected_opsize += TCPOLEN_MPTCP_DSS_MAP64; else expected_opsize += TCPOLEN_MPTCP_DSS_MAP32; } /* Always parse any csum presence combination, we will enforce * RFC 8684 Section 3.3.0 checks later in subflow_data_ready */ if (opsize != expected_opsize && opsize != expected_opsize + TCPOLEN_MPTCP_DSS_CHECKSUM) break; mp_opt->suboptions |= OPTION_MPTCP_DSS; if (mp_opt->use_ack) { if (mp_opt->ack64) { mp_opt->data_ack = get_unaligned_be64(ptr); ptr += 8; } else { mp_opt->data_ack = get_unaligned_be32(ptr); ptr += 4; } pr_debug("data_ack=%llu\n", mp_opt->data_ack); } if (mp_opt->use_map) { if (mp_opt->dsn64) { mp_opt->data_seq = get_unaligned_be64(ptr); ptr += 8; } else { mp_opt->data_seq = get_unaligned_be32(ptr); ptr += 4; } mp_opt->subflow_seq = get_unaligned_be32(ptr); ptr += 4; mp_opt->data_len = get_unaligned_be16(ptr); ptr += 2; if (opsize == expected_opsize + TCPOLEN_MPTCP_DSS_CHECKSUM) { mp_opt->suboptions |= OPTION_MPTCP_CSUMREQD; mp_opt->csum = get_unaligned((__force __sum16 *)ptr); ptr += 2; } pr_debug("data_seq=%llu subflow_seq=%u data_len=%u csum=%d:%u\n", mp_opt->data_seq, mp_opt->subflow_seq, mp_opt->data_len, !!(mp_opt->suboptions & OPTION_MPTCP_CSUMREQD), mp_opt->csum); } break; case MPTCPOPT_ADD_ADDR: mp_opt->echo = (*ptr++) & MPTCP_ADDR_ECHO; if (!mp_opt->echo) { if (opsize == TCPOLEN_MPTCP_ADD_ADDR || opsize == TCPOLEN_MPTCP_ADD_ADDR_PORT) mp_opt->addr.family = AF_INET; #if IS_ENABLED(CONFIG_MPTCP_IPV6) else if (opsize == TCPOLEN_MPTCP_ADD_ADDR6 || opsize == TCPOLEN_MPTCP_ADD_ADDR6_PORT) mp_opt->addr.family = AF_INET6; #endif else break; } else { if (opsize == TCPOLEN_MPTCP_ADD_ADDR_BASE || opsize == TCPOLEN_MPTCP_ADD_ADDR_BASE_PORT) mp_opt->addr.family = AF_INET; #if IS_ENABLED(CONFIG_MPTCP_IPV6) else if (opsize == TCPOLEN_MPTCP_ADD_ADDR6_BASE || opsize == TCPOLEN_MPTCP_ADD_ADDR6_BASE_PORT) mp_opt->addr.family = AF_INET6; #endif else break; } mp_opt->suboptions |= OPTION_MPTCP_ADD_ADDR; mp_opt->addr.id = *ptr++; mp_opt->addr.port = 0; mp_opt->ahmac = 0; if (mp_opt->addr.family == AF_INET) { memcpy((u8 *)&mp_opt->addr.addr.s_addr, (u8 *)ptr, 4); ptr += 4; if (opsize == TCPOLEN_MPTCP_ADD_ADDR_PORT || opsize == TCPOLEN_MPTCP_ADD_ADDR_BASE_PORT) { mp_opt->addr.port = htons(get_unaligned_be16(ptr)); ptr += 2; } } #if IS_ENABLED(CONFIG_MPTCP_IPV6) else { memcpy(mp_opt->addr.addr6.s6_addr, (u8 *)ptr, 16); ptr += 16; if (opsize == TCPOLEN_MPTCP_ADD_ADDR6_PORT || opsize == TCPOLEN_MPTCP_ADD_ADDR6_BASE_PORT) { mp_opt->addr.port = htons(get_unaligned_be16(ptr)); ptr += 2; } } #endif if (!mp_opt->echo) { mp_opt->ahmac = get_unaligned_be64(ptr); ptr += 8; } pr_debug("ADD_ADDR%s: id=%d, ahmac=%llu, echo=%d, port=%d\n", (mp_opt->addr.family == AF_INET6) ? "6" : "", mp_opt->addr.id, mp_opt->ahmac, mp_opt->echo, ntohs(mp_opt->addr.port)); break; case MPTCPOPT_RM_ADDR: if (opsize < TCPOLEN_MPTCP_RM_ADDR_BASE + 1 || opsize > TCPOLEN_MPTCP_RM_ADDR_BASE + MPTCP_RM_IDS_MAX) break; ptr++; mp_opt->suboptions |= OPTION_MPTCP_RM_ADDR; mp_opt->rm_list.nr = opsize - TCPOLEN_MPTCP_RM_ADDR_BASE; for (i = 0; i < mp_opt->rm_list.nr; i++) mp_opt->rm_list.ids[i] = *ptr++; pr_debug("RM_ADDR: rm_list_nr=%d\n", mp_opt->rm_list.nr); break; case MPTCPOPT_MP_PRIO: if (opsize != TCPOLEN_MPTCP_PRIO) break; mp_opt->suboptions |= OPTION_MPTCP_PRIO; mp_opt->backup = *ptr++ & MPTCP_PRIO_BKUP; pr_debug("MP_PRIO: prio=%d\n", mp_opt->backup); break; case MPTCPOPT_MP_FASTCLOSE: if (opsize != TCPOLEN_MPTCP_FASTCLOSE) break; ptr += 2; mp_opt->rcvr_key = get_unaligned_be64(ptr); ptr += 8; mp_opt->suboptions |= OPTION_MPTCP_FASTCLOSE; pr_debug("MP_FASTCLOSE: recv_key=%llu\n", mp_opt->rcvr_key); break; case MPTCPOPT_RST: if (opsize != TCPOLEN_MPTCP_RST) break; if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_RST)) break; mp_opt->suboptions |= OPTION_MPTCP_RST; flags = *ptr++; mp_opt->reset_transient = flags & MPTCP_RST_TRANSIENT; mp_opt->reset_reason = *ptr; pr_debug("MP_RST: transient=%u reason=%u\n", mp_opt->reset_transient, mp_opt->reset_reason); break; case MPTCPOPT_MP_FAIL: if (opsize != TCPOLEN_MPTCP_FAIL) break; ptr += 2; mp_opt->suboptions |= OPTION_MPTCP_FAIL; mp_opt->fail_seq = get_unaligned_be64(ptr); pr_debug("MP_FAIL: data_seq=%llu\n", mp_opt->fail_seq); break; default: break; } } void mptcp_get_options(const struct sk_buff *skb, struct mptcp_options_received *mp_opt) { const struct tcphdr *th = tcp_hdr(skb); const unsigned char *ptr; int length; /* Ensure that casting the whole status to u32 is efficient and safe */ BUILD_BUG_ON(sizeof_field(struct mptcp_options_received, status) != sizeof(u32)); BUILD_BUG_ON(!IS_ALIGNED(offsetof(struct mptcp_options_received, status), sizeof(u32))); *(u32 *)&mp_opt->status = 0; length = (th->doff * 4) - sizeof(struct tcphdr); ptr = (const unsigned char *)(th + 1); while (length > 0) { int opcode = *ptr++; int opsize; switch (opcode) { case TCPOPT_EOL: return; case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */ length--; continue; default: if (length < 2) return; opsize = *ptr++; if (opsize < 2) /* "silly options" */ return; if (opsize > length) return; /* don't parse partial options */ if (opcode == TCPOPT_MPTCP) mptcp_parse_option(skb, ptr, opsize, mp_opt); ptr += opsize - 2; length -= opsize; } } } bool mptcp_syn_options(struct sock *sk, const struct sk_buff *skb, unsigned int *size, struct mptcp_out_options *opts) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); /* we will use snd_isn to detect first pkt [re]transmission * in mptcp_established_options_mp() */ subflow->snd_isn = TCP_SKB_CB(skb)->end_seq; if (subflow->request_mptcp) { if (unlikely(subflow_simultaneous_connect(sk))) { WARN_ON_ONCE(!mptcp_try_fallback(sk, MPTCP_MIB_SIMULTCONNFALLBACK)); /* Ensure mptcp_finish_connect() will not process the * MPC handshake. */ subflow->request_mptcp = 0; return false; } opts->suboptions = OPTION_MPTCP_MPC_SYN; opts->csum_reqd = mptcp_is_checksum_enabled(sock_net(sk)); opts->allow_join_id0 = mptcp_allow_join_id0(sock_net(sk)); *size = TCPOLEN_MPTCP_MPC_SYN; return true; } else if (subflow->request_join) { pr_debug("remote_token=%u, nonce=%u\n", subflow->remote_token, subflow->local_nonce); opts->suboptions = OPTION_MPTCP_MPJ_SYN; opts->join_id = subflow->local_id; opts->token = subflow->remote_token; opts->nonce = subflow->local_nonce; opts->backup = subflow->request_bkup; *size = TCPOLEN_MPTCP_MPJ_SYN; return true; } return false; } static void clear_3rdack_retransmission(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); sk_stop_timer(sk, &icsk->icsk_delack_timer); icsk->icsk_ack.ato = 0; icsk->icsk_ack.pending &= ~(ICSK_ACK_SCHED | ICSK_ACK_TIMER); } static bool mptcp_established_options_mp(struct sock *sk, struct sk_buff *skb, bool snd_data_fin_enable, unsigned int *size, struct mptcp_out_options *opts) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); struct mptcp_sock *msk = mptcp_sk(subflow->conn); struct mptcp_ext *mpext; unsigned int data_len; u8 len; /* When skb is not available, we better over-estimate the emitted * options len. A full DSS option (28 bytes) is longer than * TCPOLEN_MPTCP_MPC_ACK_DATA(22) or TCPOLEN_MPTCP_MPJ_ACK(24), so * tell the caller to defer the estimate to * mptcp_established_options_dss(), which will reserve enough space. */ if (!skb) return false; /* MPC/MPJ needed only on 3rd ack packet, DATA_FIN and TCP shutdown take precedence */ if (READ_ONCE(subflow->fully_established) || snd_data_fin_enable || subflow->snd_isn != TCP_SKB_CB(skb)->seq || sk->sk_state != TCP_ESTABLISHED) return false; if (subflow->mp_capable) { mpext = mptcp_get_ext(skb); data_len = mpext ? mpext->data_len : 0; /* we will check ops->data_len in mptcp_write_options() to * discriminate between TCPOLEN_MPTCP_MPC_ACK_DATA and * TCPOLEN_MPTCP_MPC_ACK */ opts->data_len = data_len; opts->suboptions = OPTION_MPTCP_MPC_ACK; opts->sndr_key = subflow->local_key; opts->rcvr_key = subflow->remote_key; opts->csum_reqd = READ_ONCE(msk->csum_enabled); opts->allow_join_id0 = mptcp_allow_join_id0(sock_net(sk)); /* Section 3.1. * The MP_CAPABLE option is carried on the SYN, SYN/ACK, and ACK * packets that start the first subflow of an MPTCP connection, * as well as the first packet that carries data */ if (data_len > 0) { len = TCPOLEN_MPTCP_MPC_ACK_DATA; if (opts->csum_reqd) { /* we need to propagate more info to csum the pseudo hdr */ opts->data_seq = mpext->data_seq; opts->subflow_seq = mpext->subflow_seq; opts->csum = mpext->csum; len += TCPOLEN_MPTCP_DSS_CHECKSUM; } *size = ALIGN(len, 4); } else { *size = TCPOLEN_MPTCP_MPC_ACK; } pr_debug("subflow=%p, local_key=%llu, remote_key=%llu map_len=%d\n", subflow, subflow->local_key, subflow->remote_key, data_len); return true; } else if (subflow->mp_join) { opts->suboptions = OPTION_MPTCP_MPJ_ACK; memcpy(opts->hmac, subflow->hmac, MPTCPOPT_HMAC_LEN); *size = TCPOLEN_MPTCP_MPJ_ACK; pr_debug("subflow=%p\n", subflow); /* we can use the full delegate action helper only from BH context * If we are in process context - sk is flushing the backlog at * socket lock release time - just set the appropriate flag, will * be handled by the release callback */ if (sock_owned_by_user(sk)) set_bit(MPTCP_DELEGATE_ACK, &subflow->delegated_status); else mptcp_subflow_delegate(subflow, MPTCP_DELEGATE_ACK); return true; } return false; } static void mptcp_write_data_fin(struct mptcp_subflow_context *subflow, struct sk_buff *skb, struct mptcp_ext *ext) { /* The write_seq value has already been incremented, so the actual * sequence number for the DATA_FIN is one less. */ u64 data_fin_tx_seq = READ_ONCE(mptcp_sk(subflow->conn)->write_seq) - 1; if (!ext->use_map || !skb->len) { /* RFC6824 requires a DSS mapping with specific values * if DATA_FIN is set but no data payload is mapped */ ext->data_fin = 1; ext->use_map = 1; ext->dsn64 = 1; ext->data_seq = data_fin_tx_seq; ext->subflow_seq = 0; ext->data_len = 1; } else if (ext->data_seq + ext->data_len == data_fin_tx_seq) { /* If there's an existing DSS mapping and it is the * final mapping, DATA_FIN consumes 1 additional byte of * mapping space. */ ext->data_fin = 1; ext->data_len++; } } static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb, bool snd_data_fin_enable, unsigned int *size, struct mptcp_out_options *opts) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); struct mptcp_sock *msk = mptcp_sk(subflow->conn); unsigned int dss_size = 0; struct mptcp_ext *mpext; unsigned int ack_size; bool ret = false; u64 ack_seq; opts->csum_reqd = READ_ONCE(msk->csum_enabled); mpext = skb ? mptcp_get_ext(skb) : NULL; if (!skb || (mpext && mpext->use_map) || snd_data_fin_enable) { unsigned int map_size = TCPOLEN_MPTCP_DSS_BASE + TCPOLEN_MPTCP_DSS_MAP64; if (mpext) { if (opts->csum_reqd) map_size += TCPOLEN_MPTCP_DSS_CHECKSUM; opts->ext_copy = *mpext; } dss_size = map_size; if (skb && snd_data_fin_enable) mptcp_write_data_fin(subflow, skb, &opts->ext_copy); opts->suboptions = OPTION_MPTCP_DSS; ret = true; } /* passive sockets msk will set the 'can_ack' after accept(), even * if the first subflow may have the already the remote key handy */ opts->ext_copy.use_ack = 0; if (!READ_ONCE(msk->can_ack)) { *size = ALIGN(dss_size, 4); return ret; } ack_seq = READ_ONCE(msk->ack_seq); if (READ_ONCE(msk->use_64bit_ack)) { ack_size = TCPOLEN_MPTCP_DSS_ACK64; opts->ext_copy.data_ack = ack_seq; opts->ext_copy.ack64 = 1; } else { ack_size = TCPOLEN_MPTCP_DSS_ACK32; opts->ext_copy.data_ack32 = (uint32_t)ack_seq; opts->ext_copy.ack64 = 0; } opts->ext_copy.use_ack = 1; opts->suboptions = OPTION_MPTCP_DSS; /* Add kind/length/subtype/flag overhead if mapping is not populated */ if (dss_size == 0) ack_size += TCPOLEN_MPTCP_DSS_BASE; dss_size += ack_size; *size = ALIGN(dss_size, 4); return true; } static u64 add_addr_generate_hmac(u64 key1, u64 key2, struct mptcp_addr_info *addr) { u16 port = ntohs(addr->port); u8 hmac[SHA256_DIGEST_SIZE]; u8 msg[19]; int i = 0; msg[i++] = addr->id; if (addr->family == AF_INET) { memcpy(&msg[i], &addr->addr.s_addr, 4); i += 4; } #if IS_ENABLED(CONFIG_MPTCP_IPV6) else if (addr->family == AF_INET6) { memcpy(&msg[i], &addr->addr6.s6_addr, 16); i += 16; } #endif msg[i++] = port >> 8; msg[i++] = port & 0xFF; mptcp_crypto_hmac_sha(key1, key2, msg, i, hmac); return get_unaligned_be64(&hmac[SHA256_DIGEST_SIZE - sizeof(u64)]); } static bool mptcp_established_options_add_addr(struct sock *sk, struct sk_buff *skb, unsigned int *size, unsigned int remaining, struct mptcp_out_options *opts) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); struct mptcp_sock *msk = mptcp_sk(subflow->conn); bool drop_other_suboptions = false; unsigned int opt_size = *size; struct mptcp_addr_info addr; bool echo; int len; /* add addr will strip the existing options, be sure to avoid breaking * MPC/MPJ handshakes */ if (!mptcp_pm_should_add_signal(msk) || (opts->suboptions & (OPTION_MPTCP_MPJ_ACK | OPTION_MPTCP_MPC_ACK)) || !mptcp_pm_add_addr_signal(msk, skb, opt_size, remaining, &addr, &echo, &drop_other_suboptions)) return false; /* * Later on, mptcp_write_options() will enforce mutually exclusion with * DSS, bail out if such option is set and we can't drop it. */ if (drop_other_suboptions) remaining += opt_size; else if (opts->suboptions & OPTION_MPTCP_DSS) return false; len = mptcp_add_addr_len(addr.family, echo, !!addr.port); if (remaining < len) return false; *size = len; if (drop_other_suboptions) { pr_debug("drop other suboptions\n"); opts->suboptions = 0; /* note that e.g. DSS could have written into the memory * aliased by ahmac, we must reset the field here * to avoid appending the hmac even for ADD_ADDR echo * options */ opts->ahmac = 0; *size -= opt_size; } opts->addr = addr; opts->suboptions |= OPTION_MPTCP_ADD_ADDR; if (!echo) { MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_ADDADDRTX); opts->ahmac = add_addr_generate_hmac(READ_ONCE(msk->local_key), READ_ONCE(msk->remote_key), &opts->addr); } else { MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_ECHOADDTX); } pr_debug("addr_id=%d, ahmac=%llu, echo=%d, port=%d\n", opts->addr.id, opts->ahmac, echo, ntohs(opts->addr.port)); return true; } static bool mptcp_established_options_rm_addr(struct sock *sk, unsigned int *size, unsigned int remaining, struct mptcp_out_options *opts) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); struct mptcp_sock *msk = mptcp_sk(subflow->conn); struct mptcp_rm_list rm_list; int i, len; if (!mptcp_pm_should_rm_signal(msk) || !(mptcp_pm_rm_addr_signal(msk, remaining, &rm_list))) return false; len = mptcp_rm_addr_len(&rm_list); if (len < 0) return false; if (remaining < len) return false; *size = len; opts->suboptions |= OPTION_MPTCP_RM_ADDR; opts->rm_list = rm_list; for (i = 0; i < opts->rm_list.nr; i++) pr_debug("rm_list_ids[%d]=%d\n", i, opts->rm_list.ids[i]); MPTCP_ADD_STATS(sock_net(sk), MPTCP_MIB_RMADDRTX, opts->rm_list.nr); return true; } static bool mptcp_established_options_mp_prio(struct sock *sk, unsigned int *size, unsigned int remaining, struct mptcp_out_options *opts) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); /* can't send MP_PRIO with MPC, as they share the same option space: * 'backup'. Also it makes no sense at all */ if (!subflow->send_mp_prio || (opts->suboptions & OPTIONS_MPTCP_MPC)) return false; /* account for the trailing 'nop' option */ if (remaining < TCPOLEN_MPTCP_PRIO_ALIGN) return false; *size = TCPOLEN_MPTCP_PRIO_ALIGN; opts->suboptions |= OPTION_MPTCP_PRIO; opts->backup = subflow->request_bkup; pr_debug("prio=%d\n", opts->backup); return true; } static noinline bool mptcp_established_options_rst(struct sock *sk, struct sk_buff *skb, unsigned int *size, unsigned int remaining, struct mptcp_out_options *opts) { const struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); if (remaining < TCPOLEN_MPTCP_RST) return false; *size = TCPOLEN_MPTCP_RST; opts->suboptions |= OPTION_MPTCP_RST; opts->reset_transient = subflow->reset_transient; opts->reset_reason = subflow->reset_reason; MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPRSTTX); return true; } static bool mptcp_established_options_fastclose(struct sock *sk, unsigned int *size, unsigned int remaining, struct mptcp_out_options *opts) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); struct mptcp_sock *msk = mptcp_sk(subflow->conn); if (likely(!subflow->send_fastclose)) return false; if (remaining < TCPOLEN_MPTCP_FASTCLOSE) return false; *size = TCPOLEN_MPTCP_FASTCLOSE; opts->suboptions |= OPTION_MPTCP_FASTCLOSE; opts->rcvr_key = READ_ONCE(msk->remote_key); pr_debug("FASTCLOSE key=%llu\n", opts->rcvr_key); MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPFASTCLOSETX); return true; } static bool mptcp_established_options_mp_fail(struct sock *sk, unsigned int *size, unsigned int remaining, struct mptcp_out_options *opts) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); if (likely(!subflow->send_mp_fail)) return false; if (remaining < TCPOLEN_MPTCP_FAIL) return false; *size = TCPOLEN_MPTCP_FAIL; opts->suboptions |= OPTION_MPTCP_FAIL; opts->fail_seq = subflow->map_seq; pr_debug("MP_FAIL fail_seq=%llu\n", opts->fail_seq); MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPFAILTX); return true; } bool mptcp_established_options(struct sock *sk, struct sk_buff *skb, unsigned int *size, unsigned int remaining, struct mptcp_out_options *opts) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); struct mptcp_sock *msk = mptcp_sk(subflow->conn); unsigned int opt_size = 0; bool snd_data_fin; bool ret = false; opts->suboptions = 0; /* Force later mptcp_write_options(), but do not use any actual * option space. */ if (unlikely(__mptcp_check_fallback(msk) && !mptcp_check_infinite_map(skb))) return true; if (unlikely(skb && TCP_SKB_CB(skb)->tcp_flags & TCPHDR_RST)) { if (mptcp_established_options_fastclose(sk, &opt_size, remaining, opts) || mptcp_established_options_mp_fail(sk, &opt_size, remaining, opts)) { *size += opt_size; remaining -= opt_size; } /* MP_RST can be used with MP_FASTCLOSE and MP_FAIL if there is room */ if (mptcp_established_options_rst(sk, skb, &opt_size, remaining, opts)) { *size += opt_size; remaining -= opt_size; } return true; } snd_data_fin = mptcp_data_fin_enabled(msk); if (mptcp_established_options_mp(sk, skb, snd_data_fin, &opt_size, opts)) ret = true; else if (mptcp_established_options_dss(sk, skb, snd_data_fin, &opt_size, opts)) { unsigned int mp_fail_size; ret = true; if (mptcp_established_options_mp_fail(sk, &mp_fail_size, remaining - opt_size, opts)) { *size += opt_size + mp_fail_size; remaining -= opt_size - mp_fail_size; return true; } } /* we reserved enough space for the above options, and exceeding the * TCP option space would be fatal */ if (WARN_ON_ONCE(opt_size > remaining)) return false; *size += opt_size; remaining -= opt_size; if (mptcp_established_options_add_addr(sk, skb, &opt_size, remaining, opts)) { *size += opt_size; remaining -= opt_size; ret = true; } else if (mptcp_established_options_rm_addr(sk, &opt_size, remaining, opts)) { *size += opt_size; remaining -= opt_size; ret = true; } if (mptcp_established_options_mp_prio(sk, &opt_size, remaining, opts)) { *size += opt_size; remaining -= opt_size; ret = true; } return ret; } bool mptcp_synack_options(const struct request_sock *req, unsigned int *size, struct mptcp_out_options *opts) { struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); if (subflow_req->mp_capable) { opts->suboptions = OPTION_MPTCP_MPC_SYNACK; opts->sndr_key = subflow_req->local_key; opts->csum_reqd = subflow_req->csum_reqd; opts->allow_join_id0 = subflow_req->allow_join_id0; *size = TCPOLEN_MPTCP_MPC_SYNACK; pr_debug("subflow_req=%p, local_key=%llu\n", subflow_req, subflow_req->local_key); return true; } else if (subflow_req->mp_join) { opts->suboptions = OPTION_MPTCP_MPJ_SYNACK; opts->backup = subflow_req->request_bkup; opts->join_id = subflow_req->local_id; opts->thmac = subflow_req->thmac; opts->nonce = subflow_req->local_nonce; pr_debug("req=%p, bkup=%u, id=%u, thmac=%llu, nonce=%u\n", subflow_req, opts->backup, opts->join_id, opts->thmac, opts->nonce); *size = TCPOLEN_MPTCP_MPJ_SYNACK; return true; } return false; } static bool check_fully_established(struct mptcp_sock *msk, struct sock *ssk, struct mptcp_subflow_context *subflow, struct sk_buff *skb, struct mptcp_options_received *mp_opt) { /* here we can process OoO, in-window pkts, only in-sequence 4th ack * will make the subflow fully established */ if (likely(READ_ONCE(subflow->fully_established))) { /* on passive sockets, check for 3rd ack retransmission * note that msk is always set by subflow_syn_recv_sock() * for mp_join subflows */ if (TCP_SKB_CB(skb)->seq == subflow->ssn_offset + 1 && TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq && subflow->mp_join && (mp_opt->suboptions & OPTIONS_MPTCP_MPJ) && !subflow->request_join) tcp_send_ack(ssk); goto check_notify; } /* we must process OoO packets before the first subflow is fully * established. OoO packets are instead a protocol violation * for MP_JOIN subflows as the peer must not send any data * before receiving the forth ack - cfr. RFC 8684 section 3.2. */ if (TCP_SKB_CB(skb)->seq != subflow->ssn_offset + 1) { if (subflow->mp_join) goto reset; if (subflow->is_mptfo && mp_opt->suboptions & OPTION_MPTCP_MPC_ACK) goto set_fully_established; return subflow->mp_capable; } if (subflow->remote_key_valid && (((mp_opt->suboptions & OPTION_MPTCP_DSS) && mp_opt->use_ack) || ((mp_opt->suboptions & OPTION_MPTCP_ADD_ADDR) && (!mp_opt->echo || subflow->mp_join)))) { /* subflows are fully established as soon as we get any * additional ack, including ADD_ADDR. */ goto set_fully_established; } /* If the first established packet does not contain MP_CAPABLE + data * then fallback to TCP. Fallback scenarios requires a reset for * MP_JOIN subflows. */ if (!(mp_opt->suboptions & OPTIONS_MPTCP_MPC)) { if (subflow->mp_join) goto reset; subflow->mp_capable = 0; if (!mptcp_try_fallback(ssk, MPTCP_MIB_MPCAPABLEDATAFALLBACK)) { MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_FALLBACKFAILED); goto reset; } return false; } if (unlikely(!READ_ONCE(msk->pm.server_side))) pr_warn_once("bogus mpc option on established client sk"); set_fully_established: if (mp_opt->deny_join_id0) WRITE_ONCE(msk->pm.remote_deny_join_id0, true); mptcp_data_lock((struct sock *)msk); __mptcp_subflow_fully_established(msk, subflow, mp_opt); mptcp_data_unlock((struct sock *)msk); check_notify: /* if the subflow is not already linked into the conn_list, we can't * notify the PM: this subflow is still on the listener queue * and the PM possibly acquiring the subflow lock could race with * the listener close */ if (likely(subflow->pm_notified) || list_empty(&subflow->node)) return true; subflow->pm_notified = 1; if (subflow->mp_join) { clear_3rdack_retransmission(ssk); mptcp_pm_subflow_established(msk); } else { mptcp_pm_fully_established(msk, ssk); } return true; reset: mptcp_subflow_reset(ssk); return false; } u64 __mptcp_expand_seq(u64 old_seq, u64 cur_seq) { u32 old_seq32, cur_seq32; old_seq32 = (u32)old_seq; cur_seq32 = (u32)cur_seq; cur_seq = (old_seq & GENMASK_ULL(63, 32)) + cur_seq32; if (unlikely(cur_seq32 < old_seq32 && before(old_seq32, cur_seq32))) return cur_seq + (1LL << 32); /* reverse wrap could happen, too */ if (unlikely(cur_seq32 > old_seq32 && after(old_seq32, cur_seq32))) return cur_seq - (1LL << 32); return cur_seq; } static void __mptcp_snd_una_update(struct mptcp_sock *msk, u64 new_snd_una) { msk->bytes_acked += new_snd_una - msk->snd_una; WRITE_ONCE(msk->snd_una, new_snd_una); } static void rwin_update(struct mptcp_sock *msk, struct sock *ssk, struct sk_buff *skb) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); struct tcp_sock *tp = tcp_sk(ssk); u64 mptcp_rcv_wnd; /* Avoid touching extra cachelines if TCP is going to accept this * skb without filling the TCP-level window even with a possibly * outdated mptcp-level rwin. */ if (!skb->len || skb->len < tcp_receive_window(tp)) return; mptcp_rcv_wnd = atomic64_read(&msk->rcv_wnd_sent); if (!after64(mptcp_rcv_wnd, subflow->rcv_wnd_sent)) return; /* Some other subflow grew the mptcp-level rwin since rcv_wup, * resync. */ tp->rcv_wnd += mptcp_rcv_wnd - subflow->rcv_wnd_sent; subflow->rcv_wnd_sent = mptcp_rcv_wnd; } static void ack_update_msk(struct mptcp_sock *msk, struct sock *ssk, struct mptcp_options_received *mp_opt) { u64 new_wnd_end, new_snd_una, snd_nxt = READ_ONCE(msk->snd_nxt); struct sock *sk = (struct sock *)msk; u64 old_snd_una; mptcp_data_lock(sk); /* avoid ack expansion on update conflict, to reduce the risk of * wrongly expanding to a future ack sequence number, which is way * more dangerous than missing an ack */ old_snd_una = msk->snd_una; new_snd_una = mptcp_expand_seq(old_snd_una, mp_opt->data_ack, mp_opt->ack64); /* ACK for data not even sent yet? Ignore.*/ if (unlikely(after64(new_snd_una, snd_nxt))) new_snd_una = old_snd_una; new_wnd_end = new_snd_una + tcp_sk(ssk)->snd_wnd; if (after64(new_wnd_end, msk->wnd_end)) WRITE_ONCE(msk->wnd_end, new_wnd_end); /* this assumes mptcp_incoming_options() is invoked after tcp_ack() */ if (after64(msk->wnd_end, snd_nxt)) __mptcp_check_push(sk, ssk); if (after64(new_snd_una, old_snd_una)) { __mptcp_snd_una_update(msk, new_snd_una); __mptcp_data_acked(sk); } msk->last_ack_recv = tcp_jiffies32; mptcp_data_unlock(sk); trace_ack_update_msk(mp_opt->data_ack, old_snd_una, new_snd_una, new_wnd_end, READ_ONCE(msk->wnd_end)); } bool mptcp_update_rcv_data_fin(struct mptcp_sock *msk, u64 data_fin_seq, bool use_64bit) { /* Skip if DATA_FIN was already received. * If updating simultaneously with the recvmsg loop, values * should match. If they mismatch, the peer is misbehaving and * we will prefer the most recent information. */ if (READ_ONCE(msk->rcv_data_fin)) return false; WRITE_ONCE(msk->rcv_data_fin_seq, mptcp_expand_seq(READ_ONCE(msk->ack_seq), data_fin_seq, use_64bit)); WRITE_ONCE(msk->rcv_data_fin, 1); return true; } static bool add_addr_hmac_valid(struct mptcp_sock *msk, struct mptcp_options_received *mp_opt) { u64 hmac = 0; if (mp_opt->echo) return true; hmac = add_addr_generate_hmac(READ_ONCE(msk->remote_key), READ_ONCE(msk->local_key), &mp_opt->addr); pr_debug("msk=%p, ahmac=%llu, mp_opt->ahmac=%llu\n", msk, hmac, mp_opt->ahmac); return hmac == mp_opt->ahmac; } /* Return false in case of error (or subflow has been reset), * else return true. */ bool mptcp_incoming_options(struct sock *sk, struct sk_buff *skb) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); struct mptcp_sock *msk = mptcp_sk(subflow->conn); struct mptcp_options_received mp_opt; struct mptcp_ext *mpext; if (__mptcp_check_fallback(msk)) { /* Keep it simple and unconditionally trigger send data cleanup and * pending queue spooling. We will need to acquire the data lock * for more accurate checks, and once the lock is acquired, such * helpers are cheap. */ mptcp_data_lock(subflow->conn); if (sk_stream_memory_free(sk)) __mptcp_check_push(subflow->conn, sk); /* on fallback we just need to ignore the msk-level snd_una, as * this is really plain TCP */ __mptcp_snd_una_update(msk, READ_ONCE(msk->snd_nxt)); __mptcp_data_acked(subflow->conn); mptcp_data_unlock(subflow->conn); return true; } mptcp_get_options(skb, &mp_opt); /* The subflow can be in close state only if check_fully_established() * just sent a reset. If so, tell the caller to ignore the current packet. */ if (!check_fully_established(msk, sk, subflow, skb, &mp_opt)) return sk->sk_state != TCP_CLOSE; if (unlikely(mp_opt.suboptions != OPTION_MPTCP_DSS)) { if ((mp_opt.suboptions & OPTION_MPTCP_FASTCLOSE) && READ_ONCE(msk->local_key) == mp_opt.rcvr_key) { WRITE_ONCE(msk->rcv_fastclose, true); mptcp_schedule_work((struct sock *)msk); MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPFASTCLOSERX); } if ((mp_opt.suboptions & OPTION_MPTCP_ADD_ADDR) && add_addr_hmac_valid(msk, &mp_opt)) { if (!mp_opt.echo) { mptcp_pm_add_addr_received(sk, &mp_opt.addr); MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_ADDADDR); } else { mptcp_pm_add_addr_echoed(msk, &mp_opt.addr); mptcp_pm_del_add_timer(msk, &mp_opt.addr, true); MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_ECHOADD); } if (mp_opt.addr.port) MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_PORTADD); } if (mp_opt.suboptions & OPTION_MPTCP_RM_ADDR) mptcp_pm_rm_addr_received(msk, &mp_opt.rm_list); if (mp_opt.suboptions & OPTION_MPTCP_PRIO) { mptcp_pm_mp_prio_received(sk, mp_opt.backup); MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPPRIORX); } if (mp_opt.suboptions & OPTION_MPTCP_FAIL) { mptcp_pm_mp_fail_received(sk, mp_opt.fail_seq); MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPFAILRX); } if (mp_opt.suboptions & OPTION_MPTCP_RST) { subflow->reset_seen = 1; subflow->reset_reason = mp_opt.reset_reason; subflow->reset_transient = mp_opt.reset_transient; MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPRSTRX); } if (!(mp_opt.suboptions & OPTION_MPTCP_DSS)) return true; } /* we can't wait for recvmsg() to update the ack_seq, otherwise * monodirectional flows will stuck */ if (mp_opt.use_ack) ack_update_msk(msk, sk, &mp_opt); rwin_update(msk, sk, skb); /* Zero-data-length packets are dropped by the caller and not * propagated to the MPTCP layer, so the skb extension does not * need to be allocated or populated. DATA_FIN information, if * present, needs to be updated here before the skb is freed. */ if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq) { if (mp_opt.data_fin && mp_opt.data_len == 1 && mptcp_update_rcv_data_fin(msk, mp_opt.data_seq, mp_opt.dsn64)) mptcp_schedule_work((struct sock *)msk); return true; } mpext = skb_ext_add(skb, SKB_EXT_MPTCP); if (!mpext) return false; memset(mpext, 0, sizeof(*mpext)); if (likely(mp_opt.use_map)) { if (mp_opt.mpc_map) { /* this is an MP_CAPABLE carrying MPTCP data * we know this map the first chunk of data */ mptcp_crypto_key_sha(subflow->remote_key, NULL, &mpext->data_seq); mpext->data_seq++; mpext->subflow_seq = 1; mpext->dsn64 = 1; mpext->mpc_map = 1; mpext->data_fin = 0; } else { mpext->data_seq = mp_opt.data_seq; mpext->subflow_seq = mp_opt.subflow_seq; mpext->dsn64 = mp_opt.dsn64; mpext->data_fin = mp_opt.data_fin; } mpext->data_len = mp_opt.data_len; mpext->use_map = 1; mpext->csum_reqd = !!(mp_opt.suboptions & OPTION_MPTCP_CSUMREQD); if (mpext->csum_reqd) mpext->csum = mp_opt.csum; } return true; } static void mptcp_set_rwin(struct tcp_sock *tp, struct tcphdr *th) { const struct sock *ssk = (const struct sock *)tp; struct mptcp_subflow_context *subflow; u64 ack_seq, rcv_wnd_old, rcv_wnd_new; struct mptcp_sock *msk; u32 new_win; u64 win; subflow = mptcp_subflow_ctx(ssk); msk = mptcp_sk(subflow->conn); ack_seq = READ_ONCE(msk->ack_seq); rcv_wnd_new = ack_seq + tp->rcv_wnd; rcv_wnd_old = atomic64_read(&msk->rcv_wnd_sent); if (after64(rcv_wnd_new, rcv_wnd_old)) { u64 rcv_wnd; for (;;) { rcv_wnd = atomic64_cmpxchg(&msk->rcv_wnd_sent, rcv_wnd_old, rcv_wnd_new); if (rcv_wnd == rcv_wnd_old) break; rcv_wnd_old = rcv_wnd; if (before64(rcv_wnd_new, rcv_wnd_old)) { MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_RCVWNDCONFLICTUPDATE); goto raise_win; } MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_RCVWNDCONFLICT); } goto update_wspace; } if (rcv_wnd_new != rcv_wnd_old) { raise_win: /* The msk-level rcv wnd is after the tcp level one, * sync the latter. */ rcv_wnd_new = rcv_wnd_old; win = rcv_wnd_old - ack_seq; tp->rcv_wnd = min_t(u64, win, U32_MAX); new_win = tp->rcv_wnd; /* Make sure we do not exceed the maximum possible * scaled window. */ if (unlikely(th->syn)) new_win = min(new_win, 65535U) << tp->rx_opt.rcv_wscale; if (!tp->rx_opt.rcv_wscale && READ_ONCE(sock_net(ssk)->ipv4.sysctl_tcp_workaround_signed_windows)) new_win = min(new_win, MAX_TCP_WINDOW); else new_win = min(new_win, (65535U << tp->rx_opt.rcv_wscale)); /* RFC1323 scaling applied */ new_win >>= tp->rx_opt.rcv_wscale; th->window = htons(new_win); MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_RCVWNDSHARED); } update_wspace: WRITE_ONCE(msk->old_wspace, tp->rcv_wnd); subflow->rcv_wnd_sent = rcv_wnd_new; } static void mptcp_track_rwin(struct tcp_sock *tp) { const struct sock *ssk = (const struct sock *)tp; struct mptcp_subflow_context *subflow; struct mptcp_sock *msk; if (!ssk) return; subflow = mptcp_subflow_ctx(ssk); msk = mptcp_sk(subflow->conn); WRITE_ONCE(msk->old_wspace, tp->rcv_wnd); } __sum16 __mptcp_make_csum(u64 data_seq, u32 subflow_seq, u16 data_len, __wsum sum) { struct csum_pseudo_header header; __wsum csum; /* cfr RFC 8684 3.3.1.: * the data sequence number used in the pseudo-header is * always the 64-bit value, irrespective of what length is used in the * DSS option itself. */ header.data_seq = cpu_to_be64(data_seq); header.subflow_seq = htonl(subflow_seq); header.data_len = htons(data_len); header.csum = 0; csum = csum_partial(&header, sizeof(header), sum); return csum_fold(csum); } static __sum16 mptcp_make_csum(const struct mptcp_ext *mpext) { return __mptcp_make_csum(mpext->data_seq, mpext->subflow_seq, mpext->data_len, ~csum_unfold(mpext->csum)); } static void put_len_csum(u16 len, __sum16 csum, void *data) { __sum16 *sumptr = data + 2; __be16 *ptr = data; put_unaligned_be16(len, ptr); put_unaligned(csum, sumptr); } void mptcp_write_options(struct tcphdr *th, __be32 *ptr, struct tcp_sock *tp, struct mptcp_out_options *opts) { const struct sock *ssk = (const struct sock *)tp; struct mptcp_subflow_context *subflow; /* Which options can be used together? * * X: mutually exclusive * O: often used together * C: can be used together in some cases * P: could be used together but we prefer not to (optimisations) * * Opt: | MPC | MPJ | DSS | ADD | RM | PRIO | FAIL | FC | * ------|------|------|------|------|------|------|------|------| * MPC |------|------|------|------|------|------|------|------| * MPJ | X |------|------|------|------|------|------|------| * DSS | X | X |------|------|------|------|------|------| * ADD | X | X | P |------|------|------|------|------| * RM | C | C | C | P |------|------|------|------| * PRIO | X | C | C | C | C |------|------|------| * FAIL | X | X | C | X | X | X |------|------| * FC | X | X | X | X | X | X | X |------| * RST | X | X | X | X | X | X | O | O | * ------|------|------|------|------|------|------|------|------| * * The same applies in mptcp_established_options() function. */ if (likely(OPTION_MPTCP_DSS & opts->suboptions)) { struct mptcp_ext *mpext = &opts->ext_copy; u8 len = TCPOLEN_MPTCP_DSS_BASE; u8 flags = 0; if (mpext->use_ack) { flags = MPTCP_DSS_HAS_ACK; if (mpext->ack64) { len += TCPOLEN_MPTCP_DSS_ACK64; flags |= MPTCP_DSS_ACK64; } else { len += TCPOLEN_MPTCP_DSS_ACK32; } } if (mpext->use_map) { len += TCPOLEN_MPTCP_DSS_MAP64; /* Use only 64-bit mapping flags for now, add * support for optional 32-bit mappings later. */ flags |= MPTCP_DSS_HAS_MAP | MPTCP_DSS_DSN64; if (mpext->data_fin) flags |= MPTCP_DSS_DATA_FIN; if (opts->csum_reqd) len += TCPOLEN_MPTCP_DSS_CHECKSUM; } *ptr++ = mptcp_option(MPTCPOPT_DSS, len, 0, flags); if (mpext->use_ack) { if (mpext->ack64) { put_unaligned_be64(mpext->data_ack, ptr); ptr += 2; } else { put_unaligned_be32(mpext->data_ack32, ptr); ptr += 1; } } if (mpext->use_map) { put_unaligned_be64(mpext->data_seq, ptr); ptr += 2; put_unaligned_be32(mpext->subflow_seq, ptr); ptr += 1; if (opts->csum_reqd) { /* data_len == 0 is reserved for the infinite mapping, * the checksum will also be set to 0. */ put_len_csum(mpext->data_len, (mpext->data_len ? mptcp_make_csum(mpext) : 0), ptr); } else { put_unaligned_be32(mpext->data_len << 16 | TCPOPT_NOP << 8 | TCPOPT_NOP, ptr); } ptr += 1; } /* We might need to add MP_FAIL options in rare cases */ if (unlikely(OPTION_MPTCP_FAIL & opts->suboptions)) goto mp_fail; } else if (OPTIONS_MPTCP_MPC & opts->suboptions) { u8 len, flag = MPTCP_CAP_HMAC_SHA256; if (OPTION_MPTCP_MPC_SYN & opts->suboptions) { len = TCPOLEN_MPTCP_MPC_SYN; } else if (OPTION_MPTCP_MPC_SYNACK & opts->suboptions) { len = TCPOLEN_MPTCP_MPC_SYNACK; } else if (opts->data_len) { len = TCPOLEN_MPTCP_MPC_ACK_DATA; if (opts->csum_reqd) len += TCPOLEN_MPTCP_DSS_CHECKSUM; } else { len = TCPOLEN_MPTCP_MPC_ACK; } if (opts->csum_reqd) flag |= MPTCP_CAP_CHECKSUM_REQD; if (!opts->allow_join_id0) flag |= MPTCP_CAP_DENY_JOIN_ID0; *ptr++ = mptcp_option(MPTCPOPT_MP_CAPABLE, len, MPTCP_SUPPORTED_VERSION, flag); if (!((OPTION_MPTCP_MPC_SYNACK | OPTION_MPTCP_MPC_ACK) & opts->suboptions)) goto mp_capable_done; put_unaligned_be64(opts->sndr_key, ptr); ptr += 2; if (!((OPTION_MPTCP_MPC_ACK) & opts->suboptions)) goto mp_capable_done; put_unaligned_be64(opts->rcvr_key, ptr); ptr += 2; if (!opts->data_len) goto mp_capable_done; if (opts->csum_reqd) { put_len_csum(opts->data_len, __mptcp_make_csum(opts->data_seq, opts->subflow_seq, opts->data_len, ~csum_unfold(opts->csum)), ptr); } else { put_unaligned_be32(opts->data_len << 16 | TCPOPT_NOP << 8 | TCPOPT_NOP, ptr); } ptr += 1; /* MPC is additionally mutually exclusive with MP_PRIO */ goto mp_capable_done; } else if (OPTIONS_MPTCP_MPJ & opts->suboptions) { if (OPTION_MPTCP_MPJ_SYN & opts->suboptions) { *ptr++ = mptcp_option(MPTCPOPT_MP_JOIN, TCPOLEN_MPTCP_MPJ_SYN, opts->backup, opts->join_id); put_unaligned_be32(opts->token, ptr); ptr += 1; put_unaligned_be32(opts->nonce, ptr); ptr += 1; } else if (OPTION_MPTCP_MPJ_SYNACK & opts->suboptions) { *ptr++ = mptcp_option(MPTCPOPT_MP_JOIN, TCPOLEN_MPTCP_MPJ_SYNACK, opts->backup, opts->join_id); put_unaligned_be64(opts->thmac, ptr); ptr += 2; put_unaligned_be32(opts->nonce, ptr); ptr += 1; } else { *ptr++ = mptcp_option(MPTCPOPT_MP_JOIN, TCPOLEN_MPTCP_MPJ_ACK, 0, 0); memcpy(ptr, opts->hmac, MPTCPOPT_HMAC_LEN); ptr += 5; } } else if (OPTION_MPTCP_ADD_ADDR & opts->suboptions) { u8 len = TCPOLEN_MPTCP_ADD_ADDR_BASE; u8 echo = MPTCP_ADDR_ECHO; #if IS_ENABLED(CONFIG_MPTCP_IPV6) if (opts->addr.family == AF_INET6) len = TCPOLEN_MPTCP_ADD_ADDR6_BASE; #endif if (opts->addr.port) len += TCPOLEN_MPTCP_PORT_LEN; if (opts->ahmac) { len += sizeof(opts->ahmac); echo = 0; } *ptr++ = mptcp_option(MPTCPOPT_ADD_ADDR, len, echo, opts->addr.id); if (opts->addr.family == AF_INET) { memcpy((u8 *)ptr, (u8 *)&opts->addr.addr.s_addr, 4); ptr += 1; } #if IS_ENABLED(CONFIG_MPTCP_IPV6) else if (opts->addr.family == AF_INET6) { memcpy((u8 *)ptr, opts->addr.addr6.s6_addr, 16); ptr += 4; } #endif if (!opts->addr.port) { if (opts->ahmac) { put_unaligned_be64(opts->ahmac, ptr); ptr += 2; } } else { u16 port = ntohs(opts->addr.port); if (opts->ahmac) { u8 *bptr = (u8 *)ptr; put_unaligned_be16(port, bptr); bptr += 2; put_unaligned_be64(opts->ahmac, bptr); bptr += 8; put_unaligned_be16(TCPOPT_NOP << 8 | TCPOPT_NOP, bptr); ptr += 3; } else { put_unaligned_be32(port << 16 | TCPOPT_NOP << 8 | TCPOPT_NOP, ptr); ptr += 1; } } } else if (unlikely(OPTION_MPTCP_FASTCLOSE & opts->suboptions)) { /* FASTCLOSE is mutually exclusive with others except RST */ *ptr++ = mptcp_option(MPTCPOPT_MP_FASTCLOSE, TCPOLEN_MPTCP_FASTCLOSE, 0, 0); put_unaligned_be64(opts->rcvr_key, ptr); ptr += 2; if (OPTION_MPTCP_RST & opts->suboptions) goto mp_rst; return; } else if (unlikely(OPTION_MPTCP_FAIL & opts->suboptions)) { mp_fail: /* MP_FAIL is mutually exclusive with others except RST */ subflow = mptcp_subflow_ctx(ssk); subflow->send_mp_fail = 0; *ptr++ = mptcp_option(MPTCPOPT_MP_FAIL, TCPOLEN_MPTCP_FAIL, 0, 0); put_unaligned_be64(opts->fail_seq, ptr); ptr += 2; if (OPTION_MPTCP_RST & opts->suboptions) goto mp_rst; return; } else if (unlikely(OPTION_MPTCP_RST & opts->suboptions)) { mp_rst: *ptr++ = mptcp_option(MPTCPOPT_RST, TCPOLEN_MPTCP_RST, opts->reset_transient, opts->reset_reason); return; } else if (unlikely(!opts->suboptions)) { /* Fallback to TCP */ mptcp_track_rwin(tp); return; } if (OPTION_MPTCP_PRIO & opts->suboptions) { subflow = mptcp_subflow_ctx(ssk); subflow->send_mp_prio = 0; *ptr++ = mptcp_option(MPTCPOPT_MP_PRIO, TCPOLEN_MPTCP_PRIO, opts->backup, TCPOPT_NOP); MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_MPPRIOTX); } mp_capable_done: if (OPTION_MPTCP_RM_ADDR & opts->suboptions) { u8 i = 1; *ptr++ = mptcp_option(MPTCPOPT_RM_ADDR, TCPOLEN_MPTCP_RM_ADDR_BASE + opts->rm_list.nr, 0, opts->rm_list.ids[0]); while (i < opts->rm_list.nr) { u8 id1, id2, id3, id4; id1 = opts->rm_list.ids[i]; id2 = i + 1 < opts->rm_list.nr ? opts->rm_list.ids[i + 1] : TCPOPT_NOP; id3 = i + 2 < opts->rm_list.nr ? opts->rm_list.ids[i + 2] : TCPOPT_NOP; id4 = i + 3 < opts->rm_list.nr ? opts->rm_list.ids[i + 3] : TCPOPT_NOP; put_unaligned_be32(id1 << 24 | id2 << 16 | id3 << 8 | id4, ptr); ptr += 1; i += 4; } } if (tp) mptcp_set_rwin(tp, th); } __be32 mptcp_get_reset_option(const struct sk_buff *skb) { const struct mptcp_ext *ext = mptcp_get_ext(skb); u8 flags, reason; if (ext) { flags = ext->reset_transient; reason = ext->reset_reason; return mptcp_option(MPTCPOPT_RST, TCPOLEN_MPTCP_RST, flags, reason); } return htonl(0u); } EXPORT_SYMBOL_GPL(mptcp_get_reset_option);
5 4 3 4 2 2 2 2 1 6 13 2 2 53 42 42 9 8 1 1 4 4 1 1 2 1 2 1 1 1 1 1 3 1 2 2 1 2 4 4 5 2 13 5 1 5 5 3 1 1 1 41 25 1 1 13 1 39 13 1 1 3 8 3 1 12 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 // SPDX-License-Identifier: GPL-2.0-or-later /* * Ioctl handler * Linux ethernet bridge * * Authors: * Lennert Buytenhek <buytenh@gnu.org> */ #include <linux/capability.h> #include <linux/compat.h> #include <linux/kernel.h> #include <linux/if_bridge.h> #include <linux/netdevice.h> #include <linux/slab.h> #include <linux/times.h> #include <net/net_namespace.h> #include <linux/uaccess.h> #include "br_private.h" static int get_bridge_ifindices(struct net *net, int *indices, int num) { struct net_device *dev; int i = 0; rcu_read_lock(); for_each_netdev_rcu(net, dev) { if (i >= num) break; if (netif_is_bridge_master(dev)) indices[i++] = dev->ifindex; } rcu_read_unlock(); return i; } /* called with RTNL */ static void get_port_ifindices(struct net_bridge *br, int *ifindices, int num) { struct net_bridge_port *p; list_for_each_entry(p, &br->port_list, list) { if (p->port_no < num) ifindices[p->port_no] = p->dev->ifindex; } } /* * Format up to a page worth of forwarding table entries * userbuf -- where to copy result * maxnum -- maximum number of entries desired * (limited to a page for sanity) * offset -- number of records to skip */ static int get_fdb_entries(struct net_bridge *br, void __user *userbuf, unsigned long maxnum, unsigned long offset) { int num; void *buf; size_t size; /* Clamp size to PAGE_SIZE, test maxnum to avoid overflow */ if (maxnum > PAGE_SIZE/sizeof(struct __fdb_entry)) maxnum = PAGE_SIZE/sizeof(struct __fdb_entry); size = maxnum * sizeof(struct __fdb_entry); buf = kmalloc(size, GFP_USER); if (!buf) return -ENOMEM; num = br_fdb_fillbuf(br, buf, maxnum, offset); if (num > 0) { if (copy_to_user(userbuf, buf, array_size(num, sizeof(struct __fdb_entry)))) num = -EFAULT; } kfree(buf); return num; } /* called with RTNL */ static int add_del_if(struct net_bridge *br, int ifindex, int isadd) { struct net *net = dev_net(br->dev); struct net_device *dev; int ret; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; dev = __dev_get_by_index(net, ifindex); if (dev == NULL) return -EINVAL; if (isadd) ret = br_add_if(br, dev, NULL); else ret = br_del_if(br, dev); return ret; } #define BR_UARGS_MAX 4 static int br_dev_read_uargs(unsigned long *args, size_t nr_args, void __user **argp, void __user *data) { int ret; if (nr_args < 2 || nr_args > BR_UARGS_MAX) return -EINVAL; if (in_compat_syscall()) { unsigned int cargs[BR_UARGS_MAX]; int i; ret = copy_from_user(cargs, data, nr_args * sizeof(*cargs)); if (ret) goto fault; for (i = 0; i < nr_args; ++i) args[i] = cargs[i]; *argp = compat_ptr(args[1]); } else { ret = copy_from_user(args, data, nr_args * sizeof(*args)); if (ret) goto fault; *argp = (void __user *)args[1]; } return 0; fault: return -EFAULT; } /* * Legacy ioctl's through SIOCDEVPRIVATE * This interface is deprecated because it was too difficult * to do the translation for 32/64bit ioctl compatibility. */ int br_dev_siocdevprivate(struct net_device *dev, struct ifreq *rq, void __user *data, int cmd) { struct net_bridge *br = netdev_priv(dev); struct net_bridge_port *p = NULL; unsigned long args[4]; void __user *argp; int ret; ret = br_dev_read_uargs(args, ARRAY_SIZE(args), &argp, data); if (ret) return ret; switch (args[0]) { case BRCTL_ADD_IF: case BRCTL_DEL_IF: return add_del_if(br, args[1], args[0] == BRCTL_ADD_IF); case BRCTL_GET_BRIDGE_INFO: { struct __bridge_info b; memset(&b, 0, sizeof(struct __bridge_info)); rcu_read_lock(); memcpy(&b.designated_root, &br->designated_root, 8); memcpy(&b.bridge_id, &br->bridge_id, 8); b.root_path_cost = br->root_path_cost; b.max_age = jiffies_to_clock_t(br->max_age); b.hello_time = jiffies_to_clock_t(br->hello_time); b.forward_delay = br->forward_delay; b.bridge_max_age = br->bridge_max_age; b.bridge_hello_time = br->bridge_hello_time; b.bridge_forward_delay = jiffies_to_clock_t(br->bridge_forward_delay); b.topology_change = br->topology_change; b.topology_change_detected = br->topology_change_detected; b.root_port = br->root_port; b.stp_enabled = (br->stp_enabled != BR_NO_STP); b.ageing_time = jiffies_to_clock_t(br->ageing_time); b.hello_timer_value = br_timer_value(&br->hello_timer); b.tcn_timer_value = br_timer_value(&br->tcn_timer); b.topology_change_timer_value = br_timer_value(&br->topology_change_timer); b.gc_timer_value = br_timer_value(&br->gc_work.timer); rcu_read_unlock(); if (copy_to_user((void __user *)args[1], &b, sizeof(b))) return -EFAULT; return 0; } case BRCTL_GET_PORT_LIST: { int num, *indices; num = args[2]; if (num < 0) return -EINVAL; if (num == 0) num = 256; if (num > BR_MAX_PORTS) num = BR_MAX_PORTS; indices = kcalloc(num, sizeof(int), GFP_KERNEL); if (indices == NULL) return -ENOMEM; get_port_ifindices(br, indices, num); if (copy_to_user(argp, indices, array_size(num, sizeof(int)))) num = -EFAULT; kfree(indices); return num; } case BRCTL_SET_BRIDGE_FORWARD_DELAY: if (!ns_capable(dev_net(dev)->user_ns, CAP_NET_ADMIN)) return -EPERM; ret = br_set_forward_delay(br, args[1]); break; case BRCTL_SET_BRIDGE_HELLO_TIME: if (!ns_capable(dev_net(dev)->user_ns, CAP_NET_ADMIN)) return -EPERM; ret = br_set_hello_time(br, args[1]); break; case BRCTL_SET_BRIDGE_MAX_AGE: if (!ns_capable(dev_net(dev)->user_ns, CAP_NET_ADMIN)) return -EPERM; ret = br_set_max_age(br, args[1]); break; case BRCTL_SET_AGEING_TIME: if (!ns_capable(dev_net(dev)->user_ns, CAP_NET_ADMIN)) return -EPERM; ret = br_set_ageing_time(br, args[1]); break; case BRCTL_GET_PORT_INFO: { struct __port_info p; struct net_bridge_port *pt; rcu_read_lock(); if ((pt = br_get_port(br, args[2])) == NULL) { rcu_read_unlock(); return -EINVAL; } memset(&p, 0, sizeof(struct __port_info)); memcpy(&p.designated_root, &pt->designated_root, 8); memcpy(&p.designated_bridge, &pt->designated_bridge, 8); p.port_id = pt->port_id; p.designated_port = pt->designated_port; p.path_cost = pt->path_cost; p.designated_cost = pt->designated_cost; p.state = pt->state; p.top_change_ack = pt->topology_change_ack; p.config_pending = pt->config_pending; p.message_age_timer_value = br_timer_value(&pt->message_age_timer); p.forward_delay_timer_value = br_timer_value(&pt->forward_delay_timer); p.hold_timer_value = br_timer_value(&pt->hold_timer); rcu_read_unlock(); if (copy_to_user(argp, &p, sizeof(p))) return -EFAULT; return 0; } case BRCTL_SET_BRIDGE_STP_STATE: if (!ns_capable(dev_net(dev)->user_ns, CAP_NET_ADMIN)) return -EPERM; ret = br_stp_set_enabled(br, args[1], NULL); break; case BRCTL_SET_BRIDGE_PRIORITY: if (!ns_capable(dev_net(dev)->user_ns, CAP_NET_ADMIN)) return -EPERM; br_stp_set_bridge_priority(br, args[1]); ret = 0; break; case BRCTL_SET_PORT_PRIORITY: { if (!ns_capable(dev_net(dev)->user_ns, CAP_NET_ADMIN)) return -EPERM; spin_lock_bh(&br->lock); if ((p = br_get_port(br, args[1])) == NULL) ret = -EINVAL; else ret = br_stp_set_port_priority(p, args[2]); spin_unlock_bh(&br->lock); break; } case BRCTL_SET_PATH_COST: { if (!ns_capable(dev_net(dev)->user_ns, CAP_NET_ADMIN)) return -EPERM; spin_lock_bh(&br->lock); if ((p = br_get_port(br, args[1])) == NULL) ret = -EINVAL; else ret = br_stp_set_path_cost(p, args[2]); spin_unlock_bh(&br->lock); break; } case BRCTL_GET_FDB_ENTRIES: return get_fdb_entries(br, argp, args[2], args[3]); default: ret = -EOPNOTSUPP; } if (!ret) { if (p) br_ifinfo_notify(RTM_NEWLINK, NULL, p); else netdev_state_change(br->dev); } return ret; } static int old_deviceless(struct net *net, void __user *data) { unsigned long args[3]; void __user *argp; int ret; ret = br_dev_read_uargs(args, ARRAY_SIZE(args), &argp, data); if (ret) return ret; switch (args[0]) { case BRCTL_GET_VERSION: return BRCTL_VERSION; case BRCTL_GET_BRIDGES: { int *indices; int ret = 0; if (args[2] >= 2048) return -ENOMEM; indices = kcalloc(args[2], sizeof(int), GFP_KERNEL); if (indices == NULL) return -ENOMEM; args[2] = get_bridge_ifindices(net, indices, args[2]); ret = copy_to_user(argp, indices, array_size(args[2], sizeof(int))) ? -EFAULT : args[2]; kfree(indices); return ret; } case BRCTL_ADD_BRIDGE: case BRCTL_DEL_BRIDGE: { char buf[IFNAMSIZ]; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; if (copy_from_user(buf, argp, IFNAMSIZ)) return -EFAULT; buf[IFNAMSIZ-1] = 0; if (args[0] == BRCTL_ADD_BRIDGE) return br_add_bridge(net, buf); return br_del_bridge(net, buf); } } return -EOPNOTSUPP; } int br_ioctl_stub(struct net *net, unsigned int cmd, void __user *uarg) { int ret = -EOPNOTSUPP; struct ifreq ifr; if (cmd == SIOCBRADDIF || cmd == SIOCBRDELIF) { void __user *data; char *colon; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; if (get_user_ifreq(&ifr, &data, uarg)) return -EFAULT; ifr.ifr_name[IFNAMSIZ - 1] = 0; colon = strchr(ifr.ifr_name, ':'); if (colon) *colon = 0; } rtnl_lock(); switch (cmd) { case SIOCGIFBR: case SIOCSIFBR: ret = old_deviceless(net, uarg); break; case SIOCBRADDBR: case SIOCBRDELBR: { char buf[IFNAMSIZ]; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) { ret = -EPERM; break; } if (copy_from_user(buf, uarg, IFNAMSIZ)) { ret = -EFAULT; break; } buf[IFNAMSIZ-1] = 0; if (cmd == SIOCBRADDBR) ret = br_add_bridge(net, buf); else ret = br_del_bridge(net, buf); } break; case SIOCBRADDIF: case SIOCBRDELIF: { struct net_device *dev; dev = __dev_get_by_name(net, ifr.ifr_name); if (!dev || !netif_device_present(dev)) { ret = -ENODEV; break; } if (!netif_is_bridge_master(dev)) { ret = -EOPNOTSUPP; break; } ret = add_del_if(netdev_priv(dev), ifr.ifr_ifindex, cmd == SIOCBRADDIF); } break; } rtnl_unlock(); return ret; }
8 12 3 8 9 9 8 8 8 8 8 8 8 8 7 8 8 8 8 7 8 7 7 7 2 2 2 2 14 14 3 6 4 2 1 1 6 6 9 8 8 4 8 8 6 2 14 13 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 // SPDX-License-Identifier: GPL-2.0-or-later /* * CCM: Counter with CBC-MAC * * (C) Copyright IBM Corp. 2007 - Joy Latten <latten@us.ibm.com> */ #include <crypto/internal/aead.h> #include <crypto/internal/cipher.h> #include <crypto/internal/hash.h> #include <crypto/internal/skcipher.h> #include <crypto/scatterwalk.h> #include <crypto/utils.h> #include <linux/err.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/slab.h> #include <linux/string.h> struct ccm_instance_ctx { struct crypto_skcipher_spawn ctr; struct crypto_ahash_spawn mac; }; struct crypto_ccm_ctx { struct crypto_ahash *mac; struct crypto_skcipher *ctr; }; struct crypto_rfc4309_ctx { struct crypto_aead *child; u8 nonce[3]; }; struct crypto_rfc4309_req_ctx { struct scatterlist src[3]; struct scatterlist dst[3]; struct aead_request subreq; }; struct crypto_ccm_req_priv_ctx { u8 odata[16]; u8 idata[16]; u8 auth_tag[16]; u32 flags; struct scatterlist src[3]; struct scatterlist dst[3]; union { struct ahash_request ahreq; struct skcipher_request skreq; }; }; struct cbcmac_tfm_ctx { struct crypto_cipher *child; }; static inline struct crypto_ccm_req_priv_ctx *crypto_ccm_reqctx( struct aead_request *req) { unsigned long align = crypto_aead_alignmask(crypto_aead_reqtfm(req)); return (void *)PTR_ALIGN((u8 *)aead_request_ctx(req), align + 1); } static int set_msg_len(u8 *block, unsigned int msglen, int csize) { __be32 data; memset(block, 0, csize); block += csize; if (csize >= 4) csize = 4; else if (msglen > (1 << (8 * csize))) return -EOVERFLOW; data = cpu_to_be32(msglen); memcpy(block - csize, (u8 *)&data + 4 - csize, csize); return 0; } static int crypto_ccm_setkey(struct crypto_aead *aead, const u8 *key, unsigned int keylen) { struct crypto_ccm_ctx *ctx = crypto_aead_ctx(aead); struct crypto_skcipher *ctr = ctx->ctr; struct crypto_ahash *mac = ctx->mac; int err; crypto_skcipher_clear_flags(ctr, CRYPTO_TFM_REQ_MASK); crypto_skcipher_set_flags(ctr, crypto_aead_get_flags(aead) & CRYPTO_TFM_REQ_MASK); err = crypto_skcipher_setkey(ctr, key, keylen); if (err) return err; crypto_ahash_clear_flags(mac, CRYPTO_TFM_REQ_MASK); crypto_ahash_set_flags(mac, crypto_aead_get_flags(aead) & CRYPTO_TFM_REQ_MASK); return crypto_ahash_setkey(mac, key, keylen); } static int crypto_ccm_setauthsize(struct crypto_aead *tfm, unsigned int authsize) { switch (authsize) { case 4: case 6: case 8: case 10: case 12: case 14: case 16: break; default: return -EINVAL; } return 0; } static int format_input(u8 *info, struct aead_request *req, unsigned int cryptlen) { struct crypto_aead *aead = crypto_aead_reqtfm(req); unsigned int lp = req->iv[0]; unsigned int l = lp + 1; unsigned int m; m = crypto_aead_authsize(aead); memcpy(info, req->iv, 16); /* format control info per RFC 3610 and * NIST Special Publication 800-38C */ *info |= (8 * ((m - 2) / 2)); if (req->assoclen) *info |= 64; return set_msg_len(info + 16 - l, cryptlen, l); } static int format_adata(u8 *adata, unsigned int a) { int len = 0; /* add control info for associated data * RFC 3610 and NIST Special Publication 800-38C */ if (a < 65280) { *(__be16 *)adata = cpu_to_be16(a); len = 2; } else { *(__be16 *)adata = cpu_to_be16(0xfffe); *(__be32 *)&adata[2] = cpu_to_be32(a); len = 6; } return len; } static int crypto_ccm_auth(struct aead_request *req, struct scatterlist *plain, unsigned int cryptlen) { struct crypto_ccm_req_priv_ctx *pctx = crypto_ccm_reqctx(req); struct crypto_aead *aead = crypto_aead_reqtfm(req); struct crypto_ccm_ctx *ctx = crypto_aead_ctx(aead); struct ahash_request *ahreq = &pctx->ahreq; unsigned int assoclen = req->assoclen; struct scatterlist sg[3]; u8 *odata = pctx->odata; u8 *idata = pctx->idata; int ilen, err; /* format control data for input */ err = format_input(odata, req, cryptlen); if (err) goto out; sg_init_table(sg, 3); sg_set_buf(&sg[0], odata, 16); /* format associated data and compute into mac */ if (assoclen) { ilen = format_adata(idata, assoclen); sg_set_buf(&sg[1], idata, ilen); sg_chain(sg, 3, req->src); } else { ilen = 0; sg_chain(sg, 2, req->src); } ahash_request_set_tfm(ahreq, ctx->mac); ahash_request_set_callback(ahreq, pctx->flags, NULL, NULL); ahash_request_set_crypt(ahreq, sg, NULL, assoclen + ilen + 16); err = crypto_ahash_init(ahreq); if (err) goto out; err = crypto_ahash_update(ahreq); if (err) goto out; /* we need to pad the MAC input to a round multiple of the block size */ ilen = 16 - (assoclen + ilen) % 16; if (ilen < 16) { memset(idata, 0, ilen); sg_init_table(sg, 2); sg_set_buf(&sg[0], idata, ilen); if (plain) sg_chain(sg, 2, plain); plain = sg; cryptlen += ilen; } ahash_request_set_crypt(ahreq, plain, odata, cryptlen); err = crypto_ahash_finup(ahreq); out: return err; } static void crypto_ccm_encrypt_done(void *data, int err) { struct aead_request *req = data; struct crypto_aead *aead = crypto_aead_reqtfm(req); struct crypto_ccm_req_priv_ctx *pctx = crypto_ccm_reqctx(req); u8 *odata = pctx->odata; if (!err) scatterwalk_map_and_copy(odata, req->dst, req->assoclen + req->cryptlen, crypto_aead_authsize(aead), 1); aead_request_complete(req, err); } static inline int crypto_ccm_check_iv(const u8 *iv) { /* 2 <= L <= 8, so 1 <= L' <= 7. */ if (1 > iv[0] || iv[0] > 7) return -EINVAL; return 0; } static int crypto_ccm_init_crypt(struct aead_request *req, u8 *tag) { struct crypto_ccm_req_priv_ctx *pctx = crypto_ccm_reqctx(req); struct scatterlist *sg; u8 *iv = req->iv; int err; err = crypto_ccm_check_iv(iv); if (err) return err; pctx->flags = aead_request_flags(req); /* Note: rfc 3610 and NIST 800-38C require counter of * zero to encrypt auth tag. */ memset(iv + 15 - iv[0], 0, iv[0] + 1); sg_init_table(pctx->src, 3); sg_set_buf(pctx->src, tag, 16); sg = scatterwalk_ffwd(pctx->src + 1, req->src, req->assoclen); if (sg != pctx->src + 1) sg_chain(pctx->src, 2, sg); if (req->src != req->dst) { sg_init_table(pctx->dst, 3); sg_set_buf(pctx->dst, tag, 16); sg = scatterwalk_ffwd(pctx->dst + 1, req->dst, req->assoclen); if (sg != pctx->dst + 1) sg_chain(pctx->dst, 2, sg); } return 0; } static int crypto_ccm_encrypt(struct aead_request *req) { struct crypto_aead *aead = crypto_aead_reqtfm(req); struct crypto_ccm_ctx *ctx = crypto_aead_ctx(aead); struct crypto_ccm_req_priv_ctx *pctx = crypto_ccm_reqctx(req); struct skcipher_request *skreq = &pctx->skreq; struct scatterlist *dst; unsigned int cryptlen = req->cryptlen; u8 *odata = pctx->odata; u8 *iv = req->iv; int err; err = crypto_ccm_init_crypt(req, odata); if (err) return err; err = crypto_ccm_auth(req, sg_next(pctx->src), cryptlen); if (err) return err; dst = pctx->src; if (req->src != req->dst) dst = pctx->dst; skcipher_request_set_tfm(skreq, ctx->ctr); skcipher_request_set_callback(skreq, pctx->flags, crypto_ccm_encrypt_done, req); skcipher_request_set_crypt(skreq, pctx->src, dst, cryptlen + 16, iv); err = crypto_skcipher_encrypt(skreq); if (err) return err; /* copy authtag to end of dst */ scatterwalk_map_and_copy(odata, sg_next(dst), cryptlen, crypto_aead_authsize(aead), 1); return err; } static void crypto_ccm_decrypt_done(void *data, int err) { struct aead_request *req = data; struct crypto_ccm_req_priv_ctx *pctx = crypto_ccm_reqctx(req); struct crypto_aead *aead = crypto_aead_reqtfm(req); unsigned int authsize = crypto_aead_authsize(aead); unsigned int cryptlen = req->cryptlen - authsize; struct scatterlist *dst; pctx->flags = 0; dst = sg_next(req->src == req->dst ? pctx->src : pctx->dst); if (!err) { err = crypto_ccm_auth(req, dst, cryptlen); if (!err && crypto_memneq(pctx->auth_tag, pctx->odata, authsize)) err = -EBADMSG; } aead_request_complete(req, err); } static int crypto_ccm_decrypt(struct aead_request *req) { struct crypto_aead *aead = crypto_aead_reqtfm(req); struct crypto_ccm_ctx *ctx = crypto_aead_ctx(aead); struct crypto_ccm_req_priv_ctx *pctx = crypto_ccm_reqctx(req); struct skcipher_request *skreq = &pctx->skreq; struct scatterlist *dst; unsigned int authsize = crypto_aead_authsize(aead); unsigned int cryptlen = req->cryptlen; u8 *authtag = pctx->auth_tag; u8 *odata = pctx->odata; u8 *iv = pctx->idata; int err; cryptlen -= authsize; err = crypto_ccm_init_crypt(req, authtag); if (err) return err; scatterwalk_map_and_copy(authtag, sg_next(pctx->src), cryptlen, authsize, 0); dst = pctx->src; if (req->src != req->dst) dst = pctx->dst; memcpy(iv, req->iv, 16); skcipher_request_set_tfm(skreq, ctx->ctr); skcipher_request_set_callback(skreq, pctx->flags, crypto_ccm_decrypt_done, req); skcipher_request_set_crypt(skreq, pctx->src, dst, cryptlen + 16, iv); err = crypto_skcipher_decrypt(skreq); if (err) return err; err = crypto_ccm_auth(req, sg_next(dst), cryptlen); if (err) return err; /* verify */ if (crypto_memneq(authtag, odata, authsize)) return -EBADMSG; return err; } static int crypto_ccm_init_tfm(struct crypto_aead *tfm) { struct aead_instance *inst = aead_alg_instance(tfm); struct ccm_instance_ctx *ictx = aead_instance_ctx(inst); struct crypto_ccm_ctx *ctx = crypto_aead_ctx(tfm); struct crypto_ahash *mac; struct crypto_skcipher *ctr; unsigned long align; int err; mac = crypto_spawn_ahash(&ictx->mac); if (IS_ERR(mac)) return PTR_ERR(mac); ctr = crypto_spawn_skcipher(&ictx->ctr); err = PTR_ERR(ctr); if (IS_ERR(ctr)) goto err_free_mac; ctx->mac = mac; ctx->ctr = ctr; align = crypto_aead_alignmask(tfm); align &= ~(crypto_tfm_ctx_alignment() - 1); crypto_aead_set_reqsize( tfm, align + sizeof(struct crypto_ccm_req_priv_ctx) + max(crypto_ahash_reqsize(mac), crypto_skcipher_reqsize(ctr))); return 0; err_free_mac: crypto_free_ahash(mac); return err; } static void crypto_ccm_exit_tfm(struct crypto_aead *tfm) { struct crypto_ccm_ctx *ctx = crypto_aead_ctx(tfm); crypto_free_ahash(ctx->mac); crypto_free_skcipher(ctx->ctr); } static void crypto_ccm_free(struct aead_instance *inst) { struct ccm_instance_ctx *ctx = aead_instance_ctx(inst); crypto_drop_ahash(&ctx->mac); crypto_drop_skcipher(&ctx->ctr); kfree(inst); } static int crypto_ccm_create_common(struct crypto_template *tmpl, struct rtattr **tb, const char *ctr_name, const char *mac_name) { struct skcipher_alg_common *ctr; u32 mask; struct aead_instance *inst; struct ccm_instance_ctx *ictx; struct hash_alg_common *mac; int err; err = crypto_check_attr_type(tb, CRYPTO_ALG_TYPE_AEAD, &mask); if (err) return err; inst = kzalloc(sizeof(*inst) + sizeof(*ictx), GFP_KERNEL); if (!inst) return -ENOMEM; ictx = aead_instance_ctx(inst); err = crypto_grab_ahash(&ictx->mac, aead_crypto_instance(inst), mac_name, 0, mask | CRYPTO_ALG_ASYNC); if (err) goto err_free_inst; mac = crypto_spawn_ahash_alg(&ictx->mac); err = -EINVAL; if (strncmp(mac->base.cra_name, "cbcmac(", 7) != 0 || mac->digestsize != 16) goto err_free_inst; err = crypto_grab_skcipher(&ictx->ctr, aead_crypto_instance(inst), ctr_name, 0, mask); if (err) goto err_free_inst; ctr = crypto_spawn_skcipher_alg_common(&ictx->ctr); /* The skcipher algorithm must be CTR mode, using 16-byte blocks. */ err = -EINVAL; if (strncmp(ctr->base.cra_name, "ctr(", 4) != 0 || ctr->ivsize != 16 || ctr->base.cra_blocksize != 1) goto err_free_inst; /* ctr and cbcmac must use the same underlying block cipher. */ if (strcmp(ctr->base.cra_name + 4, mac->base.cra_name + 7) != 0) goto err_free_inst; err = -ENAMETOOLONG; if (snprintf(inst->alg.base.cra_name, CRYPTO_MAX_ALG_NAME, "ccm(%s", ctr->base.cra_name + 4) >= CRYPTO_MAX_ALG_NAME) goto err_free_inst; if (snprintf(inst->alg.base.cra_driver_name, CRYPTO_MAX_ALG_NAME, "ccm_base(%s,%s)", ctr->base.cra_driver_name, mac->base.cra_driver_name) >= CRYPTO_MAX_ALG_NAME) goto err_free_inst; inst->alg.base.cra_priority = (mac->base.cra_priority + ctr->base.cra_priority) / 2; inst->alg.base.cra_blocksize = 1; inst->alg.base.cra_alignmask = ctr->base.cra_alignmask; inst->alg.ivsize = 16; inst->alg.chunksize = ctr->chunksize; inst->alg.maxauthsize = 16; inst->alg.base.cra_ctxsize = sizeof(struct crypto_ccm_ctx); inst->alg.init = crypto_ccm_init_tfm; inst->alg.exit = crypto_ccm_exit_tfm; inst->alg.setkey = crypto_ccm_setkey; inst->alg.setauthsize = crypto_ccm_setauthsize; inst->alg.encrypt = crypto_ccm_encrypt; inst->alg.decrypt = crypto_ccm_decrypt; inst->free = crypto_ccm_free; err = aead_register_instance(tmpl, inst); if (err) { err_free_inst: crypto_ccm_free(inst); } return err; } static int crypto_ccm_create(struct crypto_template *tmpl, struct rtattr **tb) { const char *cipher_name; char ctr_name[CRYPTO_MAX_ALG_NAME]; char mac_name[CRYPTO_MAX_ALG_NAME]; cipher_name = crypto_attr_alg_name(tb[1]); if (IS_ERR(cipher_name)) return PTR_ERR(cipher_name); if (snprintf(ctr_name, CRYPTO_MAX_ALG_NAME, "ctr(%s)", cipher_name) >= CRYPTO_MAX_ALG_NAME) return -ENAMETOOLONG; if (snprintf(mac_name, CRYPTO_MAX_ALG_NAME, "cbcmac(%s)", cipher_name) >= CRYPTO_MAX_ALG_NAME) return -ENAMETOOLONG; return crypto_ccm_create_common(tmpl, tb, ctr_name, mac_name); } static int crypto_ccm_base_create(struct crypto_template *tmpl, struct rtattr **tb) { const char *ctr_name; const char *mac_name; ctr_name = crypto_attr_alg_name(tb[1]); if (IS_ERR(ctr_name)) return PTR_ERR(ctr_name); mac_name = crypto_attr_alg_name(tb[2]); if (IS_ERR(mac_name)) return PTR_ERR(mac_name); return crypto_ccm_create_common(tmpl, tb, ctr_name, mac_name); } static int crypto_rfc4309_setkey(struct crypto_aead *parent, const u8 *key, unsigned int keylen) { struct crypto_rfc4309_ctx *ctx = crypto_aead_ctx(parent); struct crypto_aead *child = ctx->child; if (keylen < 3) return -EINVAL; keylen -= 3; memcpy(ctx->nonce, key + keylen, 3); crypto_aead_clear_flags(child, CRYPTO_TFM_REQ_MASK); crypto_aead_set_flags(child, crypto_aead_get_flags(parent) & CRYPTO_TFM_REQ_MASK); return crypto_aead_setkey(child, key, keylen); } static int crypto_rfc4309_setauthsize(struct crypto_aead *parent, unsigned int authsize) { struct crypto_rfc4309_ctx *ctx = crypto_aead_ctx(parent); switch (authsize) { case 8: case 12: case 16: break; default: return -EINVAL; } return crypto_aead_setauthsize(ctx->child, authsize); } static struct aead_request *crypto_rfc4309_crypt(struct aead_request *req) { struct crypto_rfc4309_req_ctx *rctx = aead_request_ctx(req); struct aead_request *subreq = &rctx->subreq; struct crypto_aead *aead = crypto_aead_reqtfm(req); struct crypto_rfc4309_ctx *ctx = crypto_aead_ctx(aead); struct crypto_aead *child = ctx->child; struct scatterlist *sg; u8 *iv = PTR_ALIGN((u8 *)(subreq + 1) + crypto_aead_reqsize(child), crypto_aead_alignmask(child) + 1); /* L' */ iv[0] = 3; memcpy(iv + 1, ctx->nonce, 3); memcpy(iv + 4, req->iv, 8); scatterwalk_map_and_copy(iv + 16, req->src, 0, req->assoclen - 8, 0); sg_init_table(rctx->src, 3); sg_set_buf(rctx->src, iv + 16, req->assoclen - 8); sg = scatterwalk_ffwd(rctx->src + 1, req->src, req->assoclen); if (sg != rctx->src + 1) sg_chain(rctx->src, 2, sg); if (req->src != req->dst) { sg_init_table(rctx->dst, 3); sg_set_buf(rctx->dst, iv + 16, req->assoclen - 8); sg = scatterwalk_ffwd(rctx->dst + 1, req->dst, req->assoclen); if (sg != rctx->dst + 1) sg_chain(rctx->dst, 2, sg); } aead_request_set_tfm(subreq, child); aead_request_set_callback(subreq, req->base.flags, req->base.complete, req->base.data); aead_request_set_crypt(subreq, rctx->src, req->src == req->dst ? rctx->src : rctx->dst, req->cryptlen, iv); aead_request_set_ad(subreq, req->assoclen - 8); return subreq; } static int crypto_rfc4309_encrypt(struct aead_request *req) { if (req->assoclen != 16 && req->assoclen != 20) return -EINVAL; req = crypto_rfc4309_crypt(req); return crypto_aead_encrypt(req); } static int crypto_rfc4309_decrypt(struct aead_request *req) { if (req->assoclen != 16 && req->assoclen != 20) return -EINVAL; req = crypto_rfc4309_crypt(req); return crypto_aead_decrypt(req); } static int crypto_rfc4309_init_tfm(struct crypto_aead *tfm) { struct aead_instance *inst = aead_alg_instance(tfm); struct crypto_aead_spawn *spawn = aead_instance_ctx(inst); struct crypto_rfc4309_ctx *ctx = crypto_aead_ctx(tfm); struct crypto_aead *aead; unsigned long align; aead = crypto_spawn_aead(spawn); if (IS_ERR(aead)) return PTR_ERR(aead); ctx->child = aead; align = crypto_aead_alignmask(aead); align &= ~(crypto_tfm_ctx_alignment() - 1); crypto_aead_set_reqsize( tfm, sizeof(struct crypto_rfc4309_req_ctx) + ALIGN(crypto_aead_reqsize(aead), crypto_tfm_ctx_alignment()) + align + 32); return 0; } static void crypto_rfc4309_exit_tfm(struct crypto_aead *tfm) { struct crypto_rfc4309_ctx *ctx = crypto_aead_ctx(tfm); crypto_free_aead(ctx->child); } static void crypto_rfc4309_free(struct aead_instance *inst) { crypto_drop_aead(aead_instance_ctx(inst)); kfree(inst); } static int crypto_rfc4309_create(struct crypto_template *tmpl, struct rtattr **tb) { u32 mask; struct aead_instance *inst; struct crypto_aead_spawn *spawn; struct aead_alg *alg; int err; err = crypto_check_attr_type(tb, CRYPTO_ALG_TYPE_AEAD, &mask); if (err) return err; inst = kzalloc(sizeof(*inst) + sizeof(*spawn), GFP_KERNEL); if (!inst) return -ENOMEM; spawn = aead_instance_ctx(inst); err = crypto_grab_aead(spawn, aead_crypto_instance(inst), crypto_attr_alg_name(tb[1]), 0, mask); if (err) goto err_free_inst; alg = crypto_spawn_aead_alg(spawn); err = -EINVAL; /* We only support 16-byte blocks. */ if (crypto_aead_alg_ivsize(alg) != 16) goto err_free_inst; /* Not a stream cipher? */ if (alg->base.cra_blocksize != 1) goto err_free_inst; err = -ENAMETOOLONG; if (snprintf(inst->alg.base.cra_name, CRYPTO_MAX_ALG_NAME, "rfc4309(%s)", alg->base.cra_name) >= CRYPTO_MAX_ALG_NAME || snprintf(inst->alg.base.cra_driver_name, CRYPTO_MAX_ALG_NAME, "rfc4309(%s)", alg->base.cra_driver_name) >= CRYPTO_MAX_ALG_NAME) goto err_free_inst; inst->alg.base.cra_priority = alg->base.cra_priority; inst->alg.base.cra_blocksize = 1; inst->alg.base.cra_alignmask = alg->base.cra_alignmask; inst->alg.ivsize = 8; inst->alg.chunksize = crypto_aead_alg_chunksize(alg); inst->alg.maxauthsize = 16; inst->alg.base.cra_ctxsize = sizeof(struct crypto_rfc4309_ctx); inst->alg.init = crypto_rfc4309_init_tfm; inst->alg.exit = crypto_rfc4309_exit_tfm; inst->alg.setkey = crypto_rfc4309_setkey; inst->alg.setauthsize = crypto_rfc4309_setauthsize; inst->alg.encrypt = crypto_rfc4309_encrypt; inst->alg.decrypt = crypto_rfc4309_decrypt; inst->free = crypto_rfc4309_free; err = aead_register_instance(tmpl, inst); if (err) { err_free_inst: crypto_rfc4309_free(inst); } return err; } static int crypto_cbcmac_digest_setkey(struct crypto_shash *parent, const u8 *inkey, unsigned int keylen) { struct cbcmac_tfm_ctx *ctx = crypto_shash_ctx(parent); return crypto_cipher_setkey(ctx->child, inkey, keylen); } static int crypto_cbcmac_digest_init(struct shash_desc *pdesc) { int bs = crypto_shash_digestsize(pdesc->tfm); u8 *dg = shash_desc_ctx(pdesc); memset(dg, 0, bs); return 0; } static int crypto_cbcmac_digest_update(struct shash_desc *pdesc, const u8 *p, unsigned int len) { struct crypto_shash *parent = pdesc->tfm; struct cbcmac_tfm_ctx *tctx = crypto_shash_ctx(parent); struct crypto_cipher *tfm = tctx->child; int bs = crypto_shash_digestsize(parent); u8 *dg = shash_desc_ctx(pdesc); do { crypto_xor(dg, p, bs); crypto_cipher_encrypt_one(tfm, dg, dg); p += bs; len -= bs; } while (len >= bs); return len; } static int crypto_cbcmac_digest_finup(struct shash_desc *pdesc, const u8 *src, unsigned int len, u8 *out) { struct crypto_shash *parent = pdesc->tfm; struct cbcmac_tfm_ctx *tctx = crypto_shash_ctx(parent); struct crypto_cipher *tfm = tctx->child; int bs = crypto_shash_digestsize(parent); u8 *dg = shash_desc_ctx(pdesc); if (len) { crypto_xor(dg, src, len); crypto_cipher_encrypt_one(tfm, out, dg); return 0; } memcpy(out, dg, bs); return 0; } static int cbcmac_init_tfm(struct crypto_tfm *tfm) { struct crypto_cipher *cipher; struct crypto_instance *inst = (void *)tfm->__crt_alg; struct crypto_cipher_spawn *spawn = crypto_instance_ctx(inst); struct cbcmac_tfm_ctx *ctx = crypto_tfm_ctx(tfm); cipher = crypto_spawn_cipher(spawn); if (IS_ERR(cipher)) return PTR_ERR(cipher); ctx->child = cipher; return 0; }; static void cbcmac_exit_tfm(struct crypto_tfm *tfm) { struct cbcmac_tfm_ctx *ctx = crypto_tfm_ctx(tfm); crypto_free_cipher(ctx->child); } static int cbcmac_create(struct crypto_template *tmpl, struct rtattr **tb) { struct shash_instance *inst; struct crypto_cipher_spawn *spawn; struct crypto_alg *alg; u32 mask; int err; err = crypto_check_attr_type(tb, CRYPTO_ALG_TYPE_SHASH, &mask); if (err) return err; inst = kzalloc(sizeof(*inst) + sizeof(*spawn), GFP_KERNEL); if (!inst) return -ENOMEM; spawn = shash_instance_ctx(inst); err = crypto_grab_cipher(spawn, shash_crypto_instance(inst), crypto_attr_alg_name(tb[1]), 0, mask); if (err) goto err_free_inst; alg = crypto_spawn_cipher_alg(spawn); err = crypto_inst_setname(shash_crypto_instance(inst), tmpl->name, alg); if (err) goto err_free_inst; inst->alg.base.cra_priority = alg->cra_priority; inst->alg.base.cra_blocksize = alg->cra_blocksize; inst->alg.digestsize = alg->cra_blocksize; inst->alg.descsize = alg->cra_blocksize; inst->alg.base.cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY; inst->alg.base.cra_ctxsize = sizeof(struct cbcmac_tfm_ctx); inst->alg.base.cra_init = cbcmac_init_tfm; inst->alg.base.cra_exit = cbcmac_exit_tfm; inst->alg.init = crypto_cbcmac_digest_init; inst->alg.update = crypto_cbcmac_digest_update; inst->alg.finup = crypto_cbcmac_digest_finup; inst->alg.setkey = crypto_cbcmac_digest_setkey; inst->free = shash_free_singlespawn_instance; err = shash_register_instance(tmpl, inst); if (err) { err_free_inst: shash_free_singlespawn_instance(inst); } return err; } static struct crypto_template crypto_ccm_tmpls[] = { { .name = "cbcmac", .create = cbcmac_create, .module = THIS_MODULE, }, { .name = "ccm_base", .create = crypto_ccm_base_create, .module = THIS_MODULE, }, { .name = "ccm", .create = crypto_ccm_create, .module = THIS_MODULE, }, { .name = "rfc4309", .create = crypto_rfc4309_create, .module = THIS_MODULE, }, }; static int __init crypto_ccm_module_init(void) { return crypto_register_templates(crypto_ccm_tmpls, ARRAY_SIZE(crypto_ccm_tmpls)); } static void __exit crypto_ccm_module_exit(void) { crypto_unregister_templates(crypto_ccm_tmpls, ARRAY_SIZE(crypto_ccm_tmpls)); } module_init(crypto_ccm_module_init); module_exit(crypto_ccm_module_exit); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Counter with CBC MAC"); MODULE_ALIAS_CRYPTO("ccm_base"); MODULE_ALIAS_CRYPTO("rfc4309"); MODULE_ALIAS_CRYPTO("ccm"); MODULE_ALIAS_CRYPTO("cbcmac"); MODULE_IMPORT_NS("CRYPTO_INTERNAL");
470 152 6 234 234 185 51 154 153 17 138 3 1 2 155 155 6 106 58 155 155 6 3 6 17 138 151 6 62 6 142 467 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 // SPDX-License-Identifier: GPL-2.0-or-later /* SCTP kernel implementation * Copyright (c) 1999-2000 Cisco, Inc. * Copyright (c) 1999-2001 Motorola, Inc. * Copyright (c) 2002 International Business Machines, Corp. * * This file is part of the SCTP kernel implementation * * These functions are the methods for accessing the SCTP inqueue. * * An SCTP inqueue is a queue into which you push SCTP packets * (which might be bundles or fragments of chunks) and out of which you * pop SCTP whole chunks. * * Please send any bug reports or fixes you make to the * email address(es): * lksctp developers <linux-sctp@vger.kernel.org> * * Written or modified by: * La Monte H.P. Yarroll <piggy@acm.org> * Karl Knutson <karl@athena.chicago.il.us> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <net/sctp/sctp.h> #include <net/sctp/sm.h> #include <linux/interrupt.h> #include <linux/slab.h> /* Initialize an SCTP inqueue. */ void sctp_inq_init(struct sctp_inq *queue) { INIT_LIST_HEAD(&queue->in_chunk_list); queue->in_progress = NULL; /* Create a task for delivering data. */ INIT_WORK(&queue->immediate, NULL); } /* Properly release the chunk which is being worked on. */ static inline void sctp_inq_chunk_free(struct sctp_chunk *chunk) { if (chunk->head_skb) chunk->skb = chunk->head_skb; sctp_chunk_free(chunk); } /* Release the memory associated with an SCTP inqueue. */ void sctp_inq_free(struct sctp_inq *queue) { struct sctp_chunk *chunk, *tmp; /* Empty the queue. */ list_for_each_entry_safe(chunk, tmp, &queue->in_chunk_list, list) { list_del_init(&chunk->list); sctp_chunk_free(chunk); } /* If there is a packet which is currently being worked on, * free it as well. */ if (queue->in_progress) { sctp_inq_chunk_free(queue->in_progress); queue->in_progress = NULL; } } /* Put a new packet in an SCTP inqueue. * We assume that packet->sctp_hdr is set and in host byte order. */ void sctp_inq_push(struct sctp_inq *q, struct sctp_chunk *chunk) { /* Directly call the packet handling routine. */ if (chunk->rcvr->dead) { sctp_chunk_free(chunk); return; } /* We are now calling this either from the soft interrupt * or from the backlog processing. * Eventually, we should clean up inqueue to not rely * on the BH related data structures. */ list_add_tail(&chunk->list, &q->in_chunk_list); if (chunk->asoc) chunk->asoc->stats.ipackets++; q->immediate.func(&q->immediate); } /* Peek at the next chunk on the inqeue. */ struct sctp_chunkhdr *sctp_inq_peek(struct sctp_inq *queue) { struct sctp_chunk *chunk; struct sctp_chunkhdr *ch = NULL; chunk = queue->in_progress; /* If there is no more chunks in this packet, say so */ if (chunk->singleton || chunk->end_of_packet || chunk->pdiscard) return NULL; ch = (struct sctp_chunkhdr *)chunk->chunk_end; return ch; } /* Extract a chunk from an SCTP inqueue. * * WARNING: If you need to put the chunk on another queue, you need to * make a shallow copy (clone) of it. */ struct sctp_chunk *sctp_inq_pop(struct sctp_inq *queue) { struct sctp_chunk *chunk; struct sctp_chunkhdr *ch = NULL; /* The assumption is that we are safe to process the chunks * at this time. */ chunk = queue->in_progress; if (chunk) { /* There is a packet that we have been working on. * Any post processing work to do before we move on? */ if (chunk->singleton || chunk->end_of_packet || chunk->pdiscard) { if (chunk->head_skb == chunk->skb) { chunk->skb = skb_shinfo(chunk->skb)->frag_list; goto new_skb; } if (chunk->skb->next) { chunk->skb = chunk->skb->next; goto new_skb; } sctp_inq_chunk_free(chunk); chunk = queue->in_progress = NULL; } else { /* Nothing to do. Next chunk in the packet, please. */ ch = (struct sctp_chunkhdr *)chunk->chunk_end; /* Force chunk->skb->data to chunk->chunk_end. */ skb_pull(chunk->skb, chunk->chunk_end - chunk->skb->data); /* We are guaranteed to pull a SCTP header. */ } } /* Do we need to take the next packet out of the queue to process? */ if (!chunk) { struct list_head *entry; next_chunk: /* Is the queue empty? */ entry = sctp_list_dequeue(&queue->in_chunk_list); if (!entry) return NULL; chunk = list_entry(entry, struct sctp_chunk, list); if (skb_is_gso(chunk->skb) && skb_is_gso_sctp(chunk->skb)) { /* GSO-marked skbs but without frags, handle * them normally */ if (skb_shinfo(chunk->skb)->frag_list) chunk->head_skb = chunk->skb; /* skbs with "cover letter" */ if (chunk->head_skb && chunk->skb->data_len == chunk->skb->len) { if (WARN_ON(!skb_shinfo(chunk->skb)->frag_list)) { __SCTP_INC_STATS(dev_net(chunk->skb->dev), SCTP_MIB_IN_PKT_DISCARDS); sctp_chunk_free(chunk); goto next_chunk; } chunk->skb = skb_shinfo(chunk->skb)->frag_list; } } if (chunk->asoc) sock_rps_save_rxhash(chunk->asoc->base.sk, chunk->skb); queue->in_progress = chunk; new_skb: /* This is the first chunk in the packet. */ ch = (struct sctp_chunkhdr *)chunk->skb->data; chunk->singleton = 1; chunk->data_accepted = 0; chunk->pdiscard = 0; chunk->auth = 0; chunk->has_asconf = 0; chunk->end_of_packet = 0; if (chunk->head_skb) { struct sctp_input_cb *cb = SCTP_INPUT_CB(chunk->skb), *head_cb = SCTP_INPUT_CB(chunk->head_skb); cb->chunk = head_cb->chunk; cb->af = head_cb->af; } } chunk->chunk_hdr = ch; chunk->chunk_end = ((__u8 *)ch) + SCTP_PAD4(ntohs(ch->length)); skb_pull(chunk->skb, sizeof(*ch)); chunk->subh.v = NULL; /* Subheader is no longer valid. */ if (chunk->chunk_end + sizeof(*ch) <= skb_tail_pointer(chunk->skb)) { /* This is not a singleton */ chunk->singleton = 0; } else if (chunk->chunk_end > skb_tail_pointer(chunk->skb)) { /* Discard inside state machine. */ chunk->pdiscard = 1; chunk->chunk_end = skb_tail_pointer(chunk->skb); } else { /* We are at the end of the packet, so mark the chunk * in case we need to send a SACK. */ chunk->end_of_packet = 1; } pr_debug("+++sctp_inq_pop+++ chunk:%p[%s], length:%d, skb->len:%d\n", chunk, sctp_cname(SCTP_ST_CHUNK(chunk->chunk_hdr->type)), ntohs(chunk->chunk_hdr->length), chunk->skb->len); return chunk; } /* Set a top-half handler. * * Originally, we the top-half handler was scheduled as a BH. We now * call the handler directly in sctp_inq_push() at a time that * we know we are lock safe. * The intent is that this routine will pull stuff out of the * inqueue and process it. */ void sctp_inq_set_th_handler(struct sctp_inq *q, work_func_t callback) { INIT_WORK(&q->immediate, callback); }
93 93 36 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 // SPDX-License-Identifier: GPL-2.0-or-later /* * Sysfs attributes of bridge * Linux ethernet bridge * * Authors: * Stephen Hemminger <shemminger@osdl.org> */ #include <linux/capability.h> #include <linux/kernel.h> #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/if_bridge.h> #include <linux/rtnetlink.h> #include <linux/spinlock.h> #include <linux/times.h> #include <linux/sched/signal.h> #include "br_private.h" /* IMPORTANT: new bridge options must be added with netlink support only * please do not add new sysfs entries */ #define to_bridge(cd) ((struct net_bridge *)netdev_priv(to_net_dev(cd))) /* * Common code for storing bridge parameters. */ static ssize_t store_bridge_parm(struct device *d, const char *buf, size_t len, int (*set)(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack)) { struct net_bridge *br = to_bridge(d); struct netlink_ext_ack extack = {0}; unsigned long val; int err; if (!ns_capable(dev_net(br->dev)->user_ns, CAP_NET_ADMIN)) return -EPERM; err = kstrtoul(buf, 0, &val); if (err != 0) return err; if (!rtnl_trylock()) return restart_syscall(); err = (*set)(br, val, &extack); if (!err) netdev_state_change(br->dev); if (extack._msg) { if (err) br_err(br, "%s\n", extack._msg); else br_warn(br, "%s\n", extack._msg); } rtnl_unlock(); return err ? err : len; } static ssize_t forward_delay_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%lu\n", jiffies_to_clock_t(br->forward_delay)); } static int set_forward_delay(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { return br_set_forward_delay(br, val); } static ssize_t forward_delay_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, set_forward_delay); } static DEVICE_ATTR_RW(forward_delay); static ssize_t hello_time_show(struct device *d, struct device_attribute *attr, char *buf) { return sprintf(buf, "%lu\n", jiffies_to_clock_t(to_bridge(d)->hello_time)); } static int set_hello_time(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { return br_set_hello_time(br, val); } static ssize_t hello_time_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, set_hello_time); } static DEVICE_ATTR_RW(hello_time); static ssize_t max_age_show(struct device *d, struct device_attribute *attr, char *buf) { return sprintf(buf, "%lu\n", jiffies_to_clock_t(to_bridge(d)->max_age)); } static int set_max_age(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { return br_set_max_age(br, val); } static ssize_t max_age_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, set_max_age); } static DEVICE_ATTR_RW(max_age); static ssize_t ageing_time_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%lu\n", jiffies_to_clock_t(br->ageing_time)); } static int set_ageing_time(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { return br_set_ageing_time(br, val); } static ssize_t ageing_time_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, set_ageing_time); } static DEVICE_ATTR_RW(ageing_time); static ssize_t stp_state_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%d\n", br->stp_enabled); } static int set_stp_state(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { return br_stp_set_enabled(br, val, extack); } static ssize_t stp_state_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, set_stp_state); } static DEVICE_ATTR_RW(stp_state); static ssize_t group_fwd_mask_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%#x\n", br->group_fwd_mask); } static int set_group_fwd_mask(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { if (val & BR_GROUPFWD_RESTRICTED) return -EINVAL; br->group_fwd_mask = val; return 0; } static ssize_t group_fwd_mask_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, set_group_fwd_mask); } static DEVICE_ATTR_RW(group_fwd_mask); static ssize_t priority_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%d\n", (br->bridge_id.prio[0] << 8) | br->bridge_id.prio[1]); } static int set_priority(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { br_stp_set_bridge_priority(br, (u16) val); return 0; } static ssize_t priority_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, set_priority); } static DEVICE_ATTR_RW(priority); static ssize_t root_id_show(struct device *d, struct device_attribute *attr, char *buf) { return br_show_bridge_id(buf, &to_bridge(d)->designated_root); } static DEVICE_ATTR_RO(root_id); static ssize_t bridge_id_show(struct device *d, struct device_attribute *attr, char *buf) { return br_show_bridge_id(buf, &to_bridge(d)->bridge_id); } static DEVICE_ATTR_RO(bridge_id); static ssize_t root_port_show(struct device *d, struct device_attribute *attr, char *buf) { return sprintf(buf, "%d\n", to_bridge(d)->root_port); } static DEVICE_ATTR_RO(root_port); static ssize_t root_path_cost_show(struct device *d, struct device_attribute *attr, char *buf) { return sprintf(buf, "%d\n", to_bridge(d)->root_path_cost); } static DEVICE_ATTR_RO(root_path_cost); static ssize_t topology_change_show(struct device *d, struct device_attribute *attr, char *buf) { return sprintf(buf, "%d\n", to_bridge(d)->topology_change); } static DEVICE_ATTR_RO(topology_change); static ssize_t topology_change_detected_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%d\n", br->topology_change_detected); } static DEVICE_ATTR_RO(topology_change_detected); static ssize_t hello_timer_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%ld\n", br_timer_value(&br->hello_timer)); } static DEVICE_ATTR_RO(hello_timer); static ssize_t tcn_timer_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%ld\n", br_timer_value(&br->tcn_timer)); } static DEVICE_ATTR_RO(tcn_timer); static ssize_t topology_change_timer_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%ld\n", br_timer_value(&br->topology_change_timer)); } static DEVICE_ATTR_RO(topology_change_timer); static ssize_t gc_timer_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%ld\n", br_timer_value(&br->gc_work.timer)); } static DEVICE_ATTR_RO(gc_timer); static ssize_t group_addr_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%pM\n", br->group_addr); } static ssize_t group_addr_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { struct net_bridge *br = to_bridge(d); u8 new_addr[6]; if (!ns_capable(dev_net(br->dev)->user_ns, CAP_NET_ADMIN)) return -EPERM; if (!mac_pton(buf, new_addr)) return -EINVAL; if (!is_link_local_ether_addr(new_addr)) return -EINVAL; if (new_addr[5] == 1 || /* 802.3x Pause address */ new_addr[5] == 2 || /* 802.3ad Slow protocols */ new_addr[5] == 3) /* 802.1X PAE address */ return -EINVAL; if (!rtnl_trylock()) return restart_syscall(); spin_lock_bh(&br->lock); ether_addr_copy(br->group_addr, new_addr); spin_unlock_bh(&br->lock); br_opt_toggle(br, BROPT_GROUP_ADDR_SET, true); br_recalculate_fwd_mask(br); netdev_state_change(br->dev); rtnl_unlock(); return len; } static DEVICE_ATTR_RW(group_addr); static int set_flush(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { struct net_bridge_fdb_flush_desc desc = { .flags_mask = BIT(BR_FDB_STATIC) }; br_fdb_flush(br, &desc); return 0; } static ssize_t flush_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, set_flush); } static DEVICE_ATTR_WO(flush); static ssize_t no_linklocal_learn_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%d\n", br_boolopt_get(br, BR_BOOLOPT_NO_LL_LEARN)); } static int set_no_linklocal_learn(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { return br_boolopt_toggle(br, BR_BOOLOPT_NO_LL_LEARN, !!val, extack); } static ssize_t no_linklocal_learn_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, set_no_linklocal_learn); } static DEVICE_ATTR_RW(no_linklocal_learn); #ifdef CONFIG_BRIDGE_IGMP_SNOOPING static ssize_t multicast_router_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%d\n", br->multicast_ctx.multicast_router); } static int set_multicast_router(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { return br_multicast_set_router(&br->multicast_ctx, val); } static ssize_t multicast_router_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, set_multicast_router); } static DEVICE_ATTR_RW(multicast_router); static ssize_t multicast_snooping_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%d\n", br_opt_get(br, BROPT_MULTICAST_ENABLED)); } static ssize_t multicast_snooping_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, br_multicast_toggle); } static DEVICE_ATTR_RW(multicast_snooping); static ssize_t multicast_query_use_ifaddr_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%d\n", br_opt_get(br, BROPT_MULTICAST_QUERY_USE_IFADDR)); } static int set_query_use_ifaddr(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { br_opt_toggle(br, BROPT_MULTICAST_QUERY_USE_IFADDR, !!val); return 0; } static ssize_t multicast_query_use_ifaddr_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, set_query_use_ifaddr); } static DEVICE_ATTR_RW(multicast_query_use_ifaddr); static ssize_t multicast_querier_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%d\n", br->multicast_ctx.multicast_querier); } static int set_multicast_querier(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { return br_multicast_set_querier(&br->multicast_ctx, val); } static ssize_t multicast_querier_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, set_multicast_querier); } static DEVICE_ATTR_RW(multicast_querier); static ssize_t hash_elasticity_show(struct device *d, struct device_attribute *attr, char *buf) { return sprintf(buf, "%u\n", RHT_ELASTICITY); } static int set_elasticity(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { /* 16 is RHT_ELASTICITY */ NL_SET_ERR_MSG_MOD(extack, "the hash_elasticity option has been deprecated and is always 16"); return 0; } static ssize_t hash_elasticity_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, set_elasticity); } static DEVICE_ATTR_RW(hash_elasticity); static ssize_t hash_max_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%u\n", br->hash_max); } static int set_hash_max(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { br->hash_max = val; return 0; } static ssize_t hash_max_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, set_hash_max); } static DEVICE_ATTR_RW(hash_max); static ssize_t multicast_igmp_version_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%u\n", br->multicast_ctx.multicast_igmp_version); } static int set_multicast_igmp_version(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { return br_multicast_set_igmp_version(&br->multicast_ctx, val); } static ssize_t multicast_igmp_version_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, set_multicast_igmp_version); } static DEVICE_ATTR_RW(multicast_igmp_version); static ssize_t multicast_last_member_count_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%u\n", br->multicast_ctx.multicast_last_member_count); } static int set_last_member_count(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { br->multicast_ctx.multicast_last_member_count = val; return 0; } static ssize_t multicast_last_member_count_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, set_last_member_count); } static DEVICE_ATTR_RW(multicast_last_member_count); static ssize_t multicast_startup_query_count_show( struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%u\n", br->multicast_ctx.multicast_startup_query_count); } static int set_startup_query_count(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { br->multicast_ctx.multicast_startup_query_count = val; return 0; } static ssize_t multicast_startup_query_count_store( struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, set_startup_query_count); } static DEVICE_ATTR_RW(multicast_startup_query_count); static ssize_t multicast_last_member_interval_show( struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%lu\n", jiffies_to_clock_t(br->multicast_ctx.multicast_last_member_interval)); } static int set_last_member_interval(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { br->multicast_ctx.multicast_last_member_interval = clock_t_to_jiffies(val); return 0; } static ssize_t multicast_last_member_interval_store( struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, set_last_member_interval); } static DEVICE_ATTR_RW(multicast_last_member_interval); static ssize_t multicast_membership_interval_show( struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%lu\n", jiffies_to_clock_t(br->multicast_ctx.multicast_membership_interval)); } static int set_membership_interval(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { br->multicast_ctx.multicast_membership_interval = clock_t_to_jiffies(val); return 0; } static ssize_t multicast_membership_interval_store( struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, set_membership_interval); } static DEVICE_ATTR_RW(multicast_membership_interval); static ssize_t multicast_querier_interval_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%lu\n", jiffies_to_clock_t(br->multicast_ctx.multicast_querier_interval)); } static int set_querier_interval(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { br->multicast_ctx.multicast_querier_interval = clock_t_to_jiffies(val); return 0; } static ssize_t multicast_querier_interval_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, set_querier_interval); } static DEVICE_ATTR_RW(multicast_querier_interval); static ssize_t multicast_query_interval_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%lu\n", jiffies_to_clock_t(br->multicast_ctx.multicast_query_interval)); } static int set_query_interval(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { br_multicast_set_query_intvl(&br->multicast_ctx, val); return 0; } static ssize_t multicast_query_interval_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, set_query_interval); } static DEVICE_ATTR_RW(multicast_query_interval); static ssize_t multicast_query_response_interval_show( struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf( buf, "%lu\n", jiffies_to_clock_t(br->multicast_ctx.multicast_query_response_interval)); } static int set_query_response_interval(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { br->multicast_ctx.multicast_query_response_interval = clock_t_to_jiffies(val); return 0; } static ssize_t multicast_query_response_interval_store( struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, set_query_response_interval); } static DEVICE_ATTR_RW(multicast_query_response_interval); static ssize_t multicast_startup_query_interval_show( struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf( buf, "%lu\n", jiffies_to_clock_t(br->multicast_ctx.multicast_startup_query_interval)); } static int set_startup_query_interval(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { br_multicast_set_startup_query_intvl(&br->multicast_ctx, val); return 0; } static ssize_t multicast_startup_query_interval_store( struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, set_startup_query_interval); } static DEVICE_ATTR_RW(multicast_startup_query_interval); static ssize_t multicast_stats_enabled_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%d\n", br_opt_get(br, BROPT_MULTICAST_STATS_ENABLED)); } static int set_stats_enabled(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { br_opt_toggle(br, BROPT_MULTICAST_STATS_ENABLED, !!val); return 0; } static ssize_t multicast_stats_enabled_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, set_stats_enabled); } static DEVICE_ATTR_RW(multicast_stats_enabled); #if IS_ENABLED(CONFIG_IPV6) static ssize_t multicast_mld_version_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%u\n", br->multicast_ctx.multicast_mld_version); } static int set_multicast_mld_version(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { return br_multicast_set_mld_version(&br->multicast_ctx, val); } static ssize_t multicast_mld_version_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, set_multicast_mld_version); } static DEVICE_ATTR_RW(multicast_mld_version); #endif #endif #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) static ssize_t nf_call_iptables_show( struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%u\n", br_opt_get(br, BROPT_NF_CALL_IPTABLES)); } static int set_nf_call_iptables(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { br_opt_toggle(br, BROPT_NF_CALL_IPTABLES, !!val); return 0; } static ssize_t nf_call_iptables_store( struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, set_nf_call_iptables); } static DEVICE_ATTR_RW(nf_call_iptables); static ssize_t nf_call_ip6tables_show( struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%u\n", br_opt_get(br, BROPT_NF_CALL_IP6TABLES)); } static int set_nf_call_ip6tables(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { br_opt_toggle(br, BROPT_NF_CALL_IP6TABLES, !!val); return 0; } static ssize_t nf_call_ip6tables_store( struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, set_nf_call_ip6tables); } static DEVICE_ATTR_RW(nf_call_ip6tables); static ssize_t nf_call_arptables_show( struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%u\n", br_opt_get(br, BROPT_NF_CALL_ARPTABLES)); } static int set_nf_call_arptables(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { br_opt_toggle(br, BROPT_NF_CALL_ARPTABLES, !!val); return 0; } static ssize_t nf_call_arptables_store( struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, set_nf_call_arptables); } static DEVICE_ATTR_RW(nf_call_arptables); #endif #ifdef CONFIG_BRIDGE_VLAN_FILTERING static ssize_t vlan_filtering_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%d\n", br_opt_get(br, BROPT_VLAN_ENABLED)); } static ssize_t vlan_filtering_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, br_vlan_filter_toggle); } static DEVICE_ATTR_RW(vlan_filtering); static ssize_t vlan_protocol_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%#06x\n", ntohs(br->vlan_proto)); } static ssize_t vlan_protocol_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, br_vlan_set_proto); } static DEVICE_ATTR_RW(vlan_protocol); static ssize_t default_pvid_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%d\n", br->default_pvid); } static ssize_t default_pvid_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, br_vlan_set_default_pvid); } static DEVICE_ATTR_RW(default_pvid); static ssize_t vlan_stats_enabled_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%u\n", br_opt_get(br, BROPT_VLAN_STATS_ENABLED)); } static int set_vlan_stats_enabled(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { return br_vlan_set_stats(br, val); } static ssize_t vlan_stats_enabled_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, set_vlan_stats_enabled); } static DEVICE_ATTR_RW(vlan_stats_enabled); static ssize_t vlan_stats_per_port_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%u\n", br_opt_get(br, BROPT_VLAN_STATS_PER_PORT)); } static int set_vlan_stats_per_port(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { return br_vlan_set_stats_per_port(br, val); } static ssize_t vlan_stats_per_port_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, set_vlan_stats_per_port); } static DEVICE_ATTR_RW(vlan_stats_per_port); #endif static struct attribute *bridge_attrs[] = { &dev_attr_forward_delay.attr, &dev_attr_hello_time.attr, &dev_attr_max_age.attr, &dev_attr_ageing_time.attr, &dev_attr_stp_state.attr, &dev_attr_group_fwd_mask.attr, &dev_attr_priority.attr, &dev_attr_bridge_id.attr, &dev_attr_root_id.attr, &dev_attr_root_path_cost.attr, &dev_attr_root_port.attr, &dev_attr_topology_change.attr, &dev_attr_topology_change_detected.attr, &dev_attr_hello_timer.attr, &dev_attr_tcn_timer.attr, &dev_attr_topology_change_timer.attr, &dev_attr_gc_timer.attr, &dev_attr_group_addr.attr, &dev_attr_flush.attr, &dev_attr_no_linklocal_learn.attr, #ifdef CONFIG_BRIDGE_IGMP_SNOOPING &dev_attr_multicast_router.attr, &dev_attr_multicast_snooping.attr, &dev_attr_multicast_querier.attr, &dev_attr_multicast_query_use_ifaddr.attr, &dev_attr_hash_elasticity.attr, &dev_attr_hash_max.attr, &dev_attr_multicast_last_member_count.attr, &dev_attr_multicast_startup_query_count.attr, &dev_attr_multicast_last_member_interval.attr, &dev_attr_multicast_membership_interval.attr, &dev_attr_multicast_querier_interval.attr, &dev_attr_multicast_query_interval.attr, &dev_attr_multicast_query_response_interval.attr, &dev_attr_multicast_startup_query_interval.attr, &dev_attr_multicast_stats_enabled.attr, &dev_attr_multicast_igmp_version.attr, #if IS_ENABLED(CONFIG_IPV6) &dev_attr_multicast_mld_version.attr, #endif #endif #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) &dev_attr_nf_call_iptables.attr, &dev_attr_nf_call_ip6tables.attr, &dev_attr_nf_call_arptables.attr, #endif #ifdef CONFIG_BRIDGE_VLAN_FILTERING &dev_attr_vlan_filtering.attr, &dev_attr_vlan_protocol.attr, &dev_attr_default_pvid.attr, &dev_attr_vlan_stats_enabled.attr, &dev_attr_vlan_stats_per_port.attr, #endif NULL }; static const struct attribute_group bridge_group = { .name = SYSFS_BRIDGE_ATTR, .attrs = bridge_attrs, }; /* * Export the forwarding information table as a binary file * The records are struct __fdb_entry. * * Returns the number of bytes read. */ static ssize_t brforward_read(struct file *filp, struct kobject *kobj, const struct bin_attribute *bin_attr, char *buf, loff_t off, size_t count) { struct device *dev = kobj_to_dev(kobj); struct net_bridge *br = to_bridge(dev); int n; /* must read whole records */ if (off % sizeof(struct __fdb_entry) != 0) return -EINVAL; n = br_fdb_fillbuf(br, buf, count / sizeof(struct __fdb_entry), off / sizeof(struct __fdb_entry)); if (n > 0) n *= sizeof(struct __fdb_entry); return n; } static const struct bin_attribute bridge_forward = { .attr = { .name = SYSFS_BRIDGE_FDB, .mode = 0444, }, .read = brforward_read, }; /* * Add entries in sysfs onto the existing network class device * for the bridge. * Adds a attribute group "bridge" containing tuning parameters. * Binary attribute containing the forward table * Sub directory to hold links to interfaces. * * Note: the ifobj exists only to be a subdirectory * to hold links. The ifobj exists in same data structure * as it's parent the bridge so reference counting works. */ int br_sysfs_addbr(struct net_device *dev) { struct kobject *brobj = &dev->dev.kobj; struct net_bridge *br = netdev_priv(dev); int err; err = sysfs_create_group(brobj, &bridge_group); if (err) { pr_info("%s: can't create group %s/%s\n", __func__, dev->name, bridge_group.name); goto out1; } err = sysfs_create_bin_file(brobj, &bridge_forward); if (err) { pr_info("%s: can't create attribute file %s/%s\n", __func__, dev->name, bridge_forward.attr.name); goto out2; } br->ifobj = kobject_create_and_add(SYSFS_BRIDGE_PORT_SUBDIR, brobj); if (!br->ifobj) { pr_info("%s: can't add kobject (directory) %s/%s\n", __func__, dev->name, SYSFS_BRIDGE_PORT_SUBDIR); err = -ENOMEM; goto out3; } return 0; out3: sysfs_remove_bin_file(&dev->dev.kobj, &bridge_forward); out2: sysfs_remove_group(&dev->dev.kobj, &bridge_group); out1: return err; } void br_sysfs_delbr(struct net_device *dev) { struct kobject *kobj = &dev->dev.kobj; struct net_bridge *br = netdev_priv(dev); kobject_put(br->ifobj); sysfs_remove_bin_file(kobj, &bridge_forward); sysfs_remove_group(kobj, &bridge_group); }
9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 /* * Non-physical true random number generator based on timing jitter -- * Jitter RNG standalone code. * * Copyright Stephan Mueller <smueller@chronox.de>, 2015 - 2023 * * Design * ====== * * See https://www.chronox.de/jent.html * * License * ======= * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, and the entire permission notice in its entirety, * including the disclaimer of warranties. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote * products derived from this software without specific prior * written permission. * * ALTERNATIVELY, this product may be distributed under the terms of * the GNU General Public License, in which case the provisions of the GPL2 are * required INSTEAD OF the above restrictions. (This clause is * necessary due to a potential bad interaction between the GPL and * the restrictions contained in a BSD-style copyright.) * * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ALL OF * WHICH ARE HEREBY DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE * USE OF THIS SOFTWARE, EVEN IF NOT ADVISED OF THE POSSIBILITY OF SUCH * DAMAGE. */ /* * This Jitterentropy RNG is based on the jitterentropy library * version 3.4.0 provided at https://www.chronox.de/jent.html */ #ifdef __OPTIMIZE__ #error "The CPU Jitter random number generator must not be compiled with optimizations. See documentation. Use the compiler switch -O0 for compiling jitterentropy.c." #endif typedef unsigned long long __u64; typedef long long __s64; typedef unsigned int __u32; typedef unsigned char u8; #define NULL ((void *) 0) /* The entropy pool */ struct rand_data { /* SHA3-256 is used as conditioner */ #define DATA_SIZE_BITS 256 /* all data values that are vital to maintain the security * of the RNG are marked as SENSITIVE. A user must not * access that information while the RNG executes its loops to * calculate the next random value. */ void *hash_state; /* SENSITIVE hash state entropy pool */ __u64 prev_time; /* SENSITIVE Previous time stamp */ __u64 last_delta; /* SENSITIVE stuck test */ __s64 last_delta2; /* SENSITIVE stuck test */ unsigned int flags; /* Flags used to initialize */ unsigned int osr; /* Oversample rate */ #define JENT_MEMORY_ACCESSLOOPS 128 #define JENT_MEMORY_SIZE \ (CONFIG_CRYPTO_JITTERENTROPY_MEMORY_BLOCKS * \ CONFIG_CRYPTO_JITTERENTROPY_MEMORY_BLOCKSIZE) unsigned char *mem; /* Memory access location with size of * memblocks * memblocksize */ unsigned int memlocation; /* Pointer to byte in *mem */ unsigned int memblocks; /* Number of memory blocks in *mem */ unsigned int memblocksize; /* Size of one memory block in bytes */ unsigned int memaccessloops; /* Number of memory accesses per random * bit generation */ /* Repetition Count Test */ unsigned int rct_count; /* Number of stuck values */ /* Adaptive Proportion Test cutoff values */ unsigned int apt_cutoff; /* Intermittent health test failure */ unsigned int apt_cutoff_permanent; /* Permanent health test failure */ #define JENT_APT_WINDOW_SIZE 512 /* Data window size */ /* LSB of time stamp to process */ #define JENT_APT_LSB 16 #define JENT_APT_WORD_MASK (JENT_APT_LSB - 1) unsigned int apt_observations; /* Number of collected observations */ unsigned int apt_count; /* APT counter */ unsigned int apt_base; /* APT base reference */ unsigned int health_failure; /* Record health failure */ unsigned int apt_base_set:1; /* APT base reference set? */ }; /* Flags that can be used to initialize the RNG */ #define JENT_DISABLE_MEMORY_ACCESS (1<<2) /* Disable memory access for more * entropy, saves MEMORY_SIZE RAM for * entropy collector */ /* -- error codes for init function -- */ #define JENT_ENOTIME 1 /* Timer service not available */ #define JENT_ECOARSETIME 2 /* Timer too coarse for RNG */ #define JENT_ENOMONOTONIC 3 /* Timer is not monotonic increasing */ #define JENT_EVARVAR 5 /* Timer does not produce variations of * variations (2nd derivation of time is * zero). */ #define JENT_ESTUCK 8 /* Too many stuck results during init. */ #define JENT_EHEALTH 9 /* Health test failed during initialization */ #define JENT_ERCT 10 /* RCT failed during initialization */ #define JENT_EHASH 11 /* Hash self test failed */ #define JENT_EMEM 12 /* Can't allocate memory for initialization */ #define JENT_RCT_FAILURE 1 /* Failure in RCT health test. */ #define JENT_APT_FAILURE 2 /* Failure in APT health test. */ #define JENT_PERMANENT_FAILURE_SHIFT 16 #define JENT_PERMANENT_FAILURE(x) (x << JENT_PERMANENT_FAILURE_SHIFT) #define JENT_RCT_FAILURE_PERMANENT JENT_PERMANENT_FAILURE(JENT_RCT_FAILURE) #define JENT_APT_FAILURE_PERMANENT JENT_PERMANENT_FAILURE(JENT_APT_FAILURE) /* * The output n bits can receive more than n bits of min entropy, of course, * but the fixed output of the conditioning function can only asymptotically * approach the output size bits of min entropy, not attain that bound. Random * maps will tend to have output collisions, which reduces the creditable * output entropy (that is what SP 800-90B Section 3.1.5.1.2 attempts to bound). * * The value "64" is justified in Appendix A.4 of the current 90C draft, * and aligns with NIST's in "epsilon" definition in this document, which is * that a string can be considered "full entropy" if you can bound the min * entropy in each bit of output to at least 1-epsilon, where epsilon is * required to be <= 2^(-32). */ #define JENT_ENTROPY_SAFETY_FACTOR 64 #include <linux/array_size.h> #include <linux/fips.h> #include <linux/minmax.h> #include "jitterentropy.h" /*************************************************************************** * Adaptive Proportion Test * * This test complies with SP800-90B section 4.4.2. ***************************************************************************/ /* * See the SP 800-90B comment #10b for the corrected cutoff for the SP 800-90B * APT. * https://www.untruth.org/~josh/sp80090b/UL%20SP800-90B-final%20comments%20v1.9%2020191212.pdf * In the syntax of R, this is C = 2 + qbinom(1 − 2^(−30), 511, 2^(-1/osr)). * (The original formula wasn't correct because the first symbol must * necessarily have been observed, so there is no chance of observing 0 of these * symbols.) * * For the alpha < 2^-53, R cannot be used as it uses a float data type without * arbitrary precision. A SageMath script is used to calculate those cutoff * values. * * For any value above 14, this yields the maximal allowable value of 512 * (by FIPS 140-2 IG 7.19 Resolution # 16, we cannot choose a cutoff value that * renders the test unable to fail). */ static const unsigned int jent_apt_cutoff_lookup[15] = { 325, 422, 459, 477, 488, 494, 499, 502, 505, 507, 508, 509, 510, 511, 512 }; static const unsigned int jent_apt_cutoff_permanent_lookup[15] = { 355, 447, 479, 494, 502, 507, 510, 512, 512, 512, 512, 512, 512, 512, 512 }; static void jent_apt_init(struct rand_data *ec, unsigned int osr) { /* * Establish the apt_cutoff based on the presumed entropy rate of * 1/osr. */ if (osr >= ARRAY_SIZE(jent_apt_cutoff_lookup)) { ec->apt_cutoff = jent_apt_cutoff_lookup[ ARRAY_SIZE(jent_apt_cutoff_lookup) - 1]; ec->apt_cutoff_permanent = jent_apt_cutoff_permanent_lookup[ ARRAY_SIZE(jent_apt_cutoff_permanent_lookup) - 1]; } else { ec->apt_cutoff = jent_apt_cutoff_lookup[osr - 1]; ec->apt_cutoff_permanent = jent_apt_cutoff_permanent_lookup[osr - 1]; } } /* * Reset the APT counter * * @ec [in] Reference to entropy collector */ static void jent_apt_reset(struct rand_data *ec, unsigned int delta_masked) { /* Reset APT counter */ ec->apt_count = 0; ec->apt_base = delta_masked; ec->apt_observations = 0; } /* * Insert a new entropy event into APT * * @ec [in] Reference to entropy collector * @delta_masked [in] Masked time delta to process */ static void jent_apt_insert(struct rand_data *ec, unsigned int delta_masked) { /* Initialize the base reference */ if (!ec->apt_base_set) { ec->apt_base = delta_masked; ec->apt_base_set = 1; return; } if (delta_masked == ec->apt_base) { ec->apt_count++; /* Note, ec->apt_count starts with one. */ if (ec->apt_count >= ec->apt_cutoff_permanent) ec->health_failure |= JENT_APT_FAILURE_PERMANENT; else if (ec->apt_count >= ec->apt_cutoff) ec->health_failure |= JENT_APT_FAILURE; } ec->apt_observations++; if (ec->apt_observations >= JENT_APT_WINDOW_SIZE) jent_apt_reset(ec, delta_masked); } /*************************************************************************** * Stuck Test and its use as Repetition Count Test * * The Jitter RNG uses an enhanced version of the Repetition Count Test * (RCT) specified in SP800-90B section 4.4.1. Instead of counting identical * back-to-back values, the input to the RCT is the counting of the stuck * values during the generation of one Jitter RNG output block. * * The RCT is applied with an alpha of 2^{-30} compliant to FIPS 140-2 IG 9.8. * * During the counting operation, the Jitter RNG always calculates the RCT * cut-off value of C. If that value exceeds the allowed cut-off value, * the Jitter RNG output block will be calculated completely but discarded at * the end. The caller of the Jitter RNG is informed with an error code. ***************************************************************************/ /* * Repetition Count Test as defined in SP800-90B section 4.4.1 * * @ec [in] Reference to entropy collector * @stuck [in] Indicator whether the value is stuck */ static void jent_rct_insert(struct rand_data *ec, int stuck) { if (stuck) { ec->rct_count++; /* * The cutoff value is based on the following consideration: * alpha = 2^-30 or 2^-60 as recommended in SP800-90B. * In addition, we require an entropy value H of 1/osr as this * is the minimum entropy required to provide full entropy. * Note, we collect (DATA_SIZE_BITS + ENTROPY_SAFETY_FACTOR)*osr * deltas for inserting them into the entropy pool which should * then have (close to) DATA_SIZE_BITS bits of entropy in the * conditioned output. * * Note, ec->rct_count (which equals to value B in the pseudo * code of SP800-90B section 4.4.1) starts with zero. Hence * we need to subtract one from the cutoff value as calculated * following SP800-90B. Thus C = ceil(-log_2(alpha)/H) = 30*osr * or 60*osr. */ if ((unsigned int)ec->rct_count >= (60 * ec->osr)) { ec->rct_count = -1; ec->health_failure |= JENT_RCT_FAILURE_PERMANENT; } else if ((unsigned int)ec->rct_count >= (30 * ec->osr)) { ec->rct_count = -1; ec->health_failure |= JENT_RCT_FAILURE; } } else { /* Reset RCT */ ec->rct_count = 0; } } static inline __u64 jent_delta(__u64 prev, __u64 next) { #define JENT_UINT64_MAX (__u64)(~((__u64) 0)) return (prev < next) ? (next - prev) : (JENT_UINT64_MAX - prev + 1 + next); } /* * Stuck test by checking the: * 1st derivative of the jitter measurement (time delta) * 2nd derivative of the jitter measurement (delta of time deltas) * 3rd derivative of the jitter measurement (delta of delta of time deltas) * * All values must always be non-zero. * * @ec [in] Reference to entropy collector * @current_delta [in] Jitter time delta * * @return * 0 jitter measurement not stuck (good bit) * 1 jitter measurement stuck (reject bit) */ static int jent_stuck(struct rand_data *ec, __u64 current_delta) { __u64 delta2 = jent_delta(ec->last_delta, current_delta); __u64 delta3 = jent_delta(ec->last_delta2, delta2); ec->last_delta = current_delta; ec->last_delta2 = delta2; /* * Insert the result of the comparison of two back-to-back time * deltas. */ jent_apt_insert(ec, current_delta); if (!current_delta || !delta2 || !delta3) { /* RCT with a stuck bit */ jent_rct_insert(ec, 1); return 1; } /* RCT with a non-stuck bit */ jent_rct_insert(ec, 0); return 0; } /* * Report any health test failures * * @ec [in] Reference to entropy collector * * @return a bitmask indicating which tests failed * 0 No health test failure * 1 RCT failure * 2 APT failure * 1<<JENT_PERMANENT_FAILURE_SHIFT RCT permanent failure * 2<<JENT_PERMANENT_FAILURE_SHIFT APT permanent failure */ static unsigned int jent_health_failure(struct rand_data *ec) { /* Test is only enabled in FIPS mode */ if (!fips_enabled) return 0; return ec->health_failure; } /*************************************************************************** * Noise sources ***************************************************************************/ /* * Update of the loop count used for the next round of * an entropy collection. * * Input: * @bits is the number of low bits of the timer to consider * @min is the number of bits we shift the timer value to the right at * the end to make sure we have a guaranteed minimum value * * @return Newly calculated loop counter */ static __u64 jent_loop_shuffle(unsigned int bits, unsigned int min) { __u64 time = 0; __u64 shuffle = 0; unsigned int i = 0; unsigned int mask = (1<<bits) - 1; jent_get_nstime(&time); /* * We fold the time value as much as possible to ensure that as many * bits of the time stamp are included as possible. */ for (i = 0; ((DATA_SIZE_BITS + bits - 1) / bits) > i; i++) { shuffle ^= time & mask; time = time >> bits; } /* * We add a lower boundary value to ensure we have a minimum * RNG loop count. */ return (shuffle + (1<<min)); } /* * CPU Jitter noise source -- this is the noise source based on the CPU * execution time jitter * * This function injects the individual bits of the time value into the * entropy pool using a hash. * * ec [in] entropy collector * time [in] time stamp to be injected * stuck [in] Is the time stamp identified as stuck? * * Output: * updated hash context in the entropy collector or error code */ static int jent_condition_data(struct rand_data *ec, __u64 time, int stuck) { #define SHA3_HASH_LOOP (1<<3) struct { int rct_count; unsigned int apt_observations; unsigned int apt_count; unsigned int apt_base; } addtl = { ec->rct_count, ec->apt_observations, ec->apt_count, ec->apt_base }; return jent_hash_time(ec->hash_state, time, (u8 *)&addtl, sizeof(addtl), SHA3_HASH_LOOP, stuck); } /* * Memory Access noise source -- this is a noise source based on variations in * memory access times * * This function performs memory accesses which will add to the timing * variations due to an unknown amount of CPU wait states that need to be * added when accessing memory. The memory size should be larger than the L1 * caches as outlined in the documentation and the associated testing. * * The L1 cache has a very high bandwidth, albeit its access rate is usually * slower than accessing CPU registers. Therefore, L1 accesses only add minimal * variations as the CPU has hardly to wait. Starting with L2, significant * variations are added because L2 typically does not belong to the CPU any more * and therefore a wider range of CPU wait states is necessary for accesses. * L3 and real memory accesses have even a wider range of wait states. However, * to reliably access either L3 or memory, the ec->mem memory must be quite * large which is usually not desirable. * * @ec [in] Reference to the entropy collector with the memory access data -- if * the reference to the memory block to be accessed is NULL, this noise * source is disabled * @loop_cnt [in] if a value not equal to 0 is set, use the given value * number of loops to perform the LFSR */ static void jent_memaccess(struct rand_data *ec, __u64 loop_cnt) { unsigned int wrap = 0; __u64 i = 0; #define MAX_ACC_LOOP_BIT 7 #define MIN_ACC_LOOP_BIT 0 __u64 acc_loop_cnt = jent_loop_shuffle(MAX_ACC_LOOP_BIT, MIN_ACC_LOOP_BIT); if (NULL == ec || NULL == ec->mem) return; wrap = ec->memblocksize * ec->memblocks; /* * testing purposes -- allow test app to set the counter, not * needed during runtime */ if (loop_cnt) acc_loop_cnt = loop_cnt; for (i = 0; i < (ec->memaccessloops + acc_loop_cnt); i++) { unsigned char *tmpval = ec->mem + ec->memlocation; /* * memory access: just add 1 to one byte, * wrap at 255 -- memory access implies read * from and write to memory location */ *tmpval = (*tmpval + 1) & 0xff; /* * Addition of memblocksize - 1 to pointer * with wrap around logic to ensure that every * memory location is hit evenly */ ec->memlocation = ec->memlocation + ec->memblocksize - 1; ec->memlocation = ec->memlocation % wrap; } } /*************************************************************************** * Start of entropy processing logic ***************************************************************************/ /* * This is the heart of the entropy generation: calculate time deltas and * use the CPU jitter in the time deltas. The jitter is injected into the * entropy pool. * * WARNING: ensure that ->prev_time is primed before using the output * of this function! This can be done by calling this function * and not using its result. * * @ec [in] Reference to entropy collector * * @return result of stuck test */ static int jent_measure_jitter(struct rand_data *ec, __u64 *ret_current_delta) { __u64 time = 0; __u64 current_delta = 0; int stuck; /* Invoke one noise source before time measurement to add variations */ jent_memaccess(ec, 0); /* * Get time stamp and calculate time delta to previous * invocation to measure the timing variations */ jent_get_nstime(&time); current_delta = jent_delta(ec->prev_time, time); ec->prev_time = time; /* Check whether we have a stuck measurement. */ stuck = jent_stuck(ec, current_delta); /* Now call the next noise sources which also injects the data */ if (jent_condition_data(ec, current_delta, stuck)) stuck = 1; /* return the raw entropy value */ if (ret_current_delta) *ret_current_delta = current_delta; return stuck; } /* * Generator of one 64 bit random number * Function fills rand_data->hash_state * * @ec [in] Reference to entropy collector */ static void jent_gen_entropy(struct rand_data *ec) { unsigned int k = 0, safety_factor = 0; if (fips_enabled) safety_factor = JENT_ENTROPY_SAFETY_FACTOR; /* priming of the ->prev_time value */ jent_measure_jitter(ec, NULL); while (!jent_health_failure(ec)) { /* If a stuck measurement is received, repeat measurement */ if (jent_measure_jitter(ec, NULL)) continue; /* * We multiply the loop value with ->osr to obtain the * oversampling rate requested by the caller */ if (++k >= ((DATA_SIZE_BITS + safety_factor) * ec->osr)) break; } } /* * Entry function: Obtain entropy for the caller. * * This function invokes the entropy gathering logic as often to generate * as many bytes as requested by the caller. The entropy gathering logic * creates 64 bit per invocation. * * This function truncates the last 64 bit entropy value output to the exact * size specified by the caller. * * @ec [in] Reference to entropy collector * @data [in] pointer to buffer for storing random data -- buffer must already * exist * @len [in] size of the buffer, specifying also the requested number of random * in bytes * * @return 0 when request is fulfilled or an error * * The following error codes can occur: * -1 entropy_collector is NULL or the generation failed * -2 Intermittent health failure * -3 Permanent health failure */ int jent_read_entropy(struct rand_data *ec, unsigned char *data, unsigned int len) { unsigned char *p = data; if (!ec) return -1; while (len > 0) { unsigned int tocopy, health_test_result; jent_gen_entropy(ec); health_test_result = jent_health_failure(ec); if (health_test_result > JENT_PERMANENT_FAILURE_SHIFT) { /* * At this point, the Jitter RNG instance is considered * as a failed instance. There is no rerun of the * startup test any more, because the caller * is assumed to not further use this instance. */ return -3; } else if (health_test_result) { /* * Perform startup health tests and return permanent * error if it fails. */ if (jent_entropy_init(0, 0, NULL, ec)) { /* Mark the permanent error */ ec->health_failure &= JENT_RCT_FAILURE_PERMANENT | JENT_APT_FAILURE_PERMANENT; return -3; } return -2; } tocopy = min(DATA_SIZE_BITS / 8, len); if (jent_read_random_block(ec->hash_state, p, tocopy)) return -1; len -= tocopy; p += tocopy; } return 0; } /*************************************************************************** * Initialization logic ***************************************************************************/ struct rand_data *jent_entropy_collector_alloc(unsigned int osr, unsigned int flags, void *hash_state) { struct rand_data *entropy_collector; entropy_collector = jent_zalloc(sizeof(struct rand_data)); if (!entropy_collector) return NULL; if (!(flags & JENT_DISABLE_MEMORY_ACCESS)) { /* Allocate memory for adding variations based on memory * access */ entropy_collector->mem = jent_kvzalloc(JENT_MEMORY_SIZE); if (!entropy_collector->mem) { jent_zfree(entropy_collector); return NULL; } entropy_collector->memblocksize = CONFIG_CRYPTO_JITTERENTROPY_MEMORY_BLOCKSIZE; entropy_collector->memblocks = CONFIG_CRYPTO_JITTERENTROPY_MEMORY_BLOCKS; entropy_collector->memaccessloops = JENT_MEMORY_ACCESSLOOPS; } /* verify and set the oversampling rate */ if (osr == 0) osr = 1; /* H_submitter = 1 / osr */ entropy_collector->osr = osr; entropy_collector->flags = flags; entropy_collector->hash_state = hash_state; /* Initialize the APT */ jent_apt_init(entropy_collector, osr); /* fill the data pad with non-zero values */ jent_gen_entropy(entropy_collector); return entropy_collector; } void jent_entropy_collector_free(struct rand_data *entropy_collector) { jent_kvzfree(entropy_collector->mem, JENT_MEMORY_SIZE); entropy_collector->mem = NULL; jent_zfree(entropy_collector); } int jent_entropy_init(unsigned int osr, unsigned int flags, void *hash_state, struct rand_data *p_ec) { /* * If caller provides an allocated ec, reuse it which implies that the * health test entropy data is used to further still the available * entropy pool. */ struct rand_data *ec = p_ec; int i, time_backwards = 0, ret = 0, ec_free = 0; unsigned int health_test_result; if (!ec) { ec = jent_entropy_collector_alloc(osr, flags, hash_state); if (!ec) return JENT_EMEM; ec_free = 1; } else { /* Reset the APT */ jent_apt_reset(ec, 0); /* Ensure that a new APT base is obtained */ ec->apt_base_set = 0; /* Reset the RCT */ ec->rct_count = 0; /* Reset intermittent, leave permanent health test result */ ec->health_failure &= (~JENT_RCT_FAILURE); ec->health_failure &= (~JENT_APT_FAILURE); } /* We could perform statistical tests here, but the problem is * that we only have a few loop counts to do testing. These * loop counts may show some slight skew and we produce * false positives. * * Moreover, only old systems show potentially problematic * jitter entropy that could potentially be caught here. But * the RNG is intended for hardware that is available or widely * used, but not old systems that are long out of favor. Thus, * no statistical tests. */ /* * We could add a check for system capabilities such as clock_getres or * check for CONFIG_X86_TSC, but it does not make much sense as the * following sanity checks verify that we have a high-resolution * timer. */ /* * TESTLOOPCOUNT needs some loops to identify edge systems. 100 is * definitely too little. * * SP800-90B requires at least 1024 initial test cycles. */ #define TESTLOOPCOUNT 1024 #define CLEARCACHE 100 for (i = 0; (TESTLOOPCOUNT + CLEARCACHE) > i; i++) { __u64 start_time = 0, end_time = 0, delta = 0; /* Invoke core entropy collection logic */ jent_measure_jitter(ec, &delta); end_time = ec->prev_time; start_time = ec->prev_time - delta; /* test whether timer works */ if (!start_time || !end_time) { ret = JENT_ENOTIME; goto out; } /* * test whether timer is fine grained enough to provide * delta even when called shortly after each other -- this * implies that we also have a high resolution timer */ if (!delta || (end_time == start_time)) { ret = JENT_ECOARSETIME; goto out; } /* * up to here we did not modify any variable that will be * evaluated later, but we already performed some work. Thus we * already have had an impact on the caches, branch prediction, * etc. with the goal to clear it to get the worst case * measurements. */ if (i < CLEARCACHE) continue; /* test whether we have an increasing timer */ if (!(end_time > start_time)) time_backwards++; } /* * we allow up to three times the time running backwards. * CLOCK_REALTIME is affected by adjtime and NTP operations. Thus, * if such an operation just happens to interfere with our test, it * should not fail. The value of 3 should cover the NTP case being * performed during our test run. */ if (time_backwards > 3) { ret = JENT_ENOMONOTONIC; goto out; } /* Did we encounter a health test failure? */ health_test_result = jent_health_failure(ec); if (health_test_result) { ret = (health_test_result & JENT_RCT_FAILURE) ? JENT_ERCT : JENT_EHEALTH; goto out; } out: if (ec_free) jent_entropy_collector_free(ec); return ret; }
49 22 57 57 18 51 12 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 // SPDX-License-Identifier: GPL-2.0-or-later /* * ALSA sequencer System services Client * Copyright (c) 1998-1999 by Frank van de Pol <fvdpol@coil.demon.nl> */ #include <linux/init.h> #include <linux/export.h> #include <linux/slab.h> #include <sound/core.h> #include "seq_system.h" #include "seq_timer.h" #include "seq_queue.h" /* internal client that provide system services, access to timer etc. */ /* * Port "Timer" * - send tempo /start/stop etc. events to this port to manipulate the * queue's timer. The queue address is specified in * data.queue.queue. * - this port supports subscription. The received timer events are * broadcasted to all subscribed clients. The modified tempo * value is stored on data.queue.value. * The modifier client/port is not send. * * Port "Announce" * - does not receive message * - supports supscription. For each client or port attaching to or * detaching from the system an announcement is send to the subscribed * clients. * * Idea: the subscription mechanism might also work handy for distributing * synchronisation and timing information. In this case we would ideally have * a list of subscribers for each type of sync (time, tick), for each timing * queue. * * NOTE: the queue to be started, stopped, etc. must be specified * in data.queue.addr.queue field. queue is used only for * scheduling, and no longer referred as affected queue. * They are used only for timer broadcast (see above). * -- iwai */ /* client id of our system client */ static int sysclient = -1; /* port id numbers for this client */ static int announce_port = -1; /* number of subscriptions to announce port */ static int announce_subscribed; /* fill standard header data, source port & channel are filled in */ static int setheader(struct snd_seq_event * ev, int client, int port) { if (announce_port < 0 || !announce_subscribed) return -ENODEV; memset(ev, 0, sizeof(struct snd_seq_event)); ev->flags &= ~SNDRV_SEQ_EVENT_LENGTH_MASK; ev->flags |= SNDRV_SEQ_EVENT_LENGTH_FIXED; ev->source.client = sysclient; ev->source.port = announce_port; ev->dest.client = SNDRV_SEQ_ADDRESS_SUBSCRIBERS; /* fill data */ /*ev->data.addr.queue = SNDRV_SEQ_ADDRESS_UNKNOWN;*/ ev->data.addr.client = client; ev->data.addr.port = port; return 0; } /* entry points for broadcasting system events */ void snd_seq_system_broadcast(int client, int port, int type, bool atomic) { struct snd_seq_event ev; if (setheader(&ev, client, port) < 0) return; ev.type = type; snd_seq_kernel_client_dispatch(sysclient, &ev, atomic, 0); } EXPORT_SYMBOL_GPL(snd_seq_system_broadcast); /* entry points for broadcasting system events */ int snd_seq_system_notify(int client, int port, struct snd_seq_event *ev, bool atomic) { ev->flags = SNDRV_SEQ_EVENT_LENGTH_FIXED; ev->source.client = sysclient; ev->source.port = announce_port; ev->dest.client = client; ev->dest.port = port; return snd_seq_kernel_client_dispatch(sysclient, ev, atomic, 0); } /* call-back handler for timer events */ static int event_input_timer(struct snd_seq_event * ev, int direct, void *private_data, int atomic, int hop) { return snd_seq_control_queue(ev, atomic, hop); } static int sys_announce_subscribe(void *private_data, struct snd_seq_port_subscribe *info) { announce_subscribed++; return 0; } static int sys_announce_unsubscribe(void *private_data, struct snd_seq_port_subscribe *info) { if (snd_BUG_ON(!announce_subscribed)) return 0; announce_subscribed--; return 0; } /* register our internal client */ int __init snd_seq_system_client_init(void) { struct snd_seq_port_callback pcallbacks; struct snd_seq_port_info *port; int err; port = kzalloc(sizeof(*port), GFP_KERNEL); if (!port) return -ENOMEM; memset(&pcallbacks, 0, sizeof(pcallbacks)); pcallbacks.owner = THIS_MODULE; pcallbacks.event_input = event_input_timer; /* register client */ sysclient = snd_seq_create_kernel_client(NULL, 0, "System"); if (sysclient < 0) { kfree(port); return sysclient; } /* register timer */ strscpy(port->name, "Timer"); port->capability = SNDRV_SEQ_PORT_CAP_WRITE; /* accept queue control */ port->capability |= SNDRV_SEQ_PORT_CAP_READ|SNDRV_SEQ_PORT_CAP_SUBS_READ; /* for broadcast */ port->kernel = &pcallbacks; port->type = 0; port->flags = SNDRV_SEQ_PORT_FLG_GIVEN_PORT; port->addr.client = sysclient; port->addr.port = SNDRV_SEQ_PORT_SYSTEM_TIMER; err = snd_seq_kernel_client_ctl(sysclient, SNDRV_SEQ_IOCTL_CREATE_PORT, port); if (err < 0) goto error_port; /* register announcement port */ strscpy(port->name, "Announce"); port->capability = SNDRV_SEQ_PORT_CAP_READ|SNDRV_SEQ_PORT_CAP_SUBS_READ; /* for broadcast only */ pcallbacks.event_input = NULL; pcallbacks.subscribe = sys_announce_subscribe; pcallbacks.unsubscribe = sys_announce_unsubscribe; port->kernel = &pcallbacks; port->type = 0; port->flags = SNDRV_SEQ_PORT_FLG_GIVEN_PORT; port->addr.client = sysclient; port->addr.port = SNDRV_SEQ_PORT_SYSTEM_ANNOUNCE; err = snd_seq_kernel_client_ctl(sysclient, SNDRV_SEQ_IOCTL_CREATE_PORT, port); if (err < 0) goto error_port; announce_port = port->addr.port; kfree(port); return 0; error_port: snd_seq_system_client_done(); kfree(port); return err; } /* unregister our internal client */ void snd_seq_system_client_done(void) { int oldsysclient = sysclient; if (oldsysclient >= 0) { sysclient = -1; announce_port = -1; snd_seq_delete_kernel_client(oldsysclient); } }
101 101 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 /* * net/tipc/core.c: TIPC module code * * Copyright (c) 2003-2006, 2013, Ericsson AB * Copyright (c) 2005-2006, 2010-2013, Wind River Systems * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the names of the copyright holders nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * Alternatively, this software may be distributed under the terms of the * GNU General Public License ("GPL") version 2 as published by the Free * Software Foundation. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include "core.h" #include "name_table.h" #include "subscr.h" #include "bearer.h" #include "net.h" #include "socket.h" #include "bcast.h" #include "node.h" #include "crypto.h" #include <linux/module.h> /* configurable TIPC parameters */ unsigned int tipc_net_id __read_mostly; int sysctl_tipc_rmem[3] __read_mostly; /* min/default/max */ static int __net_init tipc_init_net(struct net *net) { struct tipc_net *tn = net_generic(net, tipc_net_id); int err; tn->net_id = 4711; tn->node_addr = 0; tn->trial_addr = 0; tn->addr_trial_end = 0; tn->capabilities = TIPC_NODE_CAPABILITIES; INIT_WORK(&tn->work, tipc_net_finalize_work); memset(tn->node_id, 0, sizeof(tn->node_id)); memset(tn->node_id_string, 0, sizeof(tn->node_id_string)); tn->mon_threshold = TIPC_DEF_MON_THRESHOLD; get_random_bytes(&tn->random, sizeof(int)); INIT_LIST_HEAD(&tn->node_list); spin_lock_init(&tn->node_list_lock); #ifdef CONFIG_TIPC_CRYPTO err = tipc_crypto_start(&tn->crypto_tx, net, NULL); if (err) goto out_crypto; #endif err = tipc_sk_rht_init(net); if (err) goto out_sk_rht; err = tipc_nametbl_init(net); if (err) goto out_nametbl; err = tipc_bcast_init(net); if (err) goto out_bclink; err = tipc_attach_loopback(net); if (err) goto out_bclink; return 0; out_bclink: tipc_nametbl_stop(net); out_nametbl: tipc_sk_rht_destroy(net); out_sk_rht: #ifdef CONFIG_TIPC_CRYPTO tipc_crypto_stop(&tn->crypto_tx); out_crypto: #endif return err; } static void __net_exit tipc_exit_net(struct net *net) { struct tipc_net *tn = tipc_net(net); tipc_detach_loopback(net); tipc_net_stop(net); /* Make sure the tipc_net_finalize_work() finished */ cancel_work_sync(&tn->work); tipc_bcast_stop(net); tipc_nametbl_stop(net); tipc_sk_rht_destroy(net); #ifdef CONFIG_TIPC_CRYPTO tipc_crypto_stop(&tipc_net(net)->crypto_tx); #endif while (atomic_read(&tn->wq_count)) cond_resched(); } static void __net_exit tipc_pernet_pre_exit(struct net *net) { tipc_node_pre_cleanup_net(net); } static struct pernet_operations tipc_pernet_pre_exit_ops = { .pre_exit = tipc_pernet_pre_exit, }; static struct pernet_operations tipc_net_ops = { .init = tipc_init_net, .exit = tipc_exit_net, .id = &tipc_net_id, .size = sizeof(struct tipc_net), }; static struct pernet_operations tipc_topsrv_net_ops = { .init = tipc_topsrv_init_net, .exit = tipc_topsrv_exit_net, }; static int __init tipc_init(void) { int err; pr_info("Activated (version " TIPC_MOD_VER ")\n"); sysctl_tipc_rmem[0] = RCVBUF_MIN; sysctl_tipc_rmem[1] = RCVBUF_DEF; sysctl_tipc_rmem[2] = RCVBUF_MAX; err = tipc_register_sysctl(); if (err) goto out_sysctl; err = register_pernet_device(&tipc_net_ops); if (err) goto out_pernet; err = tipc_socket_init(); if (err) goto out_socket; err = register_pernet_device(&tipc_topsrv_net_ops); if (err) goto out_pernet_topsrv; err = register_pernet_subsys(&tipc_pernet_pre_exit_ops); if (err) goto out_register_pernet_subsys; err = tipc_bearer_setup(); if (err) goto out_bearer; err = tipc_netlink_start(); if (err) goto out_netlink; err = tipc_netlink_compat_start(); if (err) goto out_netlink_compat; pr_info("Started in single node mode\n"); return 0; out_netlink_compat: tipc_netlink_stop(); out_netlink: tipc_bearer_cleanup(); out_bearer: unregister_pernet_subsys(&tipc_pernet_pre_exit_ops); out_register_pernet_subsys: unregister_pernet_device(&tipc_topsrv_net_ops); out_pernet_topsrv: tipc_socket_stop(); out_socket: unregister_pernet_device(&tipc_net_ops); out_pernet: tipc_unregister_sysctl(); out_sysctl: pr_err("Unable to start in single node mode\n"); return err; } static void __exit tipc_exit(void) { tipc_netlink_compat_stop(); tipc_netlink_stop(); tipc_bearer_cleanup(); unregister_pernet_subsys(&tipc_pernet_pre_exit_ops); unregister_pernet_device(&tipc_topsrv_net_ops); tipc_socket_stop(); unregister_pernet_device(&tipc_net_ops); tipc_unregister_sysctl(); pr_info("Deactivated\n"); } module_init(tipc_init); module_exit(tipc_exit); MODULE_DESCRIPTION("TIPC: Transparent Inter Process Communication"); MODULE_LICENSE("Dual BSD/GPL"); MODULE_VERSION(TIPC_MOD_VER);
7 7 7 7 3 2 10 10 10 10 10 2 2 8 1 4 1 5 1 5 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2022 Bobby Eshleman <bobby.eshleman@bytedance.com> * * Based off of net/unix/unix_bpf.c */ #include <linux/bpf.h> #include <linux/module.h> #include <linux/skmsg.h> #include <linux/socket.h> #include <linux/wait.h> #include <net/af_vsock.h> #include <net/sock.h> #define vsock_sk_has_data(__sk, __psock) \ ({ !skb_queue_empty(&(__sk)->sk_receive_queue) || \ !skb_queue_empty(&(__psock)->ingress_skb) || \ !list_empty(&(__psock)->ingress_msg); \ }) static struct proto *vsock_prot_saved __read_mostly; static DEFINE_SPINLOCK(vsock_prot_lock); static struct proto vsock_bpf_prot; static bool vsock_has_data(struct sock *sk, struct sk_psock *psock) { struct vsock_sock *vsk = vsock_sk(sk); s64 ret; ret = vsock_connectible_has_data(vsk); if (ret > 0) return true; return vsock_sk_has_data(sk, psock); } static bool vsock_msg_wait_data(struct sock *sk, struct sk_psock *psock, long timeo) { bool ret; DEFINE_WAIT_FUNC(wait, woken_wake_function); if (sk->sk_shutdown & RCV_SHUTDOWN) return true; if (!timeo) return false; add_wait_queue(sk_sleep(sk), &wait); sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); ret = vsock_has_data(sk, psock); if (!ret) { wait_woken(&wait, TASK_INTERRUPTIBLE, timeo); ret = vsock_has_data(sk, psock); } sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); remove_wait_queue(sk_sleep(sk), &wait); return ret; } static int __vsock_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags) { struct socket *sock = sk->sk_socket; int err; if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) err = __vsock_connectible_recvmsg(sock, msg, len, flags); else if (sk->sk_type == SOCK_DGRAM) err = __vsock_dgram_recvmsg(sock, msg, len, flags); else err = -EPROTOTYPE; return err; } static int vsock_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, int *addr_len) { struct sk_psock *psock; struct vsock_sock *vsk; int copied; psock = sk_psock_get(sk); if (unlikely(!psock)) return __vsock_recvmsg(sk, msg, len, flags); lock_sock(sk); vsk = vsock_sk(sk); if (WARN_ON_ONCE(!vsk->transport)) { copied = -ENODEV; goto out; } if (vsock_has_data(sk, psock) && sk_psock_queue_empty(psock)) { release_sock(sk); sk_psock_put(sk, psock); return __vsock_recvmsg(sk, msg, len, flags); } copied = sk_msg_recvmsg(sk, psock, msg, len, flags); while (copied == 0) { long timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); if (!vsock_msg_wait_data(sk, psock, timeo)) { copied = -EAGAIN; break; } if (sk_psock_queue_empty(psock)) { release_sock(sk); sk_psock_put(sk, psock); return __vsock_recvmsg(sk, msg, len, flags); } copied = sk_msg_recvmsg(sk, psock, msg, len, flags); } out: release_sock(sk); sk_psock_put(sk, psock); return copied; } static void vsock_bpf_rebuild_protos(struct proto *prot, const struct proto *base) { *prot = *base; prot->close = sock_map_close; prot->recvmsg = vsock_bpf_recvmsg; prot->sock_is_readable = sk_msg_is_readable; } static void vsock_bpf_check_needs_rebuild(struct proto *ops) { /* Paired with the smp_store_release() below. */ if (unlikely(ops != smp_load_acquire(&vsock_prot_saved))) { spin_lock_bh(&vsock_prot_lock); if (likely(ops != vsock_prot_saved)) { vsock_bpf_rebuild_protos(&vsock_bpf_prot, ops); /* Make sure proto function pointers are updated before publishing the * pointer to the struct. */ smp_store_release(&vsock_prot_saved, ops); } spin_unlock_bh(&vsock_prot_lock); } } int vsock_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore) { struct vsock_sock *vsk; if (restore) { sk->sk_write_space = psock->saved_write_space; sock_replace_proto(sk, psock->sk_proto); return 0; } vsk = vsock_sk(sk); if (!vsk->transport) return -ENODEV; if (!vsk->transport->read_skb) return -EOPNOTSUPP; vsock_bpf_check_needs_rebuild(psock->sk_proto); sock_replace_proto(sk, &vsock_bpf_prot); return 0; } void __init vsock_bpf_build_proto(void) { vsock_bpf_rebuild_protos(&vsock_bpf_prot, &vsock_proto); }
20 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 /* * Copyright (c) 2005 Topspin Communications. All rights reserved. * Copyright (c) 2005, 2006 Cisco Systems. All rights reserved. * Copyright (c) 2005-2017 Mellanox Technologies. All rights reserved. * Copyright (c) 2005 Voltaire, Inc. All rights reserved. * Copyright (c) 2005 PathScale, Inc. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #ifndef RDMA_CORE_H #define RDMA_CORE_H #include <linux/idr.h> #include <rdma/uverbs_types.h> #include <rdma/uverbs_ioctl.h> #include <rdma/ib_verbs.h> #include <linux/mutex.h> struct ib_uverbs_device; void uverbs_destroy_ufile_hw(struct ib_uverbs_file *ufile, enum rdma_remove_reason reason); int uobj_destroy(struct ib_uobject *uobj, struct uverbs_attr_bundle *attrs); /* * Get an ib_uobject that corresponds to the given id from ufile, assuming * the object is from the given type. Lock it to the required access when * applicable. * This function could create (access == NEW), destroy (access == DESTROY) * or unlock (access == READ || access == WRITE) objects if required. * The action will be finalized only when uverbs_finalize_object or * uverbs_finalize_objects are called. */ struct ib_uobject * uverbs_get_uobject_from_file(u16 object_id, enum uverbs_obj_access access, s64 id, struct uverbs_attr_bundle *attrs); void uverbs_finalize_object(struct ib_uobject *uobj, enum uverbs_obj_access access, bool hw_obj_valid, bool commit, struct uverbs_attr_bundle *attrs); int uverbs_output_written(const struct uverbs_attr_bundle *bundle, size_t idx); void setup_ufile_idr_uobject(struct ib_uverbs_file *ufile); void release_ufile_idr_uobject(struct ib_uverbs_file *ufile); struct ib_udata *uverbs_get_cleared_udata(struct uverbs_attr_bundle *attrs); /* * This is the runtime description of the uverbs API, used by the syscall * machinery to validate and dispatch calls. */ /* * Depending on ID the slot pointer in the radix tree points at one of these * structs. */ struct uverbs_api_ioctl_method { int(__rcu *handler)(struct uverbs_attr_bundle *attrs); DECLARE_BITMAP(attr_mandatory, UVERBS_API_ATTR_BKEY_LEN); u16 bundle_size; u8 use_stack:1; u8 driver_method:1; u8 disabled:1; u8 has_udata:1; u8 key_bitmap_len; u8 destroy_bkey; }; struct uverbs_api_write_method { int (*handler)(struct uverbs_attr_bundle *attrs); u8 disabled:1; u8 is_ex:1; u8 has_udata:1; u8 has_resp:1; u8 req_size; u8 resp_size; }; struct uverbs_api_attr { struct uverbs_attr_spec spec; }; struct uverbs_api { /* radix tree contains struct uverbs_api_* pointers */ struct radix_tree_root radix; enum rdma_driver_id driver_id; unsigned int num_write; unsigned int num_write_ex; struct uverbs_api_write_method notsupp_method; const struct uverbs_api_write_method **write_methods; const struct uverbs_api_write_method **write_ex_methods; }; /* * Get an uverbs_api_object that corresponds to the given object_id. * Note: * -ENOMSG means that any object is allowed to match during lookup. */ static inline const struct uverbs_api_object * uapi_get_object(struct uverbs_api *uapi, u16 object_id) { const struct uverbs_api_object *res; if (object_id == UVERBS_IDR_ANY_OBJECT) return ERR_PTR(-ENOMSG); res = radix_tree_lookup(&uapi->radix, uapi_key_obj(object_id)); if (!res) return ERR_PTR(-ENOENT); return res; } char *uapi_key_format(char *S, unsigned int key); struct uverbs_api *uverbs_alloc_api(struct ib_device *ibdev); void uverbs_disassociate_api_pre(struct ib_uverbs_device *uverbs_dev); void uverbs_disassociate_api(struct uverbs_api *uapi); void uverbs_destroy_api(struct uverbs_api *uapi); void uapi_compute_bundle_size(struct uverbs_api_ioctl_method *method_elm, unsigned int num_attrs); void uverbs_user_mmap_disassociate(struct ib_uverbs_file *ufile); extern const struct uapi_definition uverbs_def_obj_async_fd[]; extern const struct uapi_definition uverbs_def_obj_counters[]; extern const struct uapi_definition uverbs_def_obj_cq[]; extern const struct uapi_definition uverbs_def_obj_device[]; extern const struct uapi_definition uverbs_def_obj_dm[]; extern const struct uapi_definition uverbs_def_obj_dmah[]; extern const struct uapi_definition uverbs_def_obj_flow_action[]; extern const struct uapi_definition uverbs_def_obj_intf[]; extern const struct uapi_definition uverbs_def_obj_mr[]; extern const struct uapi_definition uverbs_def_obj_qp[]; extern const struct uapi_definition uverbs_def_obj_srq[]; extern const struct uapi_definition uverbs_def_obj_wq[]; extern const struct uapi_definition uverbs_def_write_intf[]; static inline const struct uverbs_api_write_method * uapi_get_method(const struct uverbs_api *uapi, u32 command) { u32 cmd_idx = command & IB_USER_VERBS_CMD_COMMAND_MASK; if (command & ~(u32)(IB_USER_VERBS_CMD_FLAG_EXTENDED | IB_USER_VERBS_CMD_COMMAND_MASK)) return ERR_PTR(-EINVAL); if (command & IB_USER_VERBS_CMD_FLAG_EXTENDED) { if (cmd_idx >= uapi->num_write_ex) return ERR_PTR(-EOPNOTSUPP); return uapi->write_ex_methods[cmd_idx]; } if (cmd_idx >= uapi->num_write) return ERR_PTR(-EOPNOTSUPP); return uapi->write_methods[cmd_idx]; } void uverbs_fill_udata(struct uverbs_attr_bundle *bundle, struct ib_udata *udata, unsigned int attr_in, unsigned int attr_out); #endif /* RDMA_CORE_H */
7 1 1 3 5 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 /* Kernel module to match connection tracking byte counter. * GPL (C) 2002 Martin Devera (devik@cdi.cz). */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/bitops.h> #include <linux/skbuff.h> #include <linux/math64.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter/xt_connbytes.h> #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_acct.h> MODULE_LICENSE("GPL"); MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>"); MODULE_DESCRIPTION("Xtables: Number of packets/bytes per connection matching"); MODULE_ALIAS("ipt_connbytes"); MODULE_ALIAS("ip6t_connbytes"); static bool connbytes_mt(const struct sk_buff *skb, struct xt_action_param *par) { const struct xt_connbytes_info *sinfo = par->matchinfo; const struct nf_conn *ct; enum ip_conntrack_info ctinfo; u_int64_t what = 0; /* initialize to make gcc happy */ u_int64_t bytes = 0; u_int64_t pkts = 0; const struct nf_conn_acct *acct; const struct nf_conn_counter *counters; ct = nf_ct_get(skb, &ctinfo); if (!ct) return false; acct = nf_conn_acct_find(ct); if (!acct) return false; counters = acct->counter; switch (sinfo->what) { case XT_CONNBYTES_PKTS: switch (sinfo->direction) { case XT_CONNBYTES_DIR_ORIGINAL: what = atomic64_read(&counters[IP_CT_DIR_ORIGINAL].packets); break; case XT_CONNBYTES_DIR_REPLY: what = atomic64_read(&counters[IP_CT_DIR_REPLY].packets); break; case XT_CONNBYTES_DIR_BOTH: what = atomic64_read(&counters[IP_CT_DIR_ORIGINAL].packets); what += atomic64_read(&counters[IP_CT_DIR_REPLY].packets); break; } break; case XT_CONNBYTES_BYTES: switch (sinfo->direction) { case XT_CONNBYTES_DIR_ORIGINAL: what = atomic64_read(&counters[IP_CT_DIR_ORIGINAL].bytes); break; case XT_CONNBYTES_DIR_REPLY: what = atomic64_read(&counters[IP_CT_DIR_REPLY].bytes); break; case XT_CONNBYTES_DIR_BOTH: what = atomic64_read(&counters[IP_CT_DIR_ORIGINAL].bytes); what += atomic64_read(&counters[IP_CT_DIR_REPLY].bytes); break; } break; case XT_CONNBYTES_AVGPKT: switch (sinfo->direction) { case XT_CONNBYTES_DIR_ORIGINAL: bytes = atomic64_read(&counters[IP_CT_DIR_ORIGINAL].bytes); pkts = atomic64_read(&counters[IP_CT_DIR_ORIGINAL].packets); break; case XT_CONNBYTES_DIR_REPLY: bytes = atomic64_read(&counters[IP_CT_DIR_REPLY].bytes); pkts = atomic64_read(&counters[IP_CT_DIR_REPLY].packets); break; case XT_CONNBYTES_DIR_BOTH: bytes = atomic64_read(&counters[IP_CT_DIR_ORIGINAL].bytes) + atomic64_read(&counters[IP_CT_DIR_REPLY].bytes); pkts = atomic64_read(&counters[IP_CT_DIR_ORIGINAL].packets) + atomic64_read(&counters[IP_CT_DIR_REPLY].packets); break; } if (pkts != 0) what = div64_u64(bytes, pkts); break; } if (sinfo->count.to >= sinfo->count.from) return what <= sinfo->count.to && what >= sinfo->count.from; else /* inverted */ return what < sinfo->count.to || what > sinfo->count.from; } static int connbytes_mt_check(const struct xt_mtchk_param *par) { const struct xt_connbytes_info *sinfo = par->matchinfo; int ret; if (sinfo->what != XT_CONNBYTES_PKTS && sinfo->what != XT_CONNBYTES_BYTES && sinfo->what != XT_CONNBYTES_AVGPKT) return -EINVAL; if (sinfo->direction != XT_CONNBYTES_DIR_ORIGINAL && sinfo->direction != XT_CONNBYTES_DIR_REPLY && sinfo->direction != XT_CONNBYTES_DIR_BOTH) return -EINVAL; ret = nf_ct_netns_get(par->net, par->family); if (ret < 0) { pr_info_ratelimited("cannot load conntrack support for proto=%u\n", par->family); return ret; } /* * This filter cannot function correctly unless connection tracking * accounting is enabled, so complain in the hope that someone notices. */ if (!nf_ct_acct_enabled(par->net)) { pr_warn("Forcing CT accounting to be enabled\n"); nf_ct_set_acct(par->net, true); } return ret; } static void connbytes_mt_destroy(const struct xt_mtdtor_param *par) { nf_ct_netns_put(par->net, par->family); } static struct xt_match connbytes_mt_reg __read_mostly = { .name = "connbytes", .revision = 0, .family = NFPROTO_UNSPEC, .checkentry = connbytes_mt_check, .match = connbytes_mt, .destroy = connbytes_mt_destroy, .matchsize = sizeof(struct xt_connbytes_info), .me = THIS_MODULE, }; static int __init connbytes_mt_init(void) { return xt_register_match(&connbytes_mt_reg); } static void __exit connbytes_mt_exit(void) { xt_unregister_match(&connbytes_mt_reg); } module_init(connbytes_mt_init); module_exit(connbytes_mt_exit);
4 1 3 93 15 89 3703 3682 273 17 2 3 148 114 116 1263 1263 21 428 3 1 1 1 8 1 1 536 141 399 538 12 316 393 76 363 205 10 11 187 8 1 5 1 1 6 5 1 8 8 127 225 949 1281 349 947 209 352 62 31 31 36 16755 16623 31 16746 16820 167 182 1 18 18 9 106 95 17 10 9 4 13 53 34 19 53 62 32 12 37 9 10 10 10 104 32 17 45 4 26 23 3 23 3 26 26 26 26 26 281 1504 1453 1503 1511 64 22 41 44 34 9 129 132 5 4 123 795 5 791 749 34 790 41 41 1116 626 20 473 322 167 1565 1 228 144 102 9 41 38 37 2 4 2 5 2 2 7 1114 136 136 757 757 10 737 14 31 947 953 6 1 5 139 50 10 63 31 6 1120 2 2 1114 3 30 1088 16 16 110 23 972 148 9 552 139 330 1 5 5 3 2 447 447 446 136 328 445 132 1 2 1 6 25 34 66 129 38 37 867 850 77 2 852 844 850 852 11 2 147 16 2 137 141 146 146 138 3 87 38 2 6 2 118 136 31 120 121 88 49 45 3 7 742 710 68 713 707 687 713 1 33 55 27 63 90 18 1 87 86 55 36 918 1 50 4 264 656 889 917 354 17 1 332 9 338 358 5 1 5 4017 212 2 4023 4228 301 1 4268 4222 4234 1905 1903 214 2 1690 165 2 1910 1899 1927 1926 209 15 1 209 208 213 213 15127 10 15089 60 15061 65 14522 35 1583 76 15932 8 937 62 14998 15037 4 8 3 14211 416 481 885 883 14697 338 912 524 36 13763 831 824 314 14506 476 15001 15014 15001 79 14999 36 1 1356 3 13752 13682 13823 13817 1338 1 65 1 1080 915 1090 4 23 1302 1338 939 993 281 992 357 752 198 7 28 748 935 939 46 929 63 1 19 2 283 264 294 294 666 666 37 2 148 512 1 555 320 549 481 3 68 3 158 102 87 634 2 585 15 71 672 669 3 3 1 432 13 74 822 321 318 366 368 111 58 58 115 11 2 10 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 // SPDX-License-Identifier: GPL-2.0-or-later /* * NET An implementation of the SOCKET network access protocol. * * Version: @(#)socket.c 1.1.93 18/02/95 * * Authors: Orest Zborowski, <obz@Kodak.COM> * Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * * Fixes: * Anonymous : NOTSOCK/BADF cleanup. Error fix in * shutdown() * Alan Cox : verify_area() fixes * Alan Cox : Removed DDI * Jonathan Kamens : SOCK_DGRAM reconnect bug * Alan Cox : Moved a load of checks to the very * top level. * Alan Cox : Move address structures to/from user * mode above the protocol layers. * Rob Janssen : Allow 0 length sends. * Alan Cox : Asynchronous I/O support (cribbed from the * tty drivers). * Niibe Yutaka : Asynchronous I/O for writes (4.4BSD style) * Jeff Uphoff : Made max number of sockets command-line * configurable. * Matti Aarnio : Made the number of sockets dynamic, * to be allocated when needed, and mr. * Uphoff's max is used as max to be * allowed to allocate. * Linus : Argh. removed all the socket allocation * altogether: it's in the inode now. * Alan Cox : Made sock_alloc()/sock_release() public * for NetROM and future kernel nfsd type * stuff. * Alan Cox : sendmsg/recvmsg basics. * Tom Dyas : Export net symbols. * Marcin Dalecki : Fixed problems with CONFIG_NET="n". * Alan Cox : Added thread locking to sys_* calls * for sockets. May have errors at the * moment. * Kevin Buhr : Fixed the dumb errors in the above. * Andi Kleen : Some small cleanups, optimizations, * and fixed a copy_from_user() bug. * Tigran Aivazian : sys_send(args) calls sys_sendto(args, NULL, 0) * Tigran Aivazian : Made listen(2) backlog sanity checks * protocol-independent * * This module is effectively the top level interface to the BSD socket * paradigm. * * Based upon Swansea University Computer Society NET3.039 */ #include <linux/bpf-cgroup.h> #include <linux/ethtool.h> #include <linux/mm.h> #include <linux/socket.h> #include <linux/file.h> #include <linux/splice.h> #include <linux/net.h> #include <linux/interrupt.h> #include <linux/thread_info.h> #include <linux/rcupdate.h> #include <linux/netdevice.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/mutex.h> #include <linux/if_bridge.h> #include <linux/if_vlan.h> #include <linux/ptp_classify.h> #include <linux/init.h> #include <linux/poll.h> #include <linux/cache.h> #include <linux/module.h> #include <linux/highmem.h> #include <linux/mount.h> #include <linux/pseudo_fs.h> #include <linux/security.h> #include <linux/syscalls.h> #include <linux/compat.h> #include <linux/kmod.h> #include <linux/audit.h> #include <linux/wireless.h> #include <linux/nsproxy.h> #include <linux/magic.h> #include <linux/slab.h> #include <linux/xattr.h> #include <linux/nospec.h> #include <linux/indirect_call_wrapper.h> #include <linux/io_uring/net.h> #include <linux/uaccess.h> #include <asm/unistd.h> #include <net/compat.h> #include <net/wext.h> #include <net/cls_cgroup.h> #include <net/sock.h> #include <linux/netfilter.h> #include <linux/if_tun.h> #include <linux/ipv6_route.h> #include <linux/route.h> #include <linux/termios.h> #include <linux/sockios.h> #include <net/busy_poll.h> #include <linux/errqueue.h> #include <linux/ptp_clock_kernel.h> #include <trace/events/sock.h> #include "core/dev.h" #ifdef CONFIG_NET_RX_BUSY_POLL unsigned int sysctl_net_busy_read __read_mostly; unsigned int sysctl_net_busy_poll __read_mostly; #endif static ssize_t sock_read_iter(struct kiocb *iocb, struct iov_iter *to); static ssize_t sock_write_iter(struct kiocb *iocb, struct iov_iter *from); static int sock_mmap(struct file *file, struct vm_area_struct *vma); static int sock_close(struct inode *inode, struct file *file); static __poll_t sock_poll(struct file *file, struct poll_table_struct *wait); static long sock_ioctl(struct file *file, unsigned int cmd, unsigned long arg); #ifdef CONFIG_COMPAT static long compat_sock_ioctl(struct file *file, unsigned int cmd, unsigned long arg); #endif static int sock_fasync(int fd, struct file *filp, int on); static ssize_t sock_splice_read(struct file *file, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags); static void sock_splice_eof(struct file *file); #ifdef CONFIG_PROC_FS static void sock_show_fdinfo(struct seq_file *m, struct file *f) { struct socket *sock = f->private_data; const struct proto_ops *ops = READ_ONCE(sock->ops); if (ops->show_fdinfo) ops->show_fdinfo(m, sock); } #else #define sock_show_fdinfo NULL #endif /* * Socket files have a set of 'special' operations as well as the generic file ones. These don't appear * in the operation structures but are done directly via the socketcall() multiplexor. */ static const struct file_operations socket_file_ops = { .owner = THIS_MODULE, .read_iter = sock_read_iter, .write_iter = sock_write_iter, .poll = sock_poll, .unlocked_ioctl = sock_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = compat_sock_ioctl, #endif .uring_cmd = io_uring_cmd_sock, .mmap = sock_mmap, .release = sock_close, .fasync = sock_fasync, .splice_write = splice_to_socket, .splice_read = sock_splice_read, .splice_eof = sock_splice_eof, .show_fdinfo = sock_show_fdinfo, }; static const char * const pf_family_names[] = { [PF_UNSPEC] = "PF_UNSPEC", [PF_UNIX] = "PF_UNIX/PF_LOCAL", [PF_INET] = "PF_INET", [PF_AX25] = "PF_AX25", [PF_IPX] = "PF_IPX", [PF_APPLETALK] = "PF_APPLETALK", [PF_NETROM] = "PF_NETROM", [PF_BRIDGE] = "PF_BRIDGE", [PF_ATMPVC] = "PF_ATMPVC", [PF_X25] = "PF_X25", [PF_INET6] = "PF_INET6", [PF_ROSE] = "PF_ROSE", [PF_DECnet] = "PF_DECnet", [PF_NETBEUI] = "PF_NETBEUI", [PF_SECURITY] = "PF_SECURITY", [PF_KEY] = "PF_KEY", [PF_NETLINK] = "PF_NETLINK/PF_ROUTE", [PF_PACKET] = "PF_PACKET", [PF_ASH] = "PF_ASH", [PF_ECONET] = "PF_ECONET", [PF_ATMSVC] = "PF_ATMSVC", [PF_RDS] = "PF_RDS", [PF_SNA] = "PF_SNA", [PF_IRDA] = "PF_IRDA", [PF_PPPOX] = "PF_PPPOX", [PF_WANPIPE] = "PF_WANPIPE", [PF_LLC] = "PF_LLC", [PF_IB] = "PF_IB", [PF_MPLS] = "PF_MPLS", [PF_CAN] = "PF_CAN", [PF_TIPC] = "PF_TIPC", [PF_BLUETOOTH] = "PF_BLUETOOTH", [PF_IUCV] = "PF_IUCV", [PF_RXRPC] = "PF_RXRPC", [PF_ISDN] = "PF_ISDN", [PF_PHONET] = "PF_PHONET", [PF_IEEE802154] = "PF_IEEE802154", [PF_CAIF] = "PF_CAIF", [PF_ALG] = "PF_ALG", [PF_NFC] = "PF_NFC", [PF_VSOCK] = "PF_VSOCK", [PF_KCM] = "PF_KCM", [PF_QIPCRTR] = "PF_QIPCRTR", [PF_SMC] = "PF_SMC", [PF_XDP] = "PF_XDP", [PF_MCTP] = "PF_MCTP", }; /* * The protocol list. Each protocol is registered in here. */ static DEFINE_SPINLOCK(net_family_lock); static const struct net_proto_family __rcu *net_families[NPROTO] __read_mostly; /* * Support routines. * Move socket addresses back and forth across the kernel/user * divide and look after the messy bits. */ /** * move_addr_to_kernel - copy a socket address into kernel space * @uaddr: Address in user space * @kaddr: Address in kernel space * @ulen: Length in user space * * The address is copied into kernel space. If the provided address is * too long an error code of -EINVAL is returned. If the copy gives * invalid addresses -EFAULT is returned. On a success 0 is returned. */ int move_addr_to_kernel(void __user *uaddr, int ulen, struct sockaddr_storage *kaddr) { if (ulen < 0 || ulen > sizeof(struct sockaddr_storage)) return -EINVAL; if (ulen == 0) return 0; if (copy_from_user(kaddr, uaddr, ulen)) return -EFAULT; return audit_sockaddr(ulen, kaddr); } /** * move_addr_to_user - copy an address to user space * @kaddr: kernel space address * @klen: length of address in kernel * @uaddr: user space address * @ulen: pointer to user length field * * The value pointed to by ulen on entry is the buffer length available. * This is overwritten with the buffer space used. -EINVAL is returned * if an overlong buffer is specified or a negative buffer size. -EFAULT * is returned if either the buffer or the length field are not * accessible. * After copying the data up to the limit the user specifies, the true * length of the data is written over the length limit the user * specified. Zero is returned for a success. */ static int move_addr_to_user(struct sockaddr_storage *kaddr, int klen, void __user *uaddr, int __user *ulen) { int len; BUG_ON(klen > sizeof(struct sockaddr_storage)); if (can_do_masked_user_access()) ulen = masked_user_access_begin(ulen); else if (!user_access_begin(ulen, 4)) return -EFAULT; unsafe_get_user(len, ulen, efault_end); if (len > klen) len = klen; /* * "fromlen shall refer to the value before truncation.." * 1003.1g */ if (len >= 0) unsafe_put_user(klen, ulen, efault_end); user_access_end(); if (len) { if (len < 0) return -EINVAL; if (audit_sockaddr(klen, kaddr)) return -ENOMEM; if (copy_to_user(uaddr, kaddr, len)) return -EFAULT; } return 0; efault_end: user_access_end(); return -EFAULT; } static struct kmem_cache *sock_inode_cachep __ro_after_init; static struct inode *sock_alloc_inode(struct super_block *sb) { struct socket_alloc *ei; ei = alloc_inode_sb(sb, sock_inode_cachep, GFP_KERNEL); if (!ei) return NULL; init_waitqueue_head(&ei->socket.wq.wait); ei->socket.wq.fasync_list = NULL; ei->socket.wq.flags = 0; ei->socket.state = SS_UNCONNECTED; ei->socket.flags = 0; ei->socket.ops = NULL; ei->socket.sk = NULL; ei->socket.file = NULL; return &ei->vfs_inode; } static void sock_free_inode(struct inode *inode) { struct socket_alloc *ei; ei = container_of(inode, struct socket_alloc, vfs_inode); kmem_cache_free(sock_inode_cachep, ei); } static void init_once(void *foo) { struct socket_alloc *ei = (struct socket_alloc *)foo; inode_init_once(&ei->vfs_inode); } static void init_inodecache(void) { sock_inode_cachep = kmem_cache_create("sock_inode_cache", sizeof(struct socket_alloc), 0, (SLAB_HWCACHE_ALIGN | SLAB_RECLAIM_ACCOUNT | SLAB_ACCOUNT), init_once); BUG_ON(sock_inode_cachep == NULL); } static const struct super_operations sockfs_ops = { .alloc_inode = sock_alloc_inode, .free_inode = sock_free_inode, .statfs = simple_statfs, }; /* * sockfs_dname() is called from d_path(). */ static char *sockfs_dname(struct dentry *dentry, char *buffer, int buflen) { return dynamic_dname(buffer, buflen, "socket:[%lu]", d_inode(dentry)->i_ino); } static const struct dentry_operations sockfs_dentry_operations = { .d_dname = sockfs_dname, }; static int sockfs_xattr_get(const struct xattr_handler *handler, struct dentry *dentry, struct inode *inode, const char *suffix, void *value, size_t size) { if (value) { if (dentry->d_name.len + 1 > size) return -ERANGE; memcpy(value, dentry->d_name.name, dentry->d_name.len + 1); } return dentry->d_name.len + 1; } #define XATTR_SOCKPROTONAME_SUFFIX "sockprotoname" #define XATTR_NAME_SOCKPROTONAME (XATTR_SYSTEM_PREFIX XATTR_SOCKPROTONAME_SUFFIX) #define XATTR_NAME_SOCKPROTONAME_LEN (sizeof(XATTR_NAME_SOCKPROTONAME)-1) static const struct xattr_handler sockfs_xattr_handler = { .name = XATTR_NAME_SOCKPROTONAME, .get = sockfs_xattr_get, }; static int sockfs_security_xattr_set(const struct xattr_handler *handler, struct mnt_idmap *idmap, struct dentry *dentry, struct inode *inode, const char *suffix, const void *value, size_t size, int flags) { /* Handled by LSM. */ return -EAGAIN; } static const struct xattr_handler sockfs_security_xattr_handler = { .prefix = XATTR_SECURITY_PREFIX, .set = sockfs_security_xattr_set, }; static const struct xattr_handler * const sockfs_xattr_handlers[] = { &sockfs_xattr_handler, &sockfs_security_xattr_handler, NULL }; static int sockfs_init_fs_context(struct fs_context *fc) { struct pseudo_fs_context *ctx = init_pseudo(fc, SOCKFS_MAGIC); if (!ctx) return -ENOMEM; ctx->ops = &sockfs_ops; ctx->dops = &sockfs_dentry_operations; ctx->xattr = sockfs_xattr_handlers; return 0; } static struct vfsmount *sock_mnt __read_mostly; static struct file_system_type sock_fs_type = { .name = "sockfs", .init_fs_context = sockfs_init_fs_context, .kill_sb = kill_anon_super, }; /* * Obtains the first available file descriptor and sets it up for use. * * These functions create file structures and maps them to fd space * of the current process. On success it returns file descriptor * and file struct implicitly stored in sock->file. * Note that another thread may close file descriptor before we return * from this function. We use the fact that now we do not refer * to socket after mapping. If one day we will need it, this * function will increment ref. count on file by 1. * * In any case returned fd MAY BE not valid! * This race condition is unavoidable * with shared fd spaces, we cannot solve it inside kernel, * but we take care of internal coherence yet. */ /** * sock_alloc_file - Bind a &socket to a &file * @sock: socket * @flags: file status flags * @dname: protocol name * * Returns the &file bound with @sock, implicitly storing it * in sock->file. If dname is %NULL, sets to "". * * On failure @sock is released, and an ERR pointer is returned. * * This function uses GFP_KERNEL internally. */ struct file *sock_alloc_file(struct socket *sock, int flags, const char *dname) { struct file *file; if (!dname) dname = sock->sk ? sock->sk->sk_prot_creator->name : ""; file = alloc_file_pseudo(SOCK_INODE(sock), sock_mnt, dname, O_RDWR | (flags & O_NONBLOCK), &socket_file_ops); if (IS_ERR(file)) { sock_release(sock); return file; } file->f_mode |= FMODE_NOWAIT; sock->file = file; file->private_data = sock; stream_open(SOCK_INODE(sock), file); /* * Disable permission and pre-content events, but enable legacy * inotify events for legacy users. */ file_set_fsnotify_mode(file, FMODE_NONOTIFY_PERM); return file; } EXPORT_SYMBOL(sock_alloc_file); static int sock_map_fd(struct socket *sock, int flags) { struct file *newfile; int fd = get_unused_fd_flags(flags); if (unlikely(fd < 0)) { sock_release(sock); return fd; } newfile = sock_alloc_file(sock, flags, NULL); if (!IS_ERR(newfile)) { fd_install(fd, newfile); return fd; } put_unused_fd(fd); return PTR_ERR(newfile); } /** * sock_from_file - Return the &socket bounded to @file. * @file: file * * On failure returns %NULL. */ struct socket *sock_from_file(struct file *file) { if (likely(file->f_op == &socket_file_ops)) return file->private_data; /* set in sock_alloc_file */ return NULL; } EXPORT_SYMBOL(sock_from_file); /** * sockfd_lookup - Go from a file number to its socket slot * @fd: file handle * @err: pointer to an error code return * * The file handle passed in is locked and the socket it is bound * to is returned. If an error occurs the err pointer is overwritten * with a negative errno code and NULL is returned. The function checks * for both invalid handles and passing a handle which is not a socket. * * On a success the socket object pointer is returned. */ struct socket *sockfd_lookup(int fd, int *err) { struct file *file; struct socket *sock; file = fget(fd); if (!file) { *err = -EBADF; return NULL; } sock = sock_from_file(file); if (!sock) { *err = -ENOTSOCK; fput(file); } return sock; } EXPORT_SYMBOL(sockfd_lookup); static ssize_t sockfs_listxattr(struct dentry *dentry, char *buffer, size_t size) { ssize_t len; ssize_t used = 0; len = security_inode_listsecurity(d_inode(dentry), buffer, size); if (len < 0) return len; used += len; if (buffer) { if (size < used) return -ERANGE; buffer += len; } len = (XATTR_NAME_SOCKPROTONAME_LEN + 1); used += len; if (buffer) { if (size < used) return -ERANGE; memcpy(buffer, XATTR_NAME_SOCKPROTONAME, len); buffer += len; } return used; } static int sockfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *iattr) { int err = simple_setattr(&nop_mnt_idmap, dentry, iattr); if (!err && (iattr->ia_valid & ATTR_UID)) { struct socket *sock = SOCKET_I(d_inode(dentry)); if (sock->sk) { /* Paired with READ_ONCE() in sk_uid() */ WRITE_ONCE(sock->sk->sk_uid, iattr->ia_uid); } else { err = -ENOENT; } } return err; } static const struct inode_operations sockfs_inode_ops = { .listxattr = sockfs_listxattr, .setattr = sockfs_setattr, }; /** * sock_alloc - allocate a socket * * Allocate a new inode and socket object. The two are bound together * and initialised. The socket is then returned. If we are out of inodes * NULL is returned. This functions uses GFP_KERNEL internally. */ struct socket *sock_alloc(void) { struct inode *inode; struct socket *sock; inode = new_inode_pseudo(sock_mnt->mnt_sb); if (!inode) return NULL; sock = SOCKET_I(inode); inode->i_ino = get_next_ino(); inode->i_mode = S_IFSOCK | S_IRWXUGO; inode->i_uid = current_fsuid(); inode->i_gid = current_fsgid(); inode->i_op = &sockfs_inode_ops; return sock; } EXPORT_SYMBOL(sock_alloc); static void __sock_release(struct socket *sock, struct inode *inode) { const struct proto_ops *ops = READ_ONCE(sock->ops); if (ops) { struct module *owner = ops->owner; if (inode) inode_lock(inode); ops->release(sock); sock->sk = NULL; if (inode) inode_unlock(inode); sock->ops = NULL; module_put(owner); } if (sock->wq.fasync_list) pr_err("%s: fasync list not empty!\n", __func__); if (!sock->file) { iput(SOCK_INODE(sock)); return; } sock->file = NULL; } /** * sock_release - close a socket * @sock: socket to close * * The socket is released from the protocol stack if it has a release * callback, and the inode is then released if the socket is bound to * an inode not a file. */ void sock_release(struct socket *sock) { __sock_release(sock, NULL); } EXPORT_SYMBOL(sock_release); void __sock_tx_timestamp(__u32 tsflags, __u8 *tx_flags) { u8 flags = *tx_flags; if (tsflags & SOF_TIMESTAMPING_TX_HARDWARE) flags |= SKBTX_HW_TSTAMP_NOBPF; if (tsflags & SOF_TIMESTAMPING_TX_SOFTWARE) flags |= SKBTX_SW_TSTAMP; if (tsflags & SOF_TIMESTAMPING_TX_SCHED) flags |= SKBTX_SCHED_TSTAMP; if (tsflags & SOF_TIMESTAMPING_TX_COMPLETION) flags |= SKBTX_COMPLETION_TSTAMP; *tx_flags = flags; } EXPORT_SYMBOL(__sock_tx_timestamp); INDIRECT_CALLABLE_DECLARE(int inet_sendmsg(struct socket *, struct msghdr *, size_t)); INDIRECT_CALLABLE_DECLARE(int inet6_sendmsg(struct socket *, struct msghdr *, size_t)); static noinline void call_trace_sock_send_length(struct sock *sk, int ret, int flags) { trace_sock_send_length(sk, ret, 0); } static inline int sock_sendmsg_nosec(struct socket *sock, struct msghdr *msg) { int ret = INDIRECT_CALL_INET(READ_ONCE(sock->ops)->sendmsg, inet6_sendmsg, inet_sendmsg, sock, msg, msg_data_left(msg)); BUG_ON(ret == -EIOCBQUEUED); if (trace_sock_send_length_enabled()) call_trace_sock_send_length(sock->sk, ret, 0); return ret; } static int __sock_sendmsg(struct socket *sock, struct msghdr *msg) { int err = security_socket_sendmsg(sock, msg, msg_data_left(msg)); return err ?: sock_sendmsg_nosec(sock, msg); } /** * sock_sendmsg - send a message through @sock * @sock: socket * @msg: message to send * * Sends @msg through @sock, passing through LSM. * Returns the number of bytes sent, or an error code. */ int sock_sendmsg(struct socket *sock, struct msghdr *msg) { struct sockaddr_storage *save_addr = (struct sockaddr_storage *)msg->msg_name; struct sockaddr_storage address; int save_len = msg->msg_namelen; int ret; if (msg->msg_name) { memcpy(&address, msg->msg_name, msg->msg_namelen); msg->msg_name = &address; } ret = __sock_sendmsg(sock, msg); msg->msg_name = save_addr; msg->msg_namelen = save_len; return ret; } EXPORT_SYMBOL(sock_sendmsg); /** * kernel_sendmsg - send a message through @sock (kernel-space) * @sock: socket * @msg: message header * @vec: kernel vec * @num: vec array length * @size: total message data size * * Builds the message data with @vec and sends it through @sock. * Returns the number of bytes sent, or an error code. */ int kernel_sendmsg(struct socket *sock, struct msghdr *msg, struct kvec *vec, size_t num, size_t size) { iov_iter_kvec(&msg->msg_iter, ITER_SOURCE, vec, num, size); return sock_sendmsg(sock, msg); } EXPORT_SYMBOL(kernel_sendmsg); static bool skb_is_err_queue(const struct sk_buff *skb) { /* pkt_type of skbs enqueued on the error queue are set to * PACKET_OUTGOING in skb_set_err_queue(). This is only safe to do * in recvmsg, since skbs received on a local socket will never * have a pkt_type of PACKET_OUTGOING. */ return skb->pkt_type == PACKET_OUTGOING; } /* On transmit, software and hardware timestamps are returned independently. * As the two skb clones share the hardware timestamp, which may be updated * before the software timestamp is received, a hardware TX timestamp may be * returned only if there is no software TX timestamp. Ignore false software * timestamps, which may be made in the __sock_recv_timestamp() call when the * option SO_TIMESTAMP_OLD(NS) is enabled on the socket, even when the skb has a * hardware timestamp. */ static bool skb_is_swtx_tstamp(const struct sk_buff *skb, int false_tstamp) { return skb->tstamp && !false_tstamp && skb_is_err_queue(skb); } static ktime_t get_timestamp(struct sock *sk, struct sk_buff *skb, int *if_index) { bool cycles = READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_BIND_PHC; struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb); struct net_device *orig_dev; ktime_t hwtstamp; rcu_read_lock(); orig_dev = dev_get_by_napi_id(skb_napi_id(skb)); if (orig_dev) { *if_index = orig_dev->ifindex; hwtstamp = netdev_get_tstamp(orig_dev, shhwtstamps, cycles); } else { hwtstamp = shhwtstamps->hwtstamp; } rcu_read_unlock(); return hwtstamp; } static void put_ts_pktinfo(struct msghdr *msg, struct sk_buff *skb, int if_index) { struct scm_ts_pktinfo ts_pktinfo; struct net_device *orig_dev; if (!skb_mac_header_was_set(skb)) return; memset(&ts_pktinfo, 0, sizeof(ts_pktinfo)); if (!if_index) { rcu_read_lock(); orig_dev = dev_get_by_napi_id(skb_napi_id(skb)); if (orig_dev) if_index = orig_dev->ifindex; rcu_read_unlock(); } ts_pktinfo.if_index = if_index; ts_pktinfo.pkt_length = skb->len - skb_mac_offset(skb); put_cmsg(msg, SOL_SOCKET, SCM_TIMESTAMPING_PKTINFO, sizeof(ts_pktinfo), &ts_pktinfo); } bool skb_has_tx_timestamp(struct sk_buff *skb, const struct sock *sk) { const struct sock_exterr_skb *serr = SKB_EXT_ERR(skb); u32 tsflags = READ_ONCE(sk->sk_tsflags); if (serr->ee.ee_errno != ENOMSG || serr->ee.ee_origin != SO_EE_ORIGIN_TIMESTAMPING) return false; /* software time stamp available and wanted */ if ((tsflags & SOF_TIMESTAMPING_SOFTWARE) && skb->tstamp) return true; /* hardware time stamps available and wanted */ return (tsflags & SOF_TIMESTAMPING_RAW_HARDWARE) && skb_hwtstamps(skb)->hwtstamp; } int skb_get_tx_timestamp(struct sk_buff *skb, struct sock *sk, struct timespec64 *ts) { u32 tsflags = READ_ONCE(sk->sk_tsflags); ktime_t hwtstamp; int if_index = 0; if ((tsflags & SOF_TIMESTAMPING_SOFTWARE) && ktime_to_timespec64_cond(skb->tstamp, ts)) return SOF_TIMESTAMPING_TX_SOFTWARE; if (!(tsflags & SOF_TIMESTAMPING_RAW_HARDWARE) || skb_is_swtx_tstamp(skb, false)) return -ENOENT; if (skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP_NETDEV) hwtstamp = get_timestamp(sk, skb, &if_index); else hwtstamp = skb_hwtstamps(skb)->hwtstamp; if (tsflags & SOF_TIMESTAMPING_BIND_PHC) hwtstamp = ptp_convert_timestamp(&hwtstamp, READ_ONCE(sk->sk_bind_phc)); if (!ktime_to_timespec64_cond(hwtstamp, ts)) return -ENOENT; return SOF_TIMESTAMPING_TX_HARDWARE; } /* * called from sock_recv_timestamp() if sock_flag(sk, SOCK_RCVTSTAMP) */ void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk, struct sk_buff *skb) { int need_software_tstamp = sock_flag(sk, SOCK_RCVTSTAMP); int new_tstamp = sock_flag(sk, SOCK_TSTAMP_NEW); struct scm_timestamping_internal tss; int empty = 1, false_tstamp = 0; struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb); int if_index; ktime_t hwtstamp; u32 tsflags; /* Race occurred between timestamp enabling and packet receiving. Fill in the current time for now. */ if (need_software_tstamp && skb->tstamp == 0) { __net_timestamp(skb); false_tstamp = 1; } if (need_software_tstamp) { if (!sock_flag(sk, SOCK_RCVTSTAMPNS)) { if (new_tstamp) { struct __kernel_sock_timeval tv; skb_get_new_timestamp(skb, &tv); put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMP_NEW, sizeof(tv), &tv); } else { struct __kernel_old_timeval tv; skb_get_timestamp(skb, &tv); put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMP_OLD, sizeof(tv), &tv); } } else { if (new_tstamp) { struct __kernel_timespec ts; skb_get_new_timestampns(skb, &ts); put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMPNS_NEW, sizeof(ts), &ts); } else { struct __kernel_old_timespec ts; skb_get_timestampns(skb, &ts); put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMPNS_OLD, sizeof(ts), &ts); } } } memset(&tss, 0, sizeof(tss)); tsflags = READ_ONCE(sk->sk_tsflags); if ((tsflags & SOF_TIMESTAMPING_SOFTWARE && (tsflags & SOF_TIMESTAMPING_RX_SOFTWARE || skb_is_err_queue(skb) || !(tsflags & SOF_TIMESTAMPING_OPT_RX_FILTER))) && ktime_to_timespec64_cond(skb->tstamp, tss.ts + 0)) empty = 0; if (shhwtstamps && (tsflags & SOF_TIMESTAMPING_RAW_HARDWARE && (tsflags & SOF_TIMESTAMPING_RX_HARDWARE || skb_is_err_queue(skb) || !(tsflags & SOF_TIMESTAMPING_OPT_RX_FILTER))) && !skb_is_swtx_tstamp(skb, false_tstamp)) { if_index = 0; if (skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP_NETDEV) hwtstamp = get_timestamp(sk, skb, &if_index); else hwtstamp = shhwtstamps->hwtstamp; if (tsflags & SOF_TIMESTAMPING_BIND_PHC) hwtstamp = ptp_convert_timestamp(&hwtstamp, READ_ONCE(sk->sk_bind_phc)); if (ktime_to_timespec64_cond(hwtstamp, tss.ts + 2)) { empty = 0; if ((tsflags & SOF_TIMESTAMPING_OPT_PKTINFO) && !skb_is_err_queue(skb)) put_ts_pktinfo(msg, skb, if_index); } } if (!empty) { if (sock_flag(sk, SOCK_TSTAMP_NEW)) put_cmsg_scm_timestamping64(msg, &tss); else put_cmsg_scm_timestamping(msg, &tss); if (skb_is_err_queue(skb) && skb->len && SKB_EXT_ERR(skb)->opt_stats) put_cmsg(msg, SOL_SOCKET, SCM_TIMESTAMPING_OPT_STATS, skb->len, skb->data); } } EXPORT_SYMBOL_GPL(__sock_recv_timestamp); #ifdef CONFIG_WIRELESS void __sock_recv_wifi_status(struct msghdr *msg, struct sock *sk, struct sk_buff *skb) { int ack; if (!sock_flag(sk, SOCK_WIFI_STATUS)) return; if (!skb->wifi_acked_valid) return; ack = skb->wifi_acked; put_cmsg(msg, SOL_SOCKET, SCM_WIFI_STATUS, sizeof(ack), &ack); } EXPORT_SYMBOL_GPL(__sock_recv_wifi_status); #endif static inline void sock_recv_drops(struct msghdr *msg, struct sock *sk, struct sk_buff *skb) { if (sock_flag(sk, SOCK_RXQ_OVFL) && skb && SOCK_SKB_CB(skb)->dropcount) put_cmsg(msg, SOL_SOCKET, SO_RXQ_OVFL, sizeof(__u32), &SOCK_SKB_CB(skb)->dropcount); } static void sock_recv_mark(struct msghdr *msg, struct sock *sk, struct sk_buff *skb) { if (sock_flag(sk, SOCK_RCVMARK) && skb) { /* We must use a bounce buffer for CONFIG_HARDENED_USERCOPY=y */ __u32 mark = skb->mark; put_cmsg(msg, SOL_SOCKET, SO_MARK, sizeof(__u32), &mark); } } static void sock_recv_priority(struct msghdr *msg, struct sock *sk, struct sk_buff *skb) { if (sock_flag(sk, SOCK_RCVPRIORITY) && skb) { __u32 priority = skb->priority; put_cmsg(msg, SOL_SOCKET, SO_PRIORITY, sizeof(__u32), &priority); } } void __sock_recv_cmsgs(struct msghdr *msg, struct sock *sk, struct sk_buff *skb) { sock_recv_timestamp(msg, sk, skb); sock_recv_drops(msg, sk, skb); sock_recv_mark(msg, sk, skb); sock_recv_priority(msg, sk, skb); } EXPORT_SYMBOL_GPL(__sock_recv_cmsgs); INDIRECT_CALLABLE_DECLARE(int inet_recvmsg(struct socket *, struct msghdr *, size_t, int)); INDIRECT_CALLABLE_DECLARE(int inet6_recvmsg(struct socket *, struct msghdr *, size_t, int)); static noinline void call_trace_sock_recv_length(struct sock *sk, int ret, int flags) { trace_sock_recv_length(sk, ret, flags); } static inline int sock_recvmsg_nosec(struct socket *sock, struct msghdr *msg, int flags) { int ret = INDIRECT_CALL_INET(READ_ONCE(sock->ops)->recvmsg, inet6_recvmsg, inet_recvmsg, sock, msg, msg_data_left(msg), flags); if (trace_sock_recv_length_enabled()) call_trace_sock_recv_length(sock->sk, ret, flags); return ret; } /** * sock_recvmsg - receive a message from @sock * @sock: socket * @msg: message to receive * @flags: message flags * * Receives @msg from @sock, passing through LSM. Returns the total number * of bytes received, or an error. */ int sock_recvmsg(struct socket *sock, struct msghdr *msg, int flags) { int err = security_socket_recvmsg(sock, msg, msg_data_left(msg), flags); return err ?: sock_recvmsg_nosec(sock, msg, flags); } EXPORT_SYMBOL(sock_recvmsg); /** * kernel_recvmsg - Receive a message from a socket (kernel space) * @sock: The socket to receive the message from * @msg: Received message * @vec: Input s/g array for message data * @num: Size of input s/g array * @size: Number of bytes to read * @flags: Message flags (MSG_DONTWAIT, etc...) * * On return the msg structure contains the scatter/gather array passed in the * vec argument. The array is modified so that it consists of the unfilled * portion of the original array. * * The returned value is the total number of bytes received, or an error. */ int kernel_recvmsg(struct socket *sock, struct msghdr *msg, struct kvec *vec, size_t num, size_t size, int flags) { msg->msg_control_is_user = false; iov_iter_kvec(&msg->msg_iter, ITER_DEST, vec, num, size); return sock_recvmsg(sock, msg, flags); } EXPORT_SYMBOL(kernel_recvmsg); static ssize_t sock_splice_read(struct file *file, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags) { struct socket *sock = file->private_data; const struct proto_ops *ops; ops = READ_ONCE(sock->ops); if (unlikely(!ops->splice_read)) return copy_splice_read(file, ppos, pipe, len, flags); return ops->splice_read(sock, ppos, pipe, len, flags); } static void sock_splice_eof(struct file *file) { struct socket *sock = file->private_data; const struct proto_ops *ops; ops = READ_ONCE(sock->ops); if (ops->splice_eof) ops->splice_eof(sock); } static ssize_t sock_read_iter(struct kiocb *iocb, struct iov_iter *to) { struct file *file = iocb->ki_filp; struct socket *sock = file->private_data; struct msghdr msg = {.msg_iter = *to, .msg_iocb = iocb}; ssize_t res; if (file->f_flags & O_NONBLOCK || (iocb->ki_flags & IOCB_NOWAIT)) msg.msg_flags = MSG_DONTWAIT; if (iocb->ki_pos != 0) return -ESPIPE; if (!iov_iter_count(to)) /* Match SYS5 behaviour */ return 0; res = sock_recvmsg(sock, &msg, msg.msg_flags); *to = msg.msg_iter; return res; } static ssize_t sock_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; struct socket *sock = file->private_data; struct msghdr msg = {.msg_iter = *from, .msg_iocb = iocb}; ssize_t res; if (iocb->ki_pos != 0) return -ESPIPE; if (file->f_flags & O_NONBLOCK || (iocb->ki_flags & IOCB_NOWAIT)) msg.msg_flags = MSG_DONTWAIT; if (sock->type == SOCK_SEQPACKET) msg.msg_flags |= MSG_EOR; if (iocb->ki_flags & IOCB_NOSIGNAL) msg.msg_flags |= MSG_NOSIGNAL; res = __sock_sendmsg(sock, &msg); *from = msg.msg_iter; return res; } /* * Atomic setting of ioctl hooks to avoid race * with module unload. */ static DEFINE_MUTEX(br_ioctl_mutex); static int (*br_ioctl_hook)(struct net *net, unsigned int cmd, void __user *uarg); void brioctl_set(int (*hook)(struct net *net, unsigned int cmd, void __user *uarg)) { mutex_lock(&br_ioctl_mutex); br_ioctl_hook = hook; mutex_unlock(&br_ioctl_mutex); } EXPORT_SYMBOL(brioctl_set); int br_ioctl_call(struct net *net, unsigned int cmd, void __user *uarg) { int err = -ENOPKG; if (!br_ioctl_hook) request_module("bridge"); mutex_lock(&br_ioctl_mutex); if (br_ioctl_hook) err = br_ioctl_hook(net, cmd, uarg); mutex_unlock(&br_ioctl_mutex); return err; } static DEFINE_MUTEX(vlan_ioctl_mutex); static int (*vlan_ioctl_hook) (struct net *, void __user *arg); void vlan_ioctl_set(int (*hook) (struct net *, void __user *)) { mutex_lock(&vlan_ioctl_mutex); vlan_ioctl_hook = hook; mutex_unlock(&vlan_ioctl_mutex); } EXPORT_SYMBOL(vlan_ioctl_set); static long sock_do_ioctl(struct net *net, struct socket *sock, unsigned int cmd, unsigned long arg) { const struct proto_ops *ops = READ_ONCE(sock->ops); struct ifreq ifr; bool need_copyout; int err; void __user *argp = (void __user *)arg; void __user *data; err = ops->ioctl(sock, cmd, arg); /* * If this ioctl is unknown try to hand it down * to the NIC driver. */ if (err != -ENOIOCTLCMD) return err; if (!is_socket_ioctl_cmd(cmd)) return -ENOTTY; if (get_user_ifreq(&ifr, &data, argp)) return -EFAULT; err = dev_ioctl(net, cmd, &ifr, data, &need_copyout); if (!err && need_copyout) if (put_user_ifreq(&ifr, argp)) return -EFAULT; return err; } /* * With an ioctl, arg may well be a user mode pointer, but we don't know * what to do with it - that's up to the protocol still. */ static long sock_ioctl(struct file *file, unsigned cmd, unsigned long arg) { const struct proto_ops *ops; struct socket *sock; struct sock *sk; void __user *argp = (void __user *)arg; int pid, err; struct net *net; sock = file->private_data; ops = READ_ONCE(sock->ops); sk = sock->sk; net = sock_net(sk); if (unlikely(cmd >= SIOCDEVPRIVATE && cmd <= (SIOCDEVPRIVATE + 15))) { struct ifreq ifr; void __user *data; bool need_copyout; if (get_user_ifreq(&ifr, &data, argp)) return -EFAULT; err = dev_ioctl(net, cmd, &ifr, data, &need_copyout); if (!err && need_copyout) if (put_user_ifreq(&ifr, argp)) return -EFAULT; } else #ifdef CONFIG_WEXT_CORE if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST) { err = wext_handle_ioctl(net, cmd, argp); } else #endif switch (cmd) { case FIOSETOWN: case SIOCSPGRP: err = -EFAULT; if (get_user(pid, (int __user *)argp)) break; err = f_setown(sock->file, pid, 1); break; case FIOGETOWN: case SIOCGPGRP: err = put_user(f_getown(sock->file), (int __user *)argp); break; case SIOCGIFBR: case SIOCSIFBR: case SIOCBRADDBR: case SIOCBRDELBR: case SIOCBRADDIF: case SIOCBRDELIF: err = br_ioctl_call(net, cmd, argp); break; case SIOCGIFVLAN: case SIOCSIFVLAN: err = -ENOPKG; if (!vlan_ioctl_hook) request_module("8021q"); mutex_lock(&vlan_ioctl_mutex); if (vlan_ioctl_hook) err = vlan_ioctl_hook(net, argp); mutex_unlock(&vlan_ioctl_mutex); break; case SIOCGSKNS: err = -EPERM; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) break; err = open_related_ns(&net->ns, get_net_ns); break; case SIOCGSTAMP_OLD: case SIOCGSTAMPNS_OLD: if (!ops->gettstamp) { err = -ENOIOCTLCMD; break; } err = ops->gettstamp(sock, argp, cmd == SIOCGSTAMP_OLD, !IS_ENABLED(CONFIG_64BIT)); break; case SIOCGSTAMP_NEW: case SIOCGSTAMPNS_NEW: if (!ops->gettstamp) { err = -ENOIOCTLCMD; break; } err = ops->gettstamp(sock, argp, cmd == SIOCGSTAMP_NEW, false); break; case SIOCGIFCONF: err = dev_ifconf(net, argp); break; default: err = sock_do_ioctl(net, sock, cmd, arg); break; } return err; } /** * sock_create_lite - creates a socket * @family: protocol family (AF_INET, ...) * @type: communication type (SOCK_STREAM, ...) * @protocol: protocol (0, ...) * @res: new socket * * Creates a new socket and assigns it to @res, passing through LSM. * The new socket initialization is not complete, see kernel_accept(). * Returns 0 or an error. On failure @res is set to %NULL. * This function internally uses GFP_KERNEL. */ int sock_create_lite(int family, int type, int protocol, struct socket **res) { int err; struct socket *sock = NULL; err = security_socket_create(family, type, protocol, 1); if (err) goto out; sock = sock_alloc(); if (!sock) { err = -ENOMEM; goto out; } sock->type = type; err = security_socket_post_create(sock, family, type, protocol, 1); if (err) goto out_release; out: *res = sock; return err; out_release: sock_release(sock); sock = NULL; goto out; } EXPORT_SYMBOL(sock_create_lite); /* No kernel lock held - perfect */ static __poll_t sock_poll(struct file *file, poll_table *wait) { struct socket *sock = file->private_data; const struct proto_ops *ops = READ_ONCE(sock->ops); __poll_t events = poll_requested_events(wait), flag = 0; if (!ops->poll) return 0; if (sk_can_busy_loop(sock->sk)) { /* poll once if requested by the syscall */ if (events & POLL_BUSY_LOOP) sk_busy_loop(sock->sk, 1); /* if this socket can poll_ll, tell the system call */ flag = POLL_BUSY_LOOP; } return ops->poll(file, sock, wait) | flag; } static int sock_mmap(struct file *file, struct vm_area_struct *vma) { struct socket *sock = file->private_data; return READ_ONCE(sock->ops)->mmap(file, sock, vma); } static int sock_close(struct inode *inode, struct file *filp) { __sock_release(SOCKET_I(inode), inode); return 0; } /* * Update the socket async list * * Fasync_list locking strategy. * * 1. fasync_list is modified only under process context socket lock * i.e. under semaphore. * 2. fasync_list is used under read_lock(&sk->sk_callback_lock) * or under socket lock */ static int sock_fasync(int fd, struct file *filp, int on) { struct socket *sock = filp->private_data; struct sock *sk = sock->sk; struct socket_wq *wq = &sock->wq; if (sk == NULL) return -EINVAL; lock_sock(sk); fasync_helper(fd, filp, on, &wq->fasync_list); if (!wq->fasync_list) sock_reset_flag(sk, SOCK_FASYNC); else sock_set_flag(sk, SOCK_FASYNC); release_sock(sk); return 0; } /* This function may be called only under rcu_lock */ int sock_wake_async(struct socket_wq *wq, int how, int band) { if (!wq || !wq->fasync_list) return -1; switch (how) { case SOCK_WAKE_WAITD: if (test_bit(SOCKWQ_ASYNC_WAITDATA, &wq->flags)) break; goto call_kill; case SOCK_WAKE_SPACE: if (!test_and_clear_bit(SOCKWQ_ASYNC_NOSPACE, &wq->flags)) break; fallthrough; case SOCK_WAKE_IO: call_kill: kill_fasync(&wq->fasync_list, SIGIO, band); break; case SOCK_WAKE_URG: kill_fasync(&wq->fasync_list, SIGURG, band); } return 0; } EXPORT_SYMBOL(sock_wake_async); /** * __sock_create - creates a socket * @net: net namespace * @family: protocol family (AF_INET, ...) * @type: communication type (SOCK_STREAM, ...) * @protocol: protocol (0, ...) * @res: new socket * @kern: boolean for kernel space sockets * * Creates a new socket and assigns it to @res, passing through LSM. * Returns 0 or an error. On failure @res is set to %NULL. @kern must * be set to true if the socket resides in kernel space. * This function internally uses GFP_KERNEL. */ int __sock_create(struct net *net, int family, int type, int protocol, struct socket **res, int kern) { int err; struct socket *sock; const struct net_proto_family *pf; /* * Check protocol is in range */ if (family < 0 || family >= NPROTO) return -EAFNOSUPPORT; if (type < 0 || type >= SOCK_MAX) return -EINVAL; /* Compatibility. This uglymoron is moved from INET layer to here to avoid deadlock in module load. */ if (family == PF_INET && type == SOCK_PACKET) { pr_info_once("%s uses obsolete (PF_INET,SOCK_PACKET)\n", current->comm); family = PF_PACKET; } err = security_socket_create(family, type, protocol, kern); if (err) return err; /* * Allocate the socket and allow the family to set things up. if * the protocol is 0, the family is instructed to select an appropriate * default. */ sock = sock_alloc(); if (!sock) { net_warn_ratelimited("socket: no more sockets\n"); return -ENFILE; /* Not exactly a match, but its the closest posix thing */ } sock->type = type; #ifdef CONFIG_MODULES /* Attempt to load a protocol module if the find failed. * * 12/09/1996 Marcin: But! this makes REALLY only sense, if the user * requested real, full-featured networking support upon configuration. * Otherwise module support will break! */ if (rcu_access_pointer(net_families[family]) == NULL) request_module("net-pf-%d", family); #endif rcu_read_lock(); pf = rcu_dereference(net_families[family]); err = -EAFNOSUPPORT; if (!pf) goto out_release; /* * We will call the ->create function, that possibly is in a loadable * module, so we have to bump that loadable module refcnt first. */ if (!try_module_get(pf->owner)) goto out_release; /* Now protected by module ref count */ rcu_read_unlock(); err = pf->create(net, sock, protocol, kern); if (err < 0) { /* ->create should release the allocated sock->sk object on error * and make sure sock->sk is set to NULL to avoid use-after-free */ DEBUG_NET_WARN_ONCE(sock->sk, "%ps must clear sock->sk on failure, family: %d, type: %d, protocol: %d\n", pf->create, family, type, protocol); goto out_module_put; } /* * Now to bump the refcnt of the [loadable] module that owns this * socket at sock_release time we decrement its refcnt. */ if (!try_module_get(sock->ops->owner)) goto out_module_busy; /* * Now that we're done with the ->create function, the [loadable] * module can have its refcnt decremented */ module_put(pf->owner); err = security_socket_post_create(sock, family, type, protocol, kern); if (err) goto out_sock_release; *res = sock; return 0; out_module_busy: err = -EAFNOSUPPORT; out_module_put: sock->ops = NULL; module_put(pf->owner); out_sock_release: sock_release(sock); return err; out_release: rcu_read_unlock(); goto out_sock_release; } EXPORT_SYMBOL(__sock_create); /** * sock_create - creates a socket * @family: protocol family (AF_INET, ...) * @type: communication type (SOCK_STREAM, ...) * @protocol: protocol (0, ...) * @res: new socket * * A wrapper around __sock_create(). * Returns 0 or an error. This function internally uses GFP_KERNEL. */ int sock_create(int family, int type, int protocol, struct socket **res) { return __sock_create(current->nsproxy->net_ns, family, type, protocol, res, 0); } EXPORT_SYMBOL(sock_create); /** * sock_create_kern - creates a socket (kernel space) * @net: net namespace * @family: protocol family (AF_INET, ...) * @type: communication type (SOCK_STREAM, ...) * @protocol: protocol (0, ...) * @res: new socket * * A wrapper around __sock_create(). * Returns 0 or an error. This function internally uses GFP_KERNEL. */ int sock_create_kern(struct net *net, int family, int type, int protocol, struct socket **res) { return __sock_create(net, family, type, protocol, res, 1); } EXPORT_SYMBOL(sock_create_kern); static struct socket *__sys_socket_create(int family, int type, int protocol) { struct socket *sock; int retval; /* Check the SOCK_* constants for consistency. */ BUILD_BUG_ON(SOCK_CLOEXEC != O_CLOEXEC); BUILD_BUG_ON((SOCK_MAX | SOCK_TYPE_MASK) != SOCK_TYPE_MASK); BUILD_BUG_ON(SOCK_CLOEXEC & SOCK_TYPE_MASK); BUILD_BUG_ON(SOCK_NONBLOCK & SOCK_TYPE_MASK); if ((type & ~SOCK_TYPE_MASK) & ~(SOCK_CLOEXEC | SOCK_NONBLOCK)) return ERR_PTR(-EINVAL); type &= SOCK_TYPE_MASK; retval = sock_create(family, type, protocol, &sock); if (retval < 0) return ERR_PTR(retval); return sock; } struct file *__sys_socket_file(int family, int type, int protocol) { struct socket *sock; int flags; sock = __sys_socket_create(family, type, protocol); if (IS_ERR(sock)) return ERR_CAST(sock); flags = type & ~SOCK_TYPE_MASK; if (SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK)) flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK; return sock_alloc_file(sock, flags, NULL); } /* A hook for bpf progs to attach to and update socket protocol. * * A static noinline declaration here could cause the compiler to * optimize away the function. A global noinline declaration will * keep the definition, but may optimize away the callsite. * Therefore, __weak is needed to ensure that the call is still * emitted, by telling the compiler that we don't know what the * function might eventually be. */ __bpf_hook_start(); __weak noinline int update_socket_protocol(int family, int type, int protocol) { return protocol; } __bpf_hook_end(); int __sys_socket(int family, int type, int protocol) { struct socket *sock; int flags; sock = __sys_socket_create(family, type, update_socket_protocol(family, type, protocol)); if (IS_ERR(sock)) return PTR_ERR(sock); flags = type & ~SOCK_TYPE_MASK; if (SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK)) flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK; return sock_map_fd(sock, flags & (O_CLOEXEC | O_NONBLOCK)); } SYSCALL_DEFINE3(socket, int, family, int, type, int, protocol) { return __sys_socket(family, type, protocol); } /* * Create a pair of connected sockets. */ int __sys_socketpair(int family, int type, int protocol, int __user *usockvec) { struct socket *sock1, *sock2; int fd1, fd2, err; struct file *newfile1, *newfile2; int flags; flags = type & ~SOCK_TYPE_MASK; if (flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK)) return -EINVAL; type &= SOCK_TYPE_MASK; if (SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK)) flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK; /* * reserve descriptors and make sure we won't fail * to return them to userland. */ fd1 = get_unused_fd_flags(flags); if (unlikely(fd1 < 0)) return fd1; fd2 = get_unused_fd_flags(flags); if (unlikely(fd2 < 0)) { put_unused_fd(fd1); return fd2; } err = put_user(fd1, &usockvec[0]); if (err) goto out; err = put_user(fd2, &usockvec[1]); if (err) goto out; /* * Obtain the first socket and check if the underlying protocol * supports the socketpair call. */ err = sock_create(family, type, protocol, &sock1); if (unlikely(err < 0)) goto out; err = sock_create(family, type, protocol, &sock2); if (unlikely(err < 0)) { sock_release(sock1); goto out; } err = security_socket_socketpair(sock1, sock2); if (unlikely(err)) { sock_release(sock2); sock_release(sock1); goto out; } err = READ_ONCE(sock1->ops)->socketpair(sock1, sock2); if (unlikely(err < 0)) { sock_release(sock2); sock_release(sock1); goto out; } newfile1 = sock_alloc_file(sock1, flags, NULL); if (IS_ERR(newfile1)) { err = PTR_ERR(newfile1); sock_release(sock2); goto out; } newfile2 = sock_alloc_file(sock2, flags, NULL); if (IS_ERR(newfile2)) { err = PTR_ERR(newfile2); fput(newfile1); goto out; } audit_fd_pair(fd1, fd2); fd_install(fd1, newfile1); fd_install(fd2, newfile2); return 0; out: put_unused_fd(fd2); put_unused_fd(fd1); return err; } SYSCALL_DEFINE4(socketpair, int, family, int, type, int, protocol, int __user *, usockvec) { return __sys_socketpair(family, type, protocol, usockvec); } int __sys_bind_socket(struct socket *sock, struct sockaddr_storage *address, int addrlen) { int err; err = security_socket_bind(sock, (struct sockaddr *)address, addrlen); if (!err) err = READ_ONCE(sock->ops)->bind(sock, (struct sockaddr_unsized *)address, addrlen); return err; } /* * Bind a name to a socket. Nothing much to do here since it's * the protocol's responsibility to handle the local address. * * We move the socket address to kernel space before we call * the protocol layer (having also checked the address is ok). */ int __sys_bind(int fd, struct sockaddr __user *umyaddr, int addrlen) { struct socket *sock; struct sockaddr_storage address; CLASS(fd, f)(fd); int err; if (fd_empty(f)) return -EBADF; sock = sock_from_file(fd_file(f)); if (unlikely(!sock)) return -ENOTSOCK; err = move_addr_to_kernel(umyaddr, addrlen, &address); if (unlikely(err)) return err; return __sys_bind_socket(sock, &address, addrlen); } SYSCALL_DEFINE3(bind, int, fd, struct sockaddr __user *, umyaddr, int, addrlen) { return __sys_bind(fd, umyaddr, addrlen); } /* * Perform a listen. Basically, we allow the protocol to do anything * necessary for a listen, and if that works, we mark the socket as * ready for listening. */ int __sys_listen_socket(struct socket *sock, int backlog) { int somaxconn, err; somaxconn = READ_ONCE(sock_net(sock->sk)->core.sysctl_somaxconn); if ((unsigned int)backlog > somaxconn) backlog = somaxconn; err = security_socket_listen(sock, backlog); if (!err) err = READ_ONCE(sock->ops)->listen(sock, backlog); return err; } int __sys_listen(int fd, int backlog) { CLASS(fd, f)(fd); struct socket *sock; if (fd_empty(f)) return -EBADF; sock = sock_from_file(fd_file(f)); if (unlikely(!sock)) return -ENOTSOCK; return __sys_listen_socket(sock, backlog); } SYSCALL_DEFINE2(listen, int, fd, int, backlog) { return __sys_listen(fd, backlog); } struct file *do_accept(struct file *file, struct proto_accept_arg *arg, struct sockaddr __user *upeer_sockaddr, int __user *upeer_addrlen, int flags) { struct socket *sock, *newsock; struct file *newfile; int err, len; struct sockaddr_storage address; const struct proto_ops *ops; sock = sock_from_file(file); if (!sock) return ERR_PTR(-ENOTSOCK); newsock = sock_alloc(); if (!newsock) return ERR_PTR(-ENFILE); ops = READ_ONCE(sock->ops); newsock->type = sock->type; newsock->ops = ops; /* * We don't need try_module_get here, as the listening socket (sock) * has the protocol module (sock->ops->owner) held. */ __module_get(ops->owner); newfile = sock_alloc_file(newsock, flags, sock->sk->sk_prot_creator->name); if (IS_ERR(newfile)) return newfile; err = security_socket_accept(sock, newsock); if (err) goto out_fd; arg->flags |= sock->file->f_flags; err = ops->accept(sock, newsock, arg); if (err < 0) goto out_fd; if (upeer_sockaddr) { len = ops->getname(newsock, (struct sockaddr *)&address, 2); if (len < 0) { err = -ECONNABORTED; goto out_fd; } err = move_addr_to_user(&address, len, upeer_sockaddr, upeer_addrlen); if (err < 0) goto out_fd; } /* File flags are not inherited via accept() unlike another OSes. */ return newfile; out_fd: fput(newfile); return ERR_PTR(err); } static int __sys_accept4_file(struct file *file, struct sockaddr __user *upeer_sockaddr, int __user *upeer_addrlen, int flags) { struct proto_accept_arg arg = { }; if (flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK)) return -EINVAL; if (SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK)) flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK; return FD_ADD(flags, do_accept(file, &arg, upeer_sockaddr, upeer_addrlen, flags)); } /* * For accept, we attempt to create a new socket, set up the link * with the client, wake up the client, then return the new * connected fd. We collect the address of the connector in kernel * space and move it to user at the very end. This is unclean because * we open the socket then return an error. * * 1003.1g adds the ability to recvmsg() to query connection pending * status to recvmsg. We need to add that support in a way thats * clean when we restructure accept also. */ int __sys_accept4(int fd, struct sockaddr __user *upeer_sockaddr, int __user *upeer_addrlen, int flags) { CLASS(fd, f)(fd); if (fd_empty(f)) return -EBADF; return __sys_accept4_file(fd_file(f), upeer_sockaddr, upeer_addrlen, flags); } SYSCALL_DEFINE4(accept4, int, fd, struct sockaddr __user *, upeer_sockaddr, int __user *, upeer_addrlen, int, flags) { return __sys_accept4(fd, upeer_sockaddr, upeer_addrlen, flags); } SYSCALL_DEFINE3(accept, int, fd, struct sockaddr __user *, upeer_sockaddr, int __user *, upeer_addrlen) { return __sys_accept4(fd, upeer_sockaddr, upeer_addrlen, 0); } /* * Attempt to connect to a socket with the server address. The address * is in user space so we verify it is OK and move it to kernel space. * * For 1003.1g we need to add clean support for a bind to AF_UNSPEC to * break bindings * * NOTE: 1003.1g draft 6.3 is broken with respect to AX.25/NetROM and * other SEQPACKET protocols that take time to connect() as it doesn't * include the -EINPROGRESS status for such sockets. */ int __sys_connect_file(struct file *file, struct sockaddr_storage *address, int addrlen, int file_flags) { struct socket *sock; int err; sock = sock_from_file(file); if (!sock) { err = -ENOTSOCK; goto out; } err = security_socket_connect(sock, (struct sockaddr *)address, addrlen); if (err) goto out; err = READ_ONCE(sock->ops)->connect(sock, (struct sockaddr_unsized *)address, addrlen, sock->file->f_flags | file_flags); out: return err; } int __sys_connect(int fd, struct sockaddr __user *uservaddr, int addrlen) { struct sockaddr_storage address; CLASS(fd, f)(fd); int ret; if (fd_empty(f)) return -EBADF; ret = move_addr_to_kernel(uservaddr, addrlen, &address); if (ret) return ret; return __sys_connect_file(fd_file(f), &address, addrlen, 0); } SYSCALL_DEFINE3(connect, int, fd, struct sockaddr __user *, uservaddr, int, addrlen) { return __sys_connect(fd, uservaddr, addrlen); } int do_getsockname(struct socket *sock, int peer, struct sockaddr __user *usockaddr, int __user *usockaddr_len) { struct sockaddr_storage address; int err; if (peer) err = security_socket_getpeername(sock); else err = security_socket_getsockname(sock); if (err) return err; err = READ_ONCE(sock->ops)->getname(sock, (struct sockaddr *)&address, peer); if (err < 0) return err; /* "err" is actually length in this case */ return move_addr_to_user(&address, err, usockaddr, usockaddr_len); } /* * Get the remote or local address ('name') of a socket object. Move the * obtained name to user space. */ int __sys_getsockname(int fd, struct sockaddr __user *usockaddr, int __user *usockaddr_len, int peer) { struct socket *sock; CLASS(fd, f)(fd); if (fd_empty(f)) return -EBADF; sock = sock_from_file(fd_file(f)); if (unlikely(!sock)) return -ENOTSOCK; return do_getsockname(sock, peer, usockaddr, usockaddr_len); } SYSCALL_DEFINE3(getsockname, int, fd, struct sockaddr __user *, usockaddr, int __user *, usockaddr_len) { return __sys_getsockname(fd, usockaddr, usockaddr_len, 0); } SYSCALL_DEFINE3(getpeername, int, fd, struct sockaddr __user *, usockaddr, int __user *, usockaddr_len) { return __sys_getsockname(fd, usockaddr, usockaddr_len, 1); } /* * Send a datagram to a given address. We move the address into kernel * space and check the user space data area is readable before invoking * the protocol. */ int __sys_sendto(int fd, void __user *buff, size_t len, unsigned int flags, struct sockaddr __user *addr, int addr_len) { struct socket *sock; struct sockaddr_storage address; int err; struct msghdr msg; err = import_ubuf(ITER_SOURCE, buff, len, &msg.msg_iter); if (unlikely(err)) return err; CLASS(fd, f)(fd); if (fd_empty(f)) return -EBADF; sock = sock_from_file(fd_file(f)); if (unlikely(!sock)) return -ENOTSOCK; msg.msg_name = NULL; msg.msg_control = NULL; msg.msg_controllen = 0; msg.msg_namelen = 0; msg.msg_ubuf = NULL; if (addr) { err = move_addr_to_kernel(addr, addr_len, &address); if (err < 0) return err; msg.msg_name = (struct sockaddr *)&address; msg.msg_namelen = addr_len; } flags &= ~MSG_INTERNAL_SENDMSG_FLAGS; if (sock->file->f_flags & O_NONBLOCK) flags |= MSG_DONTWAIT; msg.msg_flags = flags; return __sock_sendmsg(sock, &msg); } SYSCALL_DEFINE6(sendto, int, fd, void __user *, buff, size_t, len, unsigned int, flags, struct sockaddr __user *, addr, int, addr_len) { return __sys_sendto(fd, buff, len, flags, addr, addr_len); } /* * Send a datagram down a socket. */ SYSCALL_DEFINE4(send, int, fd, void __user *, buff, size_t, len, unsigned int, flags) { return __sys_sendto(fd, buff, len, flags, NULL, 0); } /* * Receive a frame from the socket and optionally record the address of the * sender. We verify the buffers are writable and if needed move the * sender address from kernel to user space. */ int __sys_recvfrom(int fd, void __user *ubuf, size_t size, unsigned int flags, struct sockaddr __user *addr, int __user *addr_len) { struct sockaddr_storage address; struct msghdr msg = { /* Save some cycles and don't copy the address if not needed */ .msg_name = addr ? (struct sockaddr *)&address : NULL, }; struct socket *sock; int err, err2; err = import_ubuf(ITER_DEST, ubuf, size, &msg.msg_iter); if (unlikely(err)) return err; CLASS(fd, f)(fd); if (fd_empty(f)) return -EBADF; sock = sock_from_file(fd_file(f)); if (unlikely(!sock)) return -ENOTSOCK; if (sock->file->f_flags & O_NONBLOCK) flags |= MSG_DONTWAIT; err = sock_recvmsg(sock, &msg, flags); if (err >= 0 && addr != NULL) { err2 = move_addr_to_user(&address, msg.msg_namelen, addr, addr_len); if (err2 < 0) err = err2; } return err; } SYSCALL_DEFINE6(recvfrom, int, fd, void __user *, ubuf, size_t, size, unsigned int, flags, struct sockaddr __user *, addr, int __user *, addr_len) { return __sys_recvfrom(fd, ubuf, size, flags, addr, addr_len); } /* * Receive a datagram from a socket. */ SYSCALL_DEFINE4(recv, int, fd, void __user *, ubuf, size_t, size, unsigned int, flags) { return __sys_recvfrom(fd, ubuf, size, flags, NULL, NULL); } static bool sock_use_custom_sol_socket(const struct socket *sock) { return test_bit(SOCK_CUSTOM_SOCKOPT, &sock->flags); } int do_sock_setsockopt(struct socket *sock, bool compat, int level, int optname, sockptr_t optval, int optlen) { const struct proto_ops *ops; char *kernel_optval = NULL; int err; if (optlen < 0) return -EINVAL; err = security_socket_setsockopt(sock, level, optname); if (err) goto out_put; if (!compat) err = BPF_CGROUP_RUN_PROG_SETSOCKOPT(sock->sk, &level, &optname, optval, &optlen, &kernel_optval); if (err < 0) goto out_put; if (err > 0) { err = 0; goto out_put; } if (kernel_optval) optval = KERNEL_SOCKPTR(kernel_optval); ops = READ_ONCE(sock->ops); if (level == SOL_SOCKET && !sock_use_custom_sol_socket(sock)) err = sock_setsockopt(sock, level, optname, optval, optlen); else if (unlikely(!ops->setsockopt)) err = -EOPNOTSUPP; else err = ops->setsockopt(sock, level, optname, optval, optlen); kfree(kernel_optval); out_put: return err; } EXPORT_SYMBOL(do_sock_setsockopt); /* Set a socket option. Because we don't know the option lengths we have * to pass the user mode parameter for the protocols to sort out. */ int __sys_setsockopt(int fd, int level, int optname, char __user *user_optval, int optlen) { sockptr_t optval = USER_SOCKPTR(user_optval); bool compat = in_compat_syscall(); struct socket *sock; CLASS(fd, f)(fd); if (fd_empty(f)) return -EBADF; sock = sock_from_file(fd_file(f)); if (unlikely(!sock)) return -ENOTSOCK; return do_sock_setsockopt(sock, compat, level, optname, optval, optlen); } SYSCALL_DEFINE5(setsockopt, int, fd, int, level, int, optname, char __user *, optval, int, optlen) { return __sys_setsockopt(fd, level, optname, optval, optlen); } INDIRECT_CALLABLE_DECLARE(bool tcp_bpf_bypass_getsockopt(int level, int optname)); int do_sock_getsockopt(struct socket *sock, bool compat, int level, int optname, sockptr_t optval, sockptr_t optlen) { int max_optlen __maybe_unused = 0; const struct proto_ops *ops; int err; err = security_socket_getsockopt(sock, level, optname); if (err) return err; if (!compat) copy_from_sockptr(&max_optlen, optlen, sizeof(int)); ops = READ_ONCE(sock->ops); if (level == SOL_SOCKET) { err = sk_getsockopt(sock->sk, level, optname, optval, optlen); } else if (unlikely(!ops->getsockopt)) { err = -EOPNOTSUPP; } else { if (WARN_ONCE(optval.is_kernel || optlen.is_kernel, "Invalid argument type")) return -EOPNOTSUPP; err = ops->getsockopt(sock, level, optname, optval.user, optlen.user); } if (!compat) err = BPF_CGROUP_RUN_PROG_GETSOCKOPT(sock->sk, level, optname, optval, optlen, max_optlen, err); return err; } EXPORT_SYMBOL(do_sock_getsockopt); /* * Get a socket option. Because we don't know the option lengths we have * to pass a user mode parameter for the protocols to sort out. */ int __sys_getsockopt(int fd, int level, int optname, char __user *optval, int __user *optlen) { struct socket *sock; CLASS(fd, f)(fd); if (fd_empty(f)) return -EBADF; sock = sock_from_file(fd_file(f)); if (unlikely(!sock)) return -ENOTSOCK; return do_sock_getsockopt(sock, in_compat_syscall(), level, optname, USER_SOCKPTR(optval), USER_SOCKPTR(optlen)); } SYSCALL_DEFINE5(getsockopt, int, fd, int, level, int, optname, char __user *, optval, int __user *, optlen) { return __sys_getsockopt(fd, level, optname, optval, optlen); } /* * Shutdown a socket. */ int __sys_shutdown_sock(struct socket *sock, int how) { int err; err = security_socket_shutdown(sock, how); if (!err) err = READ_ONCE(sock->ops)->shutdown(sock, how); return err; } int __sys_shutdown(int fd, int how) { struct socket *sock; CLASS(fd, f)(fd); if (fd_empty(f)) return -EBADF; sock = sock_from_file(fd_file(f)); if (unlikely(!sock)) return -ENOTSOCK; return __sys_shutdown_sock(sock, how); } SYSCALL_DEFINE2(shutdown, int, fd, int, how) { return __sys_shutdown(fd, how); } /* A couple of helpful macros for getting the address of the 32/64 bit * fields which are the same type (int / unsigned) on our platforms. */ #define COMPAT_MSG(msg, member) ((MSG_CMSG_COMPAT & flags) ? &msg##_compat->member : &msg->member) #define COMPAT_NAMELEN(msg) COMPAT_MSG(msg, msg_namelen) #define COMPAT_FLAGS(msg) COMPAT_MSG(msg, msg_flags) struct used_address { struct sockaddr_storage name; unsigned int name_len; }; int __copy_msghdr(struct msghdr *kmsg, struct user_msghdr *msg, struct sockaddr __user **save_addr) { ssize_t err; kmsg->msg_control_is_user = true; kmsg->msg_get_inq = 0; kmsg->msg_control_user = msg->msg_control; kmsg->msg_controllen = msg->msg_controllen; kmsg->msg_flags = msg->msg_flags; kmsg->msg_namelen = msg->msg_namelen; if (!msg->msg_name) kmsg->msg_namelen = 0; if (kmsg->msg_namelen < 0) return -EINVAL; if (kmsg->msg_namelen > sizeof(struct sockaddr_storage)) kmsg->msg_namelen = sizeof(struct sockaddr_storage); if (save_addr) *save_addr = msg->msg_name; if (msg->msg_name && kmsg->msg_namelen) { if (!save_addr) { err = move_addr_to_kernel(msg->msg_name, kmsg->msg_namelen, kmsg->msg_name); if (err < 0) return err; } } else { kmsg->msg_name = NULL; kmsg->msg_namelen = 0; } if (msg->msg_iovlen > UIO_MAXIOV) return -EMSGSIZE; kmsg->msg_iocb = NULL; kmsg->msg_ubuf = NULL; return 0; } static int copy_msghdr_from_user(struct msghdr *kmsg, struct user_msghdr __user *umsg, struct sockaddr __user **save_addr, struct iovec **iov) { struct user_msghdr msg; ssize_t err; if (copy_from_user(&msg, umsg, sizeof(*umsg))) return -EFAULT; err = __copy_msghdr(kmsg, &msg, save_addr); if (err) return err; err = import_iovec(save_addr ? ITER_DEST : ITER_SOURCE, msg.msg_iov, msg.msg_iovlen, UIO_FASTIOV, iov, &kmsg->msg_iter); return err < 0 ? err : 0; } static int ____sys_sendmsg(struct socket *sock, struct msghdr *msg_sys, unsigned int flags, struct used_address *used_address, unsigned int allowed_msghdr_flags) { unsigned char ctl[sizeof(struct cmsghdr) + 20] __aligned(sizeof(__kernel_size_t)); /* 20 is size of ipv6_pktinfo */ unsigned char *ctl_buf = ctl; int ctl_len; ssize_t err; err = -ENOBUFS; if (msg_sys->msg_controllen > INT_MAX) goto out; flags |= (msg_sys->msg_flags & allowed_msghdr_flags); ctl_len = msg_sys->msg_controllen; if ((MSG_CMSG_COMPAT & flags) && ctl_len) { err = cmsghdr_from_user_compat_to_kern(msg_sys, sock->sk, ctl, sizeof(ctl)); if (err) goto out; ctl_buf = msg_sys->msg_control; ctl_len = msg_sys->msg_controllen; } else if (ctl_len) { BUILD_BUG_ON(sizeof(struct cmsghdr) != CMSG_ALIGN(sizeof(struct cmsghdr))); if (ctl_len > sizeof(ctl)) { ctl_buf = sock_kmalloc(sock->sk, ctl_len, GFP_KERNEL); if (ctl_buf == NULL) goto out; } err = -EFAULT; if (copy_from_user(ctl_buf, msg_sys->msg_control_user, ctl_len)) goto out_freectl; msg_sys->msg_control = ctl_buf; msg_sys->msg_control_is_user = false; } flags &= ~MSG_INTERNAL_SENDMSG_FLAGS; msg_sys->msg_flags = flags; if (sock->file->f_flags & O_NONBLOCK) msg_sys->msg_flags |= MSG_DONTWAIT; /* * If this is sendmmsg() and current destination address is same as * previously succeeded address, omit asking LSM's decision. * used_address->name_len is initialized to UINT_MAX so that the first * destination address never matches. */ if (used_address && msg_sys->msg_name && used_address->name_len == msg_sys->msg_namelen && !memcmp(&used_address->name, msg_sys->msg_name, used_address->name_len)) { err = sock_sendmsg_nosec(sock, msg_sys); goto out_freectl; } err = __sock_sendmsg(sock, msg_sys); /* * If this is sendmmsg() and sending to current destination address was * successful, remember it. */ if (used_address && err >= 0) { used_address->name_len = msg_sys->msg_namelen; if (msg_sys->msg_name) memcpy(&used_address->name, msg_sys->msg_name, used_address->name_len); } out_freectl: if (ctl_buf != ctl) sock_kfree_s(sock->sk, ctl_buf, ctl_len); out: return err; } static int sendmsg_copy_msghdr(struct msghdr *msg, struct user_msghdr __user *umsg, unsigned flags, struct iovec **iov) { int err; if (flags & MSG_CMSG_COMPAT) { struct compat_msghdr __user *msg_compat; msg_compat = (struct compat_msghdr __user *) umsg; err = get_compat_msghdr(msg, msg_compat, NULL, iov); } else { err = copy_msghdr_from_user(msg, umsg, NULL, iov); } if (err < 0) return err; return 0; } static int ___sys_sendmsg(struct socket *sock, struct user_msghdr __user *msg, struct msghdr *msg_sys, unsigned int flags, struct used_address *used_address, unsigned int allowed_msghdr_flags) { struct sockaddr_storage address; struct iovec iovstack[UIO_FASTIOV], *iov = iovstack; ssize_t err; msg_sys->msg_name = &address; err = sendmsg_copy_msghdr(msg_sys, msg, flags, &iov); if (err < 0) return err; err = ____sys_sendmsg(sock, msg_sys, flags, used_address, allowed_msghdr_flags); kfree(iov); return err; } /* * BSD sendmsg interface */ long __sys_sendmsg_sock(struct socket *sock, struct msghdr *msg, unsigned int flags) { return ____sys_sendmsg(sock, msg, flags, NULL, 0); } long __sys_sendmsg(int fd, struct user_msghdr __user *msg, unsigned int flags, bool forbid_cmsg_compat) { struct msghdr msg_sys; struct socket *sock; if (forbid_cmsg_compat && (flags & MSG_CMSG_COMPAT)) return -EINVAL; CLASS(fd, f)(fd); if (fd_empty(f)) return -EBADF; sock = sock_from_file(fd_file(f)); if (unlikely(!sock)) return -ENOTSOCK; return ___sys_sendmsg(sock, msg, &msg_sys, flags, NULL, 0); } SYSCALL_DEFINE3(sendmsg, int, fd, struct user_msghdr __user *, msg, unsigned int, flags) { return __sys_sendmsg(fd, msg, flags, true); } /* * Linux sendmmsg interface */ int __sys_sendmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen, unsigned int flags, bool forbid_cmsg_compat) { int err, datagrams; struct socket *sock; struct mmsghdr __user *entry; struct compat_mmsghdr __user *compat_entry; struct msghdr msg_sys; struct used_address used_address; unsigned int oflags = flags; if (forbid_cmsg_compat && (flags & MSG_CMSG_COMPAT)) return -EINVAL; if (vlen > UIO_MAXIOV) vlen = UIO_MAXIOV; datagrams = 0; CLASS(fd, f)(fd); if (fd_empty(f)) return -EBADF; sock = sock_from_file(fd_file(f)); if (unlikely(!sock)) return -ENOTSOCK; used_address.name_len = UINT_MAX; entry = mmsg; compat_entry = (struct compat_mmsghdr __user *)mmsg; err = 0; flags |= MSG_BATCH; while (datagrams < vlen) { if (datagrams == vlen - 1) flags = oflags; if (MSG_CMSG_COMPAT & flags) { err = ___sys_sendmsg(sock, (struct user_msghdr __user *)compat_entry, &msg_sys, flags, &used_address, MSG_EOR); if (err < 0) break; err = __put_user(err, &compat_entry->msg_len); ++compat_entry; } else { err = ___sys_sendmsg(sock, (struct user_msghdr __user *)entry, &msg_sys, flags, &used_address, MSG_EOR); if (err < 0) break; err = put_user(err, &entry->msg_len); ++entry; } if (err) break; ++datagrams; if (msg_data_left(&msg_sys)) break; cond_resched(); } /* We only return an error if no datagrams were able to be sent */ if (datagrams != 0) return datagrams; return err; } SYSCALL_DEFINE4(sendmmsg, int, fd, struct mmsghdr __user *, mmsg, unsigned int, vlen, unsigned int, flags) { return __sys_sendmmsg(fd, mmsg, vlen, flags, true); } static int recvmsg_copy_msghdr(struct msghdr *msg, struct user_msghdr __user *umsg, unsigned flags, struct sockaddr __user **uaddr, struct iovec **iov) { ssize_t err; if (MSG_CMSG_COMPAT & flags) { struct compat_msghdr __user *msg_compat; msg_compat = (struct compat_msghdr __user *) umsg; err = get_compat_msghdr(msg, msg_compat, uaddr, iov); } else { err = copy_msghdr_from_user(msg, umsg, uaddr, iov); } if (err < 0) return err; return 0; } static int ____sys_recvmsg(struct socket *sock, struct msghdr *msg_sys, struct user_msghdr __user *msg, struct sockaddr __user *uaddr, unsigned int flags, int nosec) { struct compat_msghdr __user *msg_compat = (struct compat_msghdr __user *) msg; int __user *uaddr_len = COMPAT_NAMELEN(msg); struct sockaddr_storage addr; unsigned long cmsg_ptr; int len; ssize_t err; msg_sys->msg_name = &addr; cmsg_ptr = (unsigned long)msg_sys->msg_control; msg_sys->msg_flags = flags & (MSG_CMSG_CLOEXEC|MSG_CMSG_COMPAT); /* We assume all kernel code knows the size of sockaddr_storage */ msg_sys->msg_namelen = 0; if (sock->file->f_flags & O_NONBLOCK) flags |= MSG_DONTWAIT; if (unlikely(nosec)) err = sock_recvmsg_nosec(sock, msg_sys, flags); else err = sock_recvmsg(sock, msg_sys, flags); if (err < 0) goto out; len = err; if (uaddr != NULL) { err = move_addr_to_user(&addr, msg_sys->msg_namelen, uaddr, uaddr_len); if (err < 0) goto out; } err = __put_user((msg_sys->msg_flags & ~MSG_CMSG_COMPAT), COMPAT_FLAGS(msg)); if (err) goto out; if (MSG_CMSG_COMPAT & flags) err = __put_user((unsigned long)msg_sys->msg_control - cmsg_ptr, &msg_compat->msg_controllen); else err = __put_user((unsigned long)msg_sys->msg_control - cmsg_ptr, &msg->msg_controllen); if (err) goto out; err = len; out: return err; } static int ___sys_recvmsg(struct socket *sock, struct user_msghdr __user *msg, struct msghdr *msg_sys, unsigned int flags, int nosec) { struct iovec iovstack[UIO_FASTIOV], *iov = iovstack; /* user mode address pointers */ struct sockaddr __user *uaddr; ssize_t err; err = recvmsg_copy_msghdr(msg_sys, msg, flags, &uaddr, &iov); if (err < 0) return err; err = ____sys_recvmsg(sock, msg_sys, msg, uaddr, flags, nosec); kfree(iov); return err; } /* * BSD recvmsg interface */ long __sys_recvmsg_sock(struct socket *sock, struct msghdr *msg, struct user_msghdr __user *umsg, struct sockaddr __user *uaddr, unsigned int flags) { return ____sys_recvmsg(sock, msg, umsg, uaddr, flags, 0); } long __sys_recvmsg(int fd, struct user_msghdr __user *msg, unsigned int flags, bool forbid_cmsg_compat) { struct msghdr msg_sys; struct socket *sock; if (forbid_cmsg_compat && (flags & MSG_CMSG_COMPAT)) return -EINVAL; CLASS(fd, f)(fd); if (fd_empty(f)) return -EBADF; sock = sock_from_file(fd_file(f)); if (unlikely(!sock)) return -ENOTSOCK; return ___sys_recvmsg(sock, msg, &msg_sys, flags, 0); } SYSCALL_DEFINE3(recvmsg, int, fd, struct user_msghdr __user *, msg, unsigned int, flags) { return __sys_recvmsg(fd, msg, flags, true); } /* * Linux recvmmsg interface */ static int do_recvmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen, unsigned int flags, struct timespec64 *timeout) { int err = 0, datagrams; struct socket *sock; struct mmsghdr __user *entry; struct compat_mmsghdr __user *compat_entry; struct msghdr msg_sys; struct timespec64 end_time; struct timespec64 timeout64; if (timeout && poll_select_set_timeout(&end_time, timeout->tv_sec, timeout->tv_nsec)) return -EINVAL; datagrams = 0; CLASS(fd, f)(fd); if (fd_empty(f)) return -EBADF; sock = sock_from_file(fd_file(f)); if (unlikely(!sock)) return -ENOTSOCK; if (likely(!(flags & MSG_ERRQUEUE))) { err = sock_error(sock->sk); if (err) return err; } entry = mmsg; compat_entry = (struct compat_mmsghdr __user *)mmsg; while (datagrams < vlen) { /* * No need to ask LSM for more than the first datagram. */ if (MSG_CMSG_COMPAT & flags) { err = ___sys_recvmsg(sock, (struct user_msghdr __user *)compat_entry, &msg_sys, flags & ~MSG_WAITFORONE, datagrams); if (err < 0) break; err = __put_user(err, &compat_entry->msg_len); ++compat_entry; } else { err = ___sys_recvmsg(sock, (struct user_msghdr __user *)entry, &msg_sys, flags & ~MSG_WAITFORONE, datagrams); if (err < 0) break; err = put_user(err, &entry->msg_len); ++entry; } if (err) break; ++datagrams; /* MSG_WAITFORONE turns on MSG_DONTWAIT after one packet */ if (flags & MSG_WAITFORONE) flags |= MSG_DONTWAIT; if (timeout) { ktime_get_ts64(&timeout64); *timeout = timespec64_sub(end_time, timeout64); if (timeout->tv_sec < 0) { timeout->tv_sec = timeout->tv_nsec = 0; break; } /* Timeout, return less than vlen datagrams */ if (timeout->tv_nsec == 0 && timeout->tv_sec == 0) break; } /* Out of band data, return right away */ if (msg_sys.msg_flags & MSG_OOB) break; cond_resched(); } if (err == 0) return datagrams; if (datagrams == 0) return err; /* * We may return less entries than requested (vlen) if the * sock is non block and there aren't enough datagrams... */ if (err != -EAGAIN) { /* * ... or if recvmsg returns an error after we * received some datagrams, where we record the * error to return on the next call or if the * app asks about it using getsockopt(SO_ERROR). */ WRITE_ONCE(sock->sk->sk_err, -err); } return datagrams; } int __sys_recvmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen, unsigned int flags, struct __kernel_timespec __user *timeout, struct old_timespec32 __user *timeout32) { int datagrams; struct timespec64 timeout_sys; if (timeout && get_timespec64(&timeout_sys, timeout)) return -EFAULT; if (timeout32 && get_old_timespec32(&timeout_sys, timeout32)) return -EFAULT; if (!timeout && !timeout32) return do_recvmmsg(fd, mmsg, vlen, flags, NULL); datagrams = do_recvmmsg(fd, mmsg, vlen, flags, &timeout_sys); if (datagrams <= 0) return datagrams; if (timeout && put_timespec64(&timeout_sys, timeout)) datagrams = -EFAULT; if (timeout32 && put_old_timespec32(&timeout_sys, timeout32)) datagrams = -EFAULT; return datagrams; } SYSCALL_DEFINE5(recvmmsg, int, fd, struct mmsghdr __user *, mmsg, unsigned int, vlen, unsigned int, flags, struct __kernel_timespec __user *, timeout) { if (flags & MSG_CMSG_COMPAT) return -EINVAL; return __sys_recvmmsg(fd, mmsg, vlen, flags, timeout, NULL); } #ifdef CONFIG_COMPAT_32BIT_TIME SYSCALL_DEFINE5(recvmmsg_time32, int, fd, struct mmsghdr __user *, mmsg, unsigned int, vlen, unsigned int, flags, struct old_timespec32 __user *, timeout) { if (flags & MSG_CMSG_COMPAT) return -EINVAL; return __sys_recvmmsg(fd, mmsg, vlen, flags, NULL, timeout); } #endif #ifdef __ARCH_WANT_SYS_SOCKETCALL /* Argument list sizes for sys_socketcall */ #define AL(x) ((x) * sizeof(unsigned long)) static const unsigned char nargs[21] = { AL(0), AL(3), AL(3), AL(3), AL(2), AL(3), AL(3), AL(3), AL(4), AL(4), AL(4), AL(6), AL(6), AL(2), AL(5), AL(5), AL(3), AL(3), AL(4), AL(5), AL(4) }; #undef AL /* * System call vectors. * * Argument checking cleaned up. Saved 20% in size. * This function doesn't need to set the kernel lock because * it is set by the callees. */ SYSCALL_DEFINE2(socketcall, int, call, unsigned long __user *, args) { unsigned long a[AUDITSC_ARGS]; unsigned long a0, a1; int err; unsigned int len; if (call < 1 || call > SYS_SENDMMSG) return -EINVAL; call = array_index_nospec(call, SYS_SENDMMSG + 1); len = nargs[call]; if (len > sizeof(a)) return -EINVAL; /* copy_from_user should be SMP safe. */ if (copy_from_user(a, args, len)) return -EFAULT; err = audit_socketcall(nargs[call] / sizeof(unsigned long), a); if (err) return err; a0 = a[0]; a1 = a[1]; switch (call) { case SYS_SOCKET: err = __sys_socket(a0, a1, a[2]); break; case SYS_BIND: err = __sys_bind(a0, (struct sockaddr __user *)a1, a[2]); break; case SYS_CONNECT: err = __sys_connect(a0, (struct sockaddr __user *)a1, a[2]); break; case SYS_LISTEN: err = __sys_listen(a0, a1); break; case SYS_ACCEPT: err = __sys_accept4(a0, (struct sockaddr __user *)a1, (int __user *)a[2], 0); break; case SYS_GETSOCKNAME: err = __sys_getsockname(a0, (struct sockaddr __user *)a1, (int __user *)a[2], 0); break; case SYS_GETPEERNAME: err = __sys_getsockname(a0, (struct sockaddr __user *)a1, (int __user *)a[2], 1); break; case SYS_SOCKETPAIR: err = __sys_socketpair(a0, a1, a[2], (int __user *)a[3]); break; case SYS_SEND: err = __sys_sendto(a0, (void __user *)a1, a[2], a[3], NULL, 0); break; case SYS_SENDTO: err = __sys_sendto(a0, (void __user *)a1, a[2], a[3], (struct sockaddr __user *)a[4], a[5]); break; case SYS_RECV: err = __sys_recvfrom(a0, (void __user *)a1, a[2], a[3], NULL, NULL); break; case SYS_RECVFROM: err = __sys_recvfrom(a0, (void __user *)a1, a[2], a[3], (struct sockaddr __user *)a[4], (int __user *)a[5]); break; case SYS_SHUTDOWN: err = __sys_shutdown(a0, a1); break; case SYS_SETSOCKOPT: err = __sys_setsockopt(a0, a1, a[2], (char __user *)a[3], a[4]); break; case SYS_GETSOCKOPT: err = __sys_getsockopt(a0, a1, a[2], (char __user *)a[3], (int __user *)a[4]); break; case SYS_SENDMSG: err = __sys_sendmsg(a0, (struct user_msghdr __user *)a1, a[2], true); break; case SYS_SENDMMSG: err = __sys_sendmmsg(a0, (struct mmsghdr __user *)a1, a[2], a[3], true); break; case SYS_RECVMSG: err = __sys_recvmsg(a0, (struct user_msghdr __user *)a1, a[2], true); break; case SYS_RECVMMSG: if (IS_ENABLED(CONFIG_64BIT)) err = __sys_recvmmsg(a0, (struct mmsghdr __user *)a1, a[2], a[3], (struct __kernel_timespec __user *)a[4], NULL); else err = __sys_recvmmsg(a0, (struct mmsghdr __user *)a1, a[2], a[3], NULL, (struct old_timespec32 __user *)a[4]); break; case SYS_ACCEPT4: err = __sys_accept4(a0, (struct sockaddr __user *)a1, (int __user *)a[2], a[3]); break; default: err = -EINVAL; break; } return err; } #endif /* __ARCH_WANT_SYS_SOCKETCALL */ /** * sock_register - add a socket protocol handler * @ops: description of protocol * * This function is called by a protocol handler that wants to * advertise its address family, and have it linked into the * socket interface. The value ops->family corresponds to the * socket system call protocol family. */ int sock_register(const struct net_proto_family *ops) { int err; if (ops->family >= NPROTO) { pr_crit("protocol %d >= NPROTO(%d)\n", ops->family, NPROTO); return -ENOBUFS; } spin_lock(&net_family_lock); if (rcu_dereference_protected(net_families[ops->family], lockdep_is_held(&net_family_lock))) err = -EEXIST; else { rcu_assign_pointer(net_families[ops->family], ops); err = 0; } spin_unlock(&net_family_lock); pr_info("NET: Registered %s protocol family\n", pf_family_names[ops->family]); return err; } EXPORT_SYMBOL(sock_register); /** * sock_unregister - remove a protocol handler * @family: protocol family to remove * * This function is called by a protocol handler that wants to * remove its address family, and have it unlinked from the * new socket creation. * * If protocol handler is a module, then it can use module reference * counts to protect against new references. If protocol handler is not * a module then it needs to provide its own protection in * the ops->create routine. */ void sock_unregister(int family) { BUG_ON(family < 0 || family >= NPROTO); spin_lock(&net_family_lock); RCU_INIT_POINTER(net_families[family], NULL); spin_unlock(&net_family_lock); synchronize_rcu(); pr_info("NET: Unregistered %s protocol family\n", pf_family_names[family]); } EXPORT_SYMBOL(sock_unregister); bool sock_is_registered(int family) { return family < NPROTO && rcu_access_pointer(net_families[family]); } static int __init sock_init(void) { int err; /* * Initialize the network sysctl infrastructure. */ err = net_sysctl_init(); if (err) goto out; /* * Initialize skbuff SLAB cache */ skb_init(); /* * Initialize the protocols module. */ init_inodecache(); err = register_filesystem(&sock_fs_type); if (err) goto out; sock_mnt = kern_mount(&sock_fs_type); if (IS_ERR(sock_mnt)) { err = PTR_ERR(sock_mnt); goto out_mount; } /* The real protocol initialization is performed in later initcalls. */ #ifdef CONFIG_NETFILTER err = netfilter_init(); if (err) goto out; #endif ptp_classifier_init(); out: return err; out_mount: unregister_filesystem(&sock_fs_type); goto out; } core_initcall(sock_init); /* early initcall */ #ifdef CONFIG_PROC_FS void socket_seq_show(struct seq_file *seq) { seq_printf(seq, "sockets: used %d\n", sock_inuse_get(seq->private)); } #endif /* CONFIG_PROC_FS */ /* Handle the fact that while struct ifreq has the same *layout* on * 32/64 for everything but ifreq::ifru_ifmap and ifreq::ifru_data, * which are handled elsewhere, it still has different *size* due to * ifreq::ifru_ifmap (which is 16 bytes on 32 bit, 24 bytes on 64-bit, * resulting in struct ifreq being 32 and 40 bytes respectively). * As a result, if the struct happens to be at the end of a page and * the next page isn't readable/writable, we get a fault. To prevent * that, copy back and forth to the full size. */ int get_user_ifreq(struct ifreq *ifr, void __user **ifrdata, void __user *arg) { if (in_compat_syscall()) { struct compat_ifreq *ifr32 = (struct compat_ifreq *)ifr; memset(ifr, 0, sizeof(*ifr)); if (copy_from_user(ifr32, arg, sizeof(*ifr32))) return -EFAULT; if (ifrdata) *ifrdata = compat_ptr(ifr32->ifr_data); return 0; } if (copy_from_user(ifr, arg, sizeof(*ifr))) return -EFAULT; if (ifrdata) *ifrdata = ifr->ifr_data; return 0; } EXPORT_SYMBOL(get_user_ifreq); int put_user_ifreq(struct ifreq *ifr, void __user *arg) { size_t size = sizeof(*ifr); if (in_compat_syscall()) size = sizeof(struct compat_ifreq); if (copy_to_user(arg, ifr, size)) return -EFAULT; return 0; } EXPORT_SYMBOL(put_user_ifreq); #ifdef CONFIG_COMPAT static int compat_siocwandev(struct net *net, struct compat_ifreq __user *uifr32) { compat_uptr_t uptr32; struct ifreq ifr; void __user *saved; int err; if (get_user_ifreq(&ifr, NULL, uifr32)) return -EFAULT; if (get_user(uptr32, &uifr32->ifr_settings.ifs_ifsu)) return -EFAULT; saved = ifr.ifr_settings.ifs_ifsu.raw_hdlc; ifr.ifr_settings.ifs_ifsu.raw_hdlc = compat_ptr(uptr32); err = dev_ioctl(net, SIOCWANDEV, &ifr, NULL, NULL); if (!err) { ifr.ifr_settings.ifs_ifsu.raw_hdlc = saved; if (put_user_ifreq(&ifr, uifr32)) err = -EFAULT; } return err; } /* Handle ioctls that use ifreq::ifr_data and just need struct ifreq converted */ static int compat_ifr_data_ioctl(struct net *net, unsigned int cmd, struct compat_ifreq __user *u_ifreq32) { struct ifreq ifreq; void __user *data; if (!is_socket_ioctl_cmd(cmd)) return -ENOTTY; if (get_user_ifreq(&ifreq, &data, u_ifreq32)) return -EFAULT; ifreq.ifr_data = data; return dev_ioctl(net, cmd, &ifreq, data, NULL); } static int compat_sock_ioctl_trans(struct file *file, struct socket *sock, unsigned int cmd, unsigned long arg) { void __user *argp = compat_ptr(arg); struct sock *sk = sock->sk; struct net *net = sock_net(sk); const struct proto_ops *ops; if (cmd >= SIOCDEVPRIVATE && cmd <= (SIOCDEVPRIVATE + 15)) return sock_ioctl(file, cmd, (unsigned long)argp); switch (cmd) { case SIOCWANDEV: return compat_siocwandev(net, argp); case SIOCGSTAMP_OLD: case SIOCGSTAMPNS_OLD: ops = READ_ONCE(sock->ops); if (!ops->gettstamp) return -ENOIOCTLCMD; return ops->gettstamp(sock, argp, cmd == SIOCGSTAMP_OLD, !COMPAT_USE_64BIT_TIME); case SIOCETHTOOL: case SIOCBONDSLAVEINFOQUERY: case SIOCBONDINFOQUERY: case SIOCSHWTSTAMP: case SIOCGHWTSTAMP: return compat_ifr_data_ioctl(net, cmd, argp); case FIOSETOWN: case SIOCSPGRP: case FIOGETOWN: case SIOCGPGRP: case SIOCBRADDBR: case SIOCBRDELBR: case SIOCBRADDIF: case SIOCBRDELIF: case SIOCGIFVLAN: case SIOCSIFVLAN: case SIOCGSKNS: case SIOCGSTAMP_NEW: case SIOCGSTAMPNS_NEW: case SIOCGIFCONF: case SIOCSIFBR: case SIOCGIFBR: return sock_ioctl(file, cmd, arg); case SIOCGIFFLAGS: case SIOCSIFFLAGS: case SIOCGIFMAP: case SIOCSIFMAP: case SIOCGIFMETRIC: case SIOCSIFMETRIC: case SIOCGIFMTU: case SIOCSIFMTU: case SIOCGIFMEM: case SIOCSIFMEM: case SIOCGIFHWADDR: case SIOCSIFHWADDR: case SIOCADDMULTI: case SIOCDELMULTI: case SIOCGIFINDEX: case SIOCGIFADDR: case SIOCSIFADDR: case SIOCSIFHWBROADCAST: case SIOCDIFADDR: case SIOCGIFBRDADDR: case SIOCSIFBRDADDR: case SIOCGIFDSTADDR: case SIOCSIFDSTADDR: case SIOCGIFNETMASK: case SIOCSIFNETMASK: case SIOCSIFPFLAGS: case SIOCGIFPFLAGS: case SIOCGIFTXQLEN: case SIOCSIFTXQLEN: case SIOCGIFNAME: case SIOCSIFNAME: case SIOCGMIIPHY: case SIOCGMIIREG: case SIOCSMIIREG: case SIOCBONDENSLAVE: case SIOCBONDRELEASE: case SIOCBONDSETHWADDR: case SIOCBONDCHANGEACTIVE: case SIOCSARP: case SIOCGARP: case SIOCDARP: case SIOCOUTQ: case SIOCOUTQNSD: case SIOCATMARK: return sock_do_ioctl(net, sock, cmd, arg); } return -ENOIOCTLCMD; } static long compat_sock_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct socket *sock = file->private_data; const struct proto_ops *ops = READ_ONCE(sock->ops); int ret = -ENOIOCTLCMD; struct sock *sk; struct net *net; sk = sock->sk; net = sock_net(sk); if (ops->compat_ioctl) ret = ops->compat_ioctl(sock, cmd, arg); if (ret == -ENOIOCTLCMD && (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST)) ret = compat_wext_handle_ioctl(net, cmd, arg); if (ret == -ENOIOCTLCMD) ret = compat_sock_ioctl_trans(file, sock, cmd, arg); return ret; } #endif /** * kernel_bind - bind an address to a socket (kernel space) * @sock: socket * @addr: address * @addrlen: length of address * * Returns 0 or an error. */ int kernel_bind(struct socket *sock, struct sockaddr_unsized *addr, int addrlen) { struct sockaddr_storage address; memcpy(&address, addr, addrlen); return READ_ONCE(sock->ops)->bind(sock, (struct sockaddr_unsized *)&address, addrlen); } EXPORT_SYMBOL(kernel_bind); /** * kernel_listen - move socket to listening state (kernel space) * @sock: socket * @backlog: pending connections queue size * * Returns 0 or an error. */ int kernel_listen(struct socket *sock, int backlog) { return READ_ONCE(sock->ops)->listen(sock, backlog); } EXPORT_SYMBOL(kernel_listen); /** * kernel_accept - accept a connection (kernel space) * @sock: listening socket * @newsock: new connected socket * @flags: flags * * @flags must be SOCK_CLOEXEC, SOCK_NONBLOCK or 0. * If it fails, @newsock is guaranteed to be %NULL. * Returns 0 or an error. */ int kernel_accept(struct socket *sock, struct socket **newsock, int flags) { struct sock *sk = sock->sk; const struct proto_ops *ops = READ_ONCE(sock->ops); struct proto_accept_arg arg = { .flags = flags, .kern = true, }; int err; err = sock_create_lite(sk->sk_family, sk->sk_type, sk->sk_protocol, newsock); if (err < 0) goto done; err = ops->accept(sock, *newsock, &arg); if (err < 0) { sock_release(*newsock); *newsock = NULL; goto done; } (*newsock)->ops = ops; __module_get(ops->owner); done: return err; } EXPORT_SYMBOL(kernel_accept); /** * kernel_connect - connect a socket (kernel space) * @sock: socket * @addr: address * @addrlen: address length * @flags: flags (O_NONBLOCK, ...) * * For datagram sockets, @addr is the address to which datagrams are sent * by default, and the only address from which datagrams are received. * For stream sockets, attempts to connect to @addr. * Returns 0 or an error code. */ int kernel_connect(struct socket *sock, struct sockaddr_unsized *addr, int addrlen, int flags) { struct sockaddr_storage address; memcpy(&address, addr, addrlen); return READ_ONCE(sock->ops)->connect(sock, (struct sockaddr_unsized *)&address, addrlen, flags); } EXPORT_SYMBOL(kernel_connect); /** * kernel_getsockname - get the address which the socket is bound (kernel space) * @sock: socket * @addr: address holder * * Fills the @addr pointer with the address which the socket is bound. * Returns the length of the address in bytes or an error code. */ int kernel_getsockname(struct socket *sock, struct sockaddr *addr) { return READ_ONCE(sock->ops)->getname(sock, addr, 0); } EXPORT_SYMBOL(kernel_getsockname); /** * kernel_getpeername - get the address which the socket is connected (kernel space) * @sock: socket * @addr: address holder * * Fills the @addr pointer with the address which the socket is connected. * Returns the length of the address in bytes or an error code. */ int kernel_getpeername(struct socket *sock, struct sockaddr *addr) { return READ_ONCE(sock->ops)->getname(sock, addr, 1); } EXPORT_SYMBOL(kernel_getpeername); /** * kernel_sock_shutdown - shut down part of a full-duplex connection (kernel space) * @sock: socket * @how: connection part * * Returns 0 or an error. */ int kernel_sock_shutdown(struct socket *sock, enum sock_shutdown_cmd how) { return READ_ONCE(sock->ops)->shutdown(sock, how); } EXPORT_SYMBOL(kernel_sock_shutdown); /** * kernel_sock_ip_overhead - returns the IP overhead imposed by a socket * @sk: socket * * This routine returns the IP overhead imposed by a socket i.e. * the length of the underlying IP header, depending on whether * this is an IPv4 or IPv6 socket and the length from IP options turned * on at the socket. Assumes that the caller has a lock on the socket. */ u32 kernel_sock_ip_overhead(struct sock *sk) { struct inet_sock *inet; struct ip_options_rcu *opt; u32 overhead = 0; #if IS_ENABLED(CONFIG_IPV6) struct ipv6_pinfo *np; struct ipv6_txoptions *optv6 = NULL; #endif /* IS_ENABLED(CONFIG_IPV6) */ if (!sk) return overhead; switch (sk->sk_family) { case AF_INET: inet = inet_sk(sk); overhead += sizeof(struct iphdr); opt = rcu_dereference_protected(inet->inet_opt, sock_owned_by_user(sk)); if (opt) overhead += opt->opt.optlen; return overhead; #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: np = inet6_sk(sk); overhead += sizeof(struct ipv6hdr); if (np) optv6 = rcu_dereference_protected(np->opt, sock_owned_by_user(sk)); if (optv6) overhead += (optv6->opt_flen + optv6->opt_nflen); return overhead; #endif /* IS_ENABLED(CONFIG_IPV6) */ default: /* Returns 0 overhead if the socket is not ipv4 or ipv6 */ return overhead; } } EXPORT_SYMBOL(kernel_sock_ip_overhead);
1276 27 737 737 147 123 208 1 54 55 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 /* SPDX-License-Identifier: GPL-2.0-only */ /* * net busy poll support * Copyright(c) 2013 Intel Corporation. * * Author: Eliezer Tamir * * Contact Information: * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net> */ #ifndef _LINUX_NET_BUSY_POLL_H #define _LINUX_NET_BUSY_POLL_H #include <linux/netdevice.h> #include <linux/sched/clock.h> #include <linux/sched/signal.h> #include <net/ip.h> #include <net/xdp.h> /* 0 - Reserved to indicate value not set * 1..NR_CPUS - Reserved for sender_cpu * NR_CPUS+1..~0 - Region available for NAPI IDs */ #define MIN_NAPI_ID ((unsigned int)(NR_CPUS + 1)) static inline bool napi_id_valid(unsigned int napi_id) { return napi_id >= MIN_NAPI_ID; } #define BUSY_POLL_BUDGET 8 #ifdef CONFIG_NET_RX_BUSY_POLL struct napi_struct; extern unsigned int sysctl_net_busy_read __read_mostly; extern unsigned int sysctl_net_busy_poll __read_mostly; static inline bool net_busy_loop_on(void) { return READ_ONCE(sysctl_net_busy_poll); } static inline bool sk_can_busy_loop(const struct sock *sk) { return READ_ONCE(sk->sk_ll_usec) && !signal_pending(current); } bool sk_busy_loop_end(void *p, unsigned long start_time); void napi_busy_loop(unsigned int napi_id, bool (*loop_end)(void *, unsigned long), void *loop_end_arg, bool prefer_busy_poll, u16 budget); void napi_busy_loop_rcu(unsigned int napi_id, bool (*loop_end)(void *, unsigned long), void *loop_end_arg, bool prefer_busy_poll, u16 budget); void napi_suspend_irqs(unsigned int napi_id); void napi_resume_irqs(unsigned int napi_id); #else /* CONFIG_NET_RX_BUSY_POLL */ static inline unsigned long net_busy_loop_on(void) { return 0; } static inline bool sk_can_busy_loop(struct sock *sk) { return false; } #endif /* CONFIG_NET_RX_BUSY_POLL */ static inline unsigned long busy_loop_current_time(void) { #ifdef CONFIG_NET_RX_BUSY_POLL return (unsigned long)(ktime_get_ns() >> 10); #else return 0; #endif } /* in poll/select we use the global sysctl_net_ll_poll value */ static inline bool busy_loop_timeout(unsigned long start_time) { #ifdef CONFIG_NET_RX_BUSY_POLL unsigned long bp_usec = READ_ONCE(sysctl_net_busy_poll); if (bp_usec) { unsigned long end_time = start_time + bp_usec; unsigned long now = busy_loop_current_time(); return time_after(now, end_time); } #endif return true; } static inline bool sk_busy_loop_timeout(struct sock *sk, unsigned long start_time) { #ifdef CONFIG_NET_RX_BUSY_POLL unsigned long bp_usec = READ_ONCE(sk->sk_ll_usec); if (bp_usec) { unsigned long end_time = start_time + bp_usec; unsigned long now = busy_loop_current_time(); return time_after(now, end_time); } #endif return true; } static inline void sk_busy_loop(struct sock *sk, int nonblock) { #ifdef CONFIG_NET_RX_BUSY_POLL unsigned int napi_id = READ_ONCE(sk->sk_napi_id); if (napi_id_valid(napi_id)) napi_busy_loop(napi_id, nonblock ? NULL : sk_busy_loop_end, sk, READ_ONCE(sk->sk_prefer_busy_poll), READ_ONCE(sk->sk_busy_poll_budget) ?: BUSY_POLL_BUDGET); #endif } /* used in the NIC receive handler to mark the skb */ static inline void __skb_mark_napi_id(struct sk_buff *skb, const struct gro_node *gro) { #ifdef CONFIG_NET_RX_BUSY_POLL /* If the skb was already marked with a valid NAPI ID, avoid overwriting * it. */ if (!napi_id_valid(skb->napi_id)) skb->napi_id = gro->cached_napi_id; #endif } static inline void skb_mark_napi_id(struct sk_buff *skb, const struct napi_struct *napi) { __skb_mark_napi_id(skb, &napi->gro); } /* used in the protocol handler to propagate the napi_id to the socket */ static inline void sk_mark_napi_id(struct sock *sk, const struct sk_buff *skb) { #ifdef CONFIG_NET_RX_BUSY_POLL if (unlikely(READ_ONCE(sk->sk_napi_id) != skb->napi_id)) WRITE_ONCE(sk->sk_napi_id, skb->napi_id); #endif sk_rx_queue_update(sk, skb); } /* Variant of sk_mark_napi_id() for passive flow setup, * as sk->sk_napi_id and sk->sk_rx_queue_mapping content * needs to be set. */ static inline void sk_mark_napi_id_set(struct sock *sk, const struct sk_buff *skb) { #ifdef CONFIG_NET_RX_BUSY_POLL WRITE_ONCE(sk->sk_napi_id, skb->napi_id); #endif sk_rx_queue_set(sk, skb); } static inline void __sk_mark_napi_id_once(struct sock *sk, unsigned int napi_id) { #ifdef CONFIG_NET_RX_BUSY_POLL if (!READ_ONCE(sk->sk_napi_id)) WRITE_ONCE(sk->sk_napi_id, napi_id); #endif } /* variant used for unconnected sockets */ static inline void sk_mark_napi_id_once(struct sock *sk, const struct sk_buff *skb) { #ifdef CONFIG_NET_RX_BUSY_POLL __sk_mark_napi_id_once(sk, skb->napi_id); #endif } #endif /* _LINUX_NET_BUSY_POLL_H */
1 9 9 21 16 27 14 9 9 9 8 1 7 2 5 62 60 80 123 161 2 2 3 1 2 1 48 48 9 48 40 47 15 48 48 10 47 48 3 42 17 26 46 48 60 45 12 57 53 27 27 3 21 21 2 2 2 2 2 12 12 8 4 12 12 12 12 6 6 6 5 1 5 6 2 8 4 10 4 11 11 1 9 9 60 62 3 57 20 20 20 20 9 20 20 20 5 5 20 31 31 31 30 12 15 27 31 18 36 8 1 2 6 3 3 4 3 1 49 51 7 49 48 48 49 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 // SPDX-License-Identifier: GPL-2.0-or-later /* * ALSA sequencer Timing queue handling * Copyright (c) 1998-1999 by Frank van de Pol <fvdpol@coil.demon.nl> * * MAJOR CHANGES * Nov. 13, 1999 Takashi Iwai <iwai@ww.uni-erlangen.de> * - Queues are allocated dynamically via ioctl. * - When owner client is deleted, all owned queues are deleted, too. * - Owner of unlocked queue is kept unmodified even if it is * manipulated by other clients. * - Owner field in SET_QUEUE_OWNER ioctl must be identical with the * caller client. i.e. Changing owner to a third client is not * allowed. * * Aug. 30, 2000 Takashi Iwai * - Queues are managed in static array again, but with better way. * The API itself is identical. * - The queue is locked when struct snd_seq_queue pointer is returned via * queueptr(). This pointer *MUST* be released afterward by * queuefree(ptr). * - Addition of experimental sync support. */ #include <linux/init.h> #include <linux/slab.h> #include <sound/core.h> #include "seq_memory.h" #include "seq_queue.h" #include "seq_clientmgr.h" #include "seq_fifo.h" #include "seq_timer.h" #include "seq_info.h" /* list of allocated queues */ static struct snd_seq_queue *queue_list[SNDRV_SEQ_MAX_QUEUES]; static DEFINE_SPINLOCK(queue_list_lock); /* number of queues allocated */ static int num_queues; int snd_seq_queue_get_cur_queues(void) { return num_queues; } /*----------------------------------------------------------------*/ /* assign queue id and insert to list */ static int queue_list_add(struct snd_seq_queue *q) { int i; guard(spinlock_irqsave)(&queue_list_lock); for (i = 0; i < SNDRV_SEQ_MAX_QUEUES; i++) { if (! queue_list[i]) { queue_list[i] = q; q->queue = i; num_queues++; return i; } } return -1; } static struct snd_seq_queue *queue_list_remove(int id, int client) { struct snd_seq_queue *q; guard(spinlock_irqsave)(&queue_list_lock); q = queue_list[id]; if (q) { guard(spinlock)(&q->owner_lock); if (q->owner == client) { /* found */ q->klocked = 1; queue_list[id] = NULL; num_queues--; return q; } } return NULL; } /*----------------------------------------------------------------*/ /* create new queue (constructor) */ static struct snd_seq_queue *queue_new(int owner, int locked) { struct snd_seq_queue *q; q = kzalloc(sizeof(*q), GFP_KERNEL); if (!q) return NULL; spin_lock_init(&q->owner_lock); spin_lock_init(&q->check_lock); mutex_init(&q->timer_mutex); snd_use_lock_init(&q->use_lock); q->queue = -1; q->tickq = snd_seq_prioq_new(); q->timeq = snd_seq_prioq_new(); q->timer = snd_seq_timer_new(); if (q->tickq == NULL || q->timeq == NULL || q->timer == NULL) { snd_seq_prioq_delete(&q->tickq); snd_seq_prioq_delete(&q->timeq); snd_seq_timer_delete(&q->timer); kfree(q); return NULL; } q->owner = owner; q->locked = locked; q->klocked = 0; return q; } /* delete queue (destructor) */ static void queue_delete(struct snd_seq_queue *q) { /* stop and release the timer */ mutex_lock(&q->timer_mutex); snd_seq_timer_stop(q->timer); snd_seq_timer_close(q); mutex_unlock(&q->timer_mutex); /* wait until access free */ snd_use_lock_sync(&q->use_lock); /* release resources... */ snd_seq_prioq_delete(&q->tickq); snd_seq_prioq_delete(&q->timeq); snd_seq_timer_delete(&q->timer); kfree(q); } /*----------------------------------------------------------------*/ /* delete all existing queues */ void snd_seq_queues_delete(void) { int i; /* clear list */ for (i = 0; i < SNDRV_SEQ_MAX_QUEUES; i++) { if (queue_list[i]) queue_delete(queue_list[i]); } } static void queue_use(struct snd_seq_queue *queue, int client, int use); /* allocate a new queue - * return pointer to new queue or ERR_PTR(-errno) for error * The new queue's use_lock is set to 1. It is the caller's responsibility to * call snd_use_lock_free(&q->use_lock). */ struct snd_seq_queue *snd_seq_queue_alloc(int client, int locked, unsigned int info_flags) { struct snd_seq_queue *q; q = queue_new(client, locked); if (q == NULL) return ERR_PTR(-ENOMEM); q->info_flags = info_flags; queue_use(q, client, 1); snd_use_lock_use(&q->use_lock); if (queue_list_add(q) < 0) { snd_use_lock_free(&q->use_lock); queue_delete(q); return ERR_PTR(-ENOMEM); } return q; } /* delete a queue - queue must be owned by the client */ int snd_seq_queue_delete(int client, int queueid) { struct snd_seq_queue *q; if (queueid < 0 || queueid >= SNDRV_SEQ_MAX_QUEUES) return -EINVAL; q = queue_list_remove(queueid, client); if (q == NULL) return -EINVAL; queue_delete(q); return 0; } /* return pointer to queue structure for specified id */ struct snd_seq_queue *queueptr(int queueid) { struct snd_seq_queue *q; if (queueid < 0 || queueid >= SNDRV_SEQ_MAX_QUEUES) return NULL; guard(spinlock_irqsave)(&queue_list_lock); q = queue_list[queueid]; if (q) snd_use_lock_use(&q->use_lock); return q; } /* return the (first) queue matching with the specified name */ struct snd_seq_queue *snd_seq_queue_find_name(char *name) { int i; for (i = 0; i < SNDRV_SEQ_MAX_QUEUES; i++) { struct snd_seq_queue *q __free(snd_seq_queue) = NULL; q = queueptr(i); if (q) { if (strncmp(q->name, name, sizeof(q->name)) == 0) return no_free_ptr(q); } } return NULL; } /* -------------------------------------------------------- */ #define MAX_CELL_PROCESSES_IN_QUEUE 1000 void snd_seq_check_queue(struct snd_seq_queue *q, int atomic, int hop) { struct snd_seq_event_cell *cell; snd_seq_tick_time_t cur_tick; snd_seq_real_time_t cur_time; int processed = 0; if (q == NULL) return; /* make this function non-reentrant */ scoped_guard(spinlock_irqsave, &q->check_lock) { if (q->check_blocked) { q->check_again = 1; return; /* other thread is already checking queues */ } q->check_blocked = 1; } __again: /* Process tick queue... */ cur_tick = snd_seq_timer_get_cur_tick(q->timer); for (;;) { cell = snd_seq_prioq_cell_out(q->tickq, &cur_tick); if (!cell) break; snd_seq_dispatch_event(cell, atomic, hop); if (++processed >= MAX_CELL_PROCESSES_IN_QUEUE) goto out; /* the rest processed at the next batch */ } /* Process time queue... */ cur_time = snd_seq_timer_get_cur_time(q->timer, false); for (;;) { cell = snd_seq_prioq_cell_out(q->timeq, &cur_time); if (!cell) break; snd_seq_dispatch_event(cell, atomic, hop); if (++processed >= MAX_CELL_PROCESSES_IN_QUEUE) goto out; /* the rest processed at the next batch */ } out: /* free lock */ scoped_guard(spinlock_irqsave, &q->check_lock) { if (q->check_again) { q->check_again = 0; if (processed < MAX_CELL_PROCESSES_IN_QUEUE) goto __again; } q->check_blocked = 0; } } /* enqueue a event to singe queue */ int snd_seq_enqueue_event(struct snd_seq_event_cell *cell, int atomic, int hop) { int dest, err; struct snd_seq_queue *q __free(snd_seq_queue) = NULL; if (snd_BUG_ON(!cell)) return -EINVAL; dest = cell->event.queue; /* destination queue */ q = queueptr(dest); if (q == NULL) return -EINVAL; /* handle relative time stamps, convert them into absolute */ if ((cell->event.flags & SNDRV_SEQ_TIME_MODE_MASK) == SNDRV_SEQ_TIME_MODE_REL) { switch (cell->event.flags & SNDRV_SEQ_TIME_STAMP_MASK) { case SNDRV_SEQ_TIME_STAMP_TICK: cell->event.time.tick += q->timer->tick.cur_tick; break; case SNDRV_SEQ_TIME_STAMP_REAL: snd_seq_inc_real_time(&cell->event.time.time, &q->timer->cur_time); break; } cell->event.flags &= ~SNDRV_SEQ_TIME_MODE_MASK; cell->event.flags |= SNDRV_SEQ_TIME_MODE_ABS; } /* enqueue event in the real-time or midi queue */ switch (cell->event.flags & SNDRV_SEQ_TIME_STAMP_MASK) { case SNDRV_SEQ_TIME_STAMP_TICK: err = snd_seq_prioq_cell_in(q->tickq, cell); break; case SNDRV_SEQ_TIME_STAMP_REAL: default: err = snd_seq_prioq_cell_in(q->timeq, cell); break; } if (err < 0) return err; /* trigger dispatching */ snd_seq_check_queue(q, atomic, hop); return 0; } /*----------------------------------------------------------------*/ static inline int check_access(struct snd_seq_queue *q, int client) { return (q->owner == client) || (!q->locked && !q->klocked); } /* check if the client has permission to modify queue parameters. * if it does, lock the queue */ static int queue_access_lock(struct snd_seq_queue *q, int client) { int access_ok; guard(spinlock_irqsave)(&q->owner_lock); access_ok = check_access(q, client); if (access_ok) q->klocked = 1; return access_ok; } /* unlock the queue */ static inline void queue_access_unlock(struct snd_seq_queue *q) { guard(spinlock_irqsave)(&q->owner_lock); q->klocked = 0; } /* exported - only checking permission */ int snd_seq_queue_check_access(int queueid, int client) { struct snd_seq_queue *q __free(snd_seq_queue) = queueptr(queueid); if (! q) return 0; guard(spinlock_irqsave)(&q->owner_lock); return check_access(q, client); } /*----------------------------------------------------------------*/ /* * change queue's owner and permission */ int snd_seq_queue_set_owner(int queueid, int client, int locked) { struct snd_seq_queue *q __free(snd_seq_queue) = queueptr(queueid); if (q == NULL) return -EINVAL; if (!queue_access_lock(q, client)) return -EPERM; scoped_guard(spinlock_irqsave, &q->owner_lock) { q->locked = locked ? 1 : 0; q->owner = client; } queue_access_unlock(q); return 0; } /*----------------------------------------------------------------*/ /* open timer - * q->use mutex should be down before calling this function to avoid * confliction with snd_seq_queue_use() */ int snd_seq_queue_timer_open(int queueid) { int result = 0; struct snd_seq_queue *queue __free(snd_seq_queue) = NULL; struct snd_seq_timer *tmr; queue = queueptr(queueid); if (queue == NULL) return -EINVAL; tmr = queue->timer; result = snd_seq_timer_open(queue); if (result < 0) { snd_seq_timer_defaults(tmr); result = snd_seq_timer_open(queue); } return result; } /* close timer - * q->use mutex should be down before calling this function */ int snd_seq_queue_timer_close(int queueid) { struct snd_seq_queue *queue __free(snd_seq_queue) = NULL; int result = 0; queue = queueptr(queueid); if (queue == NULL) return -EINVAL; snd_seq_timer_close(queue); return result; } /* change queue tempo and ppq */ int snd_seq_queue_timer_set_tempo(int queueid, int client, struct snd_seq_queue_tempo *info) { struct snd_seq_queue *q __free(snd_seq_queue) = queueptr(queueid); int result; if (q == NULL) return -EINVAL; if (!queue_access_lock(q, client)) return -EPERM; result = snd_seq_timer_set_tempo_ppq(q->timer, info->tempo, info->ppq, info->tempo_base); if (result >= 0 && info->skew_base > 0) result = snd_seq_timer_set_skew(q->timer, info->skew_value, info->skew_base); queue_access_unlock(q); return result; } /* use or unuse this queue */ static void queue_use(struct snd_seq_queue *queue, int client, int use) { if (use) { if (!test_and_set_bit(client, queue->clients_bitmap)) queue->clients++; } else { if (test_and_clear_bit(client, queue->clients_bitmap)) queue->clients--; } if (queue->clients) { if (use && queue->clients == 1) snd_seq_timer_defaults(queue->timer); snd_seq_timer_open(queue); } else { snd_seq_timer_close(queue); } } /* use or unuse this queue - * if it is the first client, starts the timer. * if it is not longer used by any clients, stop the timer. */ int snd_seq_queue_use(int queueid, int client, int use) { struct snd_seq_queue *queue __free(snd_seq_queue) = NULL; queue = queueptr(queueid); if (queue == NULL) return -EINVAL; guard(mutex)(&queue->timer_mutex); queue_use(queue, client, use); return 0; } /* * check if queue is used by the client * return negative value if the queue is invalid. * return 0 if not used, 1 if used. */ int snd_seq_queue_is_used(int queueid, int client) { struct snd_seq_queue *q __free(snd_seq_queue) = NULL; q = queueptr(queueid); if (q == NULL) return -EINVAL; /* invalid queue */ return test_bit(client, q->clients_bitmap) ? 1 : 0; } /*----------------------------------------------------------------*/ /* final stage notification - * remove cells for no longer exist client (for non-owned queue) * or delete this queue (for owned queue) */ void snd_seq_queue_client_leave(int client) { int i; /* delete own queues from queue list */ for (i = 0; i < SNDRV_SEQ_MAX_QUEUES; i++) { struct snd_seq_queue *q = queue_list_remove(i, client); if (q) queue_delete(q); } /* remove cells from existing queues - * they are not owned by this client */ for (i = 0; i < SNDRV_SEQ_MAX_QUEUES; i++) { struct snd_seq_queue *q __free(snd_seq_queue) = queueptr(i); if (!q) continue; if (test_bit(client, q->clients_bitmap)) { snd_seq_prioq_leave(q->tickq, client, 0); snd_seq_prioq_leave(q->timeq, client, 0); snd_seq_queue_use(q->queue, client, 0); } } } /*----------------------------------------------------------------*/ /* remove cells based on flush criteria */ void snd_seq_queue_remove_cells(int client, struct snd_seq_remove_events *info) { int i; for (i = 0; i < SNDRV_SEQ_MAX_QUEUES; i++) { struct snd_seq_queue *q __free(snd_seq_queue) = queueptr(i); if (!q) continue; if (test_bit(client, q->clients_bitmap) && (! (info->remove_mode & SNDRV_SEQ_REMOVE_DEST) || q->queue == info->queue)) { snd_seq_prioq_remove_events(q->tickq, client, info); snd_seq_prioq_remove_events(q->timeq, client, info); } } } /*----------------------------------------------------------------*/ /* * send events to all subscribed ports */ static void queue_broadcast_event(struct snd_seq_queue *q, struct snd_seq_event *ev, int atomic, int hop) { struct snd_seq_event sev; sev = *ev; sev.flags = SNDRV_SEQ_TIME_STAMP_TICK|SNDRV_SEQ_TIME_MODE_ABS; sev.time.tick = q->timer->tick.cur_tick; sev.queue = q->queue; sev.data.queue.queue = q->queue; /* broadcast events from Timer port */ sev.source.client = SNDRV_SEQ_CLIENT_SYSTEM; sev.source.port = SNDRV_SEQ_PORT_SYSTEM_TIMER; sev.dest.client = SNDRV_SEQ_ADDRESS_SUBSCRIBERS; snd_seq_kernel_client_dispatch(SNDRV_SEQ_CLIENT_SYSTEM, &sev, atomic, hop); } /* * process a received queue-control event. * this function is exported for seq_sync.c. */ static void snd_seq_queue_process_event(struct snd_seq_queue *q, struct snd_seq_event *ev, int atomic, int hop) { switch (ev->type) { case SNDRV_SEQ_EVENT_START: snd_seq_prioq_leave(q->tickq, ev->source.client, 1); snd_seq_prioq_leave(q->timeq, ev->source.client, 1); if (! snd_seq_timer_start(q->timer)) queue_broadcast_event(q, ev, atomic, hop); break; case SNDRV_SEQ_EVENT_CONTINUE: if (! snd_seq_timer_continue(q->timer)) queue_broadcast_event(q, ev, atomic, hop); break; case SNDRV_SEQ_EVENT_STOP: snd_seq_timer_stop(q->timer); queue_broadcast_event(q, ev, atomic, hop); break; case SNDRV_SEQ_EVENT_TEMPO: snd_seq_timer_set_tempo(q->timer, ev->data.queue.param.value); queue_broadcast_event(q, ev, atomic, hop); break; case SNDRV_SEQ_EVENT_SETPOS_TICK: if (snd_seq_timer_set_position_tick(q->timer, ev->data.queue.param.time.tick) == 0) { queue_broadcast_event(q, ev, atomic, hop); } break; case SNDRV_SEQ_EVENT_SETPOS_TIME: if (snd_seq_timer_set_position_time(q->timer, ev->data.queue.param.time.time) == 0) { queue_broadcast_event(q, ev, atomic, hop); } break; case SNDRV_SEQ_EVENT_QUEUE_SKEW: if (snd_seq_timer_set_skew(q->timer, ev->data.queue.param.skew.value, ev->data.queue.param.skew.base) == 0) { queue_broadcast_event(q, ev, atomic, hop); } break; } } /* * Queue control via timer control port: * this function is exported as a callback of timer port. */ int snd_seq_control_queue(struct snd_seq_event *ev, int atomic, int hop) { struct snd_seq_queue *q __free(snd_seq_queue) = NULL; if (snd_BUG_ON(!ev)) return -EINVAL; q = queueptr(ev->data.queue.queue); if (q == NULL) return -EINVAL; if (!queue_access_lock(q, ev->source.client)) return -EPERM; snd_seq_queue_process_event(q, ev, atomic, hop); queue_access_unlock(q); return 0; } /*----------------------------------------------------------------*/ #ifdef CONFIG_SND_PROC_FS /* exported to seq_info.c */ void snd_seq_info_queues_read(struct snd_info_entry *entry, struct snd_info_buffer *buffer) { int i, bpm; struct snd_seq_timer *tmr; bool locked; int owner; for (i = 0; i < SNDRV_SEQ_MAX_QUEUES; i++) { struct snd_seq_queue *q __free(snd_seq_queue) = queueptr(i); if (!q) continue; tmr = q->timer; if (tmr->tempo) bpm = (60000 * tmr->tempo_base) / tmr->tempo; else bpm = 0; scoped_guard(spinlock_irq, &q->owner_lock) { locked = q->locked; owner = q->owner; } snd_iprintf(buffer, "queue %d: [%s]\n", q->queue, q->name); snd_iprintf(buffer, "owned by client : %d\n", owner); snd_iprintf(buffer, "lock status : %s\n", locked ? "Locked" : "Free"); snd_iprintf(buffer, "queued time events : %d\n", snd_seq_prioq_avail(q->timeq)); snd_iprintf(buffer, "queued tick events : %d\n", snd_seq_prioq_avail(q->tickq)); snd_iprintf(buffer, "timer state : %s\n", tmr->running ? "Running" : "Stopped"); snd_iprintf(buffer, "timer PPQ : %d\n", tmr->ppq); snd_iprintf(buffer, "current tempo : %d\n", tmr->tempo); snd_iprintf(buffer, "tempo base : %d ns\n", tmr->tempo_base); snd_iprintf(buffer, "current BPM : %d\n", bpm); snd_iprintf(buffer, "current time : %d.%09d s\n", tmr->cur_time.tv_sec, tmr->cur_time.tv_nsec); snd_iprintf(buffer, "current tick : %d\n", tmr->tick.cur_tick); snd_iprintf(buffer, "\n"); } } #endif /* CONFIG_SND_PROC_FS */
317 93 258 3 11 3 9 9 9 31 56 257 86 8 218 26 86 86 86 84 255 2332 5 12 41 82 83 29 148 2254 2257 148 126 27 2312 2311 2266 2308 12 2309 309 257 309 2220 109 3 3 35 15 22 2 2038 5 43 2001 35 9 158 156 228 223 5 158 86 11 1 12 32 227 84 113 2085 2090 1 2 470 471 242 1 2086 2086 2090 311 312 70 30 30 14 12 17 2 2 74 30 48 71 70 2 70 1 69 1 71 70 67 6 4 90 17 1 73 74 74 20 71 9 322 1 1 1 1 12 6 6 20 20 20 5 5 5 75 6 5 76 76 75 65 8 5 75 75 12 12 12 12 9 9 9 9 11 11 10 11 11 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2001 Jens Axboe <axboe@kernel.dk> */ #include <linux/mm.h> #include <linux/swap.h> #include <linux/bio-integrity.h> #include <linux/blkdev.h> #include <linux/uio.h> #include <linux/iocontext.h> #include <linux/slab.h> #include <linux/init.h> #include <linux/kernel.h> #include <linux/export.h> #include <linux/mempool.h> #include <linux/workqueue.h> #include <linux/cgroup.h> #include <linux/highmem.h> #include <linux/blk-crypto.h> #include <linux/xarray.h> #include <trace/events/block.h> #include "blk.h" #include "blk-rq-qos.h" #include "blk-cgroup.h" #define ALLOC_CACHE_THRESHOLD 16 #define ALLOC_CACHE_MAX 256 struct bio_alloc_cache { struct bio *free_list; struct bio *free_list_irq; unsigned int nr; unsigned int nr_irq; }; static struct biovec_slab { int nr_vecs; char *name; struct kmem_cache *slab; } bvec_slabs[] __read_mostly = { { .nr_vecs = 16, .name = "biovec-16" }, { .nr_vecs = 64, .name = "biovec-64" }, { .nr_vecs = 128, .name = "biovec-128" }, { .nr_vecs = BIO_MAX_VECS, .name = "biovec-max" }, }; static struct biovec_slab *biovec_slab(unsigned short nr_vecs) { switch (nr_vecs) { /* smaller bios use inline vecs */ case 5 ... 16: return &bvec_slabs[0]; case 17 ... 64: return &bvec_slabs[1]; case 65 ... 128: return &bvec_slabs[2]; case 129 ... BIO_MAX_VECS: return &bvec_slabs[3]; default: BUG(); return NULL; } } /* * fs_bio_set is the bio_set containing bio and iovec memory pools used by * IO code that does not need private memory pools. */ struct bio_set fs_bio_set; EXPORT_SYMBOL(fs_bio_set); /* * Our slab pool management */ struct bio_slab { struct kmem_cache *slab; unsigned int slab_ref; unsigned int slab_size; char name[12]; }; static DEFINE_MUTEX(bio_slab_lock); static DEFINE_XARRAY(bio_slabs); static struct bio_slab *create_bio_slab(unsigned int size) { struct bio_slab *bslab = kzalloc(sizeof(*bslab), GFP_KERNEL); if (!bslab) return NULL; snprintf(bslab->name, sizeof(bslab->name), "bio-%d", size); bslab->slab = kmem_cache_create(bslab->name, size, ARCH_KMALLOC_MINALIGN, SLAB_HWCACHE_ALIGN | SLAB_TYPESAFE_BY_RCU, NULL); if (!bslab->slab) goto fail_alloc_slab; bslab->slab_ref = 1; bslab->slab_size = size; if (!xa_err(xa_store(&bio_slabs, size, bslab, GFP_KERNEL))) return bslab; kmem_cache_destroy(bslab->slab); fail_alloc_slab: kfree(bslab); return NULL; } static inline unsigned int bs_bio_slab_size(struct bio_set *bs) { return bs->front_pad + sizeof(struct bio) + bs->back_pad; } static struct kmem_cache *bio_find_or_create_slab(struct bio_set *bs) { unsigned int size = bs_bio_slab_size(bs); struct bio_slab *bslab; mutex_lock(&bio_slab_lock); bslab = xa_load(&bio_slabs, size); if (bslab) bslab->slab_ref++; else bslab = create_bio_slab(size); mutex_unlock(&bio_slab_lock); if (bslab) return bslab->slab; return NULL; } static void bio_put_slab(struct bio_set *bs) { struct bio_slab *bslab = NULL; unsigned int slab_size = bs_bio_slab_size(bs); mutex_lock(&bio_slab_lock); bslab = xa_load(&bio_slabs, slab_size); if (WARN(!bslab, KERN_ERR "bio: unable to find slab!\n")) goto out; WARN_ON_ONCE(bslab->slab != bs->bio_slab); WARN_ON(!bslab->slab_ref); if (--bslab->slab_ref) goto out; xa_erase(&bio_slabs, slab_size); kmem_cache_destroy(bslab->slab); kfree(bslab); out: mutex_unlock(&bio_slab_lock); } void bvec_free(mempool_t *pool, struct bio_vec *bv, unsigned short nr_vecs) { BUG_ON(nr_vecs > BIO_MAX_VECS); if (nr_vecs == BIO_MAX_VECS) mempool_free(bv, pool); else if (nr_vecs > BIO_INLINE_VECS) kmem_cache_free(biovec_slab(nr_vecs)->slab, bv); } /* * Make the first allocation restricted and don't dump info on allocation * failures, since we'll fall back to the mempool in case of failure. */ static inline gfp_t bvec_alloc_gfp(gfp_t gfp) { return (gfp & ~(__GFP_DIRECT_RECLAIM | __GFP_IO)) | __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN; } struct bio_vec *bvec_alloc(mempool_t *pool, unsigned short *nr_vecs, gfp_t gfp_mask) { struct biovec_slab *bvs = biovec_slab(*nr_vecs); if (WARN_ON_ONCE(!bvs)) return NULL; /* * Upgrade the nr_vecs request to take full advantage of the allocation. * We also rely on this in the bvec_free path. */ *nr_vecs = bvs->nr_vecs; /* * Try a slab allocation first for all smaller allocations. If that * fails and __GFP_DIRECT_RECLAIM is set retry with the mempool. * The mempool is sized to handle up to BIO_MAX_VECS entries. */ if (*nr_vecs < BIO_MAX_VECS) { struct bio_vec *bvl; bvl = kmem_cache_alloc(bvs->slab, bvec_alloc_gfp(gfp_mask)); if (likely(bvl) || !(gfp_mask & __GFP_DIRECT_RECLAIM)) return bvl; *nr_vecs = BIO_MAX_VECS; } return mempool_alloc(pool, gfp_mask); } void bio_uninit(struct bio *bio) { #ifdef CONFIG_BLK_CGROUP if (bio->bi_blkg) { blkg_put(bio->bi_blkg); bio->bi_blkg = NULL; } #endif if (bio_integrity(bio)) bio_integrity_free(bio); bio_crypt_free_ctx(bio); } EXPORT_SYMBOL(bio_uninit); static void bio_free(struct bio *bio) { struct bio_set *bs = bio->bi_pool; void *p = bio; WARN_ON_ONCE(!bs); bio_uninit(bio); bvec_free(&bs->bvec_pool, bio->bi_io_vec, bio->bi_max_vecs); mempool_free(p - bs->front_pad, &bs->bio_pool); } /* * Users of this function have their own bio allocation. Subsequently, * they must remember to pair any call to bio_init() with bio_uninit() * when IO has completed, or when the bio is released. */ void bio_init(struct bio *bio, struct block_device *bdev, struct bio_vec *table, unsigned short max_vecs, blk_opf_t opf) { bio->bi_next = NULL; bio->bi_bdev = bdev; bio->bi_opf = opf; bio->bi_flags = 0; bio->bi_ioprio = 0; bio->bi_write_hint = 0; bio->bi_write_stream = 0; bio->bi_status = 0; bio->bi_bvec_gap_bit = 0; bio->bi_iter.bi_sector = 0; bio->bi_iter.bi_size = 0; bio->bi_iter.bi_idx = 0; bio->bi_iter.bi_bvec_done = 0; bio->bi_end_io = NULL; bio->bi_private = NULL; #ifdef CONFIG_BLK_CGROUP bio->bi_blkg = NULL; bio->issue_time_ns = 0; if (bdev) bio_associate_blkg(bio); #ifdef CONFIG_BLK_CGROUP_IOCOST bio->bi_iocost_cost = 0; #endif #endif #ifdef CONFIG_BLK_INLINE_ENCRYPTION bio->bi_crypt_context = NULL; #endif #ifdef CONFIG_BLK_DEV_INTEGRITY bio->bi_integrity = NULL; #endif bio->bi_vcnt = 0; atomic_set(&bio->__bi_remaining, 1); atomic_set(&bio->__bi_cnt, 1); bio->bi_cookie = BLK_QC_T_NONE; bio->bi_max_vecs = max_vecs; bio->bi_io_vec = table; bio->bi_pool = NULL; } EXPORT_SYMBOL(bio_init); /** * bio_reset - reinitialize a bio * @bio: bio to reset * @bdev: block device to use the bio for * @opf: operation and flags for bio * * Description: * After calling bio_reset(), @bio will be in the same state as a freshly * allocated bio returned bio bio_alloc_bioset() - the only fields that are * preserved are the ones that are initialized by bio_alloc_bioset(). See * comment in struct bio. */ void bio_reset(struct bio *bio, struct block_device *bdev, blk_opf_t opf) { bio_uninit(bio); memset(bio, 0, BIO_RESET_BYTES); atomic_set(&bio->__bi_remaining, 1); bio->bi_bdev = bdev; if (bio->bi_bdev) bio_associate_blkg(bio); bio->bi_opf = opf; } EXPORT_SYMBOL(bio_reset); static struct bio *__bio_chain_endio(struct bio *bio) { struct bio *parent = bio->bi_private; if (bio->bi_status && !parent->bi_status) parent->bi_status = bio->bi_status; bio_put(bio); return parent; } /* * This function should only be used as a flag and must never be called. * If execution reaches here, it indicates a serious programming error. */ static void bio_chain_endio(struct bio *bio) { BUG(); } /** * bio_chain - chain bio completions * @bio: the target bio * @parent: the parent bio of @bio * * The caller won't have a bi_end_io called when @bio completes - instead, * @parent's bi_end_io won't be called until both @parent and @bio have * completed; the chained bio will also be freed when it completes. * * The caller must not set bi_private or bi_end_io in @bio. */ void bio_chain(struct bio *bio, struct bio *parent) { BUG_ON(bio->bi_private || bio->bi_end_io); bio->bi_private = parent; bio->bi_end_io = bio_chain_endio; bio_inc_remaining(parent); } EXPORT_SYMBOL(bio_chain); /** * bio_chain_and_submit - submit a bio after chaining it to another one * @prev: bio to chain and submit * @new: bio to chain to * * If @prev is non-NULL, chain it to @new and submit it. * * Return: @new. */ struct bio *bio_chain_and_submit(struct bio *prev, struct bio *new) { if (prev) { bio_chain(prev, new); submit_bio(prev); } return new; } struct bio *blk_next_bio(struct bio *bio, struct block_device *bdev, unsigned int nr_pages, blk_opf_t opf, gfp_t gfp) { return bio_chain_and_submit(bio, bio_alloc(bdev, nr_pages, opf, gfp)); } EXPORT_SYMBOL_GPL(blk_next_bio); static void bio_alloc_rescue(struct work_struct *work) { struct bio_set *bs = container_of(work, struct bio_set, rescue_work); struct bio *bio; while (1) { spin_lock(&bs->rescue_lock); bio = bio_list_pop(&bs->rescue_list); spin_unlock(&bs->rescue_lock); if (!bio) break; submit_bio_noacct(bio); } } static void punt_bios_to_rescuer(struct bio_set *bs) { struct bio_list punt, nopunt; struct bio *bio; if (WARN_ON_ONCE(!bs->rescue_workqueue)) return; /* * In order to guarantee forward progress we must punt only bios that * were allocated from this bio_set; otherwise, if there was a bio on * there for a stacking driver higher up in the stack, processing it * could require allocating bios from this bio_set, and doing that from * our own rescuer would be bad. * * Since bio lists are singly linked, pop them all instead of trying to * remove from the middle of the list: */ bio_list_init(&punt); bio_list_init(&nopunt); while ((bio = bio_list_pop(&current->bio_list[0]))) bio_list_add(bio->bi_pool == bs ? &punt : &nopunt, bio); current->bio_list[0] = nopunt; bio_list_init(&nopunt); while ((bio = bio_list_pop(&current->bio_list[1]))) bio_list_add(bio->bi_pool == bs ? &punt : &nopunt, bio); current->bio_list[1] = nopunt; spin_lock(&bs->rescue_lock); bio_list_merge(&bs->rescue_list, &punt); spin_unlock(&bs->rescue_lock); queue_work(bs->rescue_workqueue, &bs->rescue_work); } static void bio_alloc_irq_cache_splice(struct bio_alloc_cache *cache) { unsigned long flags; /* cache->free_list must be empty */ if (WARN_ON_ONCE(cache->free_list)) return; local_irq_save(flags); cache->free_list = cache->free_list_irq; cache->free_list_irq = NULL; cache->nr += cache->nr_irq; cache->nr_irq = 0; local_irq_restore(flags); } static struct bio *bio_alloc_percpu_cache(struct block_device *bdev, unsigned short nr_vecs, blk_opf_t opf, gfp_t gfp, struct bio_set *bs) { struct bio_alloc_cache *cache; struct bio *bio; cache = per_cpu_ptr(bs->cache, get_cpu()); if (!cache->free_list) { if (READ_ONCE(cache->nr_irq) >= ALLOC_CACHE_THRESHOLD) bio_alloc_irq_cache_splice(cache); if (!cache->free_list) { put_cpu(); return NULL; } } bio = cache->free_list; cache->free_list = bio->bi_next; cache->nr--; put_cpu(); if (nr_vecs) bio_init_inline(bio, bdev, nr_vecs, opf); else bio_init(bio, bdev, NULL, nr_vecs, opf); bio->bi_pool = bs; return bio; } /** * bio_alloc_bioset - allocate a bio for I/O * @bdev: block device to allocate the bio for (can be %NULL) * @nr_vecs: number of bvecs to pre-allocate * @opf: operation and flags for bio * @gfp_mask: the GFP_* mask given to the slab allocator * @bs: the bio_set to allocate from. * * Allocate a bio from the mempools in @bs. * * If %__GFP_DIRECT_RECLAIM is set then bio_alloc will always be able to * allocate a bio. This is due to the mempool guarantees. To make this work, * callers must never allocate more than 1 bio at a time from the general pool. * Callers that need to allocate more than 1 bio must always submit the * previously allocated bio for IO before attempting to allocate a new one. * Failure to do so can cause deadlocks under memory pressure. * * Note that when running under submit_bio_noacct() (i.e. any block driver), * bios are not submitted until after you return - see the code in * submit_bio_noacct() that converts recursion into iteration, to prevent * stack overflows. * * This would normally mean allocating multiple bios under submit_bio_noacct() * would be susceptible to deadlocks, but we have * deadlock avoidance code that resubmits any blocked bios from a rescuer * thread. * * However, we do not guarantee forward progress for allocations from other * mempools. Doing multiple allocations from the same mempool under * submit_bio_noacct() should be avoided - instead, use bio_set's front_pad * for per bio allocations. * * Returns: Pointer to new bio on success, NULL on failure. */ struct bio *bio_alloc_bioset(struct block_device *bdev, unsigned short nr_vecs, blk_opf_t opf, gfp_t gfp_mask, struct bio_set *bs) { gfp_t saved_gfp = gfp_mask; struct bio *bio; void *p; /* should not use nobvec bioset for nr_vecs > 0 */ if (WARN_ON_ONCE(!mempool_initialized(&bs->bvec_pool) && nr_vecs > 0)) return NULL; if (bs->cache && nr_vecs <= BIO_INLINE_VECS) { opf |= REQ_ALLOC_CACHE; bio = bio_alloc_percpu_cache(bdev, nr_vecs, opf, gfp_mask, bs); if (bio) return bio; /* * No cached bio available, bio returned below marked with * REQ_ALLOC_CACHE to participate in per-cpu alloc cache. */ } else opf &= ~REQ_ALLOC_CACHE; /* * submit_bio_noacct() converts recursion to iteration; this means if * we're running beneath it, any bios we allocate and submit will not be * submitted (and thus freed) until after we return. * * This exposes us to a potential deadlock if we allocate multiple bios * from the same bio_set() while running underneath submit_bio_noacct(). * If we were to allocate multiple bios (say a stacking block driver * that was splitting bios), we would deadlock if we exhausted the * mempool's reserve. * * We solve this, and guarantee forward progress, with a rescuer * workqueue per bio_set. If we go to allocate and there are bios on * current->bio_list, we first try the allocation without * __GFP_DIRECT_RECLAIM; if that fails, we punt those bios we would be * blocking to the rescuer workqueue before we retry with the original * gfp_flags. */ if (current->bio_list && (!bio_list_empty(&current->bio_list[0]) || !bio_list_empty(&current->bio_list[1])) && bs->rescue_workqueue) gfp_mask &= ~__GFP_DIRECT_RECLAIM; p = mempool_alloc(&bs->bio_pool, gfp_mask); if (!p && gfp_mask != saved_gfp) { punt_bios_to_rescuer(bs); gfp_mask = saved_gfp; p = mempool_alloc(&bs->bio_pool, gfp_mask); } if (unlikely(!p)) return NULL; if (!mempool_is_saturated(&bs->bio_pool)) opf &= ~REQ_ALLOC_CACHE; bio = p + bs->front_pad; if (nr_vecs > BIO_INLINE_VECS) { struct bio_vec *bvl = NULL; bvl = bvec_alloc(&bs->bvec_pool, &nr_vecs, gfp_mask); if (!bvl && gfp_mask != saved_gfp) { punt_bios_to_rescuer(bs); gfp_mask = saved_gfp; bvl = bvec_alloc(&bs->bvec_pool, &nr_vecs, gfp_mask); } if (unlikely(!bvl)) goto err_free; bio_init(bio, bdev, bvl, nr_vecs, opf); } else if (nr_vecs) { bio_init_inline(bio, bdev, BIO_INLINE_VECS, opf); } else { bio_init(bio, bdev, NULL, 0, opf); } bio->bi_pool = bs; return bio; err_free: mempool_free(p, &bs->bio_pool); return NULL; } EXPORT_SYMBOL(bio_alloc_bioset); /** * bio_kmalloc - kmalloc a bio * @nr_vecs: number of bio_vecs to allocate * @gfp_mask: the GFP_* mask given to the slab allocator * * Use kmalloc to allocate a bio (including bvecs). The bio must be initialized * using bio_init() before use. To free a bio returned from this function use * kfree() after calling bio_uninit(). A bio returned from this function can * be reused by calling bio_uninit() before calling bio_init() again. * * Note that unlike bio_alloc() or bio_alloc_bioset() allocations from this * function are not backed by a mempool can fail. Do not use this function * for allocations in the file system I/O path. * * Returns: Pointer to new bio on success, NULL on failure. */ struct bio *bio_kmalloc(unsigned short nr_vecs, gfp_t gfp_mask) { struct bio *bio; if (nr_vecs > BIO_MAX_INLINE_VECS) return NULL; return kmalloc(sizeof(*bio) + nr_vecs * sizeof(struct bio_vec), gfp_mask); } EXPORT_SYMBOL(bio_kmalloc); void zero_fill_bio_iter(struct bio *bio, struct bvec_iter start) { struct bio_vec bv; struct bvec_iter iter; __bio_for_each_segment(bv, bio, iter, start) memzero_bvec(&bv); } EXPORT_SYMBOL(zero_fill_bio_iter); /** * bio_truncate - truncate the bio to small size of @new_size * @bio: the bio to be truncated * @new_size: new size for truncating the bio * * Description: * Truncate the bio to new size of @new_size. If bio_op(bio) is * REQ_OP_READ, zero the truncated part. This function should only * be used for handling corner cases, such as bio eod. */ static void bio_truncate(struct bio *bio, unsigned new_size) { struct bio_vec bv; struct bvec_iter iter; unsigned int done = 0; bool truncated = false; if (new_size >= bio->bi_iter.bi_size) return; if (bio_op(bio) != REQ_OP_READ) goto exit; bio_for_each_segment(bv, bio, iter) { if (done + bv.bv_len > new_size) { size_t offset; if (!truncated) offset = new_size - done; else offset = 0; memzero_page(bv.bv_page, bv.bv_offset + offset, bv.bv_len - offset); truncated = true; } done += bv.bv_len; } exit: /* * Don't touch bvec table here and make it really immutable, since * fs bio user has to retrieve all pages via bio_for_each_segment_all * in its .end_bio() callback. * * It is enough to truncate bio by updating .bi_size since we can make * correct bvec with the updated .bi_size for drivers. */ bio->bi_iter.bi_size = new_size; } /** * guard_bio_eod - truncate a BIO to fit the block device * @bio: bio to truncate * * This allows us to do IO even on the odd last sectors of a device, even if the * block size is some multiple of the physical sector size. * * We'll just truncate the bio to the size of the device, and clear the end of * the buffer head manually. Truly out-of-range accesses will turn into actual * I/O errors, this only handles the "we need to be able to do I/O at the final * sector" case. */ void guard_bio_eod(struct bio *bio) { sector_t maxsector = bdev_nr_sectors(bio->bi_bdev); if (!maxsector) return; /* * If the *whole* IO is past the end of the device, * let it through, and the IO layer will turn it into * an EIO. */ if (unlikely(bio->bi_iter.bi_sector >= maxsector)) return; maxsector -= bio->bi_iter.bi_sector; if (likely((bio->bi_iter.bi_size >> 9) <= maxsector)) return; bio_truncate(bio, maxsector << 9); } static int __bio_alloc_cache_prune(struct bio_alloc_cache *cache, unsigned int nr) { unsigned int i = 0; struct bio *bio; while ((bio = cache->free_list) != NULL) { cache->free_list = bio->bi_next; cache->nr--; bio_free(bio); if (++i == nr) break; } return i; } static void bio_alloc_cache_prune(struct bio_alloc_cache *cache, unsigned int nr) { nr -= __bio_alloc_cache_prune(cache, nr); if (!READ_ONCE(cache->free_list)) { bio_alloc_irq_cache_splice(cache); __bio_alloc_cache_prune(cache, nr); } } static int bio_cpu_dead(unsigned int cpu, struct hlist_node *node) { struct bio_set *bs; bs = hlist_entry_safe(node, struct bio_set, cpuhp_dead); if (bs->cache) { struct bio_alloc_cache *cache = per_cpu_ptr(bs->cache, cpu); bio_alloc_cache_prune(cache, -1U); } return 0; } static void bio_alloc_cache_destroy(struct bio_set *bs) { int cpu; if (!bs->cache) return; cpuhp_state_remove_instance_nocalls(CPUHP_BIO_DEAD, &bs->cpuhp_dead); for_each_possible_cpu(cpu) { struct bio_alloc_cache *cache; cache = per_cpu_ptr(bs->cache, cpu); bio_alloc_cache_prune(cache, -1U); } free_percpu(bs->cache); bs->cache = NULL; } static inline void bio_put_percpu_cache(struct bio *bio) { struct bio_alloc_cache *cache; cache = per_cpu_ptr(bio->bi_pool->cache, get_cpu()); if (READ_ONCE(cache->nr_irq) + cache->nr > ALLOC_CACHE_MAX) goto out_free; if (in_task()) { bio_uninit(bio); bio->bi_next = cache->free_list; /* Not necessary but helps not to iopoll already freed bios */ bio->bi_bdev = NULL; cache->free_list = bio; cache->nr++; } else if (in_hardirq()) { lockdep_assert_irqs_disabled(); bio_uninit(bio); bio->bi_next = cache->free_list_irq; cache->free_list_irq = bio; cache->nr_irq++; } else { goto out_free; } put_cpu(); return; out_free: put_cpu(); bio_free(bio); } /** * bio_put - release a reference to a bio * @bio: bio to release reference to * * Description: * Put a reference to a &struct bio, either one you have gotten with * bio_alloc, bio_get or bio_clone_*. The last put of a bio will free it. **/ void bio_put(struct bio *bio) { if (unlikely(bio_flagged(bio, BIO_REFFED))) { BUG_ON(!atomic_read(&bio->__bi_cnt)); if (!atomic_dec_and_test(&bio->__bi_cnt)) return; } if (bio->bi_opf & REQ_ALLOC_CACHE) bio_put_percpu_cache(bio); else bio_free(bio); } EXPORT_SYMBOL(bio_put); static int __bio_clone(struct bio *bio, struct bio *bio_src, gfp_t gfp) { bio_set_flag(bio, BIO_CLONED); bio->bi_ioprio = bio_src->bi_ioprio; bio->bi_write_hint = bio_src->bi_write_hint; bio->bi_write_stream = bio_src->bi_write_stream; bio->bi_iter = bio_src->bi_iter; if (bio->bi_bdev) { if (bio->bi_bdev == bio_src->bi_bdev && bio_flagged(bio_src, BIO_REMAPPED)) bio_set_flag(bio, BIO_REMAPPED); bio_clone_blkg_association(bio, bio_src); } if (bio_crypt_clone(bio, bio_src, gfp) < 0) return -ENOMEM; if (bio_integrity(bio_src) && bio_integrity_clone(bio, bio_src, gfp) < 0) return -ENOMEM; return 0; } /** * bio_alloc_clone - clone a bio that shares the original bio's biovec * @bdev: block_device to clone onto * @bio_src: bio to clone from * @gfp: allocation priority * @bs: bio_set to allocate from * * Allocate a new bio that is a clone of @bio_src. The caller owns the returned * bio, but not the actual data it points to. * * The caller must ensure that the return bio is not freed before @bio_src. */ struct bio *bio_alloc_clone(struct block_device *bdev, struct bio *bio_src, gfp_t gfp, struct bio_set *bs) { struct bio *bio; bio = bio_alloc_bioset(bdev, 0, bio_src->bi_opf, gfp, bs); if (!bio) return NULL; if (__bio_clone(bio, bio_src, gfp) < 0) { bio_put(bio); return NULL; } bio->bi_io_vec = bio_src->bi_io_vec; return bio; } EXPORT_SYMBOL(bio_alloc_clone); /** * bio_init_clone - clone a bio that shares the original bio's biovec * @bdev: block_device to clone onto * @bio: bio to clone into * @bio_src: bio to clone from * @gfp: allocation priority * * Initialize a new bio in caller provided memory that is a clone of @bio_src. * The caller owns the returned bio, but not the actual data it points to. * * The caller must ensure that @bio_src is not freed before @bio. */ int bio_init_clone(struct block_device *bdev, struct bio *bio, struct bio *bio_src, gfp_t gfp) { int ret; bio_init(bio, bdev, bio_src->bi_io_vec, 0, bio_src->bi_opf); ret = __bio_clone(bio, bio_src, gfp); if (ret) bio_uninit(bio); return ret; } EXPORT_SYMBOL(bio_init_clone); /** * bio_full - check if the bio is full * @bio: bio to check * @len: length of one segment to be added * * Return true if @bio is full and one segment with @len bytes can't be * added to the bio, otherwise return false */ static inline bool bio_full(struct bio *bio, unsigned len) { if (bio->bi_vcnt >= bio->bi_max_vecs) return true; if (bio->bi_iter.bi_size > UINT_MAX - len) return true; return false; } static bool bvec_try_merge_page(struct bio_vec *bv, struct page *page, unsigned int len, unsigned int off) { size_t bv_end = bv->bv_offset + bv->bv_len; phys_addr_t vec_end_addr = page_to_phys(bv->bv_page) + bv_end - 1; phys_addr_t page_addr = page_to_phys(page); if (vec_end_addr + 1 != page_addr + off) return false; if (xen_domain() && !xen_biovec_phys_mergeable(bv, page)) return false; if ((vec_end_addr & PAGE_MASK) != ((page_addr + off) & PAGE_MASK)) { if (IS_ENABLED(CONFIG_KMSAN)) return false; if (bv->bv_page + bv_end / PAGE_SIZE != page + off / PAGE_SIZE) return false; } bv->bv_len += len; return true; } /* * Try to merge a page into a segment, while obeying the hardware segment * size limit. * * This is kept around for the integrity metadata, which is still tries * to build the initial bio to the hardware limit and doesn't have proper * helpers to split. Hopefully this will go away soon. */ bool bvec_try_merge_hw_page(struct request_queue *q, struct bio_vec *bv, struct page *page, unsigned len, unsigned offset) { unsigned long mask = queue_segment_boundary(q); phys_addr_t addr1 = bvec_phys(bv); phys_addr_t addr2 = page_to_phys(page) + offset + len - 1; if ((addr1 | mask) != (addr2 | mask)) return false; if (len > queue_max_segment_size(q) - bv->bv_len) return false; return bvec_try_merge_page(bv, page, len, offset); } /** * __bio_add_page - add page(s) to a bio in a new segment * @bio: destination bio * @page: start page to add * @len: length of the data to add, may cross pages * @off: offset of the data relative to @page, may cross pages * * Add the data at @page + @off to @bio as a new bvec. The caller must ensure * that @bio has space for another bvec. */ void __bio_add_page(struct bio *bio, struct page *page, unsigned int len, unsigned int off) { WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)); WARN_ON_ONCE(bio_full(bio, len)); if (is_pci_p2pdma_page(page)) bio->bi_opf |= REQ_NOMERGE; bvec_set_page(&bio->bi_io_vec[bio->bi_vcnt], page, len, off); bio->bi_iter.bi_size += len; bio->bi_vcnt++; } EXPORT_SYMBOL_GPL(__bio_add_page); /** * bio_add_virt_nofail - add data in the direct kernel mapping to a bio * @bio: destination bio * @vaddr: data to add * @len: length of the data to add, may cross pages * * Add the data at @vaddr to @bio. The caller must have ensure a segment * is available for the added data. No merging into an existing segment * will be performed. */ void bio_add_virt_nofail(struct bio *bio, void *vaddr, unsigned len) { __bio_add_page(bio, virt_to_page(vaddr), len, offset_in_page(vaddr)); } EXPORT_SYMBOL_GPL(bio_add_virt_nofail); /** * bio_add_page - attempt to add page(s) to bio * @bio: destination bio * @page: start page to add * @len: vec entry length, may cross pages * @offset: vec entry offset relative to @page, may cross pages * * Attempt to add page(s) to the bio_vec maplist. This will only fail * if either bio->bi_vcnt == bio->bi_max_vecs or it's a cloned bio. */ int bio_add_page(struct bio *bio, struct page *page, unsigned int len, unsigned int offset) { if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED))) return 0; if (bio->bi_iter.bi_size > UINT_MAX - len) return 0; if (bio->bi_vcnt > 0) { struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1]; if (!zone_device_pages_have_same_pgmap(bv->bv_page, page)) return 0; if (bvec_try_merge_page(bv, page, len, offset)) { bio->bi_iter.bi_size += len; return len; } } if (bio->bi_vcnt >= bio->bi_max_vecs) return 0; __bio_add_page(bio, page, len, offset); return len; } EXPORT_SYMBOL(bio_add_page); void bio_add_folio_nofail(struct bio *bio, struct folio *folio, size_t len, size_t off) { unsigned long nr = off / PAGE_SIZE; WARN_ON_ONCE(len > UINT_MAX); __bio_add_page(bio, folio_page(folio, nr), len, off % PAGE_SIZE); } EXPORT_SYMBOL_GPL(bio_add_folio_nofail); /** * bio_add_folio - Attempt to add part of a folio to a bio. * @bio: BIO to add to. * @folio: Folio to add. * @len: How many bytes from the folio to add. * @off: First byte in this folio to add. * * Filesystems that use folios can call this function instead of calling * bio_add_page() for each page in the folio. If @off is bigger than * PAGE_SIZE, this function can create a bio_vec that starts in a page * after the bv_page. BIOs do not support folios that are 4GiB or larger. * * Return: Whether the addition was successful. */ bool bio_add_folio(struct bio *bio, struct folio *folio, size_t len, size_t off) { unsigned long nr = off / PAGE_SIZE; if (len > UINT_MAX) return false; return bio_add_page(bio, folio_page(folio, nr), len, off % PAGE_SIZE) > 0; } EXPORT_SYMBOL(bio_add_folio); /** * bio_add_vmalloc_chunk - add a vmalloc chunk to a bio * @bio: destination bio * @vaddr: vmalloc address to add * @len: total length in bytes of the data to add * * Add data starting at @vaddr to @bio and return how many bytes were added. * This may be less than the amount originally asked. Returns 0 if no data * could be added to @bio. * * This helper calls flush_kernel_vmap_range() for the range added. For reads * the caller still needs to manually call invalidate_kernel_vmap_range() in * the completion handler. */ unsigned int bio_add_vmalloc_chunk(struct bio *bio, void *vaddr, unsigned len) { unsigned int offset = offset_in_page(vaddr); len = min(len, PAGE_SIZE - offset); if (bio_add_page(bio, vmalloc_to_page(vaddr), len, offset) < len) return 0; if (op_is_write(bio_op(bio))) flush_kernel_vmap_range(vaddr, len); return len; } EXPORT_SYMBOL_GPL(bio_add_vmalloc_chunk); /** * bio_add_vmalloc - add a vmalloc region to a bio * @bio: destination bio * @vaddr: vmalloc address to add * @len: total length in bytes of the data to add * * Add data starting at @vaddr to @bio. Return %true on success or %false if * @bio does not have enough space for the payload. * * This helper calls flush_kernel_vmap_range() for the range added. For reads * the caller still needs to manually call invalidate_kernel_vmap_range() in * the completion handler. */ bool bio_add_vmalloc(struct bio *bio, void *vaddr, unsigned int len) { do { unsigned int added = bio_add_vmalloc_chunk(bio, vaddr, len); if (!added) return false; vaddr += added; len -= added; } while (len); return true; } EXPORT_SYMBOL_GPL(bio_add_vmalloc); void __bio_release_pages(struct bio *bio, bool mark_dirty) { struct folio_iter fi; bio_for_each_folio_all(fi, bio) { size_t nr_pages; if (mark_dirty) { folio_lock(fi.folio); folio_mark_dirty(fi.folio); folio_unlock(fi.folio); } nr_pages = (fi.offset + fi.length - 1) / PAGE_SIZE - fi.offset / PAGE_SIZE + 1; unpin_user_folio(fi.folio, nr_pages); } } EXPORT_SYMBOL_GPL(__bio_release_pages); void bio_iov_bvec_set(struct bio *bio, const struct iov_iter *iter) { WARN_ON_ONCE(bio->bi_max_vecs); bio->bi_vcnt = iter->nr_segs; bio->bi_io_vec = (struct bio_vec *)iter->bvec; bio->bi_iter.bi_bvec_done = iter->iov_offset; bio->bi_iter.bi_size = iov_iter_count(iter); bio_set_flag(bio, BIO_CLONED); } static unsigned int get_contig_folio_len(unsigned int *num_pages, struct page **pages, unsigned int i, struct folio *folio, size_t left, size_t offset) { size_t bytes = left; size_t contig_sz = min_t(size_t, PAGE_SIZE - offset, bytes); unsigned int j; /* * We might COW a single page in the middle of * a large folio, so we have to check that all * pages belong to the same folio. */ bytes -= contig_sz; for (j = i + 1; j < i + *num_pages; j++) { size_t next = min_t(size_t, PAGE_SIZE, bytes); if (page_folio(pages[j]) != folio || pages[j] != pages[j - 1] + 1) { break; } contig_sz += next; bytes -= next; } *num_pages = j - i; return contig_sz; } #define PAGE_PTRS_PER_BVEC (sizeof(struct bio_vec) / sizeof(struct page *)) /** * __bio_iov_iter_get_pages - pin user or kernel pages and add them to a bio * @bio: bio to add pages to * @iter: iov iterator describing the region to be mapped * * Extracts pages from *iter and appends them to @bio's bvec array. The pages * will have to be cleaned up in the way indicated by the BIO_PAGE_PINNED flag. * For a multi-segment *iter, this function only adds pages from the next * non-empty segment of the iov iterator. */ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) { iov_iter_extraction_t extraction_flags = 0; unsigned short nr_pages = bio->bi_max_vecs - bio->bi_vcnt; unsigned short entries_left = bio->bi_max_vecs - bio->bi_vcnt; struct bio_vec *bv = bio->bi_io_vec + bio->bi_vcnt; struct page **pages = (struct page **)bv; ssize_t size; unsigned int num_pages, i = 0; size_t offset, folio_offset, left, len; int ret = 0; /* * Move page array up in the allocated memory for the bio vecs as far as * possible so that we can start filling biovecs from the beginning * without overwriting the temporary page array. */ BUILD_BUG_ON(PAGE_PTRS_PER_BVEC < 2); pages += entries_left * (PAGE_PTRS_PER_BVEC - 1); if (bio->bi_bdev && blk_queue_pci_p2pdma(bio->bi_bdev->bd_disk->queue)) extraction_flags |= ITER_ALLOW_P2PDMA; size = iov_iter_extract_pages(iter, &pages, UINT_MAX - bio->bi_iter.bi_size, nr_pages, extraction_flags, &offset); if (unlikely(size <= 0)) return size ? size : -EFAULT; nr_pages = DIV_ROUND_UP(offset + size, PAGE_SIZE); for (left = size, i = 0; left > 0; left -= len, i += num_pages) { struct page *page = pages[i]; struct folio *folio = page_folio(page); unsigned int old_vcnt = bio->bi_vcnt; folio_offset = ((size_t)folio_page_idx(folio, page) << PAGE_SHIFT) + offset; len = min(folio_size(folio) - folio_offset, left); num_pages = DIV_ROUND_UP(offset + len, PAGE_SIZE); if (num_pages > 1) len = get_contig_folio_len(&num_pages, pages, i, folio, left, offset); if (!bio_add_folio(bio, folio, len, folio_offset)) { WARN_ON_ONCE(1); ret = -EINVAL; goto out; } if (bio_flagged(bio, BIO_PAGE_PINNED)) { /* * We're adding another fragment of a page that already * was part of the last segment. Undo our pin as the * page was pinned when an earlier fragment of it was * added to the bio and __bio_release_pages expects a * single pin per page. */ if (offset && bio->bi_vcnt == old_vcnt) unpin_user_folio(folio, 1); } offset = 0; } iov_iter_revert(iter, left); out: while (i < nr_pages) bio_release_page(bio, pages[i++]); return ret; } /* * Aligns the bio size to the len_align_mask, releasing excessive bio vecs that * __bio_iov_iter_get_pages may have inserted, and reverts the trimmed length * for the next iteration. */ static int bio_iov_iter_align_down(struct bio *bio, struct iov_iter *iter, unsigned len_align_mask) { size_t nbytes = bio->bi_iter.bi_size & len_align_mask; if (!nbytes) return 0; iov_iter_revert(iter, nbytes); bio->bi_iter.bi_size -= nbytes; do { struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1]; if (nbytes < bv->bv_len) { bv->bv_len -= nbytes; break; } bio_release_page(bio, bv->bv_page); bio->bi_vcnt--; nbytes -= bv->bv_len; } while (nbytes); if (!bio->bi_vcnt) return -EFAULT; return 0; } /** * bio_iov_iter_get_pages - add user or kernel pages to a bio * @bio: bio to add pages to * @iter: iov iterator describing the region to be added * @len_align_mask: the mask to align the total size to, 0 for any length * * This takes either an iterator pointing to user memory, or one pointing to * kernel pages (BVEC iterator). If we're adding user pages, we pin them and * map them into the kernel. On IO completion, the caller should put those * pages. For bvec based iterators bio_iov_iter_get_pages() uses the provided * bvecs rather than copying them. Hence anyone issuing kiocb based IO needs * to ensure the bvecs and pages stay referenced until the submitted I/O is * completed by a call to ->ki_complete() or returns with an error other than * -EIOCBQUEUED. The caller needs to check if the bio is flagged BIO_NO_PAGE_REF * on IO completion. If it isn't, then pages should be released. * * The function tries, but does not guarantee, to pin as many pages as * fit into the bio, or are requested in @iter, whatever is smaller. If * MM encounters an error pinning the requested pages, it stops. Error * is returned only if 0 pages could be pinned. */ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter, unsigned len_align_mask) { int ret = 0; if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED))) return -EIO; if (iov_iter_is_bvec(iter)) { bio_iov_bvec_set(bio, iter); iov_iter_advance(iter, bio->bi_iter.bi_size); return 0; } if (iov_iter_extract_will_pin(iter)) bio_set_flag(bio, BIO_PAGE_PINNED); do { ret = __bio_iov_iter_get_pages(bio, iter); } while (!ret && iov_iter_count(iter) && !bio_full(bio, 0)); if (bio->bi_vcnt) return bio_iov_iter_align_down(bio, iter, len_align_mask); return ret; } static void submit_bio_wait_endio(struct bio *bio) { complete(bio->bi_private); } /** * submit_bio_wait - submit a bio, and wait until it completes * @bio: The &struct bio which describes the I/O * * Simple wrapper around submit_bio(). Returns 0 on success, or the error from * bio_endio() on failure. * * WARNING: Unlike to how submit_bio() is usually used, this function does not * result in bio reference to be consumed. The caller must drop the reference * on his own. */ int submit_bio_wait(struct bio *bio) { DECLARE_COMPLETION_ONSTACK_MAP(done, bio->bi_bdev->bd_disk->lockdep_map); bio->bi_private = &done; bio->bi_end_io = submit_bio_wait_endio; bio->bi_opf |= REQ_SYNC; submit_bio(bio); blk_wait_io(&done); return blk_status_to_errno(bio->bi_status); } EXPORT_SYMBOL(submit_bio_wait); /** * bdev_rw_virt - synchronously read into / write from kernel mapping * @bdev: block device to access * @sector: sector to access * @data: data to read/write * @len: length in byte to read/write * @op: operation (e.g. REQ_OP_READ/REQ_OP_WRITE) * * Performs synchronous I/O to @bdev for @data/@len. @data must be in * the kernel direct mapping and not a vmalloc address. */ int bdev_rw_virt(struct block_device *bdev, sector_t sector, void *data, size_t len, enum req_op op) { struct bio_vec bv; struct bio bio; int error; if (WARN_ON_ONCE(is_vmalloc_addr(data))) return -EIO; bio_init(&bio, bdev, &bv, 1, op); bio.bi_iter.bi_sector = sector; bio_add_virt_nofail(&bio, data, len); error = submit_bio_wait(&bio); bio_uninit(&bio); return error; } EXPORT_SYMBOL_GPL(bdev_rw_virt); static void bio_wait_end_io(struct bio *bio) { complete(bio->bi_private); bio_put(bio); } /* * bio_await_chain - ends @bio and waits for every chained bio to complete */ void bio_await_chain(struct bio *bio) { DECLARE_COMPLETION_ONSTACK_MAP(done, bio->bi_bdev->bd_disk->lockdep_map); bio->bi_private = &done; bio->bi_end_io = bio_wait_end_io; bio_endio(bio); blk_wait_io(&done); } void __bio_advance(struct bio *bio, unsigned bytes) { if (bio_integrity(bio)) bio_integrity_advance(bio, bytes); bio_crypt_advance(bio, bytes); bio_advance_iter(bio, &bio->bi_iter, bytes); } EXPORT_SYMBOL(__bio_advance); void bio_copy_data_iter(struct bio *dst, struct bvec_iter *dst_iter, struct bio *src, struct bvec_iter *src_iter) { while (src_iter->bi_size && dst_iter->bi_size) { struct bio_vec src_bv = bio_iter_iovec(src, *src_iter); struct bio_vec dst_bv = bio_iter_iovec(dst, *dst_iter); unsigned int bytes = min(src_bv.bv_len, dst_bv.bv_len); void *src_buf = bvec_kmap_local(&src_bv); void *dst_buf = bvec_kmap_local(&dst_bv); memcpy(dst_buf, src_buf, bytes); kunmap_local(dst_buf); kunmap_local(src_buf); bio_advance_iter_single(src, src_iter, bytes); bio_advance_iter_single(dst, dst_iter, bytes); } } EXPORT_SYMBOL(bio_copy_data_iter); /** * bio_copy_data - copy contents of data buffers from one bio to another * @src: source bio * @dst: destination bio * * Stops when it reaches the end of either @src or @dst - that is, copies * min(src->bi_size, dst->bi_size) bytes (or the equivalent for lists of bios). */ void bio_copy_data(struct bio *dst, struct bio *src) { struct bvec_iter src_iter = src->bi_iter; struct bvec_iter dst_iter = dst->bi_iter; bio_copy_data_iter(dst, &dst_iter, src, &src_iter); } EXPORT_SYMBOL(bio_copy_data); void bio_free_pages(struct bio *bio) { struct bio_vec *bvec; struct bvec_iter_all iter_all; bio_for_each_segment_all(bvec, bio, iter_all) __free_page(bvec->bv_page); } EXPORT_SYMBOL(bio_free_pages); /* * bio_set_pages_dirty() and bio_check_pages_dirty() are support functions * for performing direct-IO in BIOs. * * The problem is that we cannot run folio_mark_dirty() from interrupt context * because the required locks are not interrupt-safe. So what we can do is to * mark the pages dirty _before_ performing IO. And in interrupt context, * check that the pages are still dirty. If so, fine. If not, redirty them * in process context. * * Note that this code is very hard to test under normal circumstances because * direct-io pins the pages with get_user_pages(). This makes * is_page_cache_freeable return false, and the VM will not clean the pages. * But other code (eg, flusher threads) could clean the pages if they are mapped * pagecache. * * Simply disabling the call to bio_set_pages_dirty() is a good way to test the * deferred bio dirtying paths. */ /* * bio_set_pages_dirty() will mark all the bio's pages as dirty. */ void bio_set_pages_dirty(struct bio *bio) { struct folio_iter fi; bio_for_each_folio_all(fi, bio) { folio_lock(fi.folio); folio_mark_dirty(fi.folio); folio_unlock(fi.folio); } } EXPORT_SYMBOL_GPL(bio_set_pages_dirty); /* * bio_check_pages_dirty() will check that all the BIO's pages are still dirty. * If they are, then fine. If, however, some pages are clean then they must * have been written out during the direct-IO read. So we take another ref on * the BIO and re-dirty the pages in process context. * * It is expected that bio_check_pages_dirty() will wholly own the BIO from * here on. It will unpin each page and will run one bio_put() against the * BIO. */ static void bio_dirty_fn(struct work_struct *work); static DECLARE_WORK(bio_dirty_work, bio_dirty_fn); static DEFINE_SPINLOCK(bio_dirty_lock); static struct bio *bio_dirty_list; /* * This runs in process context */ static void bio_dirty_fn(struct work_struct *work) { struct bio *bio, *next; spin_lock_irq(&bio_dirty_lock); next = bio_dirty_list; bio_dirty_list = NULL; spin_unlock_irq(&bio_dirty_lock); while ((bio = next) != NULL) { next = bio->bi_private; bio_release_pages(bio, true); bio_put(bio); } } void bio_check_pages_dirty(struct bio *bio) { struct folio_iter fi; unsigned long flags; bio_for_each_folio_all(fi, bio) { if (!folio_test_dirty(fi.folio)) goto defer; } bio_release_pages(bio, false); bio_put(bio); return; defer: spin_lock_irqsave(&bio_dirty_lock, flags); bio->bi_private = bio_dirty_list; bio_dirty_list = bio; spin_unlock_irqrestore(&bio_dirty_lock, flags); schedule_work(&bio_dirty_work); } EXPORT_SYMBOL_GPL(bio_check_pages_dirty); static inline bool bio_remaining_done(struct bio *bio) { /* * If we're not chaining, then ->__bi_remaining is always 1 and * we always end io on the first invocation. */ if (!bio_flagged(bio, BIO_CHAIN)) return true; BUG_ON(atomic_read(&bio->__bi_remaining) <= 0); if (atomic_dec_and_test(&bio->__bi_remaining)) { bio_clear_flag(bio, BIO_CHAIN); return true; } return false; } /** * bio_endio - end I/O on a bio * @bio: bio * * Description: * bio_endio() will end I/O on the whole bio. bio_endio() is the preferred * way to end I/O on a bio. No one should call bi_end_io() directly on a * bio unless they own it and thus know that it has an end_io function. * * bio_endio() can be called several times on a bio that has been chained * using bio_chain(). The ->bi_end_io() function will only be called the * last time. **/ void bio_endio(struct bio *bio) { again: if (!bio_remaining_done(bio)) return; if (!bio_integrity_endio(bio)) return; blk_zone_bio_endio(bio); rq_qos_done_bio(bio); if (bio->bi_bdev && bio_flagged(bio, BIO_TRACE_COMPLETION)) { trace_block_bio_complete(bdev_get_queue(bio->bi_bdev), bio); bio_clear_flag(bio, BIO_TRACE_COMPLETION); } /* * Need to have a real endio function for chained bios, otherwise * various corner cases will break (like stacking block devices that * save/restore bi_end_io) - however, we want to avoid unbounded * recursion and blowing the stack. Tail call optimization would * handle this, but compiling with frame pointers also disables * gcc's sibling call optimization. */ if (bio->bi_end_io == bio_chain_endio) { bio = __bio_chain_endio(bio); goto again; } #ifdef CONFIG_BLK_CGROUP /* * Release cgroup info. We shouldn't have to do this here, but quite * a few callers of bio_init fail to call bio_uninit, so we cover up * for that here at least for now. */ if (bio->bi_blkg) { blkg_put(bio->bi_blkg); bio->bi_blkg = NULL; } #endif if (bio->bi_end_io) bio->bi_end_io(bio); } EXPORT_SYMBOL(bio_endio); /** * bio_split - split a bio * @bio: bio to split * @sectors: number of sectors to split from the front of @bio * @gfp: gfp mask * @bs: bio set to allocate from * * Allocates and returns a new bio which represents @sectors from the start of * @bio, and updates @bio to represent the remaining sectors. * * Unless this is a discard request the newly allocated bio will point * to @bio's bi_io_vec. It is the caller's responsibility to ensure that * neither @bio nor @bs are freed before the split bio. */ struct bio *bio_split(struct bio *bio, int sectors, gfp_t gfp, struct bio_set *bs) { struct bio *split; if (WARN_ON_ONCE(sectors <= 0)) return ERR_PTR(-EINVAL); if (WARN_ON_ONCE(sectors >= bio_sectors(bio))) return ERR_PTR(-EINVAL); /* Zone append commands cannot be split */ if (WARN_ON_ONCE(bio_op(bio) == REQ_OP_ZONE_APPEND)) return ERR_PTR(-EINVAL); /* atomic writes cannot be split */ if (bio->bi_opf & REQ_ATOMIC) return ERR_PTR(-EINVAL); split = bio_alloc_clone(bio->bi_bdev, bio, gfp, bs); if (!split) return ERR_PTR(-ENOMEM); split->bi_iter.bi_size = sectors << 9; if (bio_integrity(split)) bio_integrity_trim(split); bio_advance(bio, split->bi_iter.bi_size); if (bio_flagged(bio, BIO_TRACE_COMPLETION)) bio_set_flag(split, BIO_TRACE_COMPLETION); return split; } EXPORT_SYMBOL(bio_split); /** * bio_trim - trim a bio * @bio: bio to trim * @offset: number of sectors to trim from the front of @bio * @size: size we want to trim @bio to, in sectors * * This function is typically used for bios that are cloned and submitted * to the underlying device in parts. */ void bio_trim(struct bio *bio, sector_t offset, sector_t size) { /* We should never trim an atomic write */ if (WARN_ON_ONCE(bio->bi_opf & REQ_ATOMIC && size)) return; if (WARN_ON_ONCE(offset > BIO_MAX_SECTORS || size > BIO_MAX_SECTORS || offset + size > bio_sectors(bio))) return; size <<= 9; if (offset == 0 && size == bio->bi_iter.bi_size) return; bio_advance(bio, offset << 9); bio->bi_iter.bi_size = size; if (bio_integrity(bio)) bio_integrity_trim(bio); } EXPORT_SYMBOL_GPL(bio_trim); /* * create memory pools for biovec's in a bio_set. * use the global biovec slabs created for general use. */ int biovec_init_pool(mempool_t *pool, int pool_entries) { struct biovec_slab *bp = bvec_slabs + ARRAY_SIZE(bvec_slabs) - 1; return mempool_init_slab_pool(pool, pool_entries, bp->slab); } /* * bioset_exit - exit a bioset initialized with bioset_init() * * May be called on a zeroed but uninitialized bioset (i.e. allocated with * kzalloc()). */ void bioset_exit(struct bio_set *bs) { bio_alloc_cache_destroy(bs); if (bs->rescue_workqueue) destroy_workqueue(bs->rescue_workqueue); bs->rescue_workqueue = NULL; mempool_exit(&bs->bio_pool); mempool_exit(&bs->bvec_pool); if (bs->bio_slab) bio_put_slab(bs); bs->bio_slab = NULL; } EXPORT_SYMBOL(bioset_exit); /** * bioset_init - Initialize a bio_set * @bs: pool to initialize * @pool_size: Number of bio and bio_vecs to cache in the mempool * @front_pad: Number of bytes to allocate in front of the returned bio * @flags: Flags to modify behavior, currently %BIOSET_NEED_BVECS * and %BIOSET_NEED_RESCUER * * Description: * Set up a bio_set to be used with @bio_alloc_bioset. Allows the caller * to ask for a number of bytes to be allocated in front of the bio. * Front pad allocation is useful for embedding the bio inside * another structure, to avoid allocating extra data to go with the bio. * Note that the bio must be embedded at the END of that structure always, * or things will break badly. * If %BIOSET_NEED_BVECS is set in @flags, a separate pool will be allocated * for allocating iovecs. This pool is not needed e.g. for bio_init_clone(). * If %BIOSET_NEED_RESCUER is set, a workqueue is created which can be used * to dispatch queued requests when the mempool runs out of space. * */ int bioset_init(struct bio_set *bs, unsigned int pool_size, unsigned int front_pad, int flags) { bs->front_pad = front_pad; if (flags & BIOSET_NEED_BVECS) bs->back_pad = BIO_INLINE_VECS * sizeof(struct bio_vec); else bs->back_pad = 0; spin_lock_init(&bs->rescue_lock); bio_list_init(&bs->rescue_list); INIT_WORK(&bs->rescue_work, bio_alloc_rescue); bs->bio_slab = bio_find_or_create_slab(bs); if (!bs->bio_slab) return -ENOMEM; if (mempool_init_slab_pool(&bs->bio_pool, pool_size, bs->bio_slab)) goto bad; if ((flags & BIOSET_NEED_BVECS) && biovec_init_pool(&bs->bvec_pool, pool_size)) goto bad; if (flags & BIOSET_NEED_RESCUER) { bs->rescue_workqueue = alloc_workqueue("bioset", WQ_MEM_RECLAIM, 0); if (!bs->rescue_workqueue) goto bad; } if (flags & BIOSET_PERCPU_CACHE) { bs->cache = alloc_percpu(struct bio_alloc_cache); if (!bs->cache) goto bad; cpuhp_state_add_instance_nocalls(CPUHP_BIO_DEAD, &bs->cpuhp_dead); } return 0; bad: bioset_exit(bs); return -ENOMEM; } EXPORT_SYMBOL(bioset_init); static int __init init_bio(void) { int i; BUILD_BUG_ON(BIO_FLAG_LAST > 8 * sizeof_field(struct bio, bi_flags)); for (i = 0; i < ARRAY_SIZE(bvec_slabs); i++) { struct biovec_slab *bvs = bvec_slabs + i; bvs->slab = kmem_cache_create(bvs->name, bvs->nr_vecs * sizeof(struct bio_vec), 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); } cpuhp_setup_state_multi(CPUHP_BIO_DEAD, "block/bio:dead", NULL, bio_cpu_dead); if (bioset_init(&fs_bio_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS | BIOSET_PERCPU_CACHE)) panic("bio: can't allocate bios\n"); return 0; } subsys_initcall(init_bio);
42 2 1 7 34 200 2 2 12 3 156 10 162 179 6 3 3 2 2 1 40 2 382 343 39 37 149 19 19 168 168 1 5 1 1 6 6 97 1 42 8 32 2 32 25 2 15 5 10 9 24 24 24 12 3 4 1 3 29 4 25 4 25 25 13 24 25 45 3 33 28 8 8 1 1 1 1 1 1 1 6 1 1 6 6 6 5 5 5 5 5 5 5 5 5 98 493 509 502 221 499 4 342 502 354 15 10 10 543 517 56 20 62 511 10 481 35 35 16 488 31 489 9 38 480 209 369 354 3 478 329 203 3 31 206 482 17 26 5 7 7 9 522 3 22 245 312 2 5 300 5 8 1 1 6 5 5 5 5 5 5 5 4 4 5 7 2 6 5 5 5 5 5 5 5 4 1 4 2 5 1 1 134 134 123 17 125 124 125 102 113 113 113 22 95 95 113 113 113 122 13 27 91 102 102 9 1 1 7 9 7 1 6 6 6 5 1 6 6 6 1 5 6 6 6 146 126 8 12 125 7 5 2 13 1 2 104 2 9 5 8 1 8 127 2 2 3 2 1 5 5 5 69 55 6 51 51 51 51 12 55 55 69 56 5 51 5 8 50 4 40 90 55 11 25 94 5 90 15 21 54 9 81 70 21 90 13 1 12 5 2 4 5 7 10 2 12 12 1 11 5 6 3 8 5 6 28 28 28 3 25 26 2 28 28 40 21 19 2 25 60 60 3 56 17 2 5 5 2 33 33 28 5 11 39 56 25 16 2 2 1 1 2 10 10 1 2 1 4 6 5 2 1 1 1 1 1 1 5 20 2 1 10 7 5 2 1 6 6 7 34 34 34 26 8 2 32 31 1 31 3 29 1 31 38 2 36 1 1 34 36 1 7 7 23 2 2 1 18 4 2 8 8 4 11 1 18 11 11 1 7 3 10 7 3 10 12 12 1 11 11 4 1 1 8 8 1 1 7 3 26 12 14 10 17 9 5 3 8 40 6 34 10 35 3 3 1 28 1 27 25 3 10 5 5 5 12 3 9 1 1 2 10 51 51 1 1 20 29 1 18 28 27 20 47 35 10 2 45 7 25 5 12 3 15 16 21 29 8 15 22 31 4 7 28 13 22 24 11 3 8 1 10 5 19 24 13 1 3 17 7 3 35 48 48 48 16 16 1 15 15 2 8 1 7 2 14 3 2 1 1 7 3 6 4 15 15 15 15 15 69 2 1 16 51 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/ext4/namei.c * * Copyright (C) 1992, 1993, 1994, 1995 * Remy Card (card@masi.ibp.fr) * Laboratoire MASI - Institut Blaise Pascal * Universite Pierre et Marie Curie (Paris VI) * * from * * linux/fs/minix/namei.c * * Copyright (C) 1991, 1992 Linus Torvalds * * Big-endian to little-endian byte-swapping/bitmaps by * David S. Miller (davem@caip.rutgers.edu), 1995 * Directory entry file type support and forward compatibility hooks * for B-tree directories by Theodore Ts'o (tytso@mit.edu), 1998 * Hash Tree Directory indexing (c) * Daniel Phillips, 2001 * Hash Tree Directory indexing porting * Christopher Li, 2002 * Hash Tree Directory indexing cleanup * Theodore Ts'o, 2002 */ #include <linux/fs.h> #include <linux/pagemap.h> #include <linux/time.h> #include <linux/fcntl.h> #include <linux/stat.h> #include <linux/string.h> #include <linux/quotaops.h> #include <linux/buffer_head.h> #include <linux/bio.h> #include <linux/iversion.h> #include <linux/unicode.h> #include "ext4.h" #include "ext4_jbd2.h" #include "xattr.h" #include "acl.h" #include <trace/events/ext4.h> /* * define how far ahead to read directories while searching them. */ #define NAMEI_RA_CHUNKS 2 #define NAMEI_RA_BLOCKS 4 #define NAMEI_RA_SIZE (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS) static struct buffer_head *ext4_append(handle_t *handle, struct inode *inode, ext4_lblk_t *block) { struct ext4_map_blocks map; struct buffer_head *bh; int err; if (unlikely(EXT4_SB(inode->i_sb)->s_max_dir_size_kb && ((inode->i_size >> 10) >= EXT4_SB(inode->i_sb)->s_max_dir_size_kb))) return ERR_PTR(-ENOSPC); *block = inode->i_size >> inode->i_sb->s_blocksize_bits; map.m_lblk = *block; map.m_len = 1; /* * We're appending new directory block. Make sure the block is not * allocated yet, otherwise we will end up corrupting the * directory. */ err = ext4_map_blocks(NULL, inode, &map, 0); if (err < 0) return ERR_PTR(err); if (err) { EXT4_ERROR_INODE(inode, "Logical block already allocated"); return ERR_PTR(-EFSCORRUPTED); } bh = ext4_bread(handle, inode, *block, EXT4_GET_BLOCKS_CREATE); if (IS_ERR(bh)) return bh; inode->i_size += inode->i_sb->s_blocksize; EXT4_I(inode)->i_disksize = inode->i_size; err = ext4_mark_inode_dirty(handle, inode); if (err) goto out; BUFFER_TRACE(bh, "get_write_access"); err = ext4_journal_get_write_access(handle, inode->i_sb, bh, EXT4_JTR_NONE); if (err) goto out; return bh; out: brelse(bh); ext4_std_error(inode->i_sb, err); return ERR_PTR(err); } static int ext4_dx_csum_verify(struct inode *inode, struct ext4_dir_entry *dirent); /* * Hints to ext4_read_dirblock regarding whether we expect a directory * block being read to be an index block, or a block containing * directory entries (and if the latter, whether it was found via a * logical block in an htree index block). This is used to control * what sort of sanity checkinig ext4_read_dirblock() will do on the * directory block read from the storage device. EITHER will means * the caller doesn't know what kind of directory block will be read, * so no specific verification will be done. */ typedef enum { EITHER, INDEX, DIRENT, DIRENT_HTREE } dirblock_type_t; #define ext4_read_dirblock(inode, block, type) \ __ext4_read_dirblock((inode), (block), (type), __func__, __LINE__) static struct buffer_head *__ext4_read_dirblock(struct inode *inode, ext4_lblk_t block, dirblock_type_t type, const char *func, unsigned int line) { struct buffer_head *bh; struct ext4_dir_entry *dirent; int is_dx_block = 0; if (block >= inode->i_size >> inode->i_blkbits) { ext4_error_inode(inode, func, line, block, "Attempting to read directory block (%u) that is past i_size (%llu)", block, inode->i_size); return ERR_PTR(-EFSCORRUPTED); } if (ext4_simulate_fail(inode->i_sb, EXT4_SIM_DIRBLOCK_EIO)) bh = ERR_PTR(-EIO); else bh = ext4_bread(NULL, inode, block, 0); if (IS_ERR(bh)) { __ext4_warning(inode->i_sb, func, line, "inode #%lu: lblock %lu: comm %s: " "error %ld reading directory block", inode->i_ino, (unsigned long)block, current->comm, PTR_ERR(bh)); return bh; } /* The first directory block must not be a hole. */ if (!bh && (type == INDEX || type == DIRENT_HTREE || block == 0)) { ext4_error_inode(inode, func, line, block, "Directory hole found for htree %s block %u", (type == INDEX) ? "index" : "leaf", block); return ERR_PTR(-EFSCORRUPTED); } if (!bh) return NULL; dirent = (struct ext4_dir_entry *) bh->b_data; /* Determine whether or not we have an index block */ if (is_dx(inode)) { if (block == 0) is_dx_block = 1; else if (ext4_rec_len_from_disk(dirent->rec_len, inode->i_sb->s_blocksize) == inode->i_sb->s_blocksize) is_dx_block = 1; } if (!is_dx_block && type == INDEX) { ext4_error_inode(inode, func, line, block, "directory leaf block found instead of index block"); brelse(bh); return ERR_PTR(-EFSCORRUPTED); } if (!ext4_has_feature_metadata_csum(inode->i_sb) || buffer_verified(bh)) return bh; /* * An empty leaf block can get mistaken for a index block; for * this reason, we can only check the index checksum when the * caller is sure it should be an index block. */ if (is_dx_block && type == INDEX) { if (ext4_dx_csum_verify(inode, dirent) && !ext4_simulate_fail(inode->i_sb, EXT4_SIM_DIRBLOCK_CRC)) set_buffer_verified(bh); else { ext4_error_inode_err(inode, func, line, block, EFSBADCRC, "Directory index failed checksum"); brelse(bh); return ERR_PTR(-EFSBADCRC); } } if (!is_dx_block) { if (ext4_dirblock_csum_verify(inode, bh) && !ext4_simulate_fail(inode->i_sb, EXT4_SIM_DIRBLOCK_CRC)) set_buffer_verified(bh); else { ext4_error_inode_err(inode, func, line, block, EFSBADCRC, "Directory block failed checksum"); brelse(bh); return ERR_PTR(-EFSBADCRC); } } return bh; } #ifdef DX_DEBUG #define dxtrace(command) command #else #define dxtrace(command) #endif struct fake_dirent { __le32 inode; __le16 rec_len; u8 name_len; u8 file_type; }; struct dx_countlimit { __le16 limit; __le16 count; }; struct dx_entry { __le32 hash; __le32 block; }; /* * dx_root_info is laid out so that if it should somehow get overlaid by a * dirent the two low bits of the hash version will be zero. Therefore, the * hash version mod 4 should never be 0. Sincerely, the paranoia department. */ struct dx_root { struct fake_dirent dot; char dot_name[4]; struct fake_dirent dotdot; char dotdot_name[4]; struct dx_root_info { __le32 reserved_zero; u8 hash_version; u8 info_length; /* 8 */ u8 indirect_levels; u8 unused_flags; } info; struct dx_entry entries[]; }; struct dx_node { struct fake_dirent fake; struct dx_entry entries[]; }; struct dx_frame { struct buffer_head *bh; struct dx_entry *entries; struct dx_entry *at; }; struct dx_map_entry { u32 hash; u16 offs; u16 size; }; /* * This goes at the end of each htree block. */ struct dx_tail { u32 dt_reserved; __le32 dt_checksum; /* crc32c(uuid+inum+dirblock) */ }; static struct buffer_head * ext4_dx_find_entry(struct inode *dir, struct ext4_filename *fname, struct ext4_dir_entry_2 **res_dir); static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname, struct inode *dir, struct inode *inode); /* checksumming functions */ void ext4_initialize_dirent_tail(struct buffer_head *bh, unsigned int blocksize) { struct ext4_dir_entry_tail *t = EXT4_DIRENT_TAIL(bh->b_data, blocksize); memset(t, 0, sizeof(struct ext4_dir_entry_tail)); t->det_rec_len = ext4_rec_len_to_disk( sizeof(struct ext4_dir_entry_tail), blocksize); t->det_reserved_ft = EXT4_FT_DIR_CSUM; } /* Walk through a dirent block to find a checksum "dirent" at the tail */ static struct ext4_dir_entry_tail *get_dirent_tail(struct inode *inode, struct buffer_head *bh) { struct ext4_dir_entry_tail *t; int blocksize = EXT4_BLOCK_SIZE(inode->i_sb); #ifdef PARANOID struct ext4_dir_entry *d, *top; d = (struct ext4_dir_entry *)bh->b_data; top = (struct ext4_dir_entry *)(bh->b_data + (blocksize - sizeof(struct ext4_dir_entry_tail))); while (d < top && ext4_rec_len_from_disk(d->rec_len, blocksize)) d = (struct ext4_dir_entry *)(((void *)d) + ext4_rec_len_from_disk(d->rec_len, blocksize)); if (d != top) return NULL; t = (struct ext4_dir_entry_tail *)d; #else t = EXT4_DIRENT_TAIL(bh->b_data, EXT4_BLOCK_SIZE(inode->i_sb)); #endif if (t->det_reserved_zero1 || (ext4_rec_len_from_disk(t->det_rec_len, blocksize) != sizeof(struct ext4_dir_entry_tail)) || t->det_reserved_zero2 || t->det_reserved_ft != EXT4_FT_DIR_CSUM) return NULL; return t; } static __le32 ext4_dirblock_csum(struct inode *inode, void *dirent, int size) { struct ext4_inode_info *ei = EXT4_I(inode); __u32 csum; csum = ext4_chksum(ei->i_csum_seed, (__u8 *)dirent, size); return cpu_to_le32(csum); } #define warn_no_space_for_csum(inode) \ __warn_no_space_for_csum((inode), __func__, __LINE__) static void __warn_no_space_for_csum(struct inode *inode, const char *func, unsigned int line) { __ext4_warning_inode(inode, func, line, "No space for directory leaf checksum. Please run e2fsck -D."); } int ext4_dirblock_csum_verify(struct inode *inode, struct buffer_head *bh) { struct ext4_dir_entry_tail *t; if (!ext4_has_feature_metadata_csum(inode->i_sb)) return 1; t = get_dirent_tail(inode, bh); if (!t) { warn_no_space_for_csum(inode); return 0; } if (t->det_checksum != ext4_dirblock_csum(inode, bh->b_data, (char *)t - bh->b_data)) return 0; return 1; } static void ext4_dirblock_csum_set(struct inode *inode, struct buffer_head *bh) { struct ext4_dir_entry_tail *t; if (!ext4_has_feature_metadata_csum(inode->i_sb)) return; t = get_dirent_tail(inode, bh); if (!t) { warn_no_space_for_csum(inode); return; } t->det_checksum = ext4_dirblock_csum(inode, bh->b_data, (char *)t - bh->b_data); } int ext4_handle_dirty_dirblock(handle_t *handle, struct inode *inode, struct buffer_head *bh) { ext4_dirblock_csum_set(inode, bh); return ext4_handle_dirty_metadata(handle, inode, bh); } static struct dx_countlimit *get_dx_countlimit(struct inode *inode, struct ext4_dir_entry *dirent, int *offset) { struct ext4_dir_entry *dp; struct dx_root_info *root; int count_offset; int blocksize = EXT4_BLOCK_SIZE(inode->i_sb); unsigned int rlen = ext4_rec_len_from_disk(dirent->rec_len, blocksize); if (rlen == blocksize) count_offset = 8; else if (rlen == 12) { dp = (struct ext4_dir_entry *)(((void *)dirent) + 12); if (ext4_rec_len_from_disk(dp->rec_len, blocksize) != blocksize - 12) return NULL; root = (struct dx_root_info *)(((void *)dp + 12)); if (root->reserved_zero || root->info_length != sizeof(struct dx_root_info)) return NULL; count_offset = 32; } else return NULL; if (offset) *offset = count_offset; return (struct dx_countlimit *)(((void *)dirent) + count_offset); } static __le32 ext4_dx_csum(struct inode *inode, struct ext4_dir_entry *dirent, int count_offset, int count, struct dx_tail *t) { struct ext4_inode_info *ei = EXT4_I(inode); __u32 csum; int size; __u32 dummy_csum = 0; int offset = offsetof(struct dx_tail, dt_checksum); size = count_offset + (count * sizeof(struct dx_entry)); csum = ext4_chksum(ei->i_csum_seed, (__u8 *)dirent, size); csum = ext4_chksum(csum, (__u8 *)t, offset); csum = ext4_chksum(csum, (__u8 *)&dummy_csum, sizeof(dummy_csum)); return cpu_to_le32(csum); } static int ext4_dx_csum_verify(struct inode *inode, struct ext4_dir_entry *dirent) { struct dx_countlimit *c; struct dx_tail *t; int count_offset, limit, count; if (!ext4_has_feature_metadata_csum(inode->i_sb)) return 1; c = get_dx_countlimit(inode, dirent, &count_offset); if (!c) { EXT4_ERROR_INODE(inode, "dir seems corrupt? Run e2fsck -D."); return 0; } limit = le16_to_cpu(c->limit); count = le16_to_cpu(c->count); if (count_offset + (limit * sizeof(struct dx_entry)) > EXT4_BLOCK_SIZE(inode->i_sb) - sizeof(struct dx_tail)) { warn_no_space_for_csum(inode); return 0; } t = (struct dx_tail *)(((struct dx_entry *)c) + limit); if (t->dt_checksum != ext4_dx_csum(inode, dirent, count_offset, count, t)) return 0; return 1; } static void ext4_dx_csum_set(struct inode *inode, struct ext4_dir_entry *dirent) { struct dx_countlimit *c; struct dx_tail *t; int count_offset, limit, count; if (!ext4_has_feature_metadata_csum(inode->i_sb)) return; c = get_dx_countlimit(inode, dirent, &count_offset); if (!c) { EXT4_ERROR_INODE(inode, "dir seems corrupt? Run e2fsck -D."); return; } limit = le16_to_cpu(c->limit); count = le16_to_cpu(c->count); if (count_offset + (limit * sizeof(struct dx_entry)) > EXT4_BLOCK_SIZE(inode->i_sb) - sizeof(struct dx_tail)) { warn_no_space_for_csum(inode); return; } t = (struct dx_tail *)(((struct dx_entry *)c) + limit); t->dt_checksum = ext4_dx_csum(inode, dirent, count_offset, count, t); } static inline int ext4_handle_dirty_dx_node(handle_t *handle, struct inode *inode, struct buffer_head *bh) { ext4_dx_csum_set(inode, (struct ext4_dir_entry *)bh->b_data); return ext4_handle_dirty_metadata(handle, inode, bh); } /* * p is at least 6 bytes before the end of page */ static inline struct ext4_dir_entry_2 * ext4_next_entry(struct ext4_dir_entry_2 *p, unsigned long blocksize) { return (struct ext4_dir_entry_2 *)((char *)p + ext4_rec_len_from_disk(p->rec_len, blocksize)); } /* * Future: use high four bits of block for coalesce-on-delete flags * Mask them off for now. */ static inline ext4_lblk_t dx_get_block(struct dx_entry *entry) { return le32_to_cpu(entry->block) & 0x0fffffff; } static inline void dx_set_block(struct dx_entry *entry, ext4_lblk_t value) { entry->block = cpu_to_le32(value); } static inline unsigned dx_get_hash(struct dx_entry *entry) { return le32_to_cpu(entry->hash); } static inline void dx_set_hash(struct dx_entry *entry, unsigned value) { entry->hash = cpu_to_le32(value); } static inline unsigned dx_get_count(struct dx_entry *entries) { return le16_to_cpu(((struct dx_countlimit *) entries)->count); } static inline unsigned dx_get_limit(struct dx_entry *entries) { return le16_to_cpu(((struct dx_countlimit *) entries)->limit); } static inline void dx_set_count(struct dx_entry *entries, unsigned value) { ((struct dx_countlimit *) entries)->count = cpu_to_le16(value); } static inline void dx_set_limit(struct dx_entry *entries, unsigned value) { ((struct dx_countlimit *) entries)->limit = cpu_to_le16(value); } static inline unsigned dx_root_limit(struct inode *dir, unsigned infosize) { unsigned int entry_space = dir->i_sb->s_blocksize - ext4_dir_rec_len(1, NULL) - ext4_dir_rec_len(2, NULL) - infosize; if (ext4_has_feature_metadata_csum(dir->i_sb)) entry_space -= sizeof(struct dx_tail); return entry_space / sizeof(struct dx_entry); } static inline unsigned dx_node_limit(struct inode *dir) { unsigned int entry_space = dir->i_sb->s_blocksize - ext4_dir_rec_len(0, dir); if (ext4_has_feature_metadata_csum(dir->i_sb)) entry_space -= sizeof(struct dx_tail); return entry_space / sizeof(struct dx_entry); } /* * Debug */ #ifdef DX_DEBUG static void dx_show_index(char * label, struct dx_entry *entries) { int i, n = dx_get_count (entries); printk(KERN_DEBUG "%s index", label); for (i = 0; i < n; i++) { printk(KERN_CONT " %x->%lu", i ? dx_get_hash(entries + i) : 0, (unsigned long)dx_get_block(entries + i)); } printk(KERN_CONT "\n"); } struct stats { unsigned names; unsigned space; unsigned bcount; }; static struct stats dx_show_leaf(struct inode *dir, struct dx_hash_info *hinfo, struct ext4_dir_entry_2 *de, int size, int show_names) { unsigned names = 0, space = 0; char *base = (char *) de; struct dx_hash_info h = *hinfo; printk("names: "); while ((char *) de < base + size) { if (de->inode) { if (show_names) { #ifdef CONFIG_FS_ENCRYPTION int len; char *name; struct fscrypt_str fname_crypto_str = FSTR_INIT(NULL, 0); int res = 0; name = de->name; len = de->name_len; if (!IS_ENCRYPTED(dir)) { /* Directory is not encrypted */ (void) ext4fs_dirhash(dir, de->name, de->name_len, &h); printk("%*.s:(U)%x.%u ", len, name, h.hash, (unsigned) ((char *) de - base)); } else { struct fscrypt_str de_name = FSTR_INIT(name, len); /* Directory is encrypted */ res = fscrypt_fname_alloc_buffer( len, &fname_crypto_str); if (res) printk(KERN_WARNING "Error " "allocating crypto " "buffer--skipping " "crypto\n"); res = fscrypt_fname_disk_to_usr(dir, 0, 0, &de_name, &fname_crypto_str); if (res) { printk(KERN_WARNING "Error " "converting filename " "from disk to usr" "\n"); name = "??"; len = 2; } else { name = fname_crypto_str.name; len = fname_crypto_str.len; } if (IS_CASEFOLDED(dir)) h.hash = EXT4_DIRENT_HASH(de); else (void) ext4fs_dirhash(dir, de->name, de->name_len, &h); printk("%*.s:(E)%x.%u ", len, name, h.hash, (unsigned) ((char *) de - base)); fscrypt_fname_free_buffer( &fname_crypto_str); } #else int len = de->name_len; char *name = de->name; (void) ext4fs_dirhash(dir, de->name, de->name_len, &h); printk("%*.s:%x.%u ", len, name, h.hash, (unsigned) ((char *) de - base)); #endif } space += ext4_dir_rec_len(de->name_len, dir); names++; } de = ext4_next_entry(de, size); } printk(KERN_CONT "(%i)\n", names); return (struct stats) { names, space, 1 }; } struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir, struct dx_entry *entries, int levels) { unsigned blocksize = dir->i_sb->s_blocksize; unsigned count = dx_get_count(entries), names = 0, space = 0, i; unsigned bcount = 0; struct buffer_head *bh; printk("%i indexed blocks...\n", count); for (i = 0; i < count; i++, entries++) { ext4_lblk_t block = dx_get_block(entries); ext4_lblk_t hash = i ? dx_get_hash(entries): 0; u32 range = i < count - 1? (dx_get_hash(entries + 1) - hash): ~hash; struct stats stats; printk("%s%3u:%03u hash %8x/%8x ",levels?"":" ", i, block, hash, range); bh = ext4_bread(NULL,dir, block, 0); if (!bh || IS_ERR(bh)) continue; stats = levels? dx_show_entries(hinfo, dir, ((struct dx_node *) bh->b_data)->entries, levels - 1): dx_show_leaf(dir, hinfo, (struct ext4_dir_entry_2 *) bh->b_data, blocksize, 0); names += stats.names; space += stats.space; bcount += stats.bcount; brelse(bh); } if (bcount) printk(KERN_DEBUG "%snames %u, fullness %u (%u%%)\n", levels ? "" : " ", names, space/bcount, (space/bcount)*100/blocksize); return (struct stats) { names, space, bcount}; } /* * Linear search cross check */ static inline void htree_rep_invariant_check(struct dx_entry *at, struct dx_entry *target, u32 hash, unsigned int n) { while (n--) { dxtrace(printk(KERN_CONT ",")); if (dx_get_hash(++at) > hash) { at--; break; } } ASSERT(at == target - 1); } #else /* DX_DEBUG */ static inline void htree_rep_invariant_check(struct dx_entry *at, struct dx_entry *target, u32 hash, unsigned int n) { } #endif /* DX_DEBUG */ /* * Probe for a directory leaf block to search. * * dx_probe can return ERR_BAD_DX_DIR, which means there was a format * error in the directory index, and the caller should fall back to * searching the directory normally. The callers of dx_probe **MUST** * check for this error code, and make sure it never gets reflected * back to userspace. */ static struct dx_frame * dx_probe(struct ext4_filename *fname, struct inode *dir, struct dx_hash_info *hinfo, struct dx_frame *frame_in) { unsigned count, indirect, level, i; struct dx_entry *at, *entries, *p, *q, *m; struct dx_root *root; struct dx_frame *frame = frame_in; struct dx_frame *ret_err = ERR_PTR(ERR_BAD_DX_DIR); u32 hash; ext4_lblk_t block; ext4_lblk_t blocks[EXT4_HTREE_LEVEL]; memset(frame_in, 0, EXT4_HTREE_LEVEL * sizeof(frame_in[0])); frame->bh = ext4_read_dirblock(dir, 0, INDEX); if (IS_ERR(frame->bh)) return (struct dx_frame *) frame->bh; root = (struct dx_root *) frame->bh->b_data; if (root->info.hash_version != DX_HASH_TEA && root->info.hash_version != DX_HASH_HALF_MD4 && root->info.hash_version != DX_HASH_LEGACY && root->info.hash_version != DX_HASH_SIPHASH) { ext4_warning_inode(dir, "Unrecognised inode hash code %u", root->info.hash_version); goto fail; } if (ext4_hash_in_dirent(dir)) { if (root->info.hash_version != DX_HASH_SIPHASH) { ext4_warning_inode(dir, "Hash in dirent, but hash is not SIPHASH"); goto fail; } } else { if (root->info.hash_version == DX_HASH_SIPHASH) { ext4_warning_inode(dir, "Hash code is SIPHASH, but hash not in dirent"); goto fail; } } if (fname) hinfo = &fname->hinfo; hinfo->hash_version = root->info.hash_version; if (hinfo->hash_version <= DX_HASH_TEA) hinfo->hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned; hinfo->seed = EXT4_SB(dir->i_sb)->s_hash_seed; /* hash is already computed for encrypted casefolded directory */ if (fname && fname_name(fname) && !(IS_ENCRYPTED(dir) && IS_CASEFOLDED(dir))) { int ret = ext4fs_dirhash(dir, fname_name(fname), fname_len(fname), hinfo); if (ret < 0) { ret_err = ERR_PTR(ret); goto fail; } } hash = hinfo->hash; if (root->info.unused_flags & 1) { ext4_warning_inode(dir, "Unimplemented hash flags: %#06x", root->info.unused_flags); goto fail; } indirect = root->info.indirect_levels; if (indirect >= ext4_dir_htree_level(dir->i_sb)) { ext4_warning(dir->i_sb, "Directory (ino: %lu) htree depth %#06x exceed" "supported value", dir->i_ino, ext4_dir_htree_level(dir->i_sb)); if (ext4_dir_htree_level(dir->i_sb) < EXT4_HTREE_LEVEL) { ext4_warning(dir->i_sb, "Enable large directory " "feature to access it"); } goto fail; } entries = (struct dx_entry *)(((char *)&root->info) + root->info.info_length); if (dx_get_limit(entries) != dx_root_limit(dir, root->info.info_length)) { ext4_warning_inode(dir, "dx entry: limit %u != root limit %u", dx_get_limit(entries), dx_root_limit(dir, root->info.info_length)); goto fail; } dxtrace(printk("Look up %x", hash)); level = 0; blocks[0] = 0; while (1) { count = dx_get_count(entries); if (!count || count > dx_get_limit(entries)) { ext4_warning_inode(dir, "dx entry: count %u beyond limit %u", count, dx_get_limit(entries)); goto fail; } p = entries + 1; q = entries + count - 1; while (p <= q) { m = p + (q - p) / 2; dxtrace(printk(KERN_CONT ".")); if (dx_get_hash(m) > hash) q = m - 1; else p = m + 1; } htree_rep_invariant_check(entries, p, hash, count - 1); at = p - 1; dxtrace(printk(KERN_CONT " %x->%u\n", at == entries ? 0 : dx_get_hash(at), dx_get_block(at))); frame->entries = entries; frame->at = at; block = dx_get_block(at); for (i = 0; i <= level; i++) { if (blocks[i] == block) { ext4_warning_inode(dir, "dx entry: tree cycle block %u points back to block %u", blocks[level], block); goto fail; } } if (++level > indirect) return frame; blocks[level] = block; frame++; frame->bh = ext4_read_dirblock(dir, block, INDEX); if (IS_ERR(frame->bh)) { ret_err = (struct dx_frame *) frame->bh; frame->bh = NULL; goto fail; } entries = ((struct dx_node *) frame->bh->b_data)->entries; if (dx_get_limit(entries) != dx_node_limit(dir)) { ext4_warning_inode(dir, "dx entry: limit %u != node limit %u", dx_get_limit(entries), dx_node_limit(dir)); goto fail; } } fail: while (frame >= frame_in) { brelse(frame->bh); frame--; } if (ret_err == ERR_PTR(ERR_BAD_DX_DIR)) ext4_warning_inode(dir, "Corrupt directory, running e2fsck is recommended"); return ret_err; } static void dx_release(struct dx_frame *frames) { struct dx_root_info *info; int i; unsigned int indirect_levels; if (frames[0].bh == NULL) return; info = &((struct dx_root *)frames[0].bh->b_data)->info; /* save local copy, "info" may be freed after brelse() */ indirect_levels = info->indirect_levels; for (i = 0; i <= indirect_levels; i++) { if (frames[i].bh == NULL) break; brelse(frames[i].bh); frames[i].bh = NULL; } } /* * This function increments the frame pointer to search the next leaf * block, and reads in the necessary intervening nodes if the search * should be necessary. Whether or not the search is necessary is * controlled by the hash parameter. If the hash value is even, then * the search is only continued if the next block starts with that * hash value. This is used if we are searching for a specific file. * * If the hash value is HASH_NB_ALWAYS, then always go to the next block. * * This function returns 1 if the caller should continue to search, * or 0 if it should not. If there is an error reading one of the * index blocks, it will a negative error code. * * If start_hash is non-null, it will be filled in with the starting * hash of the next page. */ static int ext4_htree_next_block(struct inode *dir, __u32 hash, struct dx_frame *frame, struct dx_frame *frames, __u32 *start_hash) { struct dx_frame *p; struct buffer_head *bh; int num_frames = 0; __u32 bhash; p = frame; /* * Find the next leaf page by incrementing the frame pointer. * If we run out of entries in the interior node, loop around and * increment pointer in the parent node. When we break out of * this loop, num_frames indicates the number of interior * nodes need to be read. */ while (1) { if (++(p->at) < p->entries + dx_get_count(p->entries)) break; if (p == frames) return 0; num_frames++; p--; } /* * If the hash is 1, then continue only if the next page has a * continuation hash of any value. This is used for readdir * handling. Otherwise, check to see if the hash matches the * desired continuation hash. If it doesn't, return since * there's no point to read in the successive index pages. */ bhash = dx_get_hash(p->at); if (start_hash) *start_hash = bhash; if ((hash & 1) == 0) { if ((bhash & ~1) != hash) return 0; } /* * If the hash is HASH_NB_ALWAYS, we always go to the next * block so no check is necessary */ while (num_frames--) { bh = ext4_read_dirblock(dir, dx_get_block(p->at), INDEX); if (IS_ERR(bh)) return PTR_ERR(bh); p++; brelse(p->bh); p->bh = bh; p->at = p->entries = ((struct dx_node *) bh->b_data)->entries; } return 1; } /* * This function fills a red-black tree with information from a * directory block. It returns the number directory entries loaded * into the tree. If there is an error it is returned in err. */ static int htree_dirblock_to_tree(struct file *dir_file, struct inode *dir, ext4_lblk_t block, struct dx_hash_info *hinfo, __u32 start_hash, __u32 start_minor_hash) { struct buffer_head *bh; struct ext4_dir_entry_2 *de, *top; int err = 0, count = 0; struct fscrypt_str fname_crypto_str = FSTR_INIT(NULL, 0), tmp_str; int csum = ext4_has_feature_metadata_csum(dir->i_sb); dxtrace(printk(KERN_INFO "In htree dirblock_to_tree: block %lu\n", (unsigned long)block)); bh = ext4_read_dirblock(dir, block, DIRENT_HTREE); if (IS_ERR(bh)) return PTR_ERR(bh); de = (struct ext4_dir_entry_2 *) bh->b_data; /* csum entries are not larger in the casefolded encrypted case */ top = (struct ext4_dir_entry_2 *) ((char *) de + dir->i_sb->s_blocksize - ext4_dir_rec_len(0, csum ? NULL : dir)); /* Check if the directory is encrypted */ if (IS_ENCRYPTED(dir)) { err = fscrypt_prepare_readdir(dir); if (err < 0) { brelse(bh); return err; } err = fscrypt_fname_alloc_buffer(EXT4_NAME_LEN, &fname_crypto_str); if (err < 0) { brelse(bh); return err; } } for (; de < top; de = ext4_next_entry(de, dir->i_sb->s_blocksize)) { if (ext4_check_dir_entry(dir, NULL, de, bh, bh->b_data, bh->b_size, EXT4_LBLK_TO_B(dir, block) + ((char *)de - bh->b_data))) { /* silently ignore the rest of the block */ break; } if (ext4_hash_in_dirent(dir)) { if (de->name_len && de->inode) { hinfo->hash = EXT4_DIRENT_HASH(de); hinfo->minor_hash = EXT4_DIRENT_MINOR_HASH(de); } else { hinfo->hash = 0; hinfo->minor_hash = 0; } } else { err = ext4fs_dirhash(dir, de->name, de->name_len, hinfo); if (err < 0) { count = err; goto errout; } } if ((hinfo->hash < start_hash) || ((hinfo->hash == start_hash) && (hinfo->minor_hash < start_minor_hash))) continue; if (de->inode == 0) continue; if (!IS_ENCRYPTED(dir)) { tmp_str.name = de->name; tmp_str.len = de->name_len; err = ext4_htree_store_dirent(dir_file, hinfo->hash, hinfo->minor_hash, de, &tmp_str); } else { int save_len = fname_crypto_str.len; struct fscrypt_str de_name = FSTR_INIT(de->name, de->name_len); /* Directory is encrypted */ err = fscrypt_fname_disk_to_usr(dir, hinfo->hash, hinfo->minor_hash, &de_name, &fname_crypto_str); if (err) { count = err; goto errout; } err = ext4_htree_store_dirent(dir_file, hinfo->hash, hinfo->minor_hash, de, &fname_crypto_str); fname_crypto_str.len = save_len; } if (err != 0) { count = err; goto errout; } count++; } errout: brelse(bh); fscrypt_fname_free_buffer(&fname_crypto_str); return count; } /* * This function fills a red-black tree with information from a * directory. We start scanning the directory in hash order, starting * at start_hash and start_minor_hash. * * This function returns the number of entries inserted into the tree, * or a negative error code. */ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash, __u32 start_minor_hash, __u32 *next_hash) { struct dx_hash_info hinfo; struct ext4_dir_entry_2 *de; struct dx_frame frames[EXT4_HTREE_LEVEL], *frame; struct inode *dir; ext4_lblk_t block; int count = 0; int ret, err; __u32 hashval; struct fscrypt_str tmp_str; dxtrace(printk(KERN_DEBUG "In htree_fill_tree, start hash: %x:%x\n", start_hash, start_minor_hash)); dir = file_inode(dir_file); if (!(ext4_test_inode_flag(dir, EXT4_INODE_INDEX))) { if (ext4_hash_in_dirent(dir)) hinfo.hash_version = DX_HASH_SIPHASH; else hinfo.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version; if (hinfo.hash_version <= DX_HASH_TEA) hinfo.hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned; hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed; if (ext4_has_inline_data(dir)) { int has_inline_data = 1; count = ext4_inlinedir_to_tree(dir_file, dir, 0, &hinfo, start_hash, start_minor_hash, &has_inline_data); if (has_inline_data) { *next_hash = ~0; return count; } } count = htree_dirblock_to_tree(dir_file, dir, 0, &hinfo, start_hash, start_minor_hash); *next_hash = ~0; return count; } hinfo.hash = start_hash; hinfo.minor_hash = 0; frame = dx_probe(NULL, dir, &hinfo, frames); if (IS_ERR(frame)) return PTR_ERR(frame); /* Add '.' and '..' from the htree header */ if (!start_hash && !start_minor_hash) { de = (struct ext4_dir_entry_2 *) frames[0].bh->b_data; tmp_str.name = de->name; tmp_str.len = de->name_len; err = ext4_htree_store_dirent(dir_file, 0, 0, de, &tmp_str); if (err != 0) goto errout; count++; } if (start_hash < 2 || (start_hash ==2 && start_minor_hash==0)) { de = (struct ext4_dir_entry_2 *) frames[0].bh->b_data; de = ext4_next_entry(de, dir->i_sb->s_blocksize); tmp_str.name = de->name; tmp_str.len = de->name_len; err = ext4_htree_store_dirent(dir_file, 2, 0, de, &tmp_str); if (err != 0) goto errout; count++; } while (1) { if (fatal_signal_pending(current)) { err = -ERESTARTSYS; goto errout; } cond_resched(); block = dx_get_block(frame->at); ret = htree_dirblock_to_tree(dir_file, dir, block, &hinfo, start_hash, start_minor_hash); if (ret < 0) { err = ret; goto errout; } count += ret; hashval = ~0; ret = ext4_htree_next_block(dir, HASH_NB_ALWAYS, frame, frames, &hashval); *next_hash = hashval; if (ret < 0) { err = ret; goto errout; } /* * Stop if: (a) there are no more entries, or * (b) we have inserted at least one entry and the * next hash value is not a continuation */ if ((ret == 0) || (count && ((hashval & 1) == 0))) break; } dx_release(frames); dxtrace(printk(KERN_DEBUG "Fill tree: returned %d entries, " "next hash: %x\n", count, *next_hash)); return count; errout: dx_release(frames); return (err); } static inline int search_dirblock(struct buffer_head *bh, struct inode *dir, struct ext4_filename *fname, unsigned int offset, struct ext4_dir_entry_2 **res_dir) { return ext4_search_dir(bh, bh->b_data, dir->i_sb->s_blocksize, dir, fname, offset, res_dir); } /* * Directory block splitting, compacting */ /* * Create map of hash values, offsets, and sizes, stored at end of block. * Returns number of entries mapped. */ static int dx_make_map(struct inode *dir, struct buffer_head *bh, struct dx_hash_info *hinfo, struct dx_map_entry *map_tail) { int count = 0; struct ext4_dir_entry_2 *de = (struct ext4_dir_entry_2 *)bh->b_data; unsigned int buflen = bh->b_size; char *base = bh->b_data; struct dx_hash_info h = *hinfo; int blocksize = EXT4_BLOCK_SIZE(dir->i_sb); if (ext4_has_feature_metadata_csum(dir->i_sb)) buflen -= sizeof(struct ext4_dir_entry_tail); while ((char *) de < base + buflen) { if (ext4_check_dir_entry(dir, NULL, de, bh, base, buflen, ((char *)de) - base)) return -EFSCORRUPTED; if (de->name_len && de->inode) { if (ext4_hash_in_dirent(dir)) h.hash = EXT4_DIRENT_HASH(de); else { int err = ext4fs_dirhash(dir, de->name, de->name_len, &h); if (err < 0) return err; } map_tail--; map_tail->hash = h.hash; map_tail->offs = ((char *) de - base)>>2; map_tail->size = ext4_rec_len_from_disk(de->rec_len, blocksize); count++; cond_resched(); } de = ext4_next_entry(de, blocksize); } return count; } /* Sort map by hash value */ static void dx_sort_map (struct dx_map_entry *map, unsigned count) { struct dx_map_entry *p, *q, *top = map + count - 1; int more; /* Combsort until bubble sort doesn't suck */ while (count > 2) { count = count*10/13; if (count - 9 < 2) /* 9, 10 -> 11 */ count = 11; for (p = top, q = p - count; q >= map; p--, q--) if (p->hash < q->hash) swap(*p, *q); } /* Garden variety bubble sort */ do { more = 0; q = top; while (q-- > map) { if (q[1].hash >= q[0].hash) continue; swap(*(q+1), *q); more = 1; } } while(more); } static void dx_insert_block(struct dx_frame *frame, u32 hash, ext4_lblk_t block) { struct dx_entry *entries = frame->entries; struct dx_entry *old = frame->at, *new = old + 1; int count = dx_get_count(entries); ASSERT(count < dx_get_limit(entries)); ASSERT(old < entries + count); memmove(new + 1, new, (char *)(entries + count) - (char *)(new)); dx_set_hash(new, hash); dx_set_block(new, block); dx_set_count(entries, count + 1); } #if IS_ENABLED(CONFIG_UNICODE) int ext4_fname_setup_ci_filename(struct inode *dir, const struct qstr *iname, struct ext4_filename *name) { struct qstr *cf_name = &name->cf_name; unsigned char *buf; struct dx_hash_info *hinfo = &name->hinfo; int len; if (!IS_CASEFOLDED(dir) || (IS_ENCRYPTED(dir) && !fscrypt_has_encryption_key(dir))) { cf_name->name = NULL; return 0; } buf = kmalloc(EXT4_NAME_LEN, GFP_NOFS); if (!buf) return -ENOMEM; len = utf8_casefold(dir->i_sb->s_encoding, iname, buf, EXT4_NAME_LEN); if (len <= 0) { kfree(buf); buf = NULL; } cf_name->name = buf; cf_name->len = (unsigned) len; if (!IS_ENCRYPTED(dir)) return 0; hinfo->hash_version = DX_HASH_SIPHASH; hinfo->seed = NULL; if (cf_name->name) return ext4fs_dirhash(dir, cf_name->name, cf_name->len, hinfo); else return ext4fs_dirhash(dir, iname->name, iname->len, hinfo); } #endif /* * Test whether a directory entry matches the filename being searched for. * * Return: %true if the directory entry matches, otherwise %false. */ static bool ext4_match(struct inode *parent, const struct ext4_filename *fname, struct ext4_dir_entry_2 *de) { struct fscrypt_name f; if (!de->inode) return false; f.usr_fname = fname->usr_fname; f.disk_name = fname->disk_name; #ifdef CONFIG_FS_ENCRYPTION f.crypto_buf = fname->crypto_buf; #endif #if IS_ENABLED(CONFIG_UNICODE) if (IS_CASEFOLDED(parent) && (!IS_ENCRYPTED(parent) || fscrypt_has_encryption_key(parent))) { /* * Just checking IS_ENCRYPTED(parent) below is not * sufficient to decide whether one can use the hash for * skipping the string comparison, because the key might * have been added right after * ext4_fname_setup_ci_filename(). In this case, a hash * mismatch will be a false negative. Therefore, make * sure cf_name was properly initialized before * considering the calculated hash. */ if (sb_no_casefold_compat_fallback(parent->i_sb) && IS_ENCRYPTED(parent) && fname->cf_name.name && (fname->hinfo.hash != EXT4_DIRENT_HASH(de) || fname->hinfo.minor_hash != EXT4_DIRENT_MINOR_HASH(de))) return false; /* * Treat comparison errors as not a match. The * only case where it happens is on a disk * corruption or ENOMEM. */ return generic_ci_match(parent, fname->usr_fname, &fname->cf_name, de->name, de->name_len) > 0; } #endif return fscrypt_match_name(&f, de->name, de->name_len); } /* * Returns 0 if not found, -EFSCORRUPTED on failure, and 1 on success */ int ext4_search_dir(struct buffer_head *bh, char *search_buf, int buf_size, struct inode *dir, struct ext4_filename *fname, unsigned int offset, struct ext4_dir_entry_2 **res_dir) { struct ext4_dir_entry_2 * de; char * dlimit; int de_len; de = (struct ext4_dir_entry_2 *)search_buf; dlimit = search_buf + buf_size; while ((char *) de < dlimit - EXT4_BASE_DIR_LEN) { /* this code is executed quadratically often */ /* do minimal checking `by hand' */ if (de->name + de->name_len <= dlimit && ext4_match(dir, fname, de)) { /* found a match - just to be sure, do * a full check */ if (ext4_check_dir_entry(dir, NULL, de, bh, search_buf, buf_size, offset)) return -EFSCORRUPTED; *res_dir = de; return 1; } /* prevent looping on a bad block */ de_len = ext4_rec_len_from_disk(de->rec_len, dir->i_sb->s_blocksize); if (de_len <= 0) return -EFSCORRUPTED; offset += de_len; de = (struct ext4_dir_entry_2 *) ((char *) de + de_len); } return 0; } static int is_dx_internal_node(struct inode *dir, ext4_lblk_t block, struct ext4_dir_entry *de) { struct super_block *sb = dir->i_sb; if (!is_dx(dir)) return 0; if (block == 0) return 1; if (de->inode == 0 && ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize) == sb->s_blocksize) return 1; return 0; } /* * __ext4_find_entry() * * finds an entry in the specified directory with the wanted name. It * returns the cache buffer in which the entry was found, and the entry * itself (as a parameter - res_dir). It does NOT read the inode of the * entry - you'll have to do that yourself if you want to. * * The returned buffer_head has ->b_count elevated. The caller is expected * to brelse() it when appropriate. */ static struct buffer_head *__ext4_find_entry(struct inode *dir, struct ext4_filename *fname, struct ext4_dir_entry_2 **res_dir, int *inlined) { struct super_block *sb; struct buffer_head *bh_use[NAMEI_RA_SIZE]; struct buffer_head *bh, *ret = NULL; ext4_lblk_t start, block; const u8 *name = fname->usr_fname->name; size_t ra_max = 0; /* Number of bh's in the readahead buffer, bh_use[] */ size_t ra_ptr = 0; /* Current index into readahead buffer */ ext4_lblk_t nblocks; int i, namelen, retval; *res_dir = NULL; sb = dir->i_sb; namelen = fname->usr_fname->len; if (namelen > EXT4_NAME_LEN) return NULL; if (ext4_has_inline_data(dir)) { int has_inline_data = 1; ret = ext4_find_inline_entry(dir, fname, res_dir, &has_inline_data); if (inlined) *inlined = has_inline_data; if (has_inline_data || IS_ERR(ret)) goto cleanup_and_exit; } if ((namelen <= 2) && (name[0] == '.') && (name[1] == '.' || name[1] == '\0')) { /* * "." or ".." will only be in the first block * NFS may look up ".."; "." should be handled by the VFS */ block = start = 0; nblocks = 1; goto restart; } if (is_dx(dir)) { ret = ext4_dx_find_entry(dir, fname, res_dir); /* * On success, or if the error was file not found, * return. Otherwise, fall back to doing a search the * old fashioned way. */ if (IS_ERR(ret) && PTR_ERR(ret) == ERR_BAD_DX_DIR) dxtrace(printk(KERN_DEBUG "ext4_find_entry: dx failed, " "falling back\n")); else if (!sb_no_casefold_compat_fallback(dir->i_sb) && *res_dir == NULL && IS_CASEFOLDED(dir)) dxtrace(printk(KERN_DEBUG "ext4_find_entry: casefold " "failed, falling back\n")); else goto cleanup_and_exit; ret = NULL; } nblocks = dir->i_size >> EXT4_BLOCK_SIZE_BITS(sb); if (!nblocks) { ret = NULL; goto cleanup_and_exit; } start = EXT4_I(dir)->i_dir_start_lookup; if (start >= nblocks) start = 0; block = start; restart: do { /* * We deal with the read-ahead logic here. */ cond_resched(); if (ra_ptr >= ra_max) { /* Refill the readahead buffer */ ra_ptr = 0; if (block < start) ra_max = start - block; else ra_max = nblocks - block; ra_max = min(ra_max, ARRAY_SIZE(bh_use)); retval = ext4_bread_batch(dir, block, ra_max, false /* wait */, bh_use); if (retval) { ret = ERR_PTR(retval); ra_max = 0; goto cleanup_and_exit; } } if ((bh = bh_use[ra_ptr++]) == NULL) goto next; wait_on_buffer(bh); if (!buffer_uptodate(bh)) { EXT4_ERROR_INODE_ERR(dir, EIO, "reading directory lblock %lu", (unsigned long) block); brelse(bh); ret = ERR_PTR(-EIO); goto cleanup_and_exit; } if (!buffer_verified(bh) && !is_dx_internal_node(dir, block, (struct ext4_dir_entry *)bh->b_data) && !ext4_dirblock_csum_verify(dir, bh)) { EXT4_ERROR_INODE_ERR(dir, EFSBADCRC, "checksumming directory " "block %lu", (unsigned long)block); brelse(bh); ret = ERR_PTR(-EFSBADCRC); goto cleanup_and_exit; } set_buffer_verified(bh); i = search_dirblock(bh, dir, fname, EXT4_LBLK_TO_B(dir, block), res_dir); if (i == 1) { EXT4_I(dir)->i_dir_start_lookup = block; ret = bh; goto cleanup_and_exit; } else { brelse(bh); if (i < 0) { ret = ERR_PTR(i); goto cleanup_and_exit; } } next: if (++block >= nblocks) block = 0; } while (block != start); /* * If the directory has grown while we were searching, then * search the last part of the directory before giving up. */ block = nblocks; nblocks = dir->i_size >> EXT4_BLOCK_SIZE_BITS(sb); if (block < nblocks) { start = 0; goto restart; } cleanup_and_exit: /* Clean up the read-ahead blocks */ for (; ra_ptr < ra_max; ra_ptr++) brelse(bh_use[ra_ptr]); return ret; } static struct buffer_head *ext4_find_entry(struct inode *dir, const struct qstr *d_name, struct ext4_dir_entry_2 **res_dir, int *inlined) { int err; struct ext4_filename fname; struct buffer_head *bh; err = ext4_fname_setup_filename(dir, d_name, 1, &fname); if (err == -ENOENT) return NULL; if (err) return ERR_PTR(err); bh = __ext4_find_entry(dir, &fname, res_dir, inlined); ext4_fname_free_filename(&fname); return bh; } static struct buffer_head *ext4_lookup_entry(struct inode *dir, struct dentry *dentry, struct ext4_dir_entry_2 **res_dir) { int err; struct ext4_filename fname; struct buffer_head *bh; err = ext4_fname_prepare_lookup(dir, dentry, &fname); if (err == -ENOENT) return NULL; if (err) return ERR_PTR(err); bh = __ext4_find_entry(dir, &fname, res_dir, NULL); ext4_fname_free_filename(&fname); return bh; } static struct buffer_head * ext4_dx_find_entry(struct inode *dir, struct ext4_filename *fname, struct ext4_dir_entry_2 **res_dir) { struct dx_frame frames[EXT4_HTREE_LEVEL], *frame; struct buffer_head *bh; ext4_lblk_t block; int retval; #ifdef CONFIG_FS_ENCRYPTION *res_dir = NULL; #endif frame = dx_probe(fname, dir, NULL, frames); if (IS_ERR(frame)) return ERR_CAST(frame); do { block = dx_get_block(frame->at); bh = ext4_read_dirblock(dir, block, DIRENT_HTREE); if (IS_ERR(bh)) goto errout; retval = search_dirblock(bh, dir, fname, EXT4_LBLK_TO_B(dir, block), res_dir); if (retval == 1) goto success; brelse(bh); if (retval < 0) { bh = ERR_PTR(ERR_BAD_DX_DIR); goto errout; } /* Check to see if we should continue to search */ retval = ext4_htree_next_block(dir, fname->hinfo.hash, frame, frames, NULL); if (retval < 0) { ext4_warning_inode(dir, "error %d reading directory index block", retval); bh = ERR_PTR(retval); goto errout; } } while (retval == 1); bh = NULL; errout: dxtrace(printk(KERN_DEBUG "%s not found\n", fname->usr_fname->name)); success: dx_release(frames); return bh; } static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { struct inode *inode; struct ext4_dir_entry_2 *de = NULL; struct buffer_head *bh; if (dentry->d_name.len > EXT4_NAME_LEN) return ERR_PTR(-ENAMETOOLONG); bh = ext4_lookup_entry(dir, dentry, &de); if (IS_ERR(bh)) return ERR_CAST(bh); inode = NULL; if (bh) { __u32 ino = le32_to_cpu(de->inode); brelse(bh); if (!ext4_valid_inum(dir->i_sb, ino)) { EXT4_ERROR_INODE(dir, "bad inode number: %u", ino); return ERR_PTR(-EFSCORRUPTED); } if (unlikely(ino == dir->i_ino)) { EXT4_ERROR_INODE(dir, "'%pd' linked to parent dir", dentry); return ERR_PTR(-EFSCORRUPTED); } inode = ext4_iget(dir->i_sb, ino, EXT4_IGET_NORMAL); if (inode == ERR_PTR(-ESTALE)) { EXT4_ERROR_INODE(dir, "deleted inode referenced: %u", ino); return ERR_PTR(-EFSCORRUPTED); } if (!IS_ERR(inode) && IS_ENCRYPTED(dir) && (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) && !fscrypt_has_permitted_context(dir, inode)) { ext4_warning(inode->i_sb, "Inconsistent encryption contexts: %lu/%lu", dir->i_ino, inode->i_ino); iput(inode); return ERR_PTR(-EPERM); } } if (IS_ENABLED(CONFIG_UNICODE) && !inode && IS_CASEFOLDED(dir)) { /* Eventually we want to call d_add_ci(dentry, NULL) * for negative dentries in the encoding case as * well. For now, prevent the negative dentry * from being cached. */ return NULL; } return d_splice_alias(inode, dentry); } struct dentry *ext4_get_parent(struct dentry *child) { __u32 ino; struct ext4_dir_entry_2 * de = NULL; struct buffer_head *bh; bh = ext4_find_entry(d_inode(child), &dotdot_name, &de, NULL); if (IS_ERR(bh)) return ERR_CAST(bh); if (!bh) return ERR_PTR(-ENOENT); ino = le32_to_cpu(de->inode); brelse(bh); if (!ext4_valid_inum(child->d_sb, ino)) { EXT4_ERROR_INODE(d_inode(child), "bad parent inode number: %u", ino); return ERR_PTR(-EFSCORRUPTED); } return d_obtain_alias(ext4_iget(child->d_sb, ino, EXT4_IGET_NORMAL)); } /* * Move count entries from end of map between two memory locations. * Returns pointer to last entry moved. */ static struct ext4_dir_entry_2 * dx_move_dirents(struct inode *dir, char *from, char *to, struct dx_map_entry *map, int count, unsigned blocksize) { unsigned rec_len = 0; while (count--) { struct ext4_dir_entry_2 *de = (struct ext4_dir_entry_2 *) (from + (map->offs<<2)); rec_len = ext4_dir_rec_len(de->name_len, dir); memcpy (to, de, rec_len); ((struct ext4_dir_entry_2 *) to)->rec_len = ext4_rec_len_to_disk(rec_len, blocksize); /* wipe dir_entry excluding the rec_len field */ de->inode = 0; memset(&de->name_len, 0, ext4_rec_len_from_disk(de->rec_len, blocksize) - offsetof(struct ext4_dir_entry_2, name_len)); map++; to += rec_len; } return (struct ext4_dir_entry_2 *) (to - rec_len); } /* * Compact each dir entry in the range to the minimal rec_len. * Returns pointer to last entry in range. */ static struct ext4_dir_entry_2 *dx_pack_dirents(struct inode *dir, char *base, unsigned int blocksize) { struct ext4_dir_entry_2 *next, *to, *prev, *de = (struct ext4_dir_entry_2 *) base; unsigned rec_len = 0; prev = to = de; while ((char*)de < base + blocksize) { next = ext4_next_entry(de, blocksize); if (de->inode && de->name_len) { rec_len = ext4_dir_rec_len(de->name_len, dir); if (de > to) memmove(to, de, rec_len); to->rec_len = ext4_rec_len_to_disk(rec_len, blocksize); prev = to; to = (struct ext4_dir_entry_2 *) (((char *) to) + rec_len); } de = next; } return prev; } /* * Split a full leaf block to make room for a new dir entry. * Allocate a new block, and move entries so that they are approx. equally full. * Returns pointer to de in block into which the new entry will be inserted. */ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir, struct buffer_head **bh,struct dx_frame *frame, struct dx_hash_info *hinfo) { unsigned blocksize = dir->i_sb->s_blocksize; unsigned continued; int count; struct buffer_head *bh2; ext4_lblk_t newblock; u32 hash2; struct dx_map_entry *map; char *data1 = (*bh)->b_data, *data2; unsigned split, move, size; struct ext4_dir_entry_2 *de = NULL, *de2; int csum_size = 0; int err = 0, i; if (ext4_has_feature_metadata_csum(dir->i_sb)) csum_size = sizeof(struct ext4_dir_entry_tail); bh2 = ext4_append(handle, dir, &newblock); if (IS_ERR(bh2)) { brelse(*bh); *bh = NULL; return ERR_CAST(bh2); } BUFFER_TRACE(*bh, "get_write_access"); err = ext4_journal_get_write_access(handle, dir->i_sb, *bh, EXT4_JTR_NONE); if (err) goto journal_error; BUFFER_TRACE(frame->bh, "get_write_access"); err = ext4_journal_get_write_access(handle, dir->i_sb, frame->bh, EXT4_JTR_NONE); if (err) goto journal_error; data2 = bh2->b_data; /* create map in the end of data2 block */ map = (struct dx_map_entry *) (data2 + blocksize); count = dx_make_map(dir, *bh, hinfo, map); if (count < 0) { err = count; goto journal_error; } map -= count; dx_sort_map(map, count); /* Ensure that neither split block is over half full */ size = 0; move = 0; for (i = count-1; i >= 0; i--) { /* is more than half of this entry in 2nd half of the block? */ if (size + map[i].size/2 > blocksize/2) break; size += map[i].size; move++; } /* * map index at which we will split * * If the sum of active entries didn't exceed half the block size, just * split it in half by count; each resulting block will have at least * half the space free. */ if (i >= 0) split = count - move; else split = count/2; if (WARN_ON_ONCE(split == 0)) { /* Should never happen, but avoid out-of-bounds access below */ ext4_error_inode_block(dir, (*bh)->b_blocknr, 0, "bad indexed directory? hash=%08x:%08x count=%d move=%u", hinfo->hash, hinfo->minor_hash, count, move); err = -EFSCORRUPTED; goto out; } hash2 = map[split].hash; continued = hash2 == map[split - 1].hash; dxtrace(printk(KERN_INFO "Split block %lu at %x, %i/%i\n", (unsigned long)dx_get_block(frame->at), hash2, split, count-split)); /* Fancy dance to stay within two buffers */ de2 = dx_move_dirents(dir, data1, data2, map + split, count - split, blocksize); de = dx_pack_dirents(dir, data1, blocksize); de->rec_len = ext4_rec_len_to_disk(data1 + (blocksize - csum_size) - (char *) de, blocksize); de2->rec_len = ext4_rec_len_to_disk(data2 + (blocksize - csum_size) - (char *) de2, blocksize); if (csum_size) { ext4_initialize_dirent_tail(*bh, blocksize); ext4_initialize_dirent_tail(bh2, blocksize); } dxtrace(dx_show_leaf(dir, hinfo, (struct ext4_dir_entry_2 *) data1, blocksize, 1)); dxtrace(dx_show_leaf(dir, hinfo, (struct ext4_dir_entry_2 *) data2, blocksize, 1)); /* Which block gets the new entry? */ if (hinfo->hash >= hash2) { swap(*bh, bh2); de = de2; } dx_insert_block(frame, hash2 + continued, newblock); err = ext4_handle_dirty_dirblock(handle, dir, bh2); if (err) goto journal_error; err = ext4_handle_dirty_dx_node(handle, dir, frame->bh); if (err) goto journal_error; brelse(bh2); dxtrace(dx_show_index("frame", frame->entries)); return de; journal_error: ext4_std_error(dir->i_sb, err); out: brelse(*bh); brelse(bh2); *bh = NULL; return ERR_PTR(err); } int ext4_find_dest_de(struct inode *dir, struct buffer_head *bh, void *buf, int buf_size, struct ext4_filename *fname, struct ext4_dir_entry_2 **dest_de) { struct ext4_dir_entry_2 *de; unsigned short reclen = ext4_dir_rec_len(fname_len(fname), dir); int nlen, rlen; unsigned int offset = 0; char *top; de = buf; top = buf + buf_size - reclen; while ((char *) de <= top) { if (ext4_check_dir_entry(dir, NULL, de, bh, buf, buf_size, offset)) return -EFSCORRUPTED; if (ext4_match(dir, fname, de)) return -EEXIST; nlen = ext4_dir_rec_len(de->name_len, dir); rlen = ext4_rec_len_from_disk(de->rec_len, buf_size); if ((de->inode ? rlen - nlen : rlen) >= reclen) break; de = (struct ext4_dir_entry_2 *)((char *)de + rlen); offset += rlen; } if ((char *) de > top) return -ENOSPC; *dest_de = de; return 0; } void ext4_insert_dentry(struct inode *dir, struct inode *inode, struct ext4_dir_entry_2 *de, int buf_size, struct ext4_filename *fname) { int nlen, rlen; nlen = ext4_dir_rec_len(de->name_len, dir); rlen = ext4_rec_len_from_disk(de->rec_len, buf_size); if (de->inode) { struct ext4_dir_entry_2 *de1 = (struct ext4_dir_entry_2 *)((char *)de + nlen); de1->rec_len = ext4_rec_len_to_disk(rlen - nlen, buf_size); de->rec_len = ext4_rec_len_to_disk(nlen, buf_size); de = de1; } de->file_type = EXT4_FT_UNKNOWN; de->inode = cpu_to_le32(inode->i_ino); ext4_set_de_type(inode->i_sb, de, inode->i_mode); de->name_len = fname_len(fname); memcpy(de->name, fname_name(fname), fname_len(fname)); if (ext4_hash_in_dirent(dir)) { struct dx_hash_info *hinfo = &fname->hinfo; EXT4_DIRENT_HASHES(de)->hash = cpu_to_le32(hinfo->hash); EXT4_DIRENT_HASHES(de)->minor_hash = cpu_to_le32(hinfo->minor_hash); } } /* * Add a new entry into a directory (leaf) block. If de is non-NULL, * it points to a directory entry which is guaranteed to be large * enough for new directory entry. If de is NULL, then * add_dirent_to_buf will attempt search the directory block for * space. It will return -ENOSPC if no space is available, and -EIO * and -EEXIST if directory entry already exists. */ static int add_dirent_to_buf(handle_t *handle, struct ext4_filename *fname, struct inode *dir, struct inode *inode, struct ext4_dir_entry_2 *de, struct buffer_head *bh) { unsigned int blocksize = dir->i_sb->s_blocksize; int csum_size = 0; int err, err2; if (ext4_has_feature_metadata_csum(inode->i_sb)) csum_size = sizeof(struct ext4_dir_entry_tail); if (!de) { err = ext4_find_dest_de(dir, bh, bh->b_data, blocksize - csum_size, fname, &de); if (err) return err; } BUFFER_TRACE(bh, "get_write_access"); err = ext4_journal_get_write_access(handle, dir->i_sb, bh, EXT4_JTR_NONE); if (err) { ext4_std_error(dir->i_sb, err); return err; } /* By now the buffer is marked for journaling */ ext4_insert_dentry(dir, inode, de, blocksize, fname); /* * XXX shouldn't update any times until successful * completion of syscall, but too many callers depend * on this. * * XXX similarly, too many callers depend on * ext4_new_inode() setting the times, but error * recovery deletes the inode, so the worst that can * happen is that the times are slightly out of date * and/or different from the directory change time. */ inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); ext4_update_dx_flag(dir); inode_inc_iversion(dir); err2 = ext4_mark_inode_dirty(handle, dir); BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); err = ext4_handle_dirty_dirblock(handle, dir, bh); if (err) ext4_std_error(dir->i_sb, err); return err ? err : err2; } static bool ext4_check_dx_root(struct inode *dir, struct dx_root *root) { struct fake_dirent *fde; const char *error_msg; unsigned int rlen; unsigned int blocksize = dir->i_sb->s_blocksize; char *blockend = (char *)root + dir->i_sb->s_blocksize; fde = &root->dot; if (unlikely(fde->name_len != 1)) { error_msg = "invalid name_len for '.'"; goto corrupted; } if (unlikely(strncmp(root->dot_name, ".", fde->name_len))) { error_msg = "invalid name for '.'"; goto corrupted; } rlen = ext4_rec_len_from_disk(fde->rec_len, blocksize); if (unlikely((char *)fde + rlen >= blockend)) { error_msg = "invalid rec_len for '.'"; goto corrupted; } fde = &root->dotdot; if (unlikely(fde->name_len != 2)) { error_msg = "invalid name_len for '..'"; goto corrupted; } if (unlikely(strncmp(root->dotdot_name, "..", fde->name_len))) { error_msg = "invalid name for '..'"; goto corrupted; } rlen = ext4_rec_len_from_disk(fde->rec_len, blocksize); if (unlikely((char *)fde + rlen >= blockend)) { error_msg = "invalid rec_len for '..'"; goto corrupted; } return true; corrupted: EXT4_ERROR_INODE(dir, "Corrupt dir, %s, running e2fsck is recommended", error_msg); return false; } /* * This converts a one block unindexed directory to a 3 block indexed * directory, and adds the dentry to the indexed directory. */ static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname, struct inode *dir, struct inode *inode, struct buffer_head *bh) { struct buffer_head *bh2; struct dx_root *root; struct dx_frame frames[EXT4_HTREE_LEVEL], *frame; struct dx_entry *entries; struct ext4_dir_entry_2 *de, *de2; char *data2, *top; unsigned len; int retval; unsigned blocksize; ext4_lblk_t block; struct fake_dirent *fde; int csum_size = 0; if (ext4_has_feature_metadata_csum(inode->i_sb)) csum_size = sizeof(struct ext4_dir_entry_tail); blocksize = dir->i_sb->s_blocksize; dxtrace(printk(KERN_DEBUG "Creating index: inode %lu\n", dir->i_ino)); BUFFER_TRACE(bh, "get_write_access"); retval = ext4_journal_get_write_access(handle, dir->i_sb, bh, EXT4_JTR_NONE); if (retval) { ext4_std_error(dir->i_sb, retval); brelse(bh); return retval; } root = (struct dx_root *) bh->b_data; if (!ext4_check_dx_root(dir, root)) { brelse(bh); return -EFSCORRUPTED; } /* The 0th block becomes the root, move the dirents out */ fde = &root->dotdot; de = (struct ext4_dir_entry_2 *)((char *)fde + ext4_rec_len_from_disk(fde->rec_len, blocksize)); len = ((char *) root) + (blocksize - csum_size) - (char *) de; /* Allocate new block for the 0th block's dirents */ bh2 = ext4_append(handle, dir, &block); if (IS_ERR(bh2)) { brelse(bh); return PTR_ERR(bh2); } ext4_set_inode_flag(dir, EXT4_INODE_INDEX); data2 = bh2->b_data; memcpy(data2, de, len); memset(de, 0, len); /* wipe old data */ de = (struct ext4_dir_entry_2 *) data2; top = data2 + len; while ((char *)(de2 = ext4_next_entry(de, blocksize)) < top) { if (ext4_check_dir_entry(dir, NULL, de, bh2, data2, len, (char *)de - data2)) { brelse(bh2); brelse(bh); return -EFSCORRUPTED; } de = de2; } de->rec_len = ext4_rec_len_to_disk(data2 + (blocksize - csum_size) - (char *) de, blocksize); if (csum_size) ext4_initialize_dirent_tail(bh2, blocksize); /* Initialize the root; the dot dirents already exist */ de = (struct ext4_dir_entry_2 *) (&root->dotdot); de->rec_len = ext4_rec_len_to_disk( blocksize - ext4_dir_rec_len(2, NULL), blocksize); memset (&root->info, 0, sizeof(root->info)); root->info.info_length = sizeof(root->info); if (ext4_hash_in_dirent(dir)) root->info.hash_version = DX_HASH_SIPHASH; else root->info.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version; entries = root->entries; dx_set_block(entries, 1); dx_set_count(entries, 1); dx_set_limit(entries, dx_root_limit(dir, sizeof(root->info))); /* Initialize as for dx_probe */ fname->hinfo.hash_version = root->info.hash_version; if (fname->hinfo.hash_version <= DX_HASH_TEA) fname->hinfo.hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned; fname->hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed; /* casefolded encrypted hashes are computed on fname setup */ if (!ext4_hash_in_dirent(dir)) { int err = ext4fs_dirhash(dir, fname_name(fname), fname_len(fname), &fname->hinfo); if (err < 0) { brelse(bh2); brelse(bh); return err; } } memset(frames, 0, sizeof(frames)); frame = frames; frame->entries = entries; frame->at = entries; frame->bh = bh; retval = ext4_handle_dirty_dx_node(handle, dir, frame->bh); if (retval) goto out_frames; retval = ext4_handle_dirty_dirblock(handle, dir, bh2); if (retval) goto out_frames; de = do_split(handle,dir, &bh2, frame, &fname->hinfo); if (IS_ERR(de)) { retval = PTR_ERR(de); goto out_frames; } retval = add_dirent_to_buf(handle, fname, dir, inode, de, bh2); out_frames: /* * Even if the block split failed, we have to properly write * out all the changes we did so far. Otherwise we can end up * with corrupted filesystem. */ if (retval) ext4_mark_inode_dirty(handle, dir); dx_release(frames); brelse(bh2); return retval; } /* * ext4_add_entry() * * adds a file entry to the specified directory, using the same * semantics as ext4_find_entry(). It returns NULL if it failed. * * NOTE!! The inode part of 'de' is left at 0 - which means you * may not sleep between calling this and putting something into * the entry, as someone else might have used it while you slept. */ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, struct inode *inode) { struct inode *dir = d_inode(dentry->d_parent); struct buffer_head *bh = NULL; struct ext4_dir_entry_2 *de; struct super_block *sb; struct ext4_filename fname; int retval; int dx_fallback=0; unsigned blocksize; ext4_lblk_t block, blocks; int csum_size = 0; if (ext4_has_feature_metadata_csum(inode->i_sb)) csum_size = sizeof(struct ext4_dir_entry_tail); sb = dir->i_sb; blocksize = sb->s_blocksize; if (fscrypt_is_nokey_name(dentry)) return -ENOKEY; if (!generic_ci_validate_strict_name(dir, &dentry->d_name)) return -EINVAL; retval = ext4_fname_setup_filename(dir, &dentry->d_name, 0, &fname); if (retval) return retval; if (ext4_has_inline_data(dir)) { retval = ext4_try_add_inline_entry(handle, &fname, dir, inode); if (retval < 0) goto out; if (retval == 1) { retval = 0; goto out; } } if (is_dx(dir)) { retval = ext4_dx_add_entry(handle, &fname, dir, inode); if (!retval || (retval != ERR_BAD_DX_DIR)) goto out; /* Can we just ignore htree data? */ if (ext4_has_feature_metadata_csum(sb)) { EXT4_ERROR_INODE(dir, "Directory has corrupted htree index."); retval = -EFSCORRUPTED; goto out; } ext4_clear_inode_flag(dir, EXT4_INODE_INDEX); dx_fallback++; retval = ext4_mark_inode_dirty(handle, dir); if (unlikely(retval)) goto out; } blocks = dir->i_size >> sb->s_blocksize_bits; for (block = 0; block < blocks; block++) { bh = ext4_read_dirblock(dir, block, DIRENT); if (bh == NULL) { bh = ext4_bread(handle, dir, block, EXT4_GET_BLOCKS_CREATE); goto add_to_new_block; } if (IS_ERR(bh)) { retval = PTR_ERR(bh); bh = NULL; goto out; } retval = add_dirent_to_buf(handle, &fname, dir, inode, NULL, bh); if (retval != -ENOSPC) goto out; if (blocks == 1 && !dx_fallback && ext4_has_feature_dir_index(sb)) { retval = make_indexed_dir(handle, &fname, dir, inode, bh); bh = NULL; /* make_indexed_dir releases bh */ goto out; } brelse(bh); } bh = ext4_append(handle, dir, &block); add_to_new_block: if (IS_ERR(bh)) { retval = PTR_ERR(bh); bh = NULL; goto out; } de = (struct ext4_dir_entry_2 *) bh->b_data; de->inode = 0; de->rec_len = ext4_rec_len_to_disk(blocksize - csum_size, blocksize); if (csum_size) ext4_initialize_dirent_tail(bh, blocksize); retval = add_dirent_to_buf(handle, &fname, dir, inode, de, bh); out: ext4_fname_free_filename(&fname); brelse(bh); if (retval == 0) ext4_set_inode_state(inode, EXT4_STATE_NEWENTRY); return retval; } /* * Returns 0 for success, or a negative error value */ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname, struct inode *dir, struct inode *inode) { struct dx_frame frames[EXT4_HTREE_LEVEL], *frame; struct dx_entry *entries, *at; struct buffer_head *bh; struct super_block *sb = dir->i_sb; struct ext4_dir_entry_2 *de; int restart; int err; again: restart = 0; frame = dx_probe(fname, dir, NULL, frames); if (IS_ERR(frame)) return PTR_ERR(frame); entries = frame->entries; at = frame->at; bh = ext4_read_dirblock(dir, dx_get_block(frame->at), DIRENT_HTREE); if (IS_ERR(bh)) { err = PTR_ERR(bh); bh = NULL; goto cleanup; } BUFFER_TRACE(bh, "get_write_access"); err = ext4_journal_get_write_access(handle, sb, bh, EXT4_JTR_NONE); if (err) goto journal_error; err = add_dirent_to_buf(handle, fname, dir, inode, NULL, bh); if (err != -ENOSPC) goto cleanup; err = 0; /* Block full, should compress but for now just split */ dxtrace(printk(KERN_DEBUG "using %u of %u node entries\n", dx_get_count(entries), dx_get_limit(entries))); /* Need to split index? */ if (dx_get_count(entries) == dx_get_limit(entries)) { ext4_lblk_t newblock; int levels = frame - frames + 1; unsigned int icount; int add_level = 1; struct dx_entry *entries2; struct dx_node *node2; struct buffer_head *bh2; while (frame > frames) { if (dx_get_count((frame - 1)->entries) < dx_get_limit((frame - 1)->entries)) { add_level = 0; break; } frame--; /* split higher index block */ at = frame->at; entries = frame->entries; restart = 1; } if (add_level && levels == ext4_dir_htree_level(sb)) { ext4_warning(sb, "Directory (ino: %lu) index full, " "reach max htree level :%d", dir->i_ino, levels); if (ext4_dir_htree_level(sb) < EXT4_HTREE_LEVEL) { ext4_warning(sb, "Large directory feature is " "not enabled on this " "filesystem"); } err = -ENOSPC; goto cleanup; } icount = dx_get_count(entries); bh2 = ext4_append(handle, dir, &newblock); if (IS_ERR(bh2)) { err = PTR_ERR(bh2); goto cleanup; } node2 = (struct dx_node *)(bh2->b_data); entries2 = node2->entries; memset(&node2->fake, 0, sizeof(struct fake_dirent)); node2->fake.rec_len = ext4_rec_len_to_disk(sb->s_blocksize, sb->s_blocksize); BUFFER_TRACE(frame->bh, "get_write_access"); err = ext4_journal_get_write_access(handle, sb, frame->bh, EXT4_JTR_NONE); if (err) { brelse(bh2); goto journal_error; } if (!add_level) { unsigned icount1 = icount/2, icount2 = icount - icount1; unsigned hash2 = dx_get_hash(entries + icount1); dxtrace(printk(KERN_DEBUG "Split index %i/%i\n", icount1, icount2)); BUFFER_TRACE(frame->bh, "get_write_access"); /* index root */ err = ext4_journal_get_write_access(handle, sb, (frame - 1)->bh, EXT4_JTR_NONE); if (err) { brelse(bh2); goto journal_error; } memcpy((char *) entries2, (char *) (entries + icount1), icount2 * sizeof(struct dx_entry)); dx_set_count(entries, icount1); dx_set_count(entries2, icount2); dx_set_limit(entries2, dx_node_limit(dir)); /* Which index block gets the new entry? */ if (at - entries >= icount1) { frame->at = at - entries - icount1 + entries2; frame->entries = entries = entries2; swap(frame->bh, bh2); } dx_insert_block((frame - 1), hash2, newblock); dxtrace(dx_show_index("node", frame->entries)); dxtrace(dx_show_index("node", ((struct dx_node *) bh2->b_data)->entries)); err = ext4_handle_dirty_dx_node(handle, dir, bh2); if (err) { brelse(bh2); goto journal_error; } brelse (bh2); err = ext4_handle_dirty_dx_node(handle, dir, (frame - 1)->bh); if (err) goto journal_error; err = ext4_handle_dirty_dx_node(handle, dir, frame->bh); if (restart || err) goto journal_error; } else { struct dx_root *dxroot; memcpy((char *) entries2, (char *) entries, icount * sizeof(struct dx_entry)); dx_set_limit(entries2, dx_node_limit(dir)); /* Set up root */ dx_set_count(entries, 1); dx_set_block(entries + 0, newblock); dxroot = (struct dx_root *)frames[0].bh->b_data; dxroot->info.indirect_levels += 1; dxtrace(printk(KERN_DEBUG "Creating %d level index...\n", dxroot->info.indirect_levels)); err = ext4_handle_dirty_dx_node(handle, dir, frame->bh); if (err) { brelse(bh2); goto journal_error; } err = ext4_handle_dirty_dx_node(handle, dir, bh2); brelse(bh2); restart = 1; goto journal_error; } } de = do_split(handle, dir, &bh, frame, &fname->hinfo); if (IS_ERR(de)) { err = PTR_ERR(de); goto cleanup; } err = add_dirent_to_buf(handle, fname, dir, inode, de, bh); goto cleanup; journal_error: ext4_std_error(dir->i_sb, err); /* this is a no-op if err == 0 */ cleanup: brelse(bh); dx_release(frames); /* @restart is true means htree-path has been changed, we need to * repeat dx_probe() to find out valid htree-path */ if (restart && err == 0) goto again; return err; } /* * ext4_generic_delete_entry deletes a directory entry by merging it * with the previous entry */ int ext4_generic_delete_entry(struct inode *dir, struct ext4_dir_entry_2 *de_del, struct buffer_head *bh, void *entry_buf, int buf_size, int csum_size) { struct ext4_dir_entry_2 *de, *pde; unsigned int blocksize = dir->i_sb->s_blocksize; int i; i = 0; pde = NULL; de = entry_buf; while (i < buf_size - csum_size) { if (ext4_check_dir_entry(dir, NULL, de, bh, entry_buf, buf_size, i)) return -EFSCORRUPTED; if (de == de_del) { if (pde) { pde->rec_len = ext4_rec_len_to_disk( ext4_rec_len_from_disk(pde->rec_len, blocksize) + ext4_rec_len_from_disk(de->rec_len, blocksize), blocksize); /* wipe entire dir_entry */ memset(de, 0, ext4_rec_len_from_disk(de->rec_len, blocksize)); } else { /* wipe dir_entry excluding the rec_len field */ de->inode = 0; memset(&de->name_len, 0, ext4_rec_len_from_disk(de->rec_len, blocksize) - offsetof(struct ext4_dir_entry_2, name_len)); } inode_inc_iversion(dir); return 0; } i += ext4_rec_len_from_disk(de->rec_len, blocksize); pde = de; de = ext4_next_entry(de, blocksize); } return -ENOENT; } static int ext4_delete_entry(handle_t *handle, struct inode *dir, struct ext4_dir_entry_2 *de_del, struct buffer_head *bh) { int err, csum_size = 0; if (ext4_has_inline_data(dir)) { int has_inline_data = 1; err = ext4_delete_inline_entry(handle, dir, de_del, bh, &has_inline_data); if (has_inline_data) return err; } if (ext4_has_feature_metadata_csum(dir->i_sb)) csum_size = sizeof(struct ext4_dir_entry_tail); BUFFER_TRACE(bh, "get_write_access"); err = ext4_journal_get_write_access(handle, dir->i_sb, bh, EXT4_JTR_NONE); if (unlikely(err)) goto out; err = ext4_generic_delete_entry(dir, de_del, bh, bh->b_data, dir->i_sb->s_blocksize, csum_size); if (err) goto out; BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); err = ext4_handle_dirty_dirblock(handle, dir, bh); if (unlikely(err)) goto out; return 0; out: if (err != -ENOENT) ext4_std_error(dir->i_sb, err); return err; } /* * Set directory link count to 1 if nlinks > EXT4_LINK_MAX, or if nlinks == 2 * since this indicates that nlinks count was previously 1 to avoid overflowing * the 16-bit i_links_count field on disk. Directories with i_nlink == 1 mean * that subdirectory link counts are not being maintained accurately. * * The caller has already checked for i_nlink overflow in case the DIR_LINK * feature is not enabled and returned -EMLINK. The is_dx() check is a proxy * for checking S_ISDIR(inode) (since the INODE_INDEX feature will not be set * on regular files) and to avoid creating huge/slow non-HTREE directories. */ static void ext4_inc_count(struct inode *inode) { inc_nlink(inode); if (is_dx(inode) && (inode->i_nlink > EXT4_LINK_MAX || inode->i_nlink == 2)) set_nlink(inode, 1); } /* * If a directory had nlink == 1, then we should let it be 1. This indicates * directory has >EXT4_LINK_MAX subdirs. */ static void ext4_dec_count(struct inode *inode) { if (!S_ISDIR(inode->i_mode) || inode->i_nlink > 2) drop_nlink(inode); } /* * Add non-directory inode to a directory. On success, the inode reference is * consumed by dentry is instantiation. This is also indicated by clearing of * *inodep pointer. On failure, the caller is responsible for dropping the * inode reference in the safe context. */ static int ext4_add_nondir(handle_t *handle, struct dentry *dentry, struct inode **inodep) { struct inode *dir = d_inode(dentry->d_parent); struct inode *inode = *inodep; int err = ext4_add_entry(handle, dentry, inode); if (!err) { err = ext4_mark_inode_dirty(handle, inode); if (IS_DIRSYNC(dir)) ext4_handle_sync(handle); d_instantiate_new(dentry, inode); *inodep = NULL; return err; } drop_nlink(inode); ext4_mark_inode_dirty(handle, inode); ext4_orphan_add(handle, inode); unlock_new_inode(inode); return err; } /* * By the time this is called, we already have created * the directory cache entry for the new file, but it * is so far negative - it has no inode. * * If the create succeeds, we fill in the inode information * with d_instantiate(). */ static int ext4_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { handle_t *handle; struct inode *inode; int err, credits, retries = 0; err = dquot_initialize(dir); if (err) return err; credits = (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3); retry: inode = ext4_new_inode_start_handle(idmap, dir, mode, &dentry->d_name, 0, NULL, EXT4_HT_DIR, credits); handle = ext4_journal_current_handle(); err = PTR_ERR(inode); if (!IS_ERR(inode)) { inode->i_op = &ext4_file_inode_operations; inode->i_fop = &ext4_file_operations; ext4_set_aops(inode); err = ext4_add_nondir(handle, dentry, &inode); if (!err) ext4_fc_track_create(handle, dentry); } if (handle) ext4_journal_stop(handle); if (!IS_ERR_OR_NULL(inode)) iput(inode); if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries)) goto retry; return err; } static int ext4_mknod(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev) { handle_t *handle; struct inode *inode; int err, credits, retries = 0; err = dquot_initialize(dir); if (err) return err; credits = (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3); retry: inode = ext4_new_inode_start_handle(idmap, dir, mode, &dentry->d_name, 0, NULL, EXT4_HT_DIR, credits); handle = ext4_journal_current_handle(); err = PTR_ERR(inode); if (!IS_ERR(inode)) { init_special_inode(inode, inode->i_mode, rdev); inode->i_op = &ext4_special_inode_operations; err = ext4_add_nondir(handle, dentry, &inode); if (!err) ext4_fc_track_create(handle, dentry); } if (handle) ext4_journal_stop(handle); if (!IS_ERR_OR_NULL(inode)) iput(inode); if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries)) goto retry; return err; } static int ext4_tmpfile(struct mnt_idmap *idmap, struct inode *dir, struct file *file, umode_t mode) { handle_t *handle; struct inode *inode; int err, retries = 0; err = dquot_initialize(dir); if (err) return err; retry: inode = ext4_new_inode_start_handle(idmap, dir, mode, NULL, 0, NULL, EXT4_HT_DIR, EXT4_MAXQUOTAS_TRANS_BLOCKS(dir->i_sb) + 4 + EXT4_XATTR_TRANS_BLOCKS); handle = ext4_journal_current_handle(); err = PTR_ERR(inode); if (!IS_ERR(inode)) { inode->i_op = &ext4_file_inode_operations; inode->i_fop = &ext4_file_operations; ext4_set_aops(inode); d_tmpfile(file, inode); err = ext4_orphan_add(handle, inode); if (err) goto err_unlock_inode; mark_inode_dirty(inode); unlock_new_inode(inode); } if (handle) ext4_journal_stop(handle); if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries)) goto retry; return finish_open_simple(file, err); err_unlock_inode: ext4_journal_stop(handle); unlock_new_inode(inode); return err; } int ext4_init_dirblock(handle_t *handle, struct inode *inode, struct buffer_head *bh, unsigned int parent_ino, void *inline_buf, int inline_size) { struct ext4_dir_entry_2 *de = (struct ext4_dir_entry_2 *) bh->b_data; size_t blocksize = bh->b_size; int csum_size = 0, header_size; if (ext4_has_feature_metadata_csum(inode->i_sb)) csum_size = sizeof(struct ext4_dir_entry_tail); de->inode = cpu_to_le32(inode->i_ino); de->name_len = 1; de->rec_len = ext4_rec_len_to_disk(ext4_dir_rec_len(de->name_len, NULL), blocksize); memcpy(de->name, ".", 2); ext4_set_de_type(inode->i_sb, de, S_IFDIR); de = ext4_next_entry(de, blocksize); de->inode = cpu_to_le32(parent_ino); de->name_len = 2; memcpy(de->name, "..", 3); ext4_set_de_type(inode->i_sb, de, S_IFDIR); if (inline_buf) { de->rec_len = ext4_rec_len_to_disk( ext4_dir_rec_len(de->name_len, NULL), blocksize); de = ext4_next_entry(de, blocksize); header_size = (char *)de - bh->b_data; memcpy((void *)de, inline_buf, inline_size); ext4_update_final_de(bh->b_data, inline_size + header_size, blocksize - csum_size); } else { de->rec_len = ext4_rec_len_to_disk(blocksize - (csum_size + ext4_dir_rec_len(1, NULL)), blocksize); } if (csum_size) ext4_initialize_dirent_tail(bh, blocksize); BUFFER_TRACE(dir_block, "call ext4_handle_dirty_metadata"); set_buffer_uptodate(bh); set_buffer_verified(bh); return ext4_handle_dirty_dirblock(handle, inode, bh); } int ext4_init_new_dir(handle_t *handle, struct inode *dir, struct inode *inode) { struct buffer_head *dir_block = NULL; ext4_lblk_t block = 0; int err; if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) { err = ext4_try_create_inline_dir(handle, dir, inode); if (err < 0 && err != -ENOSPC) goto out; if (!err) goto out; } set_nlink(inode, 2); inode->i_size = 0; dir_block = ext4_append(handle, inode, &block); if (IS_ERR(dir_block)) return PTR_ERR(dir_block); err = ext4_init_dirblock(handle, inode, dir_block, dir->i_ino, NULL, 0); out: brelse(dir_block); return err; } static struct dentry *ext4_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { handle_t *handle; struct inode *inode; int err, err2 = 0, credits, retries = 0; if (EXT4_DIR_LINK_MAX(dir)) return ERR_PTR(-EMLINK); err = dquot_initialize(dir); if (err) return ERR_PTR(err); credits = (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3); retry: inode = ext4_new_inode_start_handle(idmap, dir, S_IFDIR | mode, &dentry->d_name, 0, NULL, EXT4_HT_DIR, credits); handle = ext4_journal_current_handle(); err = PTR_ERR(inode); if (IS_ERR(inode)) goto out_stop; inode->i_op = &ext4_dir_inode_operations; inode->i_fop = &ext4_dir_operations; err = ext4_init_new_dir(handle, dir, inode); if (err) goto out_clear_inode; err = ext4_mark_inode_dirty(handle, inode); if (!err) err = ext4_add_entry(handle, dentry, inode); if (err) { out_clear_inode: clear_nlink(inode); ext4_orphan_add(handle, inode); unlock_new_inode(inode); err2 = ext4_mark_inode_dirty(handle, inode); if (unlikely(err2)) err = err2; ext4_journal_stop(handle); iput(inode); goto out_retry; } ext4_inc_count(dir); ext4_update_dx_flag(dir); err = ext4_mark_inode_dirty(handle, dir); if (err) goto out_clear_inode; d_instantiate_new(dentry, inode); ext4_fc_track_create(handle, dentry); if (IS_DIRSYNC(dir)) ext4_handle_sync(handle); out_stop: if (handle) ext4_journal_stop(handle); out_retry: if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries)) goto retry; return ERR_PTR(err); } /* * routine to check that the specified directory is empty (for rmdir) */ bool ext4_empty_dir(struct inode *inode) { unsigned int offset; struct buffer_head *bh; struct ext4_dir_entry_2 *de; struct super_block *sb; if (ext4_has_inline_data(inode)) { int has_inline_data = 1; int ret; ret = empty_inline_dir(inode, &has_inline_data); if (has_inline_data) return ret; } sb = inode->i_sb; if (inode->i_size < ext4_dir_rec_len(1, NULL) + ext4_dir_rec_len(2, NULL)) { EXT4_ERROR_INODE(inode, "invalid size"); return false; } bh = ext4_read_dirblock(inode, 0, EITHER); if (IS_ERR(bh)) return false; de = (struct ext4_dir_entry_2 *) bh->b_data; if (ext4_check_dir_entry(inode, NULL, de, bh, bh->b_data, bh->b_size, 0) || le32_to_cpu(de->inode) != inode->i_ino || de->name_len != 1 || de->name[0] != '.') { ext4_warning_inode(inode, "directory missing '.'"); brelse(bh); return false; } offset = ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize); de = ext4_next_entry(de, sb->s_blocksize); if (ext4_check_dir_entry(inode, NULL, de, bh, bh->b_data, bh->b_size, offset) || le32_to_cpu(de->inode) == 0 || de->name_len != 2 || de->name[0] != '.' || de->name[1] != '.') { ext4_warning_inode(inode, "directory missing '..'"); brelse(bh); return false; } offset += ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize); while (offset < inode->i_size) { if (!(offset & (sb->s_blocksize - 1))) { unsigned int lblock; brelse(bh); lblock = offset >> EXT4_BLOCK_SIZE_BITS(sb); bh = ext4_read_dirblock(inode, lblock, EITHER); if (bh == NULL) { offset += sb->s_blocksize; continue; } if (IS_ERR(bh)) return false; } de = (struct ext4_dir_entry_2 *) (bh->b_data + (offset & (sb->s_blocksize - 1))); if (ext4_check_dir_entry(inode, NULL, de, bh, bh->b_data, bh->b_size, offset) || le32_to_cpu(de->inode)) { brelse(bh); return false; } offset += ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize); } brelse(bh); return true; } static int ext4_rmdir(struct inode *dir, struct dentry *dentry) { int retval; struct inode *inode; struct buffer_head *bh; struct ext4_dir_entry_2 *de = NULL; handle_t *handle = NULL; retval = ext4_emergency_state(dir->i_sb); if (unlikely(retval)) return retval; /* Initialize quotas before so that eventual writes go in * separate transaction */ retval = dquot_initialize(dir); if (retval) return retval; retval = dquot_initialize(d_inode(dentry)); if (retval) return retval; retval = -ENOENT; bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL); if (IS_ERR(bh)) return PTR_ERR(bh); if (!bh) goto end_rmdir; inode = d_inode(dentry); retval = -EFSCORRUPTED; if (le32_to_cpu(de->inode) != inode->i_ino) goto end_rmdir; retval = -ENOTEMPTY; if (!ext4_empty_dir(inode)) goto end_rmdir; handle = ext4_journal_start(dir, EXT4_HT_DIR, EXT4_DATA_TRANS_BLOCKS(dir->i_sb)); if (IS_ERR(handle)) { retval = PTR_ERR(handle); handle = NULL; goto end_rmdir; } if (IS_DIRSYNC(dir)) ext4_handle_sync(handle); retval = ext4_delete_entry(handle, dir, de, bh); if (retval) goto end_rmdir; if (!EXT4_DIR_LINK_EMPTY(inode)) ext4_warning_inode(inode, "empty directory '%.*s' has too many links (%u)", dentry->d_name.len, dentry->d_name.name, inode->i_nlink); inode_inc_iversion(inode); clear_nlink(inode); /* There's no need to set i_disksize: the fact that i_nlink is * zero will ensure that the right thing happens during any * recovery. */ inode->i_size = 0; ext4_orphan_add(handle, inode); inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); inode_set_ctime_current(inode); retval = ext4_mark_inode_dirty(handle, inode); if (retval) goto end_rmdir; ext4_dec_count(dir); ext4_update_dx_flag(dir); ext4_fc_track_unlink(handle, dentry); retval = ext4_mark_inode_dirty(handle, dir); /* VFS negative dentries are incompatible with Encoding and * Case-insensitiveness. Eventually we'll want avoid * invalidating the dentries here, alongside with returning the * negative dentries at ext4_lookup(), when it is better * supported by the VFS for the CI case. */ if (IS_ENABLED(CONFIG_UNICODE) && IS_CASEFOLDED(dir)) d_invalidate(dentry); end_rmdir: brelse(bh); if (handle) ext4_journal_stop(handle); return retval; } int __ext4_unlink(struct inode *dir, const struct qstr *d_name, struct inode *inode, struct dentry *dentry /* NULL during fast_commit recovery */) { int retval = -ENOENT; struct buffer_head *bh; struct ext4_dir_entry_2 *de = NULL; handle_t *handle; int skip_remove_dentry = 0; /* * Keep this outside the transaction; it may have to set up the * directory's encryption key, which isn't GFP_NOFS-safe. */ bh = ext4_find_entry(dir, d_name, &de, NULL); if (IS_ERR(bh)) return PTR_ERR(bh); if (!bh) return -ENOENT; if (le32_to_cpu(de->inode) != inode->i_ino) { /* * It's okay if we find dont find dentry which matches * the inode. That's because it might have gotten * renamed to a different inode number */ if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) skip_remove_dentry = 1; else goto out_bh; } handle = ext4_journal_start(dir, EXT4_HT_DIR, EXT4_DATA_TRANS_BLOCKS(dir->i_sb)); if (IS_ERR(handle)) { retval = PTR_ERR(handle); goto out_bh; } if (IS_DIRSYNC(dir)) ext4_handle_sync(handle); if (!skip_remove_dentry) { retval = ext4_delete_entry(handle, dir, de, bh); if (retval) goto out_handle; inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); ext4_update_dx_flag(dir); retval = ext4_mark_inode_dirty(handle, dir); if (retval) goto out_handle; } else { retval = 0; } if (inode->i_nlink == 0) ext4_warning_inode(inode, "Deleting file '%.*s' with no links", d_name->len, d_name->name); else drop_nlink(inode); if (!inode->i_nlink) ext4_orphan_add(handle, inode); inode_set_ctime_current(inode); retval = ext4_mark_inode_dirty(handle, inode); if (dentry && !retval) ext4_fc_track_unlink(handle, dentry); out_handle: ext4_journal_stop(handle); out_bh: brelse(bh); return retval; } static int ext4_unlink(struct inode *dir, struct dentry *dentry) { int retval; retval = ext4_emergency_state(dir->i_sb); if (unlikely(retval)) return retval; trace_ext4_unlink_enter(dir, dentry); /* * Initialize quotas before so that eventual writes go * in separate transaction */ retval = dquot_initialize(dir); if (retval) goto out_trace; retval = dquot_initialize(d_inode(dentry)); if (retval) goto out_trace; retval = __ext4_unlink(dir, &dentry->d_name, d_inode(dentry), dentry); /* VFS negative dentries are incompatible with Encoding and * Case-insensitiveness. Eventually we'll want avoid * invalidating the dentries here, alongside with returning the * negative dentries at ext4_lookup(), when it is better * supported by the VFS for the CI case. */ if (IS_ENABLED(CONFIG_UNICODE) && IS_CASEFOLDED(dir)) d_invalidate(dentry); out_trace: trace_ext4_unlink_exit(dentry, retval); return retval; } static int ext4_init_symlink_block(handle_t *handle, struct inode *inode, struct fscrypt_str *disk_link) { struct buffer_head *bh; char *kaddr; int err = 0; bh = ext4_bread(handle, inode, 0, EXT4_GET_BLOCKS_CREATE); if (IS_ERR(bh)) return PTR_ERR(bh); BUFFER_TRACE(bh, "get_write_access"); err = ext4_journal_get_write_access(handle, inode->i_sb, bh, EXT4_JTR_NONE); if (err) goto out; kaddr = (char *)bh->b_data; memcpy(kaddr, disk_link->name, disk_link->len); inode->i_size = disk_link->len - 1; EXT4_I(inode)->i_disksize = inode->i_size; err = ext4_handle_dirty_metadata(handle, inode, bh); out: brelse(bh); return err; } static int ext4_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *symname) { handle_t *handle; struct inode *inode; int err, len = strlen(symname); int credits; struct fscrypt_str disk_link; int retries = 0; err = ext4_emergency_state(dir->i_sb); if (unlikely(err)) return err; err = fscrypt_prepare_symlink(dir, symname, len, dir->i_sb->s_blocksize, &disk_link); if (err) return err; err = dquot_initialize(dir); if (err) return err; /* * EXT4_INDEX_EXTRA_TRANS_BLOCKS for addition of entry into the * directory. +3 for inode, inode bitmap, group descriptor allocation. * EXT4_DATA_TRANS_BLOCKS for the data block allocation and * modification. */ credits = EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3; retry: inode = ext4_new_inode_start_handle(idmap, dir, S_IFLNK|S_IRWXUGO, &dentry->d_name, 0, NULL, EXT4_HT_DIR, credits); handle = ext4_journal_current_handle(); if (IS_ERR(inode)) { if (handle) ext4_journal_stop(handle); err = PTR_ERR(inode); goto out_retry; } if (IS_ENCRYPTED(inode)) { err = fscrypt_encrypt_symlink(inode, symname, len, &disk_link); if (err) goto err_drop_inode; inode->i_op = &ext4_encrypted_symlink_inode_operations; } else { if ((disk_link.len > EXT4_N_BLOCKS * 4)) { inode->i_op = &ext4_symlink_inode_operations; } else { inode->i_op = &ext4_fast_symlink_inode_operations; } } if ((disk_link.len > EXT4_N_BLOCKS * 4)) { /* alloc symlink block and fill it */ err = ext4_init_symlink_block(handle, inode, &disk_link); if (err) goto err_drop_inode; } else { /* clear the extent format for fast symlink */ ext4_clear_inode_flag(inode, EXT4_INODE_EXTENTS); memcpy((char *)&EXT4_I(inode)->i_data, disk_link.name, disk_link.len); inode->i_size = disk_link.len - 1; EXT4_I(inode)->i_disksize = inode->i_size; if (!IS_ENCRYPTED(inode)) inode_set_cached_link(inode, (char *)&EXT4_I(inode)->i_data, inode->i_size); } err = ext4_add_nondir(handle, dentry, &inode); if (handle) ext4_journal_stop(handle); iput(inode); goto out_retry; err_drop_inode: clear_nlink(inode); ext4_mark_inode_dirty(handle, inode); ext4_orphan_add(handle, inode); unlock_new_inode(inode); if (handle) ext4_journal_stop(handle); iput(inode); out_retry: if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries)) goto retry; if (disk_link.name != (unsigned char *)symname) kfree(disk_link.name); return err; } int __ext4_link(struct inode *dir, struct inode *inode, struct dentry *dentry) { handle_t *handle; int err, retries = 0; retry: handle = ext4_journal_start(dir, EXT4_HT_DIR, (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + EXT4_INDEX_EXTRA_TRANS_BLOCKS) + 1); if (IS_ERR(handle)) return PTR_ERR(handle); if (IS_DIRSYNC(dir)) ext4_handle_sync(handle); inode_set_ctime_current(inode); ext4_inc_count(inode); ihold(inode); err = ext4_add_entry(handle, dentry, inode); if (!err) { err = ext4_mark_inode_dirty(handle, inode); /* this can happen only for tmpfile being * linked the first time */ if (inode->i_nlink == 1) ext4_orphan_del(handle, inode); d_instantiate(dentry, inode); ext4_fc_track_link(handle, dentry); } else { drop_nlink(inode); iput(inode); } ext4_journal_stop(handle); if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries)) goto retry; return err; } static int ext4_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) { struct inode *inode = d_inode(old_dentry); int err; if (inode->i_nlink >= EXT4_LINK_MAX) return -EMLINK; err = fscrypt_prepare_link(old_dentry, dir, dentry); if (err) return err; if ((ext4_test_inode_flag(dir, EXT4_INODE_PROJINHERIT)) && (!projid_eq(EXT4_I(dir)->i_projid, EXT4_I(old_dentry->d_inode)->i_projid))) return -EXDEV; err = dquot_initialize(dir); if (err) return err; return __ext4_link(dir, inode, dentry); } /* * Try to find buffer head where contains the parent block. * It should be the inode block if it is inlined or the 1st block * if it is a normal dir. */ static struct buffer_head *ext4_get_first_dir_block(handle_t *handle, struct inode *inode, int *retval, struct ext4_dir_entry_2 **parent_de, int *inlined) { struct buffer_head *bh; if (!ext4_has_inline_data(inode)) { struct ext4_dir_entry_2 *de; unsigned int offset; bh = ext4_read_dirblock(inode, 0, EITHER); if (IS_ERR(bh)) { *retval = PTR_ERR(bh); return NULL; } de = (struct ext4_dir_entry_2 *) bh->b_data; if (ext4_check_dir_entry(inode, NULL, de, bh, bh->b_data, bh->b_size, 0) || le32_to_cpu(de->inode) != inode->i_ino || de->name_len != 1 || de->name[0] != '.') { EXT4_ERROR_INODE(inode, "directory missing '.'"); brelse(bh); *retval = -EFSCORRUPTED; return NULL; } offset = ext4_rec_len_from_disk(de->rec_len, inode->i_sb->s_blocksize); de = ext4_next_entry(de, inode->i_sb->s_blocksize); if (ext4_check_dir_entry(inode, NULL, de, bh, bh->b_data, bh->b_size, offset) || le32_to_cpu(de->inode) == 0 || de->name_len != 2 || de->name[0] != '.' || de->name[1] != '.') { EXT4_ERROR_INODE(inode, "directory missing '..'"); brelse(bh); *retval = -EFSCORRUPTED; return NULL; } *parent_de = de; return bh; } *inlined = 1; return ext4_get_first_inline_block(inode, parent_de, retval); } struct ext4_renament { struct inode *dir; struct dentry *dentry; struct inode *inode; bool is_dir; int dir_nlink_delta; /* entry for "dentry" */ struct buffer_head *bh; struct ext4_dir_entry_2 *de; int inlined; /* entry for ".." in inode if it's a directory */ struct buffer_head *dir_bh; struct ext4_dir_entry_2 *parent_de; int dir_inlined; }; static int ext4_rename_dir_prepare(handle_t *handle, struct ext4_renament *ent, bool is_cross) { int retval; ent->is_dir = true; if (!is_cross) return 0; ent->dir_bh = ext4_get_first_dir_block(handle, ent->inode, &retval, &ent->parent_de, &ent->dir_inlined); if (!ent->dir_bh) return retval; if (le32_to_cpu(ent->parent_de->inode) != ent->dir->i_ino) return -EFSCORRUPTED; BUFFER_TRACE(ent->dir_bh, "get_write_access"); return ext4_journal_get_write_access(handle, ent->dir->i_sb, ent->dir_bh, EXT4_JTR_NONE); } static int ext4_rename_dir_finish(handle_t *handle, struct ext4_renament *ent, unsigned dir_ino) { int retval; if (!ent->dir_bh) return 0; ent->parent_de->inode = cpu_to_le32(dir_ino); BUFFER_TRACE(ent->dir_bh, "call ext4_handle_dirty_metadata"); if (!ent->dir_inlined) { if (is_dx(ent->inode)) { retval = ext4_handle_dirty_dx_node(handle, ent->inode, ent->dir_bh); } else { retval = ext4_handle_dirty_dirblock(handle, ent->inode, ent->dir_bh); } } else { retval = ext4_mark_inode_dirty(handle, ent->inode); } if (retval) { ext4_std_error(ent->dir->i_sb, retval); return retval; } return 0; } static int ext4_setent(handle_t *handle, struct ext4_renament *ent, unsigned ino, unsigned file_type) { int retval, retval2; BUFFER_TRACE(ent->bh, "get write access"); retval = ext4_journal_get_write_access(handle, ent->dir->i_sb, ent->bh, EXT4_JTR_NONE); if (retval) return retval; ent->de->inode = cpu_to_le32(ino); if (ext4_has_feature_filetype(ent->dir->i_sb)) ent->de->file_type = file_type; inode_inc_iversion(ent->dir); inode_set_mtime_to_ts(ent->dir, inode_set_ctime_current(ent->dir)); retval = ext4_mark_inode_dirty(handle, ent->dir); BUFFER_TRACE(ent->bh, "call ext4_handle_dirty_metadata"); if (!ent->inlined) { retval2 = ext4_handle_dirty_dirblock(handle, ent->dir, ent->bh); if (unlikely(retval2)) { ext4_std_error(ent->dir->i_sb, retval2); return retval2; } } return retval; } static void ext4_resetent(handle_t *handle, struct ext4_renament *ent, unsigned ino, unsigned file_type) { struct ext4_renament old = *ent; int retval = 0; /* * old->de could have moved from under us during make indexed dir, * so the old->de may no longer valid and need to find it again * before reset old inode info. */ old.bh = ext4_find_entry(old.dir, &old.dentry->d_name, &old.de, &old.inlined); if (IS_ERR(old.bh)) retval = PTR_ERR(old.bh); if (!old.bh) retval = -ENOENT; if (retval) { ext4_std_error(old.dir->i_sb, retval); return; } ext4_setent(handle, &old, ino, file_type); brelse(old.bh); } static int ext4_find_delete_entry(handle_t *handle, struct inode *dir, const struct qstr *d_name) { int retval = -ENOENT; struct buffer_head *bh; struct ext4_dir_entry_2 *de = NULL; bh = ext4_find_entry(dir, d_name, &de, NULL); if (IS_ERR(bh)) return PTR_ERR(bh); if (bh) { retval = ext4_delete_entry(handle, dir, de, bh); brelse(bh); } return retval; } static void ext4_rename_delete(handle_t *handle, struct ext4_renament *ent, int force_reread) { int retval; /* * ent->de could have moved from under us during htree split, so make * sure that we are deleting the right entry. We might also be pointing * to a stale entry in the unused part of ent->bh so just checking inum * and the name isn't enough. */ if (le32_to_cpu(ent->de->inode) != ent->inode->i_ino || ent->de->name_len != ent->dentry->d_name.len || strncmp(ent->de->name, ent->dentry->d_name.name, ent->de->name_len) || force_reread) { retval = ext4_find_delete_entry(handle, ent->dir, &ent->dentry->d_name); } else { retval = ext4_delete_entry(handle, ent->dir, ent->de, ent->bh); if (retval == -ENOENT) { retval = ext4_find_delete_entry(handle, ent->dir, &ent->dentry->d_name); } } if (retval) { ext4_warning_inode(ent->dir, "Deleting old file: nlink %d, error=%d", ent->dir->i_nlink, retval); } } static void ext4_update_dir_count(handle_t *handle, struct ext4_renament *ent) { if (ent->dir_nlink_delta) { if (ent->dir_nlink_delta == -1) ext4_dec_count(ent->dir); else ext4_inc_count(ent->dir); ext4_mark_inode_dirty(handle, ent->dir); } } static struct inode *ext4_whiteout_for_rename(struct mnt_idmap *idmap, struct ext4_renament *ent, int credits, handle_t **h) { struct inode *wh; handle_t *handle; int retries = 0; /* * for inode block, sb block, group summaries, * and inode bitmap */ credits += (EXT4_MAXQUOTAS_TRANS_BLOCKS(ent->dir->i_sb) + EXT4_XATTR_TRANS_BLOCKS + 4); retry: wh = ext4_new_inode_start_handle(idmap, ent->dir, S_IFCHR | WHITEOUT_MODE, &ent->dentry->d_name, 0, NULL, EXT4_HT_DIR, credits); handle = ext4_journal_current_handle(); if (IS_ERR(wh)) { if (handle) ext4_journal_stop(handle); if (PTR_ERR(wh) == -ENOSPC && ext4_should_retry_alloc(ent->dir->i_sb, &retries)) goto retry; } else { *h = handle; init_special_inode(wh, wh->i_mode, WHITEOUT_DEV); wh->i_op = &ext4_special_inode_operations; } return wh; } /* * Anybody can rename anything with this: the permission checks are left to the * higher-level routines. * * n.b. old_{dentry,inode) refers to the source dentry/inode * while new_{dentry,inode) refers to the destination dentry/inode * This comes from rename(const char *oldpath, const char *newpath) */ static int ext4_rename(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { handle_t *handle = NULL; struct ext4_renament old = { .dir = old_dir, .dentry = old_dentry, .inode = d_inode(old_dentry), }; struct ext4_renament new = { .dir = new_dir, .dentry = new_dentry, .inode = d_inode(new_dentry), }; int force_reread; int retval; struct inode *whiteout = NULL; int credits; u8 old_file_type; if (new.inode && new.inode->i_nlink == 0) { EXT4_ERROR_INODE(new.inode, "target of rename is already freed"); return -EFSCORRUPTED; } if ((ext4_test_inode_flag(new_dir, EXT4_INODE_PROJINHERIT)) && (!projid_eq(EXT4_I(new_dir)->i_projid, EXT4_I(old_dentry->d_inode)->i_projid))) return -EXDEV; retval = dquot_initialize(old.dir); if (retval) return retval; retval = dquot_initialize(old.inode); if (retval) return retval; retval = dquot_initialize(new.dir); if (retval) return retval; /* Initialize quotas before so that eventual writes go * in separate transaction */ if (new.inode) { retval = dquot_initialize(new.inode); if (retval) return retval; } old.bh = ext4_find_entry(old.dir, &old.dentry->d_name, &old.de, &old.inlined); if (IS_ERR(old.bh)) return PTR_ERR(old.bh); /* * Check for inode number is _not_ due to possible IO errors. * We might rmdir the source, keep it as pwd of some process * and merrily kill the link to whatever was created under the * same name. Goodbye sticky bit ;-< */ retval = -ENOENT; if (!old.bh || le32_to_cpu(old.de->inode) != old.inode->i_ino) goto release_bh; new.bh = ext4_find_entry(new.dir, &new.dentry->d_name, &new.de, &new.inlined); if (IS_ERR(new.bh)) { retval = PTR_ERR(new.bh); new.bh = NULL; goto release_bh; } if (new.bh) { if (!new.inode) { brelse(new.bh); new.bh = NULL; } } if (new.inode && !test_opt(new.dir->i_sb, NO_AUTO_DA_ALLOC)) ext4_alloc_da_blocks(old.inode); credits = (2 * EXT4_DATA_TRANS_BLOCKS(old.dir->i_sb) + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 2); if (!(flags & RENAME_WHITEOUT)) { handle = ext4_journal_start(old.dir, EXT4_HT_DIR, credits); if (IS_ERR(handle)) { retval = PTR_ERR(handle); goto release_bh; } } else { whiteout = ext4_whiteout_for_rename(idmap, &old, credits, &handle); if (IS_ERR(whiteout)) { retval = PTR_ERR(whiteout); goto release_bh; } } old_file_type = old.de->file_type; if (IS_DIRSYNC(old.dir) || IS_DIRSYNC(new.dir)) ext4_handle_sync(handle); if (S_ISDIR(old.inode->i_mode)) { if (new.inode) { retval = -ENOTEMPTY; if (!ext4_empty_dir(new.inode)) goto end_rename; } else { retval = -EMLINK; if (new.dir != old.dir && EXT4_DIR_LINK_MAX(new.dir)) goto end_rename; } retval = ext4_rename_dir_prepare(handle, &old, new.dir != old.dir); if (retval) goto end_rename; } /* * If we're renaming a file within an inline_data dir and adding or * setting the new dirent causes a conversion from inline_data to * extents/blockmap, we need to force the dirent delete code to * re-read the directory, or else we end up trying to delete a dirent * from what is now the extent tree root (or a block map). */ force_reread = (new.dir->i_ino == old.dir->i_ino && ext4_test_inode_flag(new.dir, EXT4_INODE_INLINE_DATA)); if (whiteout) { /* * Do this before adding a new entry, so the old entry is sure * to be still pointing to the valid old entry. */ retval = ext4_setent(handle, &old, whiteout->i_ino, EXT4_FT_CHRDEV); if (retval) goto end_rename; retval = ext4_mark_inode_dirty(handle, whiteout); if (unlikely(retval)) goto end_rename; } if (!new.bh) { retval = ext4_add_entry(handle, new.dentry, old.inode); if (retval) goto end_rename; } else { retval = ext4_setent(handle, &new, old.inode->i_ino, old_file_type); if (retval) goto end_rename; } if (force_reread) force_reread = !ext4_test_inode_flag(new.dir, EXT4_INODE_INLINE_DATA); /* * Like most other Unix systems, set the ctime for inodes on a * rename. */ inode_set_ctime_current(old.inode); retval = ext4_mark_inode_dirty(handle, old.inode); if (unlikely(retval)) goto end_rename; if (!whiteout) { /* * ok, that's it */ ext4_rename_delete(handle, &old, force_reread); } if (new.inode) { ext4_dec_count(new.inode); inode_set_ctime_current(new.inode); } inode_set_mtime_to_ts(old.dir, inode_set_ctime_current(old.dir)); ext4_update_dx_flag(old.dir); if (old.is_dir) { retval = ext4_rename_dir_finish(handle, &old, new.dir->i_ino); if (retval) goto end_rename; ext4_dec_count(old.dir); if (new.inode) { /* checked ext4_empty_dir above, can't have another * parent, ext4_dec_count() won't work for many-linked * dirs */ clear_nlink(new.inode); } else { ext4_inc_count(new.dir); ext4_update_dx_flag(new.dir); retval = ext4_mark_inode_dirty(handle, new.dir); if (unlikely(retval)) goto end_rename; } } retval = ext4_mark_inode_dirty(handle, old.dir); if (unlikely(retval)) goto end_rename; if (old.is_dir) { /* * We disable fast commits here that's because the * replay code is not yet capable of changing dot dot * dirents in directories. */ ext4_fc_mark_ineligible(old.inode->i_sb, EXT4_FC_REASON_RENAME_DIR, handle); } else { struct super_block *sb = old.inode->i_sb; if (new.inode) ext4_fc_track_unlink(handle, new.dentry); if (test_opt2(sb, JOURNAL_FAST_COMMIT) && !(EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY) && !(ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE))) { __ext4_fc_track_link(handle, old.inode, new.dentry); __ext4_fc_track_unlink(handle, old.inode, old.dentry); if (whiteout) __ext4_fc_track_create(handle, whiteout, old.dentry); } } if (new.inode) { retval = ext4_mark_inode_dirty(handle, new.inode); if (unlikely(retval)) goto end_rename; if (!new.inode->i_nlink) ext4_orphan_add(handle, new.inode); } retval = 0; end_rename: if (whiteout) { if (retval) { ext4_resetent(handle, &old, old.inode->i_ino, old_file_type); drop_nlink(whiteout); ext4_mark_inode_dirty(handle, whiteout); ext4_orphan_add(handle, whiteout); } unlock_new_inode(whiteout); ext4_journal_stop(handle); iput(whiteout); } else { ext4_journal_stop(handle); } release_bh: brelse(old.dir_bh); brelse(old.bh); brelse(new.bh); return retval; } static int ext4_cross_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry) { handle_t *handle = NULL; struct ext4_renament old = { .dir = old_dir, .dentry = old_dentry, .inode = d_inode(old_dentry), }; struct ext4_renament new = { .dir = new_dir, .dentry = new_dentry, .inode = d_inode(new_dentry), }; u8 new_file_type; int retval; if ((ext4_test_inode_flag(new_dir, EXT4_INODE_PROJINHERIT) && !projid_eq(EXT4_I(new_dir)->i_projid, EXT4_I(old_dentry->d_inode)->i_projid)) || (ext4_test_inode_flag(old_dir, EXT4_INODE_PROJINHERIT) && !projid_eq(EXT4_I(old_dir)->i_projid, EXT4_I(new_dentry->d_inode)->i_projid))) return -EXDEV; retval = dquot_initialize(old.dir); if (retval) return retval; retval = dquot_initialize(new.dir); if (retval) return retval; old.bh = ext4_find_entry(old.dir, &old.dentry->d_name, &old.de, &old.inlined); if (IS_ERR(old.bh)) return PTR_ERR(old.bh); /* * Check for inode number is _not_ due to possible IO errors. * We might rmdir the source, keep it as pwd of some process * and merrily kill the link to whatever was created under the * same name. Goodbye sticky bit ;-< */ retval = -ENOENT; if (!old.bh || le32_to_cpu(old.de->inode) != old.inode->i_ino) goto end_rename; new.bh = ext4_find_entry(new.dir, &new.dentry->d_name, &new.de, &new.inlined); if (IS_ERR(new.bh)) { retval = PTR_ERR(new.bh); new.bh = NULL; goto end_rename; } /* RENAME_EXCHANGE case: old *and* new must both exist */ if (!new.bh || le32_to_cpu(new.de->inode) != new.inode->i_ino) goto end_rename; handle = ext4_journal_start(old.dir, EXT4_HT_DIR, (2 * EXT4_DATA_TRANS_BLOCKS(old.dir->i_sb) + 2 * EXT4_INDEX_EXTRA_TRANS_BLOCKS + 2)); if (IS_ERR(handle)) { retval = PTR_ERR(handle); handle = NULL; goto end_rename; } if (IS_DIRSYNC(old.dir) || IS_DIRSYNC(new.dir)) ext4_handle_sync(handle); if (S_ISDIR(old.inode->i_mode)) { retval = ext4_rename_dir_prepare(handle, &old, new.dir != old.dir); if (retval) goto end_rename; } if (S_ISDIR(new.inode->i_mode)) { retval = ext4_rename_dir_prepare(handle, &new, new.dir != old.dir); if (retval) goto end_rename; } /* * Other than the special case of overwriting a directory, parents' * nlink only needs to be modified if this is a cross directory rename. */ if (old.dir != new.dir && old.is_dir != new.is_dir) { old.dir_nlink_delta = old.is_dir ? -1 : 1; new.dir_nlink_delta = -old.dir_nlink_delta; retval = -EMLINK; if ((old.dir_nlink_delta > 0 && EXT4_DIR_LINK_MAX(old.dir)) || (new.dir_nlink_delta > 0 && EXT4_DIR_LINK_MAX(new.dir))) goto end_rename; } new_file_type = new.de->file_type; retval = ext4_setent(handle, &new, old.inode->i_ino, old.de->file_type); if (retval) goto end_rename; retval = ext4_setent(handle, &old, new.inode->i_ino, new_file_type); if (retval) goto end_rename; /* * Like most other Unix systems, set the ctime for inodes on a * rename. */ inode_set_ctime_current(old.inode); inode_set_ctime_current(new.inode); retval = ext4_mark_inode_dirty(handle, old.inode); if (unlikely(retval)) goto end_rename; retval = ext4_mark_inode_dirty(handle, new.inode); if (unlikely(retval)) goto end_rename; ext4_fc_mark_ineligible(new.inode->i_sb, EXT4_FC_REASON_CROSS_RENAME, handle); if (old.dir_bh) { retval = ext4_rename_dir_finish(handle, &old, new.dir->i_ino); if (retval) goto end_rename; } if (new.dir_bh) { retval = ext4_rename_dir_finish(handle, &new, old.dir->i_ino); if (retval) goto end_rename; } ext4_update_dir_count(handle, &old); ext4_update_dir_count(handle, &new); retval = 0; end_rename: brelse(old.dir_bh); brelse(new.dir_bh); brelse(old.bh); brelse(new.bh); if (handle) ext4_journal_stop(handle); return retval; } static int ext4_rename2(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { int err; err = ext4_emergency_state(old_dir->i_sb); if (unlikely(err)) return err; if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT)) return -EINVAL; err = fscrypt_prepare_rename(old_dir, old_dentry, new_dir, new_dentry, flags); if (err) return err; if (flags & RENAME_EXCHANGE) { return ext4_cross_rename(old_dir, old_dentry, new_dir, new_dentry); } return ext4_rename(idmap, old_dir, old_dentry, new_dir, new_dentry, flags); } /* * directories can handle most operations... */ const struct inode_operations ext4_dir_inode_operations = { .create = ext4_create, .lookup = ext4_lookup, .link = ext4_link, .unlink = ext4_unlink, .symlink = ext4_symlink, .mkdir = ext4_mkdir, .rmdir = ext4_rmdir, .mknod = ext4_mknod, .tmpfile = ext4_tmpfile, .rename = ext4_rename2, .setattr = ext4_setattr, .getattr = ext4_getattr, .listxattr = ext4_listxattr, .get_inode_acl = ext4_get_acl, .set_acl = ext4_set_acl, .fiemap = ext4_fiemap, .fileattr_get = ext4_fileattr_get, .fileattr_set = ext4_fileattr_set, }; const struct inode_operations ext4_special_inode_operations = { .setattr = ext4_setattr, .getattr = ext4_getattr, .listxattr = ext4_listxattr, .get_inode_acl = ext4_get_acl, .set_acl = ext4_set_acl, };
2068 238 30 2200 12 158 139 1705 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 /** * css_get - obtain a reference on the specified css * @css: target css * * The caller must already have a reference. */ CGROUP_REF_FN_ATTRS void css_get(struct cgroup_subsys_state *css) { if (!(css->flags & CSS_NO_REF)) percpu_ref_get(&css->refcnt); } CGROUP_REF_EXPORT(css_get) /** * css_get_many - obtain references on the specified css * @css: target css * @n: number of references to get * * The caller must already have a reference. */ CGROUP_REF_FN_ATTRS void css_get_many(struct cgroup_subsys_state *css, unsigned int n) { if (!(css->flags & CSS_NO_REF)) percpu_ref_get_many(&css->refcnt, n); } CGROUP_REF_EXPORT(css_get_many) /** * css_tryget - try to obtain a reference on the specified css * @css: target css * * Obtain a reference on @css unless it already has reached zero and is * being released. This function doesn't care whether @css is on or * offline. The caller naturally needs to ensure that @css is accessible * but doesn't have to be holding a reference on it - IOW, RCU protected * access is good enough for this function. Returns %true if a reference * count was successfully obtained; %false otherwise. */ CGROUP_REF_FN_ATTRS bool css_tryget(struct cgroup_subsys_state *css) { if (!(css->flags & CSS_NO_REF)) return percpu_ref_tryget(&css->refcnt); return true; } CGROUP_REF_EXPORT(css_tryget) /** * css_tryget_online - try to obtain a reference on the specified css if online * @css: target css * * Obtain a reference on @css if it's online. The caller naturally needs * to ensure that @css is accessible but doesn't have to be holding a * reference on it - IOW, RCU protected access is good enough for this * function. Returns %true if a reference count was successfully obtained; * %false otherwise. */ CGROUP_REF_FN_ATTRS bool css_tryget_online(struct cgroup_subsys_state *css) { if (!(css->flags & CSS_NO_REF)) return percpu_ref_tryget_live(&css->refcnt); return true; } CGROUP_REF_EXPORT(css_tryget_online) /** * css_put - put a css reference * @css: target css * * Put a reference obtained via css_get() and css_tryget_online(). */ CGROUP_REF_FN_ATTRS void css_put(struct cgroup_subsys_state *css) { if (!(css->flags & CSS_NO_REF)) percpu_ref_put(&css->refcnt); } CGROUP_REF_EXPORT(css_put) /** * css_put_many - put css references * @css: target css * @n: number of references to put * * Put references obtained via css_get() and css_tryget_online(). */ CGROUP_REF_FN_ATTRS void css_put_many(struct cgroup_subsys_state *css, unsigned int n) { if (!(css->flags & CSS_NO_REF)) percpu_ref_put_many(&css->refcnt, n); } CGROUP_REF_EXPORT(css_put_many)
4 7 7 1 1 14 1 1 2 1 1 2 7 1 5 5 5 119 119 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 // SPDX-License-Identifier: GPL-2.0-or-later /* * net/sched/act_simple.c Simple example of an action * * Authors: Jamal Hadi Salim (2005-8) */ #include <linux/module.h> #include <linux/slab.h> #include <linux/init.h> #include <linux/kernel.h> #include <linux/skbuff.h> #include <linux/rtnetlink.h> #include <net/netlink.h> #include <net/pkt_sched.h> #include <net/pkt_cls.h> #include <net/tc_wrapper.h> #include <linux/tc_act/tc_defact.h> #include <net/tc_act/tc_defact.h> static struct tc_action_ops act_simp_ops; #define SIMP_MAX_DATA 32 TC_INDIRECT_SCOPE int tcf_simp_act(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { struct tcf_defact *d = to_defact(a); spin_lock(&d->tcf_lock); tcf_lastuse_update(&d->tcf_tm); bstats_update(&d->tcf_bstats, skb); /* print policy string followed by _ then packet count * Example if this was the 3rd packet and the string was "hello" * then it would look like "hello_3" (without quotes) */ pr_info("simple: %s_%llu\n", (char *)d->tcfd_defdata, u64_stats_read(&d->tcf_bstats.packets)); spin_unlock(&d->tcf_lock); return d->tcf_action; } static void tcf_simp_release(struct tc_action *a) { struct tcf_defact *d = to_defact(a); kfree(d->tcfd_defdata); } static int alloc_defdata(struct tcf_defact *d, const struct nlattr *defdata) { d->tcfd_defdata = kzalloc(SIMP_MAX_DATA, GFP_KERNEL); if (unlikely(!d->tcfd_defdata)) return -ENOMEM; nla_strscpy(d->tcfd_defdata, defdata, SIMP_MAX_DATA); return 0; } static int reset_policy(struct tc_action *a, const struct nlattr *defdata, struct tc_defact *p, struct tcf_proto *tp, struct netlink_ext_ack *extack) { struct tcf_chain *goto_ch = NULL; struct tcf_defact *d; int err; err = tcf_action_check_ctrlact(p->action, tp, &goto_ch, extack); if (err < 0) return err; d = to_defact(a); spin_lock_bh(&d->tcf_lock); goto_ch = tcf_action_set_ctrlact(a, p->action, goto_ch); nla_strscpy(d->tcfd_defdata, defdata, SIMP_MAX_DATA); spin_unlock_bh(&d->tcf_lock); if (goto_ch) tcf_chain_put_by_act(goto_ch); return 0; } static const struct nla_policy simple_policy[TCA_DEF_MAX + 1] = { [TCA_DEF_PARMS] = { .len = sizeof(struct tc_defact) }, [TCA_DEF_DATA] = { .type = NLA_STRING, .len = SIMP_MAX_DATA }, }; static int tcf_simp_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, struct tcf_proto *tp, u32 flags, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, act_simp_ops.net_id); bool bind = flags & TCA_ACT_FLAGS_BIND; struct nlattr *tb[TCA_DEF_MAX + 1]; struct tcf_chain *goto_ch = NULL; struct tc_defact *parm; struct tcf_defact *d; bool exists = false; int ret = 0, err; u32 index; if (nla == NULL) return -EINVAL; err = nla_parse_nested_deprecated(tb, TCA_DEF_MAX, nla, simple_policy, NULL); if (err < 0) return err; if (tb[TCA_DEF_PARMS] == NULL) return -EINVAL; parm = nla_data(tb[TCA_DEF_PARMS]); index = parm->index; err = tcf_idr_check_alloc(tn, &index, a, bind); if (err < 0) return err; exists = err; if (exists && bind) return ACT_P_BOUND; if (tb[TCA_DEF_DATA] == NULL) { if (exists) tcf_idr_release(*a, bind); else tcf_idr_cleanup(tn, index); return -EINVAL; } if (!exists) { ret = tcf_idr_create(tn, index, est, a, &act_simp_ops, bind, false, flags); if (ret) { tcf_idr_cleanup(tn, index); return ret; } d = to_defact(*a); err = tcf_action_check_ctrlact(parm->action, tp, &goto_ch, extack); if (err < 0) goto release_idr; err = alloc_defdata(d, tb[TCA_DEF_DATA]); if (err < 0) goto put_chain; tcf_action_set_ctrlact(*a, parm->action, goto_ch); ret = ACT_P_CREATED; } else { if (!(flags & TCA_ACT_FLAGS_REPLACE)) { err = -EEXIST; goto release_idr; } err = reset_policy(*a, tb[TCA_DEF_DATA], parm, tp, extack); if (err) goto release_idr; } return ret; put_chain: if (goto_ch) tcf_chain_put_by_act(goto_ch); release_idr: tcf_idr_release(*a, bind); return err; } static int tcf_simp_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) { unsigned char *b = skb_tail_pointer(skb); struct tcf_defact *d = to_defact(a); struct tc_defact opt = { .index = d->tcf_index, .refcnt = refcount_read(&d->tcf_refcnt) - ref, .bindcnt = atomic_read(&d->tcf_bindcnt) - bind, }; struct tcf_t t; spin_lock_bh(&d->tcf_lock); opt.action = d->tcf_action; if (nla_put(skb, TCA_DEF_PARMS, sizeof(opt), &opt) || nla_put_string(skb, TCA_DEF_DATA, d->tcfd_defdata)) goto nla_put_failure; tcf_tm_dump(&t, &d->tcf_tm); if (nla_put_64bit(skb, TCA_DEF_TM, sizeof(t), &t, TCA_DEF_PAD)) goto nla_put_failure; spin_unlock_bh(&d->tcf_lock); return skb->len; nla_put_failure: spin_unlock_bh(&d->tcf_lock); nlmsg_trim(skb, b); return -1; } static struct tc_action_ops act_simp_ops = { .kind = "simple", .id = TCA_ID_SIMP, .owner = THIS_MODULE, .act = tcf_simp_act, .dump = tcf_simp_dump, .cleanup = tcf_simp_release, .init = tcf_simp_init, .size = sizeof(struct tcf_defact), }; MODULE_ALIAS_NET_ACT("simple"); static __net_init int simp_init_net(struct net *net) { struct tc_action_net *tn = net_generic(net, act_simp_ops.net_id); return tc_action_net_init(net, tn, &act_simp_ops); } static void __net_exit simp_exit_net(struct list_head *net_list) { tc_action_net_exit(net_list, act_simp_ops.net_id); } static struct pernet_operations simp_net_ops = { .init = simp_init_net, .exit_batch = simp_exit_net, .id = &act_simp_ops.net_id, .size = sizeof(struct tc_action_net), }; MODULE_AUTHOR("Jamal Hadi Salim(2005)"); MODULE_DESCRIPTION("Simple example action"); MODULE_LICENSE("GPL"); static int __init simp_init_module(void) { int ret = tcf_register_action(&act_simp_ops, &simp_net_ops); if (!ret) pr_info("Simple TC action Loaded\n"); return ret; } static void __exit simp_cleanup_module(void) { tcf_unregister_action(&act_simp_ops, &simp_net_ops); } module_init(simp_init_module); module_exit(simp_cleanup_module);
1 48 23 35 11 687 3 684 1 11 1 11 11 92 114 11 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 /* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ /* * Copyright (c) 2004 Mellanox Technologies Ltd. All rights reserved. * Copyright (c) 2004 Infinicon Corporation. All rights reserved. * Copyright (c) 2004, 2020 Intel Corporation. All rights reserved. * Copyright (c) 2004 Topspin Corporation. All rights reserved. * Copyright (c) 2004 Voltaire Corporation. All rights reserved. * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. * Copyright (c) 2005, 2006, 2007 Cisco Systems. All rights reserved. */ #ifndef IB_VERBS_H #define IB_VERBS_H #include <linux/ethtool.h> #include <linux/types.h> #include <linux/device.h> #include <linux/dma-mapping.h> #include <linux/kref.h> #include <linux/list.h> #include <linux/rwsem.h> #include <linux/workqueue.h> #include <linux/irq_poll.h> #include <uapi/linux/if_ether.h> #include <net/ipv6.h> #include <net/ip.h> #include <linux/string.h> #include <linux/slab.h> #include <linux/netdevice.h> #include <linux/refcount.h> #include <linux/if_link.h> #include <linux/atomic.h> #include <linux/mmu_notifier.h> #include <linux/uaccess.h> #include <linux/cgroup_rdma.h> #include <linux/irqflags.h> #include <linux/preempt.h> #include <linux/dim.h> #include <uapi/rdma/ib_user_verbs.h> #include <rdma/rdma_counter.h> #include <rdma/restrack.h> #include <rdma/signature.h> #include <uapi/rdma/rdma_user_ioctl.h> #include <uapi/rdma/ib_user_ioctl_verbs.h> #include <linux/pci-tph.h> #define IB_FW_VERSION_NAME_MAX ETHTOOL_FWVERS_LEN struct ib_umem_odp; struct ib_uqp_object; struct ib_usrq_object; struct ib_uwq_object; struct rdma_cm_id; struct ib_port; struct hw_stats_device_data; extern struct workqueue_struct *ib_wq; extern struct workqueue_struct *ib_comp_wq; extern struct workqueue_struct *ib_comp_unbound_wq; struct ib_ucq_object; __printf(2, 3) __cold void ibdev_emerg(const struct ib_device *ibdev, const char *format, ...); __printf(2, 3) __cold void ibdev_alert(const struct ib_device *ibdev, const char *format, ...); __printf(2, 3) __cold void ibdev_crit(const struct ib_device *ibdev, const char *format, ...); __printf(2, 3) __cold void ibdev_err(const struct ib_device *ibdev, const char *format, ...); __printf(2, 3) __cold void ibdev_warn(const struct ib_device *ibdev, const char *format, ...); __printf(2, 3) __cold void ibdev_notice(const struct ib_device *ibdev, const char *format, ...); __printf(2, 3) __cold void ibdev_info(const struct ib_device *ibdev, const char *format, ...); #if defined(CONFIG_DYNAMIC_DEBUG) || \ (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE)) #define ibdev_dbg(__dev, format, args...) \ dynamic_ibdev_dbg(__dev, format, ##args) #else __printf(2, 3) __cold static inline void ibdev_dbg(const struct ib_device *ibdev, const char *format, ...) {} #endif #define ibdev_level_ratelimited(ibdev_level, ibdev, fmt, ...) \ do { \ static DEFINE_RATELIMIT_STATE(_rs, \ DEFAULT_RATELIMIT_INTERVAL, \ DEFAULT_RATELIMIT_BURST); \ if (__ratelimit(&_rs)) \ ibdev_level(ibdev, fmt, ##__VA_ARGS__); \ } while (0) #define ibdev_emerg_ratelimited(ibdev, fmt, ...) \ ibdev_level_ratelimited(ibdev_emerg, ibdev, fmt, ##__VA_ARGS__) #define ibdev_alert_ratelimited(ibdev, fmt, ...) \ ibdev_level_ratelimited(ibdev_alert, ibdev, fmt, ##__VA_ARGS__) #define ibdev_crit_ratelimited(ibdev, fmt, ...) \ ibdev_level_ratelimited(ibdev_crit, ibdev, fmt, ##__VA_ARGS__) #define ibdev_err_ratelimited(ibdev, fmt, ...) \ ibdev_level_ratelimited(ibdev_err, ibdev, fmt, ##__VA_ARGS__) #define ibdev_warn_ratelimited(ibdev, fmt, ...) \ ibdev_level_ratelimited(ibdev_warn, ibdev, fmt, ##__VA_ARGS__) #define ibdev_notice_ratelimited(ibdev, fmt, ...) \ ibdev_level_ratelimited(ibdev_notice, ibdev, fmt, ##__VA_ARGS__) #define ibdev_info_ratelimited(ibdev, fmt, ...) \ ibdev_level_ratelimited(ibdev_info, ibdev, fmt, ##__VA_ARGS__) #if defined(CONFIG_DYNAMIC_DEBUG) || \ (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE)) /* descriptor check is first to prevent flooding with "callbacks suppressed" */ #define ibdev_dbg_ratelimited(ibdev, fmt, ...) \ do { \ static DEFINE_RATELIMIT_STATE(_rs, \ DEFAULT_RATELIMIT_INTERVAL, \ DEFAULT_RATELIMIT_BURST); \ DEFINE_DYNAMIC_DEBUG_METADATA(descriptor, fmt); \ if (DYNAMIC_DEBUG_BRANCH(descriptor) && __ratelimit(&_rs)) \ __dynamic_ibdev_dbg(&descriptor, ibdev, fmt, \ ##__VA_ARGS__); \ } while (0) #else __printf(2, 3) __cold static inline void ibdev_dbg_ratelimited(const struct ib_device *ibdev, const char *format, ...) {} #endif union ib_gid { u8 raw[16]; struct { __be64 subnet_prefix; __be64 interface_id; } global; }; extern union ib_gid zgid; enum ib_gid_type { IB_GID_TYPE_IB = IB_UVERBS_GID_TYPE_IB, IB_GID_TYPE_ROCE = IB_UVERBS_GID_TYPE_ROCE_V1, IB_GID_TYPE_ROCE_UDP_ENCAP = IB_UVERBS_GID_TYPE_ROCE_V2, IB_GID_TYPE_SIZE }; #define ROCE_V2_UDP_DPORT 4791 struct ib_gid_attr { struct net_device __rcu *ndev; struct ib_device *device; union ib_gid gid; enum ib_gid_type gid_type; u16 index; u32 port_num; }; enum { /* set the local administered indication */ IB_SA_WELL_KNOWN_GUID = BIT_ULL(57) | 2, }; enum rdma_transport_type { RDMA_TRANSPORT_IB, RDMA_TRANSPORT_IWARP, RDMA_TRANSPORT_USNIC, RDMA_TRANSPORT_USNIC_UDP, RDMA_TRANSPORT_UNSPECIFIED, }; enum rdma_protocol_type { RDMA_PROTOCOL_IB, RDMA_PROTOCOL_IBOE, RDMA_PROTOCOL_IWARP, RDMA_PROTOCOL_USNIC_UDP }; __attribute_const__ enum rdma_transport_type rdma_node_get_transport(unsigned int node_type); enum rdma_network_type { RDMA_NETWORK_IB, RDMA_NETWORK_ROCE_V1, RDMA_NETWORK_IPV4, RDMA_NETWORK_IPV6 }; static inline enum ib_gid_type ib_network_to_gid_type(enum rdma_network_type network_type) { if (network_type == RDMA_NETWORK_IPV4 || network_type == RDMA_NETWORK_IPV6) return IB_GID_TYPE_ROCE_UDP_ENCAP; else if (network_type == RDMA_NETWORK_ROCE_V1) return IB_GID_TYPE_ROCE; else return IB_GID_TYPE_IB; } static inline enum rdma_network_type rdma_gid_attr_network_type(const struct ib_gid_attr *attr) { if (attr->gid_type == IB_GID_TYPE_IB) return RDMA_NETWORK_IB; if (attr->gid_type == IB_GID_TYPE_ROCE) return RDMA_NETWORK_ROCE_V1; if (ipv6_addr_v4mapped((struct in6_addr *)&attr->gid)) return RDMA_NETWORK_IPV4; else return RDMA_NETWORK_IPV6; } enum rdma_link_layer { IB_LINK_LAYER_UNSPECIFIED, IB_LINK_LAYER_INFINIBAND, IB_LINK_LAYER_ETHERNET, }; enum ib_device_cap_flags { IB_DEVICE_RESIZE_MAX_WR = IB_UVERBS_DEVICE_RESIZE_MAX_WR, IB_DEVICE_BAD_PKEY_CNTR = IB_UVERBS_DEVICE_BAD_PKEY_CNTR, IB_DEVICE_BAD_QKEY_CNTR = IB_UVERBS_DEVICE_BAD_QKEY_CNTR, IB_DEVICE_RAW_MULTI = IB_UVERBS_DEVICE_RAW_MULTI, IB_DEVICE_AUTO_PATH_MIG = IB_UVERBS_DEVICE_AUTO_PATH_MIG, IB_DEVICE_CHANGE_PHY_PORT = IB_UVERBS_DEVICE_CHANGE_PHY_PORT, IB_DEVICE_UD_AV_PORT_ENFORCE = IB_UVERBS_DEVICE_UD_AV_PORT_ENFORCE, IB_DEVICE_CURR_QP_STATE_MOD = IB_UVERBS_DEVICE_CURR_QP_STATE_MOD, IB_DEVICE_SHUTDOWN_PORT = IB_UVERBS_DEVICE_SHUTDOWN_PORT, /* IB_DEVICE_INIT_TYPE = IB_UVERBS_DEVICE_INIT_TYPE, (not in use) */ IB_DEVICE_PORT_ACTIVE_EVENT = IB_UVERBS_DEVICE_PORT_ACTIVE_EVENT, IB_DEVICE_SYS_IMAGE_GUID = IB_UVERBS_DEVICE_SYS_IMAGE_GUID, IB_DEVICE_RC_RNR_NAK_GEN = IB_UVERBS_DEVICE_RC_RNR_NAK_GEN, IB_DEVICE_SRQ_RESIZE = IB_UVERBS_DEVICE_SRQ_RESIZE, IB_DEVICE_N_NOTIFY_CQ = IB_UVERBS_DEVICE_N_NOTIFY_CQ, /* Reserved, old SEND_W_INV = 1 << 16,*/ IB_DEVICE_MEM_WINDOW = IB_UVERBS_DEVICE_MEM_WINDOW, /* * Devices should set IB_DEVICE_UD_IP_SUM if they support * insertion of UDP and TCP checksum on outgoing UD IPoIB * messages and can verify the validity of checksum for * incoming messages. Setting this flag implies that the * IPoIB driver may set NETIF_F_IP_CSUM for datagram mode. */ IB_DEVICE_UD_IP_CSUM = IB_UVERBS_DEVICE_UD_IP_CSUM, IB_DEVICE_XRC = IB_UVERBS_DEVICE_XRC, /* * This device supports the IB "base memory management extension", * which includes support for fast registrations (IB_WR_REG_MR, * IB_WR_LOCAL_INV and IB_WR_SEND_WITH_INV verbs). This flag should * also be set by any iWarp device which must support FRs to comply * to the iWarp verbs spec. iWarp devices also support the * IB_WR_RDMA_READ_WITH_INV verb for RDMA READs that invalidate the * stag. */ IB_DEVICE_MEM_MGT_EXTENSIONS = IB_UVERBS_DEVICE_MEM_MGT_EXTENSIONS, IB_DEVICE_MEM_WINDOW_TYPE_2A = IB_UVERBS_DEVICE_MEM_WINDOW_TYPE_2A, IB_DEVICE_MEM_WINDOW_TYPE_2B = IB_UVERBS_DEVICE_MEM_WINDOW_TYPE_2B, IB_DEVICE_RC_IP_CSUM = IB_UVERBS_DEVICE_RC_IP_CSUM, /* Deprecated. Please use IB_RAW_PACKET_CAP_IP_CSUM. */ IB_DEVICE_RAW_IP_CSUM = IB_UVERBS_DEVICE_RAW_IP_CSUM, IB_DEVICE_MANAGED_FLOW_STEERING = IB_UVERBS_DEVICE_MANAGED_FLOW_STEERING, /* Deprecated. Please use IB_RAW_PACKET_CAP_SCATTER_FCS. */ IB_DEVICE_RAW_SCATTER_FCS = IB_UVERBS_DEVICE_RAW_SCATTER_FCS, /* The device supports padding incoming writes to cacheline. */ IB_DEVICE_PCI_WRITE_END_PADDING = IB_UVERBS_DEVICE_PCI_WRITE_END_PADDING, /* Placement type attributes */ IB_DEVICE_FLUSH_GLOBAL = IB_UVERBS_DEVICE_FLUSH_GLOBAL, IB_DEVICE_FLUSH_PERSISTENT = IB_UVERBS_DEVICE_FLUSH_PERSISTENT, IB_DEVICE_ATOMIC_WRITE = IB_UVERBS_DEVICE_ATOMIC_WRITE, }; enum ib_kernel_cap_flags { /* * This device supports a per-device lkey or stag that can be * used without performing a memory registration for the local * memory. Note that ULPs should never check this flag, but * instead of use the local_dma_lkey flag in the ib_pd structure, * which will always contain a usable lkey. */ IBK_LOCAL_DMA_LKEY = 1 << 0, /* IB_QP_CREATE_INTEGRITY_EN is supported to implement T10-PI */ IBK_INTEGRITY_HANDOVER = 1 << 1, /* IB_ACCESS_ON_DEMAND is supported during reg_user_mr() */ IBK_ON_DEMAND_PAGING = 1 << 2, /* IB_MR_TYPE_SG_GAPS is supported */ IBK_SG_GAPS_REG = 1 << 3, /* Driver supports RDMA_NLDEV_CMD_DELLINK */ IBK_ALLOW_USER_UNREG = 1 << 4, /* ipoib will use IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK */ IBK_BLOCK_MULTICAST_LOOPBACK = 1 << 5, /* iopib will use IB_QP_CREATE_IPOIB_UD_LSO for its QPs */ IBK_UD_TSO = 1 << 6, /* iopib will use the device ops: * get_vf_config * get_vf_guid * get_vf_stats * set_vf_guid * set_vf_link_state */ IBK_VIRTUAL_FUNCTION = 1 << 7, /* ipoib will use IB_QP_CREATE_NETDEV_USE for its QPs */ IBK_RDMA_NETDEV_OPA = 1 << 8, }; enum ib_atomic_cap { IB_ATOMIC_NONE, IB_ATOMIC_HCA, IB_ATOMIC_GLOB }; enum ib_odp_general_cap_bits { IB_ODP_SUPPORT = IB_UVERBS_ODP_SUPPORT, IB_ODP_SUPPORT_IMPLICIT = IB_UVERBS_ODP_SUPPORT_IMPLICIT, }; enum ib_odp_transport_cap_bits { IB_ODP_SUPPORT_SEND = IB_UVERBS_ODP_SUPPORT_SEND, IB_ODP_SUPPORT_RECV = IB_UVERBS_ODP_SUPPORT_RECV, IB_ODP_SUPPORT_WRITE = IB_UVERBS_ODP_SUPPORT_WRITE, IB_ODP_SUPPORT_READ = IB_UVERBS_ODP_SUPPORT_READ, IB_ODP_SUPPORT_ATOMIC = IB_UVERBS_ODP_SUPPORT_ATOMIC, IB_ODP_SUPPORT_SRQ_RECV = IB_UVERBS_ODP_SUPPORT_SRQ_RECV, IB_ODP_SUPPORT_FLUSH = IB_UVERBS_ODP_SUPPORT_FLUSH, IB_ODP_SUPPORT_ATOMIC_WRITE = IB_UVERBS_ODP_SUPPORT_ATOMIC_WRITE, }; struct ib_odp_caps { uint64_t general_caps; struct { uint32_t rc_odp_caps; uint32_t uc_odp_caps; uint32_t ud_odp_caps; uint32_t xrc_odp_caps; } per_transport_caps; }; struct ib_rss_caps { /* Corresponding bit will be set if qp type from * 'enum ib_qp_type' is supported, e.g. * supported_qpts |= 1 << IB_QPT_UD */ u32 supported_qpts; u32 max_rwq_indirection_tables; u32 max_rwq_indirection_table_size; }; enum ib_tm_cap_flags { /* Support tag matching with rendezvous offload for RC transport */ IB_TM_CAP_RNDV_RC = 1 << 0, }; struct ib_tm_caps { /* Max size of RNDV header */ u32 max_rndv_hdr_size; /* Max number of entries in tag matching list */ u32 max_num_tags; /* From enum ib_tm_cap_flags */ u32 flags; /* Max number of outstanding list operations */ u32 max_ops; /* Max number of SGE in tag matching entry */ u32 max_sge; }; struct ib_cq_init_attr { unsigned int cqe; u32 comp_vector; u32 flags; }; enum ib_cq_attr_mask { IB_CQ_MODERATE = 1 << 0, }; struct ib_cq_caps { u16 max_cq_moderation_count; u16 max_cq_moderation_period; }; struct ib_dm_mr_attr { u64 length; u64 offset; u32 access_flags; }; struct ib_dm_alloc_attr { u64 length; u32 alignment; u32 flags; }; struct ib_device_attr { u64 fw_ver; __be64 sys_image_guid; u64 max_mr_size; u64 page_size_cap; u32 vendor_id; u32 vendor_part_id; u32 hw_ver; int max_qp; int max_qp_wr; u64 device_cap_flags; u64 kernel_cap_flags; int max_send_sge; int max_recv_sge; int max_sge_rd; int max_cq; int max_cqe; int max_mr; int max_pd; int max_qp_rd_atom; int max_ee_rd_atom; int max_res_rd_atom; int max_qp_init_rd_atom; int max_ee_init_rd_atom; enum ib_atomic_cap atomic_cap; enum ib_atomic_cap masked_atomic_cap; int max_ee; int max_rdd; int max_mw; int max_raw_ipv6_qp; int max_raw_ethy_qp; int max_mcast_grp; int max_mcast_qp_attach; int max_total_mcast_qp_attach; int max_ah; int max_srq; int max_srq_wr; int max_srq_sge; unsigned int max_fast_reg_page_list_len; unsigned int max_pi_fast_reg_page_list_len; u16 max_pkeys; u8 local_ca_ack_delay; int sig_prot_cap; int sig_guard_cap; struct ib_odp_caps odp_caps; uint64_t timestamp_mask; uint64_t hca_core_clock; /* in KHZ */ struct ib_rss_caps rss_caps; u32 max_wq_type_rq; u32 raw_packet_caps; /* Use ib_raw_packet_caps enum */ struct ib_tm_caps tm_caps; struct ib_cq_caps cq_caps; u64 max_dm_size; /* Max entries for sgl for optimized performance per READ */ u32 max_sgl_rd; }; enum ib_mtu { IB_MTU_256 = 1, IB_MTU_512 = 2, IB_MTU_1024 = 3, IB_MTU_2048 = 4, IB_MTU_4096 = 5 }; enum opa_mtu { OPA_MTU_8192 = 6, OPA_MTU_10240 = 7 }; static inline int ib_mtu_enum_to_int(enum ib_mtu mtu) { switch (mtu) { case IB_MTU_256: return 256; case IB_MTU_512: return 512; case IB_MTU_1024: return 1024; case IB_MTU_2048: return 2048; case IB_MTU_4096: return 4096; default: return -1; } } static inline enum ib_mtu ib_mtu_int_to_enum(int mtu) { if (mtu >= 4096) return IB_MTU_4096; else if (mtu >= 2048) return IB_MTU_2048; else if (mtu >= 1024) return IB_MTU_1024; else if (mtu >= 512) return IB_MTU_512; else return IB_MTU_256; } static inline int opa_mtu_enum_to_int(enum opa_mtu mtu) { switch (mtu) { case OPA_MTU_8192: return 8192; case OPA_MTU_10240: return 10240; default: return(ib_mtu_enum_to_int((enum ib_mtu)mtu)); } } static inline enum opa_mtu opa_mtu_int_to_enum(int mtu) { if (mtu >= 10240) return OPA_MTU_10240; else if (mtu >= 8192) return OPA_MTU_8192; else return ((enum opa_mtu)ib_mtu_int_to_enum(mtu)); } enum ib_port_state { IB_PORT_NOP = 0, IB_PORT_DOWN = 1, IB_PORT_INIT = 2, IB_PORT_ARMED = 3, IB_PORT_ACTIVE = 4, IB_PORT_ACTIVE_DEFER = 5 }; static inline const char *__attribute_const__ ib_port_state_to_str(enum ib_port_state state) { const char * const states[] = { [IB_PORT_NOP] = "NOP", [IB_PORT_DOWN] = "DOWN", [IB_PORT_INIT] = "INIT", [IB_PORT_ARMED] = "ARMED", [IB_PORT_ACTIVE] = "ACTIVE", [IB_PORT_ACTIVE_DEFER] = "ACTIVE_DEFER", }; if (state < ARRAY_SIZE(states)) return states[state]; return "UNKNOWN"; } enum ib_port_phys_state { IB_PORT_PHYS_STATE_SLEEP = 1, IB_PORT_PHYS_STATE_POLLING = 2, IB_PORT_PHYS_STATE_DISABLED = 3, IB_PORT_PHYS_STATE_PORT_CONFIGURATION_TRAINING = 4, IB_PORT_PHYS_STATE_LINK_UP = 5, IB_PORT_PHYS_STATE_LINK_ERROR_RECOVERY = 6, IB_PORT_PHYS_STATE_PHY_TEST = 7, }; enum ib_port_width { IB_WIDTH_1X = 1, IB_WIDTH_2X = 16, IB_WIDTH_4X = 2, IB_WIDTH_8X = 4, IB_WIDTH_12X = 8 }; static inline int ib_width_enum_to_int(enum ib_port_width width) { switch (width) { case IB_WIDTH_1X: return 1; case IB_WIDTH_2X: return 2; case IB_WIDTH_4X: return 4; case IB_WIDTH_8X: return 8; case IB_WIDTH_12X: return 12; default: return -1; } } enum ib_port_speed { IB_SPEED_SDR = 1, IB_SPEED_DDR = 2, IB_SPEED_QDR = 4, IB_SPEED_FDR10 = 8, IB_SPEED_FDR = 16, IB_SPEED_EDR = 32, IB_SPEED_HDR = 64, IB_SPEED_NDR = 128, IB_SPEED_XDR = 256, }; enum ib_stat_flag { IB_STAT_FLAG_OPTIONAL = 1 << 0, }; /** * struct rdma_stat_desc - description of one rdma stat/counter * @name: The name of the counter * @flags: Flags of the counter; For example, IB_STAT_FLAG_OPTIONAL * @priv: Driver private information; Core code should not use */ struct rdma_stat_desc { const char *name; unsigned int flags; const void *priv; }; /** * struct rdma_hw_stats - collection of hardware stats and their management * @lock: Mutex to protect parallel write access to lifespan and values * of counters, which are 64bits and not guaranteed to be written * atomicaly on 32bits systems. * @timestamp: Used by the core code to track when the last update was * @lifespan: Used by the core code to determine how old the counters * should be before being updated again. Stored in jiffies, defaults * to 10 milliseconds, drivers can override the default be specifying * their own value during their allocation routine. * @descs: Array of pointers to static descriptors used for the counters * in directory. * @is_disabled: A bitmap to indicate each counter is currently disabled * or not. * @num_counters: How many hardware counters there are. If name is * shorter than this number, a kernel oops will result. Driver authors * are encouraged to leave BUILD_BUG_ON(ARRAY_SIZE(@name) < num_counters) * in their code to prevent this. * @value: Array of u64 counters that are accessed by the sysfs code and * filled in by the drivers get_stats routine */ struct rdma_hw_stats { struct mutex lock; /* Protect lifespan and values[] */ unsigned long timestamp; unsigned long lifespan; const struct rdma_stat_desc *descs; unsigned long *is_disabled; int num_counters; u64 value[] __counted_by(num_counters); }; #define RDMA_HW_STATS_DEFAULT_LIFESPAN 10 struct rdma_hw_stats *rdma_alloc_hw_stats_struct( const struct rdma_stat_desc *descs, int num_counters, unsigned long lifespan); void rdma_free_hw_stats_struct(struct rdma_hw_stats *stats); /* Define bits for the various functionality this port needs to be supported by * the core. */ /* Management 0x00000FFF */ #define RDMA_CORE_CAP_IB_MAD 0x00000001 #define RDMA_CORE_CAP_IB_SMI 0x00000002 #define RDMA_CORE_CAP_IB_CM 0x00000004 #define RDMA_CORE_CAP_IW_CM 0x00000008 #define RDMA_CORE_CAP_IB_SA 0x00000010 #define RDMA_CORE_CAP_OPA_MAD 0x00000020 /* Address format 0x000FF000 */ #define RDMA_CORE_CAP_AF_IB 0x00001000 #define RDMA_CORE_CAP_ETH_AH 0x00002000 #define RDMA_CORE_CAP_OPA_AH 0x00004000 #define RDMA_CORE_CAP_IB_GRH_REQUIRED 0x00008000 /* Protocol 0xFFF00000 */ #define RDMA_CORE_CAP_PROT_IB 0x00100000 #define RDMA_CORE_CAP_PROT_ROCE 0x00200000 #define RDMA_CORE_CAP_PROT_IWARP 0x00400000 #define RDMA_CORE_CAP_PROT_ROCE_UDP_ENCAP 0x00800000 #define RDMA_CORE_CAP_PROT_RAW_PACKET 0x01000000 #define RDMA_CORE_CAP_PROT_USNIC 0x02000000 #define RDMA_CORE_PORT_IB_GRH_REQUIRED (RDMA_CORE_CAP_IB_GRH_REQUIRED \ | RDMA_CORE_CAP_PROT_ROCE \ | RDMA_CORE_CAP_PROT_ROCE_UDP_ENCAP) #define RDMA_CORE_PORT_IBA_IB (RDMA_CORE_CAP_PROT_IB \ | RDMA_CORE_CAP_IB_MAD \ | RDMA_CORE_CAP_IB_SMI \ | RDMA_CORE_CAP_IB_CM \ | RDMA_CORE_CAP_IB_SA \ | RDMA_CORE_CAP_AF_IB) #define RDMA_CORE_PORT_IBA_ROCE (RDMA_CORE_CAP_PROT_ROCE \ | RDMA_CORE_CAP_IB_MAD \ | RDMA_CORE_CAP_IB_CM \ | RDMA_CORE_CAP_AF_IB \ | RDMA_CORE_CAP_ETH_AH) #define RDMA_CORE_PORT_IBA_ROCE_UDP_ENCAP \ (RDMA_CORE_CAP_PROT_ROCE_UDP_ENCAP \ | RDMA_CORE_CAP_IB_MAD \ | RDMA_CORE_CAP_IB_CM \ | RDMA_CORE_CAP_AF_IB \ | RDMA_CORE_CAP_ETH_AH) #define RDMA_CORE_PORT_IWARP (RDMA_CORE_CAP_PROT_IWARP \ | RDMA_CORE_CAP_IW_CM) #define RDMA_CORE_PORT_INTEL_OPA (RDMA_CORE_PORT_IBA_IB \ | RDMA_CORE_CAP_OPA_MAD) #define RDMA_CORE_PORT_RAW_PACKET (RDMA_CORE_CAP_PROT_RAW_PACKET) #define RDMA_CORE_PORT_USNIC (RDMA_CORE_CAP_PROT_USNIC) struct ib_port_attr { u64 subnet_prefix; enum ib_port_state state; enum ib_mtu max_mtu; enum ib_mtu active_mtu; u32 phys_mtu; int gid_tbl_len; unsigned int ip_gids:1; /* This is the value from PortInfo CapabilityMask, defined by IBA */ u32 port_cap_flags; u32 max_msg_sz; u32 bad_pkey_cntr; u32 qkey_viol_cntr; u16 pkey_tbl_len; u32 sm_lid; u32 lid; u8 lmc; u8 max_vl_num; u8 sm_sl; u8 subnet_timeout; u8 init_type_reply; u8 active_width; u16 active_speed; u8 phys_state; u16 port_cap_flags2; }; enum ib_device_modify_flags { IB_DEVICE_MODIFY_SYS_IMAGE_GUID = 1 << 0, IB_DEVICE_MODIFY_NODE_DESC = 1 << 1 }; #define IB_DEVICE_NODE_DESC_MAX 64 struct ib_device_modify { u64 sys_image_guid; char node_desc[IB_DEVICE_NODE_DESC_MAX]; }; enum ib_port_modify_flags { IB_PORT_SHUTDOWN = 1, IB_PORT_INIT_TYPE = (1<<2), IB_PORT_RESET_QKEY_CNTR = (1<<3), IB_PORT_OPA_MASK_CHG = (1<<4) }; struct ib_port_modify { u32 set_port_cap_mask; u32 clr_port_cap_mask; u8 init_type; }; enum ib_event_type { IB_EVENT_CQ_ERR, IB_EVENT_QP_FATAL, IB_EVENT_QP_REQ_ERR, IB_EVENT_QP_ACCESS_ERR, IB_EVENT_COMM_EST, IB_EVENT_SQ_DRAINED, IB_EVENT_PATH_MIG, IB_EVENT_PATH_MIG_ERR, IB_EVENT_DEVICE_FATAL, IB_EVENT_PORT_ACTIVE, IB_EVENT_PORT_ERR, IB_EVENT_LID_CHANGE, IB_EVENT_PKEY_CHANGE, IB_EVENT_SM_CHANGE, IB_EVENT_SRQ_ERR, IB_EVENT_SRQ_LIMIT_REACHED, IB_EVENT_QP_LAST_WQE_REACHED, IB_EVENT_CLIENT_REREGISTER, IB_EVENT_GID_CHANGE, IB_EVENT_WQ_FATAL, }; const char *__attribute_const__ ib_event_msg(enum ib_event_type event); struct ib_event { struct ib_device *device; union { struct ib_cq *cq; struct ib_qp *qp; struct ib_srq *srq; struct ib_wq *wq; u32 port_num; } element; enum ib_event_type event; }; struct ib_event_handler { struct ib_device *device; void (*handler)(struct ib_event_handler *, struct ib_event *); struct list_head list; }; #define INIT_IB_EVENT_HANDLER(_ptr, _device, _handler) \ do { \ (_ptr)->device = _device; \ (_ptr)->handler = _handler; \ INIT_LIST_HEAD(&(_ptr)->list); \ } while (0) struct ib_global_route { const struct ib_gid_attr *sgid_attr; union ib_gid dgid; u32 flow_label; u8 sgid_index; u8 hop_limit; u8 traffic_class; }; struct ib_grh { __be32 version_tclass_flow; __be16 paylen; u8 next_hdr; u8 hop_limit; union ib_gid sgid; union ib_gid dgid; }; union rdma_network_hdr { struct ib_grh ibgrh; struct { /* The IB spec states that if it's IPv4, the header * is located in the last 20 bytes of the header. */ u8 reserved[20]; struct iphdr roce4grh; }; }; #define IB_QPN_MASK 0xFFFFFF enum { IB_MULTICAST_QPN = 0xffffff }; #define IB_LID_PERMISSIVE cpu_to_be16(0xFFFF) #define IB_MULTICAST_LID_BASE cpu_to_be16(0xC000) enum ib_ah_flags { IB_AH_GRH = 1 }; enum ib_rate { IB_RATE_PORT_CURRENT = 0, IB_RATE_2_5_GBPS = 2, IB_RATE_5_GBPS = 5, IB_RATE_10_GBPS = 3, IB_RATE_20_GBPS = 6, IB_RATE_30_GBPS = 4, IB_RATE_40_GBPS = 7, IB_RATE_60_GBPS = 8, IB_RATE_80_GBPS = 9, IB_RATE_120_GBPS = 10, IB_RATE_14_GBPS = 11, IB_RATE_56_GBPS = 12, IB_RATE_112_GBPS = 13, IB_RATE_168_GBPS = 14, IB_RATE_25_GBPS = 15, IB_RATE_100_GBPS = 16, IB_RATE_200_GBPS = 17, IB_RATE_300_GBPS = 18, IB_RATE_28_GBPS = 19, IB_RATE_50_GBPS = 20, IB_RATE_400_GBPS = 21, IB_RATE_600_GBPS = 22, IB_RATE_800_GBPS = 23, IB_RATE_1600_GBPS = 25, }; /** * ib_rate_to_mult - Convert the IB rate enum to a multiple of the * base rate of 2.5 Gbit/sec. For example, IB_RATE_5_GBPS will be * converted to 2, since 5 Gbit/sec is 2 * 2.5 Gbit/sec. * @rate: rate to convert. */ __attribute_const__ int ib_rate_to_mult(enum ib_rate rate); /** * ib_rate_to_mbps - Convert the IB rate enum to Mbps. * For example, IB_RATE_2_5_GBPS will be converted to 2500. * @rate: rate to convert. */ __attribute_const__ int ib_rate_to_mbps(enum ib_rate rate); /** * enum ib_mr_type - memory region type * @IB_MR_TYPE_MEM_REG: memory region that is used for * normal registration * @IB_MR_TYPE_SG_GAPS: memory region that is capable to * register any arbitrary sg lists (without * the normal mr constraints - see * ib_map_mr_sg) * @IB_MR_TYPE_DM: memory region that is used for device * memory registration * @IB_MR_TYPE_USER: memory region that is used for the user-space * application * @IB_MR_TYPE_DMA: memory region that is used for DMA operations * without address translations (VA=PA) * @IB_MR_TYPE_INTEGRITY: memory region that is used for * data integrity operations */ enum ib_mr_type { IB_MR_TYPE_MEM_REG, IB_MR_TYPE_SG_GAPS, IB_MR_TYPE_DM, IB_MR_TYPE_USER, IB_MR_TYPE_DMA, IB_MR_TYPE_INTEGRITY, }; enum ib_mr_status_check { IB_MR_CHECK_SIG_STATUS = 1, }; /** * struct ib_mr_status - Memory region status container * * @fail_status: Bitmask of MR checks status. For each * failed check a corresponding status bit is set. * @sig_err: Additional info for IB_MR_CEHCK_SIG_STATUS * failure. */ struct ib_mr_status { u32 fail_status; struct ib_sig_err sig_err; }; /** * mult_to_ib_rate - Convert a multiple of 2.5 Gbit/sec to an IB rate * enum. * @mult: multiple to convert. */ __attribute_const__ enum ib_rate mult_to_ib_rate(int mult); struct rdma_ah_init_attr { struct rdma_ah_attr *ah_attr; u32 flags; struct net_device *xmit_slave; }; enum rdma_ah_attr_type { RDMA_AH_ATTR_TYPE_UNDEFINED, RDMA_AH_ATTR_TYPE_IB, RDMA_AH_ATTR_TYPE_ROCE, RDMA_AH_ATTR_TYPE_OPA, }; struct ib_ah_attr { u16 dlid; u8 src_path_bits; }; struct roce_ah_attr { u8 dmac[ETH_ALEN]; }; struct opa_ah_attr { u32 dlid; u8 src_path_bits; bool make_grd; }; struct rdma_ah_attr { struct ib_global_route grh; u8 sl; u8 static_rate; u32 port_num; u8 ah_flags; enum rdma_ah_attr_type type; union { struct ib_ah_attr ib; struct roce_ah_attr roce; struct opa_ah_attr opa; }; }; enum ib_wc_status { IB_WC_SUCCESS, IB_WC_LOC_LEN_ERR, IB_WC_LOC_QP_OP_ERR, IB_WC_LOC_EEC_OP_ERR, IB_WC_LOC_PROT_ERR, IB_WC_WR_FLUSH_ERR, IB_WC_MW_BIND_ERR, IB_WC_BAD_RESP_ERR, IB_WC_LOC_ACCESS_ERR, IB_WC_REM_INV_REQ_ERR, IB_WC_REM_ACCESS_ERR, IB_WC_REM_OP_ERR, IB_WC_RETRY_EXC_ERR, IB_WC_RNR_RETRY_EXC_ERR, IB_WC_LOC_RDD_VIOL_ERR, IB_WC_REM_INV_RD_REQ_ERR, IB_WC_REM_ABORT_ERR, IB_WC_INV_EECN_ERR, IB_WC_INV_EEC_STATE_ERR, IB_WC_FATAL_ERR, IB_WC_RESP_TIMEOUT_ERR, IB_WC_GENERAL_ERR }; const char *__attribute_const__ ib_wc_status_msg(enum ib_wc_status status); enum ib_wc_opcode { IB_WC_SEND = IB_UVERBS_WC_SEND, IB_WC_RDMA_WRITE = IB_UVERBS_WC_RDMA_WRITE, IB_WC_RDMA_READ = IB_UVERBS_WC_RDMA_READ, IB_WC_COMP_SWAP = IB_UVERBS_WC_COMP_SWAP, IB_WC_FETCH_ADD = IB_UVERBS_WC_FETCH_ADD, IB_WC_BIND_MW = IB_UVERBS_WC_BIND_MW, IB_WC_LOCAL_INV = IB_UVERBS_WC_LOCAL_INV, IB_WC_LSO = IB_UVERBS_WC_TSO, IB_WC_ATOMIC_WRITE = IB_UVERBS_WC_ATOMIC_WRITE, IB_WC_REG_MR, IB_WC_MASKED_COMP_SWAP, IB_WC_MASKED_FETCH_ADD, IB_WC_FLUSH = IB_UVERBS_WC_FLUSH, /* * Set value of IB_WC_RECV so consumers can test if a completion is a * receive by testing (opcode & IB_WC_RECV). */ IB_WC_RECV = 1 << 7, IB_WC_RECV_RDMA_WITH_IMM }; enum ib_wc_flags { IB_WC_GRH = 1, IB_WC_WITH_IMM = (1<<1), IB_WC_WITH_INVALIDATE = (1<<2), IB_WC_IP_CSUM_OK = (1<<3), IB_WC_WITH_SMAC = (1<<4), IB_WC_WITH_VLAN = (1<<5), IB_WC_WITH_NETWORK_HDR_TYPE = (1<<6), }; struct ib_wc { union { u64 wr_id; struct ib_cqe *wr_cqe; }; enum ib_wc_status status; enum ib_wc_opcode opcode; u32 vendor_err; u32 byte_len; struct ib_qp *qp; union { __be32 imm_data; u32 invalidate_rkey; } ex; u32 src_qp; u32 slid; int wc_flags; u16 pkey_index; u8 sl; u8 dlid_path_bits; u32 port_num; /* valid only for DR SMPs on switches */ u8 smac[ETH_ALEN]; u16 vlan_id; u8 network_hdr_type; }; enum ib_cq_notify_flags { IB_CQ_SOLICITED = 1 << 0, IB_CQ_NEXT_COMP = 1 << 1, IB_CQ_SOLICITED_MASK = IB_CQ_SOLICITED | IB_CQ_NEXT_COMP, IB_CQ_REPORT_MISSED_EVENTS = 1 << 2, }; enum ib_srq_type { IB_SRQT_BASIC = IB_UVERBS_SRQT_BASIC, IB_SRQT_XRC = IB_UVERBS_SRQT_XRC, IB_SRQT_TM = IB_UVERBS_SRQT_TM, }; static inline bool ib_srq_has_cq(enum ib_srq_type srq_type) { return srq_type == IB_SRQT_XRC || srq_type == IB_SRQT_TM; } enum ib_srq_attr_mask { IB_SRQ_MAX_WR = 1 << 0, IB_SRQ_LIMIT = 1 << 1, }; struct ib_srq_attr { u32 max_wr; u32 max_sge; u32 srq_limit; }; struct ib_srq_init_attr { void (*event_handler)(struct ib_event *, void *); void *srq_context; struct ib_srq_attr attr; enum ib_srq_type srq_type; struct { struct ib_cq *cq; union { struct { struct ib_xrcd *xrcd; } xrc; struct { u32 max_num_tags; } tag_matching; }; } ext; }; struct ib_qp_cap { u32 max_send_wr; u32 max_recv_wr; u32 max_send_sge; u32 max_recv_sge; u32 max_inline_data; /* * Maximum number of rdma_rw_ctx structures in flight at a time. * ib_create_qp() will calculate the right amount of needed WRs * and MRs based on this. */ u32 max_rdma_ctxs; }; enum ib_sig_type { IB_SIGNAL_ALL_WR, IB_SIGNAL_REQ_WR }; enum ib_qp_type { /* * IB_QPT_SMI and IB_QPT_GSI have to be the first two entries * here (and in that order) since the MAD layer uses them as * indices into a 2-entry table. */ IB_QPT_SMI, IB_QPT_GSI, IB_QPT_RC = IB_UVERBS_QPT_RC, IB_QPT_UC = IB_UVERBS_QPT_UC, IB_QPT_UD = IB_UVERBS_QPT_UD, IB_QPT_RAW_IPV6, IB_QPT_RAW_ETHERTYPE, IB_QPT_RAW_PACKET = IB_UVERBS_QPT_RAW_PACKET, IB_QPT_XRC_INI = IB_UVERBS_QPT_XRC_INI, IB_QPT_XRC_TGT = IB_UVERBS_QPT_XRC_TGT, IB_QPT_MAX, IB_QPT_DRIVER = IB_UVERBS_QPT_DRIVER, /* Reserve a range for qp types internal to the low level driver. * These qp types will not be visible at the IB core layer, so the * IB_QPT_MAX usages should not be affected in the core layer */ IB_QPT_RESERVED1 = 0x1000, IB_QPT_RESERVED2, IB_QPT_RESERVED3, IB_QPT_RESERVED4, IB_QPT_RESERVED5, IB_QPT_RESERVED6, IB_QPT_RESERVED7, IB_QPT_RESERVED8, IB_QPT_RESERVED9, IB_QPT_RESERVED10, }; enum ib_qp_create_flags { IB_QP_CREATE_IPOIB_UD_LSO = 1 << 0, IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK = IB_UVERBS_QP_CREATE_BLOCK_MULTICAST_LOOPBACK, IB_QP_CREATE_CROSS_CHANNEL = 1 << 2, IB_QP_CREATE_MANAGED_SEND = 1 << 3, IB_QP_CREATE_MANAGED_RECV = 1 << 4, IB_QP_CREATE_NETIF_QP = 1 << 5, IB_QP_CREATE_INTEGRITY_EN = 1 << 6, IB_QP_CREATE_NETDEV_USE = 1 << 7, IB_QP_CREATE_SCATTER_FCS = IB_UVERBS_QP_CREATE_SCATTER_FCS, IB_QP_CREATE_CVLAN_STRIPPING = IB_UVERBS_QP_CREATE_CVLAN_STRIPPING, IB_QP_CREATE_SOURCE_QPN = 1 << 10, IB_QP_CREATE_PCI_WRITE_END_PADDING = IB_UVERBS_QP_CREATE_PCI_WRITE_END_PADDING, /* reserve bits 26-31 for low level drivers' internal use */ IB_QP_CREATE_RESERVED_START = 1 << 26, IB_QP_CREATE_RESERVED_END = 1 << 31, }; /* * Note: users may not call ib_close_qp or ib_destroy_qp from the event_handler * callback to destroy the passed in QP. */ struct ib_qp_init_attr { /* This callback occurs in workqueue context */ void (*event_handler)(struct ib_event *, void *); void *qp_context; struct ib_cq *send_cq; struct ib_cq *recv_cq; struct ib_srq *srq; struct ib_xrcd *xrcd; /* XRC TGT QPs only */ struct ib_qp_cap cap; enum ib_sig_type sq_sig_type; enum ib_qp_type qp_type; u32 create_flags; /* * Only needed for special QP types, or when using the RW API. */ u32 port_num; struct ib_rwq_ind_table *rwq_ind_tbl; u32 source_qpn; }; struct ib_qp_open_attr { void (*event_handler)(struct ib_event *, void *); void *qp_context; u32 qp_num; enum ib_qp_type qp_type; }; enum ib_rnr_timeout { IB_RNR_TIMER_655_36 = 0, IB_RNR_TIMER_000_01 = 1, IB_RNR_TIMER_000_02 = 2, IB_RNR_TIMER_000_03 = 3, IB_RNR_TIMER_000_04 = 4, IB_RNR_TIMER_000_06 = 5, IB_RNR_TIMER_000_08 = 6, IB_RNR_TIMER_000_12 = 7, IB_RNR_TIMER_000_16 = 8, IB_RNR_TIMER_000_24 = 9, IB_RNR_TIMER_000_32 = 10, IB_RNR_TIMER_000_48 = 11, IB_RNR_TIMER_000_64 = 12, IB_RNR_TIMER_000_96 = 13, IB_RNR_TIMER_001_28 = 14, IB_RNR_TIMER_001_92 = 15, IB_RNR_TIMER_002_56 = 16, IB_RNR_TIMER_003_84 = 17, IB_RNR_TIMER_005_12 = 18, IB_RNR_TIMER_007_68 = 19, IB_RNR_TIMER_010_24 = 20, IB_RNR_TIMER_015_36 = 21, IB_RNR_TIMER_020_48 = 22, IB_RNR_TIMER_030_72 = 23, IB_RNR_TIMER_040_96 = 24, IB_RNR_TIMER_061_44 = 25, IB_RNR_TIMER_081_92 = 26, IB_RNR_TIMER_122_88 = 27, IB_RNR_TIMER_163_84 = 28, IB_RNR_TIMER_245_76 = 29, IB_RNR_TIMER_327_68 = 30, IB_RNR_TIMER_491_52 = 31 }; enum ib_qp_attr_mask { IB_QP_STATE = 1, IB_QP_CUR_STATE = (1<<1), IB_QP_EN_SQD_ASYNC_NOTIFY = (1<<2), IB_QP_ACCESS_FLAGS = (1<<3), IB_QP_PKEY_INDEX = (1<<4), IB_QP_PORT = (1<<5), IB_QP_QKEY = (1<<6), IB_QP_AV = (1<<7), IB_QP_PATH_MTU = (1<<8), IB_QP_TIMEOUT = (1<<9), IB_QP_RETRY_CNT = (1<<10), IB_QP_RNR_RETRY = (1<<11), IB_QP_RQ_PSN = (1<<12), IB_QP_MAX_QP_RD_ATOMIC = (1<<13), IB_QP_ALT_PATH = (1<<14), IB_QP_MIN_RNR_TIMER = (1<<15), IB_QP_SQ_PSN = (1<<16), IB_QP_MAX_DEST_RD_ATOMIC = (1<<17), IB_QP_PATH_MIG_STATE = (1<<18), IB_QP_CAP = (1<<19), IB_QP_DEST_QPN = (1<<20), IB_QP_RESERVED1 = (1<<21), IB_QP_RESERVED2 = (1<<22), IB_QP_RESERVED3 = (1<<23), IB_QP_RESERVED4 = (1<<24), IB_QP_RATE_LIMIT = (1<<25), IB_QP_ATTR_STANDARD_BITS = GENMASK(20, 0), }; enum ib_qp_state { IB_QPS_RESET, IB_QPS_INIT, IB_QPS_RTR, IB_QPS_RTS, IB_QPS_SQD, IB_QPS_SQE, IB_QPS_ERR }; enum ib_mig_state { IB_MIG_MIGRATED, IB_MIG_REARM, IB_MIG_ARMED }; enum ib_mw_type { IB_MW_TYPE_1 = 1, IB_MW_TYPE_2 = 2 }; struct ib_qp_attr { enum ib_qp_state qp_state; enum ib_qp_state cur_qp_state; enum ib_mtu path_mtu; enum ib_mig_state path_mig_state; u32 qkey; u32 rq_psn; u32 sq_psn; u32 dest_qp_num; int qp_access_flags; struct ib_qp_cap cap; struct rdma_ah_attr ah_attr; struct rdma_ah_attr alt_ah_attr; u16 pkey_index; u16 alt_pkey_index; u8 en_sqd_async_notify; u8 sq_draining; u8 max_rd_atomic; u8 max_dest_rd_atomic; u8 min_rnr_timer; u32 port_num; u8 timeout; u8 retry_cnt; u8 rnr_retry; u32 alt_port_num; u8 alt_timeout; u32 rate_limit; struct net_device *xmit_slave; }; enum ib_wr_opcode { /* These are shared with userspace */ IB_WR_RDMA_WRITE = IB_UVERBS_WR_RDMA_WRITE, IB_WR_RDMA_WRITE_WITH_IMM = IB_UVERBS_WR_RDMA_WRITE_WITH_IMM, IB_WR_SEND = IB_UVERBS_WR_SEND, IB_WR_SEND_WITH_IMM = IB_UVERBS_WR_SEND_WITH_IMM, IB_WR_RDMA_READ = IB_UVERBS_WR_RDMA_READ, IB_WR_ATOMIC_CMP_AND_SWP = IB_UVERBS_WR_ATOMIC_CMP_AND_SWP, IB_WR_ATOMIC_FETCH_AND_ADD = IB_UVERBS_WR_ATOMIC_FETCH_AND_ADD, IB_WR_BIND_MW = IB_UVERBS_WR_BIND_MW, IB_WR_LSO = IB_UVERBS_WR_TSO, IB_WR_SEND_WITH_INV = IB_UVERBS_WR_SEND_WITH_INV, IB_WR_RDMA_READ_WITH_INV = IB_UVERBS_WR_RDMA_READ_WITH_INV, IB_WR_LOCAL_INV = IB_UVERBS_WR_LOCAL_INV, IB_WR_MASKED_ATOMIC_CMP_AND_SWP = IB_UVERBS_WR_MASKED_ATOMIC_CMP_AND_SWP, IB_WR_MASKED_ATOMIC_FETCH_AND_ADD = IB_UVERBS_WR_MASKED_ATOMIC_FETCH_AND_ADD, IB_WR_FLUSH = IB_UVERBS_WR_FLUSH, IB_WR_ATOMIC_WRITE = IB_UVERBS_WR_ATOMIC_WRITE, /* These are kernel only and can not be issued by userspace */ IB_WR_REG_MR = 0x20, IB_WR_REG_MR_INTEGRITY, /* reserve values for low level drivers' internal use. * These values will not be used at all in the ib core layer. */ IB_WR_RESERVED1 = 0xf0, IB_WR_RESERVED2, IB_WR_RESERVED3, IB_WR_RESERVED4, IB_WR_RESERVED5, IB_WR_RESERVED6, IB_WR_RESERVED7, IB_WR_RESERVED8, IB_WR_RESERVED9, IB_WR_RESERVED10, }; enum ib_send_flags { IB_SEND_FENCE = 1, IB_SEND_SIGNALED = (1<<1), IB_SEND_SOLICITED = (1<<2), IB_SEND_INLINE = (1<<3), IB_SEND_IP_CSUM = (1<<4), /* reserve bits 26-31 for low level drivers' internal use */ IB_SEND_RESERVED_START = (1 << 26), IB_SEND_RESERVED_END = (1 << 31), }; struct ib_sge { u64 addr; u32 length; u32 lkey; }; struct ib_cqe { void (*done)(struct ib_cq *cq, struct ib_wc *wc); }; struct ib_send_wr { struct ib_send_wr *next; union { u64 wr_id; struct ib_cqe *wr_cqe; }; struct ib_sge *sg_list; int num_sge; enum ib_wr_opcode opcode; int send_flags; union { __be32 imm_data; u32 invalidate_rkey; } ex; }; struct ib_rdma_wr { struct ib_send_wr wr; u64 remote_addr; u32 rkey; }; static inline const struct ib_rdma_wr *rdma_wr(const struct ib_send_wr *wr) { return container_of(wr, struct ib_rdma_wr, wr); } struct ib_atomic_wr { struct ib_send_wr wr; u64 remote_addr; u64 compare_add; u64 swap; u64 compare_add_mask; u64 swap_mask; u32 rkey; }; static inline const struct ib_atomic_wr *atomic_wr(const struct ib_send_wr *wr) { return container_of(wr, struct ib_atomic_wr, wr); } struct ib_ud_wr { struct ib_send_wr wr; struct ib_ah *ah; void *header; int hlen; int mss; u32 remote_qpn; u32 remote_qkey; u16 pkey_index; /* valid for GSI only */ u32 port_num; /* valid for DR SMPs on switch only */ }; static inline const struct ib_ud_wr *ud_wr(const struct ib_send_wr *wr) { return container_of(wr, struct ib_ud_wr, wr); } struct ib_reg_wr { struct ib_send_wr wr; struct ib_mr *mr; u32 key; int access; }; static inline const struct ib_reg_wr *reg_wr(const struct ib_send_wr *wr) { return container_of(wr, struct ib_reg_wr, wr); } struct ib_recv_wr { struct ib_recv_wr *next; union { u64 wr_id; struct ib_cqe *wr_cqe; }; struct ib_sge *sg_list; int num_sge; }; enum ib_access_flags { IB_ACCESS_LOCAL_WRITE = IB_UVERBS_ACCESS_LOCAL_WRITE, IB_ACCESS_REMOTE_WRITE = IB_UVERBS_ACCESS_REMOTE_WRITE, IB_ACCESS_REMOTE_READ = IB_UVERBS_ACCESS_REMOTE_READ, IB_ACCESS_REMOTE_ATOMIC = IB_UVERBS_ACCESS_REMOTE_ATOMIC, IB_ACCESS_MW_BIND = IB_UVERBS_ACCESS_MW_BIND, IB_ZERO_BASED = IB_UVERBS_ACCESS_ZERO_BASED, IB_ACCESS_ON_DEMAND = IB_UVERBS_ACCESS_ON_DEMAND, IB_ACCESS_HUGETLB = IB_UVERBS_ACCESS_HUGETLB, IB_ACCESS_RELAXED_ORDERING = IB_UVERBS_ACCESS_RELAXED_ORDERING, IB_ACCESS_FLUSH_GLOBAL = IB_UVERBS_ACCESS_FLUSH_GLOBAL, IB_ACCESS_FLUSH_PERSISTENT = IB_UVERBS_ACCESS_FLUSH_PERSISTENT, IB_ACCESS_OPTIONAL = IB_UVERBS_ACCESS_OPTIONAL_RANGE, IB_ACCESS_SUPPORTED = ((IB_ACCESS_FLUSH_PERSISTENT << 1) - 1) | IB_ACCESS_OPTIONAL, }; /* * XXX: these are apparently used for ->rereg_user_mr, no idea why they * are hidden here instead of a uapi header! */ enum ib_mr_rereg_flags { IB_MR_REREG_TRANS = 1, IB_MR_REREG_PD = (1<<1), IB_MR_REREG_ACCESS = (1<<2), IB_MR_REREG_SUPPORTED = ((IB_MR_REREG_ACCESS << 1) - 1) }; struct ib_umem; enum rdma_remove_reason { /* * Userspace requested uobject deletion or initial try * to remove uobject via cleanup. Call could fail */ RDMA_REMOVE_DESTROY, /* Context deletion. This call should delete the actual object itself */ RDMA_REMOVE_CLOSE, /* Driver is being hot-unplugged. This call should delete the actual object itself */ RDMA_REMOVE_DRIVER_REMOVE, /* uobj is being cleaned-up before being committed */ RDMA_REMOVE_ABORT, /* The driver failed to destroy the uobject and is being disconnected */ RDMA_REMOVE_DRIVER_FAILURE, }; struct ib_rdmacg_object { #ifdef CONFIG_CGROUP_RDMA struct rdma_cgroup *cg; /* owner rdma cgroup */ #endif }; struct ib_ucontext { struct ib_device *device; struct ib_uverbs_file *ufile; struct ib_rdmacg_object cg_obj; u64 enabled_caps; /* * Implementation details of the RDMA core, don't use in drivers: */ struct rdma_restrack_entry res; struct xarray mmap_xa; }; struct ib_uobject { u64 user_handle; /* handle given to us by userspace */ /* ufile & ucontext owning this object */ struct ib_uverbs_file *ufile; /* FIXME, save memory: ufile->context == context */ struct ib_ucontext *context; /* associated user context */ void *object; /* containing object */ struct list_head list; /* link to context's list */ struct ib_rdmacg_object cg_obj; /* rdmacg object */ int id; /* index into kernel idr */ struct kref ref; atomic_t usecnt; /* protects exclusive access */ struct rcu_head rcu; /* kfree_rcu() overhead */ const struct uverbs_api_object *uapi_object; }; struct ib_udata { const void __user *inbuf; void __user *outbuf; size_t inlen; size_t outlen; }; struct ib_pd { u32 local_dma_lkey; u32 flags; struct ib_device *device; struct ib_uobject *uobject; atomic_t usecnt; /* count all resources */ u32 unsafe_global_rkey; /* * Implementation details of the RDMA core, don't use in drivers: */ struct ib_mr *__internal_mr; struct rdma_restrack_entry res; }; struct ib_xrcd { struct ib_device *device; atomic_t usecnt; /* count all exposed resources */ struct inode *inode; struct rw_semaphore tgt_qps_rwsem; struct xarray tgt_qps; }; struct ib_ah { struct ib_device *device; struct ib_pd *pd; struct ib_uobject *uobject; const struct ib_gid_attr *sgid_attr; enum rdma_ah_attr_type type; }; typedef void (*ib_comp_handler)(struct ib_cq *cq, void *cq_context); enum ib_poll_context { IB_POLL_SOFTIRQ, /* poll from softirq context */ IB_POLL_WORKQUEUE, /* poll from workqueue */ IB_POLL_UNBOUND_WORKQUEUE, /* poll from unbound workqueue */ IB_POLL_LAST_POOL_TYPE = IB_POLL_UNBOUND_WORKQUEUE, IB_POLL_DIRECT, /* caller context, no hw completions */ }; struct ib_cq { struct ib_device *device; struct ib_ucq_object *uobject; ib_comp_handler comp_handler; void (*event_handler)(struct ib_event *, void *); void *cq_context; int cqe; unsigned int cqe_used; atomic_t usecnt; /* count number of work queues */ enum ib_poll_context poll_ctx; struct ib_wc *wc; struct list_head pool_entry; union { struct irq_poll iop; struct work_struct work; }; struct workqueue_struct *comp_wq; struct dim *dim; /* updated only by trace points */ ktime_t timestamp; u8 interrupt:1; u8 shared:1; unsigned int comp_vector; /* * Implementation details of the RDMA core, don't use in drivers: */ struct rdma_restrack_entry res; }; struct ib_srq { struct ib_device *device; struct ib_pd *pd; struct ib_usrq_object *uobject; void (*event_handler)(struct ib_event *, void *); void *srq_context; enum ib_srq_type srq_type; atomic_t usecnt; struct { struct ib_cq *cq; union { struct { struct ib_xrcd *xrcd; u32 srq_num; } xrc; }; } ext; /* * Implementation details of the RDMA core, don't use in drivers: */ struct rdma_restrack_entry res; }; enum ib_raw_packet_caps { /* * Strip cvlan from incoming packet and report it in the matching work * completion is supported. */ IB_RAW_PACKET_CAP_CVLAN_STRIPPING = IB_UVERBS_RAW_PACKET_CAP_CVLAN_STRIPPING, /* * Scatter FCS field of an incoming packet to host memory is supported. */ IB_RAW_PACKET_CAP_SCATTER_FCS = IB_UVERBS_RAW_PACKET_CAP_SCATTER_FCS, /* Checksum offloads are supported (for both send and receive). */ IB_RAW_PACKET_CAP_IP_CSUM = IB_UVERBS_RAW_PACKET_CAP_IP_CSUM, /* * When a packet is received for an RQ with no receive WQEs, the * packet processing is delayed. */ IB_RAW_PACKET_CAP_DELAY_DROP = IB_UVERBS_RAW_PACKET_CAP_DELAY_DROP, }; enum ib_wq_type { IB_WQT_RQ = IB_UVERBS_WQT_RQ, }; enum ib_wq_state { IB_WQS_RESET, IB_WQS_RDY, IB_WQS_ERR }; struct ib_wq { struct ib_device *device; struct ib_uwq_object *uobject; void *wq_context; void (*event_handler)(struct ib_event *, void *); struct ib_pd *pd; struct ib_cq *cq; u32 wq_num; enum ib_wq_state state; enum ib_wq_type wq_type; atomic_t usecnt; }; enum ib_wq_flags { IB_WQ_FLAGS_CVLAN_STRIPPING = IB_UVERBS_WQ_FLAGS_CVLAN_STRIPPING, IB_WQ_FLAGS_SCATTER_FCS = IB_UVERBS_WQ_FLAGS_SCATTER_FCS, IB_WQ_FLAGS_DELAY_DROP = IB_UVERBS_WQ_FLAGS_DELAY_DROP, IB_WQ_FLAGS_PCI_WRITE_END_PADDING = IB_UVERBS_WQ_FLAGS_PCI_WRITE_END_PADDING, }; struct ib_wq_init_attr { void *wq_context; enum ib_wq_type wq_type; u32 max_wr; u32 max_sge; struct ib_cq *cq; void (*event_handler)(struct ib_event *, void *); u32 create_flags; /* Use enum ib_wq_flags */ }; enum ib_wq_attr_mask { IB_WQ_STATE = 1 << 0, IB_WQ_CUR_STATE = 1 << 1, IB_WQ_FLAGS = 1 << 2, }; struct ib_wq_attr { enum ib_wq_state wq_state; enum ib_wq_state curr_wq_state; u32 flags; /* Use enum ib_wq_flags */ u32 flags_mask; /* Use enum ib_wq_flags */ }; struct ib_rwq_ind_table { struct ib_device *device; struct ib_uobject *uobject; atomic_t usecnt; u32 ind_tbl_num; u32 log_ind_tbl_size; struct ib_wq **ind_tbl; }; struct ib_rwq_ind_table_init_attr { u32 log_ind_tbl_size; /* Each entry is a pointer to Receive Work Queue */ struct ib_wq **ind_tbl; }; enum port_pkey_state { IB_PORT_PKEY_NOT_VALID = 0, IB_PORT_PKEY_VALID = 1, IB_PORT_PKEY_LISTED = 2, }; struct ib_qp_security; struct ib_port_pkey { enum port_pkey_state state; u16 pkey_index; u32 port_num; struct list_head qp_list; struct list_head to_error_list; struct ib_qp_security *sec; }; struct ib_ports_pkeys { struct ib_port_pkey main; struct ib_port_pkey alt; }; struct ib_qp_security { struct ib_qp *qp; struct ib_device *dev; /* Hold this mutex when changing port and pkey settings. */ struct mutex mutex; struct ib_ports_pkeys *ports_pkeys; /* A list of all open shared QP handles. Required to enforce security * properly for all users of a shared QP. */ struct list_head shared_qp_list; void *security; bool destroying; atomic_t error_list_count; struct completion error_complete; int error_comps_pending; }; /* * @max_write_sge: Maximum SGE elements per RDMA WRITE request. * @max_read_sge: Maximum SGE elements per RDMA READ request. */ struct ib_qp { struct ib_device *device; struct ib_pd *pd; struct ib_cq *send_cq; struct ib_cq *recv_cq; spinlock_t mr_lock; int mrs_used; struct list_head rdma_mrs; struct list_head sig_mrs; struct ib_srq *srq; struct completion srq_completion; struct ib_xrcd *xrcd; /* XRC TGT QPs only */ struct list_head xrcd_list; /* count times opened, mcast attaches, flow attaches */ atomic_t usecnt; struct list_head open_list; struct ib_qp *real_qp; struct ib_uqp_object *uobject; void (*event_handler)(struct ib_event *, void *); void (*registered_event_handler)(struct ib_event *, void *); void *qp_context; /* sgid_attrs associated with the AV's */ const struct ib_gid_attr *av_sgid_attr; const struct ib_gid_attr *alt_path_sgid_attr; u32 qp_num; u32 max_write_sge; u32 max_read_sge; enum ib_qp_type qp_type; struct ib_rwq_ind_table *rwq_ind_tbl; struct ib_qp_security *qp_sec; u32 port; bool integrity_en; /* * Implementation details of the RDMA core, don't use in drivers: */ struct rdma_restrack_entry res; /* The counter the qp is bind to */ struct rdma_counter *counter; }; struct ib_dm { struct ib_device *device; u32 length; u32 flags; struct ib_uobject *uobject; atomic_t usecnt; }; /* bit values to mark existence of ib_dmah fields */ enum { IB_DMAH_CPU_ID_EXISTS, IB_DMAH_MEM_TYPE_EXISTS, IB_DMAH_PH_EXISTS, }; struct ib_dmah { struct ib_device *device; struct ib_uobject *uobject; /* * Implementation details of the RDMA core, don't use in drivers: */ struct rdma_restrack_entry res; u32 cpu_id; enum tph_mem_type mem_type; atomic_t usecnt; u8 ph; u8 valid_fields; /* use IB_DMAH_XXX_EXISTS */ }; struct ib_mr { struct ib_device *device; struct ib_pd *pd; u32 lkey; u32 rkey; u64 iova; u64 length; unsigned int page_size; enum ib_mr_type type; bool need_inval; union { struct ib_uobject *uobject; /* user */ struct list_head qp_entry; /* FR */ }; struct ib_dm *dm; struct ib_sig_attrs *sig_attrs; /* only for IB_MR_TYPE_INTEGRITY MRs */ struct ib_dmah *dmah; /* * Implementation details of the RDMA core, don't use in drivers: */ struct rdma_restrack_entry res; }; struct ib_mw { struct ib_device *device; struct ib_pd *pd; struct ib_uobject *uobject; u32 rkey; enum ib_mw_type type; }; /* Supported steering options */ enum ib_flow_attr_type { /* steering according to rule specifications */ IB_FLOW_ATTR_NORMAL = 0x0, /* default unicast and multicast rule - * receive all Eth traffic which isn't steered to any QP */ IB_FLOW_ATTR_ALL_DEFAULT = 0x1, /* default multicast rule - * receive all Eth multicast traffic which isn't steered to any QP */ IB_FLOW_ATTR_MC_DEFAULT = 0x2, /* sniffer rule - receive all port traffic */ IB_FLOW_ATTR_SNIFFER = 0x3 }; /* Supported steering header types */ enum ib_flow_spec_type { /* L2 headers*/ IB_FLOW_SPEC_ETH = 0x20, IB_FLOW_SPEC_IB = 0x22, /* L3 header*/ IB_FLOW_SPEC_IPV4 = 0x30, IB_FLOW_SPEC_IPV6 = 0x31, IB_FLOW_SPEC_ESP = 0x34, /* L4 headers*/ IB_FLOW_SPEC_TCP = 0x40, IB_FLOW_SPEC_UDP = 0x41, IB_FLOW_SPEC_VXLAN_TUNNEL = 0x50, IB_FLOW_SPEC_GRE = 0x51, IB_FLOW_SPEC_MPLS = 0x60, IB_FLOW_SPEC_INNER = 0x100, /* Actions */ IB_FLOW_SPEC_ACTION_TAG = 0x1000, IB_FLOW_SPEC_ACTION_DROP = 0x1001, IB_FLOW_SPEC_ACTION_HANDLE = 0x1002, IB_FLOW_SPEC_ACTION_COUNT = 0x1003, }; #define IB_FLOW_SPEC_LAYER_MASK 0xF0 #define IB_FLOW_SPEC_SUPPORT_LAYERS 10 enum ib_flow_flags { IB_FLOW_ATTR_FLAGS_DONT_TRAP = 1UL << 1, /* Continue match, no steal */ IB_FLOW_ATTR_FLAGS_EGRESS = 1UL << 2, /* Egress flow */ IB_FLOW_ATTR_FLAGS_RESERVED = 1UL << 3 /* Must be last */ }; struct ib_flow_eth_filter { u8 dst_mac[6]; u8 src_mac[6]; __be16 ether_type; __be16 vlan_tag; }; struct ib_flow_spec_eth { u32 type; u16 size; struct ib_flow_eth_filter val; struct ib_flow_eth_filter mask; }; struct ib_flow_ib_filter { __be16 dlid; __u8 sl; }; struct ib_flow_spec_ib { u32 type; u16 size; struct ib_flow_ib_filter val; struct ib_flow_ib_filter mask; }; /* IPv4 header flags */ enum ib_ipv4_flags { IB_IPV4_DONT_FRAG = 0x2, /* Don't enable packet fragmentation */ IB_IPV4_MORE_FRAG = 0X4 /* For All fragmented packets except the last have this flag set */ }; struct ib_flow_ipv4_filter { __be32 src_ip; __be32 dst_ip; u8 proto; u8 tos; u8 ttl; u8 flags; }; struct ib_flow_spec_ipv4 { u32 type; u16 size; struct ib_flow_ipv4_filter val; struct ib_flow_ipv4_filter mask; }; struct ib_flow_ipv6_filter { u8 src_ip[16]; u8 dst_ip[16]; __be32 flow_label; u8 next_hdr; u8 traffic_class; u8 hop_limit; } __packed; struct ib_flow_spec_ipv6 { u32 type; u16 size; struct ib_flow_ipv6_filter val; struct ib_flow_ipv6_filter mask; }; struct ib_flow_tcp_udp_filter { __be16 dst_port; __be16 src_port; }; struct ib_flow_spec_tcp_udp { u32 type; u16 size; struct ib_flow_tcp_udp_filter val; struct ib_flow_tcp_udp_filter mask; }; struct ib_flow_tunnel_filter { __be32 tunnel_id; }; /* ib_flow_spec_tunnel describes the Vxlan tunnel * the tunnel_id from val has the vni value */ struct ib_flow_spec_tunnel { u32 type; u16 size; struct ib_flow_tunnel_filter val; struct ib_flow_tunnel_filter mask; }; struct ib_flow_esp_filter { __be32 spi; __be32 seq; }; struct ib_flow_spec_esp { u32 type; u16 size; struct ib_flow_esp_filter val; struct ib_flow_esp_filter mask; }; struct ib_flow_gre_filter { __be16 c_ks_res0_ver; __be16 protocol; __be32 key; }; struct ib_flow_spec_gre { u32 type; u16 size; struct ib_flow_gre_filter val; struct ib_flow_gre_filter mask; }; struct ib_flow_mpls_filter { __be32 tag; }; struct ib_flow_spec_mpls { u32 type; u16 size; struct ib_flow_mpls_filter val; struct ib_flow_mpls_filter mask; }; struct ib_flow_spec_action_tag { enum ib_flow_spec_type type; u16 size; u32 tag_id; }; struct ib_flow_spec_action_drop { enum ib_flow_spec_type type; u16 size; }; struct ib_flow_spec_action_handle { enum ib_flow_spec_type type; u16 size; struct ib_flow_action *act; }; enum ib_counters_description { IB_COUNTER_PACKETS, IB_COUNTER_BYTES, }; struct ib_flow_spec_action_count { enum ib_flow_spec_type type; u16 size; struct ib_counters *counters; }; union ib_flow_spec { struct { u32 type; u16 size; }; struct ib_flow_spec_eth eth; struct ib_flow_spec_ib ib; struct ib_flow_spec_ipv4 ipv4; struct ib_flow_spec_tcp_udp tcp_udp; struct ib_flow_spec_ipv6 ipv6; struct ib_flow_spec_tunnel tunnel; struct ib_flow_spec_esp esp; struct ib_flow_spec_gre gre; struct ib_flow_spec_mpls mpls; struct ib_flow_spec_action_tag flow_tag; struct ib_flow_spec_action_drop drop; struct ib_flow_spec_action_handle action; struct ib_flow_spec_action_count flow_count; }; struct ib_flow_attr { enum ib_flow_attr_type type; u16 size; u16 priority; u32 flags; u8 num_of_specs; u32 port; union ib_flow_spec flows[]; }; struct ib_flow { struct ib_qp *qp; struct ib_device *device; struct ib_uobject *uobject; }; enum ib_flow_action_type { IB_FLOW_ACTION_UNSPECIFIED, IB_FLOW_ACTION_ESP = 1, }; struct ib_flow_action_attrs_esp_keymats { enum ib_uverbs_flow_action_esp_keymat protocol; union { struct ib_uverbs_flow_action_esp_keymat_aes_gcm aes_gcm; } keymat; }; struct ib_flow_action_attrs_esp_replays { enum ib_uverbs_flow_action_esp_replay protocol; union { struct ib_uverbs_flow_action_esp_replay_bmp bmp; } replay; }; enum ib_flow_action_attrs_esp_flags { /* All user-space flags at the top: Use enum ib_uverbs_flow_action_esp_flags * This is done in order to share the same flags between user-space and * kernel and spare an unnecessary translation. */ /* Kernel flags */ IB_FLOW_ACTION_ESP_FLAGS_ESN_TRIGGERED = 1ULL << 32, IB_FLOW_ACTION_ESP_FLAGS_MOD_ESP_ATTRS = 1ULL << 33, }; struct ib_flow_spec_list { struct ib_flow_spec_list *next; union ib_flow_spec spec; }; struct ib_flow_action_attrs_esp { struct ib_flow_action_attrs_esp_keymats *keymat; struct ib_flow_action_attrs_esp_replays *replay; struct ib_flow_spec_list *encap; /* Used only if IB_FLOW_ACTION_ESP_FLAGS_ESN_TRIGGERED is enabled. * Value of 0 is a valid value. */ u32 esn; u32 spi; u32 seq; u32 tfc_pad; /* Use enum ib_flow_action_attrs_esp_flags */ u64 flags; u64 hard_limit_pkts; }; struct ib_flow_action { struct ib_device *device; struct ib_uobject *uobject; enum ib_flow_action_type type; atomic_t usecnt; }; struct ib_mad; enum ib_process_mad_flags { IB_MAD_IGNORE_MKEY = 1, IB_MAD_IGNORE_BKEY = 2, IB_MAD_IGNORE_ALL = IB_MAD_IGNORE_MKEY | IB_MAD_IGNORE_BKEY }; enum ib_mad_result { IB_MAD_RESULT_FAILURE = 0, /* (!SUCCESS is the important flag) */ IB_MAD_RESULT_SUCCESS = 1 << 0, /* MAD was successfully processed */ IB_MAD_RESULT_REPLY = 1 << 1, /* Reply packet needs to be sent */ IB_MAD_RESULT_CONSUMED = 1 << 2 /* Packet consumed: stop processing */ }; struct ib_port_cache { u64 subnet_prefix; struct ib_pkey_cache *pkey; struct ib_gid_table *gid; u8 lmc; enum ib_port_state port_state; enum ib_port_state last_port_state; }; struct ib_port_immutable { int pkey_tbl_len; int gid_tbl_len; u32 core_cap_flags; u32 max_mad_size; }; struct ib_port_data { struct ib_device *ib_dev; struct ib_port_immutable immutable; spinlock_t pkey_list_lock; spinlock_t netdev_lock; struct list_head pkey_list; struct ib_port_cache cache; struct net_device __rcu *netdev; netdevice_tracker netdev_tracker; struct hlist_node ndev_hash_link; struct rdma_port_counter port_counter; struct ib_port *sysfs; }; /* rdma netdev type - specifies protocol type */ enum rdma_netdev_t { RDMA_NETDEV_OPA_VNIC, RDMA_NETDEV_IPOIB, }; /** * struct rdma_netdev - rdma netdev * For cases where netstack interfacing is required. */ struct rdma_netdev { void *clnt_priv; struct ib_device *hca; u32 port_num; int mtu; /* * cleanup function must be specified. * FIXME: This is only used for OPA_VNIC and that usage should be * removed too. */ void (*free_rdma_netdev)(struct net_device *netdev); /* control functions */ void (*set_id)(struct net_device *netdev, int id); /* send packet */ int (*send)(struct net_device *dev, struct sk_buff *skb, struct ib_ah *address, u32 dqpn); /* multicast */ int (*attach_mcast)(struct net_device *dev, struct ib_device *hca, union ib_gid *gid, u16 mlid, int set_qkey, u32 qkey); int (*detach_mcast)(struct net_device *dev, struct ib_device *hca, union ib_gid *gid, u16 mlid); /* timeout */ void (*tx_timeout)(struct net_device *dev, unsigned int txqueue); }; struct rdma_netdev_alloc_params { size_t sizeof_priv; unsigned int txqs; unsigned int rxqs; void *param; int (*initialize_rdma_netdev)(struct ib_device *device, u32 port_num, struct net_device *netdev, void *param); }; struct ib_odp_counters { atomic64_t faults; atomic64_t faults_handled; atomic64_t invalidations; atomic64_t invalidations_handled; atomic64_t prefetch; }; struct ib_counters { struct ib_device *device; struct ib_uobject *uobject; /* num of objects attached */ atomic_t usecnt; }; struct ib_counters_read_attr { u64 *counters_buff; u32 ncounters; u32 flags; /* use enum ib_read_counters_flags */ }; struct uverbs_attr_bundle; struct iw_cm_id; struct iw_cm_conn_param; #define INIT_RDMA_OBJ_SIZE(ib_struct, drv_struct, member) \ .size_##ib_struct = \ (sizeof(struct drv_struct) + \ BUILD_BUG_ON_ZERO(offsetof(struct drv_struct, member)) + \ BUILD_BUG_ON_ZERO( \ !__same_type(((struct drv_struct *)NULL)->member, \ struct ib_struct))) #define rdma_zalloc_drv_obj_gfp(ib_dev, ib_type, gfp) \ ((struct ib_type *)rdma_zalloc_obj(ib_dev, ib_dev->ops.size_##ib_type, \ gfp, false)) #define rdma_zalloc_drv_obj_numa(ib_dev, ib_type) \ ((struct ib_type *)rdma_zalloc_obj(ib_dev, ib_dev->ops.size_##ib_type, \ GFP_KERNEL, true)) #define rdma_zalloc_drv_obj(ib_dev, ib_type) \ rdma_zalloc_drv_obj_gfp(ib_dev, ib_type, GFP_KERNEL) #define DECLARE_RDMA_OBJ_SIZE(ib_struct) size_t size_##ib_struct struct rdma_user_mmap_entry { struct kref ref; struct ib_ucontext *ucontext; unsigned long start_pgoff; size_t npages; bool driver_removed; }; /* Return the offset (in bytes) the user should pass to libc's mmap() */ static inline u64 rdma_user_mmap_get_offset(const struct rdma_user_mmap_entry *entry) { return (u64)entry->start_pgoff << PAGE_SHIFT; } /** * struct ib_device_ops - InfiniBand device operations * This structure defines all the InfiniBand device operations, providers will * need to define the supported operations, otherwise they will be set to null. */ struct ib_device_ops { struct module *owner; enum rdma_driver_id driver_id; u32 uverbs_abi_ver; unsigned int uverbs_no_driver_id_binding:1; /* * NOTE: New drivers should not make use of device_group; instead new * device parameter should be exposed via netlink command. This * mechanism exists only for existing drivers. */ const struct attribute_group *device_group; const struct attribute_group **port_groups; int (*post_send)(struct ib_qp *qp, const struct ib_send_wr *send_wr, const struct ib_send_wr **bad_send_wr); int (*post_recv)(struct ib_qp *qp, const struct ib_recv_wr *recv_wr, const struct ib_recv_wr **bad_recv_wr); void (*drain_rq)(struct ib_qp *qp); void (*drain_sq)(struct ib_qp *qp); int (*poll_cq)(struct ib_cq *cq, int num_entries, struct ib_wc *wc); int (*peek_cq)(struct ib_cq *cq, int wc_cnt); int (*req_notify_cq)(struct ib_cq *cq, enum ib_cq_notify_flags flags); int (*post_srq_recv)(struct ib_srq *srq, const struct ib_recv_wr *recv_wr, const struct ib_recv_wr **bad_recv_wr); int (*process_mad)(struct ib_device *device, int process_mad_flags, u32 port_num, const struct ib_wc *in_wc, const struct ib_grh *in_grh, const struct ib_mad *in_mad, struct ib_mad *out_mad, size_t *out_mad_size, u16 *out_mad_pkey_index); int (*query_device)(struct ib_device *device, struct ib_device_attr *device_attr, struct ib_udata *udata); int (*modify_device)(struct ib_device *device, int device_modify_mask, struct ib_device_modify *device_modify); void (*get_dev_fw_str)(struct ib_device *device, char *str); const struct cpumask *(*get_vector_affinity)(struct ib_device *ibdev, int comp_vector); int (*query_port)(struct ib_device *device, u32 port_num, struct ib_port_attr *port_attr); int (*modify_port)(struct ib_device *device, u32 port_num, int port_modify_mask, struct ib_port_modify *port_modify); /* * The following mandatory functions are used only at device * registration. Keep functions such as these at the end of this * structure to avoid cache line misses when accessing struct ib_device * in fast paths. */ int (*get_port_immutable)(struct ib_device *device, u32 port_num, struct ib_port_immutable *immutable); enum rdma_link_layer (*get_link_layer)(struct ib_device *device, u32 port_num); /* * When calling get_netdev, the HW vendor's driver should return the * net device of device @device at port @port_num or NULL if such * a net device doesn't exist. The vendor driver should call dev_hold * on this net device. The HW vendor's device driver must guarantee * that this function returns NULL before the net device has finished * NETDEV_UNREGISTER state. */ struct net_device *(*get_netdev)(struct ib_device *device, u32 port_num); /* * rdma netdev operation * * Driver implementing alloc_rdma_netdev or rdma_netdev_get_params * must return -EOPNOTSUPP if it doesn't support the specified type. */ struct net_device *(*alloc_rdma_netdev)( struct ib_device *device, u32 port_num, enum rdma_netdev_t type, const char *name, unsigned char name_assign_type, void (*setup)(struct net_device *)); int (*rdma_netdev_get_params)(struct ib_device *device, u32 port_num, enum rdma_netdev_t type, struct rdma_netdev_alloc_params *params); /* * query_gid should be return GID value for @device, when @port_num * link layer is either IB or iWarp. It is no-op if @port_num port * is RoCE link layer. */ int (*query_gid)(struct ib_device *device, u32 port_num, int index, union ib_gid *gid); /* * When calling add_gid, the HW vendor's driver should add the gid * of device of port at gid index available at @attr. Meta-info of * that gid (for example, the network device related to this gid) is * available at @attr. @context allows the HW vendor driver to store * extra information together with a GID entry. The HW vendor driver may * allocate memory to contain this information and store it in @context * when a new GID entry is written to. Params are consistent until the * next call of add_gid or delete_gid. The function should return 0 on * success or error otherwise. The function could be called * concurrently for different ports. This function is only called when * roce_gid_table is used. */ int (*add_gid)(const struct ib_gid_attr *attr, void **context); /* * When calling del_gid, the HW vendor's driver should delete the * gid of device @device at gid index gid_index of port port_num * available in @attr. * Upon the deletion of a GID entry, the HW vendor must free any * allocated memory. The caller will clear @context afterwards. * This function is only called when roce_gid_table is used. */ int (*del_gid)(const struct ib_gid_attr *attr, void **context); int (*query_pkey)(struct ib_device *device, u32 port_num, u16 index, u16 *pkey); int (*alloc_ucontext)(struct ib_ucontext *context, struct ib_udata *udata); void (*dealloc_ucontext)(struct ib_ucontext *context); int (*mmap)(struct ib_ucontext *context, struct vm_area_struct *vma); /* * This will be called once refcount of an entry in mmap_xa reaches * zero. The type of the memory that was mapped may differ between * entries and is opaque to the rdma_user_mmap interface. * Therefore needs to be implemented by the driver in mmap_free. */ void (*mmap_free)(struct rdma_user_mmap_entry *entry); void (*disassociate_ucontext)(struct ib_ucontext *ibcontext); int (*alloc_pd)(struct ib_pd *pd, struct ib_udata *udata); int (*dealloc_pd)(struct ib_pd *pd, struct ib_udata *udata); int (*create_ah)(struct ib_ah *ah, struct rdma_ah_init_attr *attr, struct ib_udata *udata); int (*create_user_ah)(struct ib_ah *ah, struct rdma_ah_init_attr *attr, struct ib_udata *udata); int (*modify_ah)(struct ib_ah *ah, struct rdma_ah_attr *ah_attr); int (*query_ah)(struct ib_ah *ah, struct rdma_ah_attr *ah_attr); int (*destroy_ah)(struct ib_ah *ah, u32 flags); int (*create_srq)(struct ib_srq *srq, struct ib_srq_init_attr *srq_init_attr, struct ib_udata *udata); int (*modify_srq)(struct ib_srq *srq, struct ib_srq_attr *srq_attr, enum ib_srq_attr_mask srq_attr_mask, struct ib_udata *udata); int (*query_srq)(struct ib_srq *srq, struct ib_srq_attr *srq_attr); int (*destroy_srq)(struct ib_srq *srq, struct ib_udata *udata); int (*create_qp)(struct ib_qp *qp, struct ib_qp_init_attr *qp_init_attr, struct ib_udata *udata); int (*modify_qp)(struct ib_qp *qp, struct ib_qp_attr *qp_attr, int qp_attr_mask, struct ib_udata *udata); int (*query_qp)(struct ib_qp *qp, struct ib_qp_attr *qp_attr, int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr); int (*destroy_qp)(struct ib_qp *qp, struct ib_udata *udata); int (*create_cq)(struct ib_cq *cq, const struct ib_cq_init_attr *attr, struct uverbs_attr_bundle *attrs); int (*create_cq_umem)(struct ib_cq *cq, const struct ib_cq_init_attr *attr, struct ib_umem *umem, struct uverbs_attr_bundle *attrs); int (*modify_cq)(struct ib_cq *cq, u16 cq_count, u16 cq_period); int (*destroy_cq)(struct ib_cq *cq, struct ib_udata *udata); int (*resize_cq)(struct ib_cq *cq, int cqe, struct ib_udata *udata); /* * pre_destroy_cq - Prevent a cq from generating any new work * completions, but not free any kernel resources */ int (*pre_destroy_cq)(struct ib_cq *cq); /* * post_destroy_cq - Free all kernel resources */ void (*post_destroy_cq)(struct ib_cq *cq); struct ib_mr *(*get_dma_mr)(struct ib_pd *pd, int mr_access_flags); struct ib_mr *(*reg_user_mr)(struct ib_pd *pd, u64 start, u64 length, u64 virt_addr, int mr_access_flags, struct ib_dmah *dmah, struct ib_udata *udata); struct ib_mr *(*reg_user_mr_dmabuf)(struct ib_pd *pd, u64 offset, u64 length, u64 virt_addr, int fd, int mr_access_flags, struct ib_dmah *dmah, struct uverbs_attr_bundle *attrs); struct ib_mr *(*rereg_user_mr)(struct ib_mr *mr, int flags, u64 start, u64 length, u64 virt_addr, int mr_access_flags, struct ib_pd *pd, struct ib_udata *udata); int (*dereg_mr)(struct ib_mr *mr, struct ib_udata *udata); struct ib_mr *(*alloc_mr)(struct ib_pd *pd, enum ib_mr_type mr_type, u32 max_num_sg); struct ib_mr *(*alloc_mr_integrity)(struct ib_pd *pd, u32 max_num_data_sg, u32 max_num_meta_sg); int (*advise_mr)(struct ib_pd *pd, enum ib_uverbs_advise_mr_advice advice, u32 flags, struct ib_sge *sg_list, u32 num_sge, struct uverbs_attr_bundle *attrs); /* * Kernel users should universally support relaxed ordering (RO), as * they are designed to read data only after observing the CQE and use * the DMA API correctly. * * Some drivers implicitly enable RO if platform supports it. */ int (*map_mr_sg)(struct ib_mr *mr, struct scatterlist *sg, int sg_nents, unsigned int *sg_offset); int (*check_mr_status)(struct ib_mr *mr, u32 check_mask, struct ib_mr_status *mr_status); int (*alloc_mw)(struct ib_mw *mw, struct ib_udata *udata); int (*dealloc_mw)(struct ib_mw *mw); int (*attach_mcast)(struct ib_qp *qp, union ib_gid *gid, u16 lid); int (*detach_mcast)(struct ib_qp *qp, union ib_gid *gid, u16 lid); int (*alloc_xrcd)(struct ib_xrcd *xrcd, struct ib_udata *udata); int (*dealloc_xrcd)(struct ib_xrcd *xrcd, struct ib_udata *udata); struct ib_flow *(*create_flow)(struct ib_qp *qp, struct ib_flow_attr *flow_attr, struct ib_udata *udata); int (*destroy_flow)(struct ib_flow *flow_id); int (*destroy_flow_action)(struct ib_flow_action *action); int (*set_vf_link_state)(struct ib_device *device, int vf, u32 port, int state); int (*get_vf_config)(struct ib_device *device, int vf, u32 port, struct ifla_vf_info *ivf); int (*get_vf_stats)(struct ib_device *device, int vf, u32 port, struct ifla_vf_stats *stats); int (*get_vf_guid)(struct ib_device *device, int vf, u32 port, struct ifla_vf_guid *node_guid, struct ifla_vf_guid *port_guid); int (*set_vf_guid)(struct ib_device *device, int vf, u32 port, u64 guid, int type); struct ib_wq *(*create_wq)(struct ib_pd *pd, struct ib_wq_init_attr *init_attr, struct ib_udata *udata); int (*destroy_wq)(struct ib_wq *wq, struct ib_udata *udata); int (*modify_wq)(struct ib_wq *wq, struct ib_wq_attr *attr, u32 wq_attr_mask, struct ib_udata *udata); int (*create_rwq_ind_table)(struct ib_rwq_ind_table *ib_rwq_ind_table, struct ib_rwq_ind_table_init_attr *init_attr, struct ib_udata *udata); int (*destroy_rwq_ind_table)(struct ib_rwq_ind_table *wq_ind_table); struct ib_dm *(*alloc_dm)(struct ib_device *device, struct ib_ucontext *context, struct ib_dm_alloc_attr *attr, struct uverbs_attr_bundle *attrs); int (*dealloc_dm)(struct ib_dm *dm, struct uverbs_attr_bundle *attrs); int (*alloc_dmah)(struct ib_dmah *ibdmah, struct uverbs_attr_bundle *attrs); int (*dealloc_dmah)(struct ib_dmah *dmah, struct uverbs_attr_bundle *attrs); struct ib_mr *(*reg_dm_mr)(struct ib_pd *pd, struct ib_dm *dm, struct ib_dm_mr_attr *attr, struct uverbs_attr_bundle *attrs); int (*create_counters)(struct ib_counters *counters, struct uverbs_attr_bundle *attrs); int (*destroy_counters)(struct ib_counters *counters); int (*read_counters)(struct ib_counters *counters, struct ib_counters_read_attr *counters_read_attr, struct uverbs_attr_bundle *attrs); int (*map_mr_sg_pi)(struct ib_mr *mr, struct scatterlist *data_sg, int data_sg_nents, unsigned int *data_sg_offset, struct scatterlist *meta_sg, int meta_sg_nents, unsigned int *meta_sg_offset); /* * alloc_hw_[device,port]_stats - Allocate a struct rdma_hw_stats and * fill in the driver initialized data. The struct is kfree()'ed by * the sysfs core when the device is removed. A lifespan of -1 in the * return struct tells the core to set a default lifespan. */ struct rdma_hw_stats *(*alloc_hw_device_stats)(struct ib_device *device); struct rdma_hw_stats *(*alloc_hw_port_stats)(struct ib_device *device, u32 port_num); /* * get_hw_stats - Fill in the counter value(s) in the stats struct. * @index - The index in the value array we wish to have updated, or * num_counters if we want all stats updated * Return codes - * < 0 - Error, no counters updated * index - Updated the single counter pointed to by index * num_counters - Updated all counters (will reset the timestamp * and prevent further calls for lifespan milliseconds) * Drivers are allowed to update all counters in leiu of just the * one given in index at their option */ int (*get_hw_stats)(struct ib_device *device, struct rdma_hw_stats *stats, u32 port, int index); /* * modify_hw_stat - Modify the counter configuration * @enable: true/false when enable/disable a counter * Return codes - 0 on success or error code otherwise. */ int (*modify_hw_stat)(struct ib_device *device, u32 port, unsigned int counter_index, bool enable); /* * Allows rdma drivers to add their own restrack attributes. */ int (*fill_res_mr_entry)(struct sk_buff *msg, struct ib_mr *ibmr); int (*fill_res_mr_entry_raw)(struct sk_buff *msg, struct ib_mr *ibmr); int (*fill_res_cq_entry)(struct sk_buff *msg, struct ib_cq *ibcq); int (*fill_res_cq_entry_raw)(struct sk_buff *msg, struct ib_cq *ibcq); int (*fill_res_qp_entry)(struct sk_buff *msg, struct ib_qp *ibqp); int (*fill_res_qp_entry_raw)(struct sk_buff *msg, struct ib_qp *ibqp); int (*fill_res_cm_id_entry)(struct sk_buff *msg, struct rdma_cm_id *id); int (*fill_res_srq_entry)(struct sk_buff *msg, struct ib_srq *ib_srq); int (*fill_res_srq_entry_raw)(struct sk_buff *msg, struct ib_srq *ib_srq); /* Device lifecycle callbacks */ /* * Called after the device becomes registered, before clients are * attached */ int (*enable_driver)(struct ib_device *dev); /* * This is called as part of ib_dealloc_device(). */ void (*dealloc_driver)(struct ib_device *dev); /* iWarp CM callbacks */ void (*iw_add_ref)(struct ib_qp *qp); void (*iw_rem_ref)(struct ib_qp *qp); struct ib_qp *(*iw_get_qp)(struct ib_device *device, int qpn); int (*iw_connect)(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param); int (*iw_accept)(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param); int (*iw_reject)(struct iw_cm_id *cm_id, const void *pdata, u8 pdata_len); int (*iw_create_listen)(struct iw_cm_id *cm_id, int backlog); int (*iw_destroy_listen)(struct iw_cm_id *cm_id); /* * counter_bind_qp - Bind a QP to a counter. * @counter - The counter to be bound. If counter->id is zero then * the driver needs to allocate a new counter and set counter->id */ int (*counter_bind_qp)(struct rdma_counter *counter, struct ib_qp *qp, u32 port); /* * counter_unbind_qp - Unbind the qp from the dynamically-allocated * counter and bind it onto the default one */ int (*counter_unbind_qp)(struct ib_qp *qp, u32 port); /* * counter_dealloc -De-allocate the hw counter */ int (*counter_dealloc)(struct rdma_counter *counter); /* * counter_alloc_stats - Allocate a struct rdma_hw_stats and fill in * the driver initialized data. */ struct rdma_hw_stats *(*counter_alloc_stats)( struct rdma_counter *counter); /* * counter_update_stats - Query the stats value of this counter */ int (*counter_update_stats)(struct rdma_counter *counter); /* * counter_init - Initialize the driver specific rdma counter struct. */ void (*counter_init)(struct rdma_counter *counter); /* * Allows rdma drivers to add their own restrack attributes * dumped via 'rdma stat' iproute2 command. */ int (*fill_stat_mr_entry)(struct sk_buff *msg, struct ib_mr *ibmr); /* query driver for its ucontext properties */ int (*query_ucontext)(struct ib_ucontext *context, struct uverbs_attr_bundle *attrs); /* * Provide NUMA node. This API exists for rdmavt/hfi1 only. * Everyone else relies on Linux memory management model. */ int (*get_numa_node)(struct ib_device *dev); /* * add_sub_dev - Add a sub IB device */ struct ib_device *(*add_sub_dev)(struct ib_device *parent, enum rdma_nl_dev_type type, const char *name); /* * del_sub_dev - Delete a sub IB device */ void (*del_sub_dev)(struct ib_device *sub_dev); /* * ufile_cleanup - Attempt to cleanup ubojects HW resources inside * the ufile. */ void (*ufile_hw_cleanup)(struct ib_uverbs_file *ufile); /* * report_port_event - Drivers need to implement this if they have * some private stuff to handle when link status changes. */ void (*report_port_event)(struct ib_device *ibdev, struct net_device *ndev, unsigned long event); DECLARE_RDMA_OBJ_SIZE(ib_ah); DECLARE_RDMA_OBJ_SIZE(ib_counters); DECLARE_RDMA_OBJ_SIZE(ib_cq); DECLARE_RDMA_OBJ_SIZE(ib_dmah); DECLARE_RDMA_OBJ_SIZE(ib_mw); DECLARE_RDMA_OBJ_SIZE(ib_pd); DECLARE_RDMA_OBJ_SIZE(ib_qp); DECLARE_RDMA_OBJ_SIZE(ib_rwq_ind_table); DECLARE_RDMA_OBJ_SIZE(ib_srq); DECLARE_RDMA_OBJ_SIZE(ib_ucontext); DECLARE_RDMA_OBJ_SIZE(ib_xrcd); DECLARE_RDMA_OBJ_SIZE(rdma_counter); }; struct ib_core_device { /* device must be the first element in structure until, * union of ib_core_device and device exists in ib_device. */ struct device dev; possible_net_t rdma_net; struct kobject *ports_kobj; struct list_head port_list; struct ib_device *owner; /* reach back to owner ib_device */ }; struct rdma_restrack_root; struct ib_device { /* Do not access @dma_device directly from ULP nor from HW drivers. */ struct device *dma_device; struct ib_device_ops ops; char name[IB_DEVICE_NAME_MAX]; struct rcu_head rcu_head; struct list_head event_handler_list; /* Protects event_handler_list */ struct rw_semaphore event_handler_rwsem; /* Protects QP's event_handler calls and open_qp list */ spinlock_t qp_open_list_lock; struct rw_semaphore client_data_rwsem; struct xarray client_data; struct mutex unregistration_lock; /* Synchronize GID, Pkey cache entries, subnet prefix, LMC */ rwlock_t cache_lock; /** * port_data is indexed by port number */ struct ib_port_data *port_data; int num_comp_vectors; union { struct device dev; struct ib_core_device coredev; }; /* First group is for device attributes, * Second group is for driver provided attributes (optional). * Third group is for the hw_stats * It is a NULL terminated array. */ const struct attribute_group *groups[4]; u8 hw_stats_attr_index; u64 uverbs_cmd_mask; char node_desc[IB_DEVICE_NODE_DESC_MAX]; __be64 node_guid; u32 local_dma_lkey; u16 is_switch:1; /* Indicates kernel verbs support, should not be used in drivers */ u16 kverbs_provider:1; /* CQ adaptive moderation (RDMA DIM) */ u16 use_cq_dim:1; u8 node_type; u32 phys_port_cnt; struct ib_device_attr attrs; struct hw_stats_device_data *hw_stats_data; #ifdef CONFIG_CGROUP_RDMA struct rdmacg_device cg_device; #endif u32 index; spinlock_t cq_pools_lock; struct list_head cq_pools[IB_POLL_LAST_POOL_TYPE + 1]; struct rdma_restrack_root *res; const struct uapi_definition *driver_def; /* * Positive refcount indicates that the device is currently * registered and cannot be unregistered. */ refcount_t refcount; struct completion unreg_completion; struct work_struct unregistration_work; const struct rdma_link_ops *link_ops; /* Protects compat_devs xarray modifications */ struct mutex compat_devs_mutex; /* Maintains compat devices for each net namespace */ struct xarray compat_devs; /* Used by iWarp CM */ char iw_ifname[IFNAMSIZ]; u32 iw_driver_flags; u32 lag_flags; /* A parent device has a list of sub-devices */ struct mutex subdev_lock; struct list_head subdev_list_head; /* A sub device has a type and a parent */ enum rdma_nl_dev_type type; struct ib_device *parent; struct list_head subdev_list; enum rdma_nl_name_assign_type name_assign_type; }; static inline void *rdma_zalloc_obj(struct ib_device *dev, size_t size, gfp_t gfp, bool is_numa_aware) { if (is_numa_aware && dev->ops.get_numa_node) return kzalloc_node(size, gfp, dev->ops.get_numa_node(dev)); return kzalloc(size, gfp); } struct ib_client_nl_info; struct ib_client { const char *name; int (*add)(struct ib_device *ibdev); void (*remove)(struct ib_device *, void *client_data); void (*rename)(struct ib_device *dev, void *client_data); int (*get_nl_info)(struct ib_device *ibdev, void *client_data, struct ib_client_nl_info *res); int (*get_global_nl_info)(struct ib_client_nl_info *res); /* Returns the net_dev belonging to this ib_client and matching the * given parameters. * @dev: An RDMA device that the net_dev use for communication. * @port: A physical port number on the RDMA device. * @pkey: P_Key that the net_dev uses if applicable. * @gid: A GID that the net_dev uses to communicate. * @addr: An IP address the net_dev is configured with. * @client_data: The device's client data set by ib_set_client_data(). * * An ib_client that implements a net_dev on top of RDMA devices * (such as IP over IB) should implement this callback, allowing the * rdma_cm module to find the right net_dev for a given request. * * The caller is responsible for calling dev_put on the returned * netdev. */ struct net_device *(*get_net_dev_by_params)( struct ib_device *dev, u32 port, u16 pkey, const union ib_gid *gid, const struct sockaddr *addr, void *client_data); refcount_t uses; struct completion uses_zero; u32 client_id; /* kverbs are not required by the client */ u8 no_kverbs_req:1; }; /* * IB block DMA iterator * * Iterates the DMA-mapped SGL in contiguous memory blocks aligned * to a HW supported page size. */ struct ib_block_iter { /* internal states */ struct scatterlist *__sg; /* sg holding the current aligned block */ dma_addr_t __dma_addr; /* unaligned DMA address of this block */ size_t __sg_numblocks; /* ib_umem_num_dma_blocks() */ unsigned int __sg_nents; /* number of SG entries */ unsigned int __sg_advance; /* number of bytes to advance in sg in next step */ unsigned int __pg_bit; /* alignment of current block */ }; struct ib_device *_ib_alloc_device(size_t size, struct net *net); #define ib_alloc_device(drv_struct, member) \ container_of(_ib_alloc_device(sizeof(struct drv_struct) + \ BUILD_BUG_ON_ZERO(offsetof( \ struct drv_struct, member)), \ &init_net), \ struct drv_struct, member) #define ib_alloc_device_with_net(drv_struct, member, net) \ container_of(_ib_alloc_device(sizeof(struct drv_struct) + \ BUILD_BUG_ON_ZERO(offsetof( \ struct drv_struct, member)), net), \ struct drv_struct, member) void ib_dealloc_device(struct ib_device *device); void ib_get_device_fw_str(struct ib_device *device, char *str); int ib_register_device(struct ib_device *device, const char *name, struct device *dma_device); void ib_unregister_device(struct ib_device *device); void ib_unregister_driver(enum rdma_driver_id driver_id); void ib_unregister_device_and_put(struct ib_device *device); void ib_unregister_device_queued(struct ib_device *ib_dev); int ib_register_client (struct ib_client *client); void ib_unregister_client(struct ib_client *client); void __rdma_block_iter_start(struct ib_block_iter *biter, struct scatterlist *sglist, unsigned int nents, unsigned long pgsz); bool __rdma_block_iter_next(struct ib_block_iter *biter); /** * rdma_block_iter_dma_address - get the aligned dma address of the current * block held by the block iterator. * @biter: block iterator holding the memory block */ static inline dma_addr_t rdma_block_iter_dma_address(struct ib_block_iter *biter) { return biter->__dma_addr & ~(BIT_ULL(biter->__pg_bit) - 1); } /** * rdma_for_each_block - iterate over contiguous memory blocks of the sg list * @sglist: sglist to iterate over * @biter: block iterator holding the memory block * @nents: maximum number of sg entries to iterate over * @pgsz: best HW supported page size to use * * Callers may use rdma_block_iter_dma_address() to get each * blocks aligned DMA address. */ #define rdma_for_each_block(sglist, biter, nents, pgsz) \ for (__rdma_block_iter_start(biter, sglist, nents, \ pgsz); \ __rdma_block_iter_next(biter);) /** * ib_get_client_data - Get IB client context * @device:Device to get context for * @client:Client to get context for * * ib_get_client_data() returns the client context data set with * ib_set_client_data(). This can only be called while the client is * registered to the device, once the ib_client remove() callback returns this * cannot be called. */ static inline void *ib_get_client_data(struct ib_device *device, struct ib_client *client) { return xa_load(&device->client_data, client->client_id); } void ib_set_client_data(struct ib_device *device, struct ib_client *client, void *data); void ib_set_device_ops(struct ib_device *device, const struct ib_device_ops *ops); int rdma_user_mmap_io(struct ib_ucontext *ucontext, struct vm_area_struct *vma, unsigned long pfn, unsigned long size, pgprot_t prot, struct rdma_user_mmap_entry *entry); int rdma_user_mmap_entry_insert(struct ib_ucontext *ucontext, struct rdma_user_mmap_entry *entry, size_t length); int rdma_user_mmap_entry_insert_range(struct ib_ucontext *ucontext, struct rdma_user_mmap_entry *entry, size_t length, u32 min_pgoff, u32 max_pgoff); #if IS_ENABLED(CONFIG_INFINIBAND_USER_ACCESS) void rdma_user_mmap_disassociate(struct ib_device *device); #else static inline void rdma_user_mmap_disassociate(struct ib_device *device) { } #endif static inline int rdma_user_mmap_entry_insert_exact(struct ib_ucontext *ucontext, struct rdma_user_mmap_entry *entry, size_t length, u32 pgoff) { return rdma_user_mmap_entry_insert_range(ucontext, entry, length, pgoff, pgoff); } struct rdma_user_mmap_entry * rdma_user_mmap_entry_get_pgoff(struct ib_ucontext *ucontext, unsigned long pgoff); struct rdma_user_mmap_entry * rdma_user_mmap_entry_get(struct ib_ucontext *ucontext, struct vm_area_struct *vma); void rdma_user_mmap_entry_put(struct rdma_user_mmap_entry *entry); void rdma_user_mmap_entry_remove(struct rdma_user_mmap_entry *entry); static inline int ib_copy_from_udata(void *dest, struct ib_udata *udata, size_t len) { return copy_from_user(dest, udata->inbuf, len) ? -EFAULT : 0; } static inline int ib_copy_to_udata(struct ib_udata *udata, void *src, size_t len) { return copy_to_user(udata->outbuf, src, len) ? -EFAULT : 0; } static inline bool ib_is_buffer_cleared(const void __user *p, size_t len) { bool ret; u8 *buf; if (len > USHRT_MAX) return false; buf = memdup_user(p, len); if (IS_ERR(buf)) return false; ret = !memchr_inv(buf, 0, len); kfree(buf); return ret; } static inline bool ib_is_udata_cleared(struct ib_udata *udata, size_t offset, size_t len) { return ib_is_buffer_cleared(udata->inbuf + offset, len); } /** * ib_modify_qp_is_ok - Check that the supplied attribute mask * contains all required attributes and no attributes not allowed for * the given QP state transition. * @cur_state: Current QP state * @next_state: Next QP state * @type: QP type * @mask: Mask of supplied QP attributes * * This function is a helper function that a low-level driver's * modify_qp method can use to validate the consumer's input. It * checks that cur_state and next_state are valid QP states, that a * transition from cur_state to next_state is allowed by the IB spec, * and that the attribute mask supplied is allowed for the transition. */ bool ib_modify_qp_is_ok(enum ib_qp_state cur_state, enum ib_qp_state next_state, enum ib_qp_type type, enum ib_qp_attr_mask mask); void ib_register_event_handler(struct ib_event_handler *event_handler); void ib_unregister_event_handler(struct ib_event_handler *event_handler); void ib_dispatch_event(const struct ib_event *event); int ib_query_port(struct ib_device *device, u32 port_num, struct ib_port_attr *port_attr); enum rdma_link_layer rdma_port_get_link_layer(struct ib_device *device, u32 port_num); /** * rdma_cap_ib_switch - Check if the device is IB switch * @device: Device to check * * Device driver is responsible for setting is_switch bit on * in ib_device structure at init time. * * Return: true if the device is IB switch. */ static inline bool rdma_cap_ib_switch(const struct ib_device *device) { return device->is_switch; } /** * rdma_start_port - Return the first valid port number for the device * specified * * @device: Device to be checked * * Return start port number */ static inline u32 rdma_start_port(const struct ib_device *device) { return rdma_cap_ib_switch(device) ? 0 : 1; } /** * rdma_for_each_port - Iterate over all valid port numbers of the IB device * @device: The struct ib_device * to iterate over * @iter: The unsigned int to store the port number */ #define rdma_for_each_port(device, iter) \ for (iter = rdma_start_port(device + \ BUILD_BUG_ON_ZERO(!__same_type(u32, \ iter))); \ iter <= rdma_end_port(device); iter++) /** * rdma_end_port - Return the last valid port number for the device * specified * * @device: Device to be checked * * Return last port number */ static inline u32 rdma_end_port(const struct ib_device *device) { return rdma_cap_ib_switch(device) ? 0 : device->phys_port_cnt; } static inline int rdma_is_port_valid(const struct ib_device *device, unsigned int port) { return (port >= rdma_start_port(device) && port <= rdma_end_port(device)); } static inline bool rdma_is_grh_required(const struct ib_device *device, u32 port_num) { return device->port_data[port_num].immutable.core_cap_flags & RDMA_CORE_PORT_IB_GRH_REQUIRED; } static inline bool rdma_protocol_ib(const struct ib_device *device, u32 port_num) { return device->port_data[port_num].immutable.core_cap_flags & RDMA_CORE_CAP_PROT_IB; } static inline bool rdma_protocol_roce(const struct ib_device *device, u32 port_num) { return device->port_data[port_num].immutable.core_cap_flags & (RDMA_CORE_CAP_PROT_ROCE | RDMA_CORE_CAP_PROT_ROCE_UDP_ENCAP); } static inline bool rdma_protocol_roce_udp_encap(const struct ib_device *device, u32 port_num) { return device->port_data[port_num].immutable.core_cap_flags & RDMA_CORE_CAP_PROT_ROCE_UDP_ENCAP; } static inline bool rdma_protocol_roce_eth_encap(const struct ib_device *device, u32 port_num) { return device->port_data[port_num].immutable.core_cap_flags & RDMA_CORE_CAP_PROT_ROCE; } static inline bool rdma_protocol_iwarp(const struct ib_device *device, u32 port_num) { return device->port_data[port_num].immutable.core_cap_flags & RDMA_CORE_CAP_PROT_IWARP; } static inline bool rdma_ib_or_roce(const struct ib_device *device, u32 port_num) { return rdma_protocol_ib(device, port_num) || rdma_protocol_roce(device, port_num); } static inline bool rdma_protocol_raw_packet(const struct ib_device *device, u32 port_num) { return device->port_data[port_num].immutable.core_cap_flags & RDMA_CORE_CAP_PROT_RAW_PACKET; } static inline bool rdma_protocol_usnic(const struct ib_device *device, u32 port_num) { return device->port_data[port_num].immutable.core_cap_flags & RDMA_CORE_CAP_PROT_USNIC; } /** * rdma_cap_ib_mad - Check if the port of a device supports Infiniband * Management Datagrams. * @device: Device to check * @port_num: Port number to check * * Management Datagrams (MAD) are a required part of the InfiniBand * specification and are supported on all InfiniBand devices. A slightly * extended version are also supported on OPA interfaces. * * Return: true if the port supports sending/receiving of MAD packets. */ static inline bool rdma_cap_ib_mad(const struct ib_device *device, u32 port_num) { return device->port_data[port_num].immutable.core_cap_flags & RDMA_CORE_CAP_IB_MAD; } /** * rdma_cap_opa_mad - Check if the port of device provides support for OPA * Management Datagrams. * @device: Device to check * @port_num: Port number to check * * Intel OmniPath devices extend and/or replace the InfiniBand Management * datagrams with their own versions. These OPA MADs share many but not all of * the characteristics of InfiniBand MADs. * * OPA MADs differ in the following ways: * * 1) MADs are variable size up to 2K * IBTA defined MADs remain fixed at 256 bytes * 2) OPA SMPs must carry valid PKeys * 3) OPA SMP packets are a different format * * Return: true if the port supports OPA MAD packet formats. */ static inline bool rdma_cap_opa_mad(struct ib_device *device, u32 port_num) { return device->port_data[port_num].immutable.core_cap_flags & RDMA_CORE_CAP_OPA_MAD; } /** * rdma_cap_ib_smi - Check if the port of a device provides an Infiniband * Subnet Management Agent (SMA) on the Subnet Management Interface (SMI). * @device: Device to check * @port_num: Port number to check * * Each InfiniBand node is required to provide a Subnet Management Agent * that the subnet manager can access. Prior to the fabric being fully * configured by the subnet manager, the SMA is accessed via a well known * interface called the Subnet Management Interface (SMI). This interface * uses directed route packets to communicate with the SM to get around the * chicken and egg problem of the SM needing to know what's on the fabric * in order to configure the fabric, and needing to configure the fabric in * order to send packets to the devices on the fabric. These directed * route packets do not need the fabric fully configured in order to reach * their destination. The SMI is the only method allowed to send * directed route packets on an InfiniBand fabric. * * Return: true if the port provides an SMI. */ static inline bool rdma_cap_ib_smi(const struct ib_device *device, u32 port_num) { return device->port_data[port_num].immutable.core_cap_flags & RDMA_CORE_CAP_IB_SMI; } /** * rdma_cap_ib_cm - Check if the port of device has the capability Infiniband * Communication Manager. * @device: Device to check * @port_num: Port number to check * * The InfiniBand Communication Manager is one of many pre-defined General * Service Agents (GSA) that are accessed via the General Service * Interface (GSI). It's role is to facilitate establishment of connections * between nodes as well as other management related tasks for established * connections. * * Return: true if the port supports an IB CM (this does not guarantee that * a CM is actually running however). */ static inline bool rdma_cap_ib_cm(const struct ib_device *device, u32 port_num) { return device->port_data[port_num].immutable.core_cap_flags & RDMA_CORE_CAP_IB_CM; } /** * rdma_cap_iw_cm - Check if the port of device has the capability IWARP * Communication Manager. * @device: Device to check * @port_num: Port number to check * * Similar to above, but specific to iWARP connections which have a different * managment protocol than InfiniBand. * * Return: true if the port supports an iWARP CM (this does not guarantee that * a CM is actually running however). */ static inline bool rdma_cap_iw_cm(const struct ib_device *device, u32 port_num) { return device->port_data[port_num].immutable.core_cap_flags & RDMA_CORE_CAP_IW_CM; } /** * rdma_cap_ib_sa - Check if the port of device has the capability Infiniband * Subnet Administration. * @device: Device to check * @port_num: Port number to check * * An InfiniBand Subnet Administration (SA) service is a pre-defined General * Service Agent (GSA) provided by the Subnet Manager (SM). On InfiniBand * fabrics, devices should resolve routes to other hosts by contacting the * SA to query the proper route. * * Return: true if the port should act as a client to the fabric Subnet * Administration interface. This does not imply that the SA service is * running locally. */ static inline bool rdma_cap_ib_sa(const struct ib_device *device, u32 port_num) { return device->port_data[port_num].immutable.core_cap_flags & RDMA_CORE_CAP_IB_SA; } /** * rdma_cap_ib_mcast - Check if the port of device has the capability Infiniband * Multicast. * @device: Device to check * @port_num: Port number to check * * InfiniBand multicast registration is more complex than normal IPv4 or * IPv6 multicast registration. Each Host Channel Adapter must register * with the Subnet Manager when it wishes to join a multicast group. It * should do so only once regardless of how many queue pairs it subscribes * to this group. And it should leave the group only after all queue pairs * attached to the group have been detached. * * Return: true if the port must undertake the additional adminstrative * overhead of registering/unregistering with the SM and tracking of the * total number of queue pairs attached to the multicast group. */ static inline bool rdma_cap_ib_mcast(const struct ib_device *device, u32 port_num) { return rdma_cap_ib_sa(device, port_num); } /** * rdma_cap_af_ib - Check if the port of device has the capability * Native Infiniband Address. * @device: Device to check * @port_num: Port number to check * * InfiniBand addressing uses a port's GUID + Subnet Prefix to make a default * GID. RoCE uses a different mechanism, but still generates a GID via * a prescribed mechanism and port specific data. * * Return: true if the port uses a GID address to identify devices on the * network. */ static inline bool rdma_cap_af_ib(const struct ib_device *device, u32 port_num) { return device->port_data[port_num].immutable.core_cap_flags & RDMA_CORE_CAP_AF_IB; } /** * rdma_cap_eth_ah - Check if the port of device has the capability * Ethernet Address Handle. * @device: Device to check * @port_num: Port number to check * * RoCE is InfiniBand over Ethernet, and it uses a well defined technique * to fabricate GIDs over Ethernet/IP specific addresses native to the * port. Normally, packet headers are generated by the sending host * adapter, but when sending connectionless datagrams, we must manually * inject the proper headers for the fabric we are communicating over. * * Return: true if we are running as a RoCE port and must force the * addition of a Global Route Header built from our Ethernet Address * Handle into our header list for connectionless packets. */ static inline bool rdma_cap_eth_ah(const struct ib_device *device, u32 port_num) { return device->port_data[port_num].immutable.core_cap_flags & RDMA_CORE_CAP_ETH_AH; } /** * rdma_cap_opa_ah - Check if the port of device supports * OPA Address handles * @device: Device to check * @port_num: Port number to check * * Return: true if we are running on an OPA device which supports * the extended OPA addressing. */ static inline bool rdma_cap_opa_ah(struct ib_device *device, u32 port_num) { return (device->port_data[port_num].immutable.core_cap_flags & RDMA_CORE_CAP_OPA_AH) == RDMA_CORE_CAP_OPA_AH; } /** * rdma_max_mad_size - Return the max MAD size required by this RDMA Port. * * @device: Device * @port_num: Port number * * This MAD size includes the MAD headers and MAD payload. No other headers * are included. * * Return the max MAD size required by the Port. Will return 0 if the port * does not support MADs */ static inline size_t rdma_max_mad_size(const struct ib_device *device, u32 port_num) { return device->port_data[port_num].immutable.max_mad_size; } /** * rdma_cap_roce_gid_table - Check if the port of device uses roce_gid_table * @device: Device to check * @port_num: Port number to check * * RoCE GID table mechanism manages the various GIDs for a device. * * NOTE: if allocating the port's GID table has failed, this call will still * return true, but any RoCE GID table API will fail. * * Return: true if the port uses RoCE GID table mechanism in order to manage * its GIDs. */ static inline bool rdma_cap_roce_gid_table(const struct ib_device *device, u32 port_num) { return rdma_protocol_roce(device, port_num) && device->ops.add_gid && device->ops.del_gid; } /* * Check if the device supports READ W/ INVALIDATE. */ static inline bool rdma_cap_read_inv(struct ib_device *dev, u32 port_num) { /* * iWarp drivers must support READ W/ INVALIDATE. No other protocol * has support for it yet. */ return rdma_protocol_iwarp(dev, port_num); } /** * rdma_core_cap_opa_port - Return whether the RDMA Port is OPA or not. * @device: Device * @port_num: 1 based Port number * * Return true if port is an Intel OPA port , false if not */ static inline bool rdma_core_cap_opa_port(struct ib_device *device, u32 port_num) { return (device->port_data[port_num].immutable.core_cap_flags & RDMA_CORE_PORT_INTEL_OPA) == RDMA_CORE_PORT_INTEL_OPA; } /** * rdma_mtu_enum_to_int - Return the mtu of the port as an integer value. * @device: Device * @port: Port number * @mtu: enum value of MTU * * Return the MTU size supported by the port as an integer value. Will return * -1 if enum value of mtu is not supported. */ static inline int rdma_mtu_enum_to_int(struct ib_device *device, u32 port, int mtu) { if (rdma_core_cap_opa_port(device, port)) return opa_mtu_enum_to_int((enum opa_mtu)mtu); else return ib_mtu_enum_to_int((enum ib_mtu)mtu); } /** * rdma_mtu_from_attr - Return the mtu of the port from the port attribute. * @device: Device * @port: Port number * @attr: port attribute * * Return the MTU size supported by the port as an integer value. */ static inline int rdma_mtu_from_attr(struct ib_device *device, u32 port, struct ib_port_attr *attr) { if (rdma_core_cap_opa_port(device, port)) return attr->phys_mtu; else return ib_mtu_enum_to_int(attr->max_mtu); } int ib_set_vf_link_state(struct ib_device *device, int vf, u32 port, int state); int ib_get_vf_config(struct ib_device *device, int vf, u32 port, struct ifla_vf_info *info); int ib_get_vf_stats(struct ib_device *device, int vf, u32 port, struct ifla_vf_stats *stats); int ib_get_vf_guid(struct ib_device *device, int vf, u32 port, struct ifla_vf_guid *node_guid, struct ifla_vf_guid *port_guid); int ib_set_vf_guid(struct ib_device *device, int vf, u32 port, u64 guid, int type); int ib_query_pkey(struct ib_device *device, u32 port_num, u16 index, u16 *pkey); int ib_modify_device(struct ib_device *device, int device_modify_mask, struct ib_device_modify *device_modify); int ib_modify_port(struct ib_device *device, u32 port_num, int port_modify_mask, struct ib_port_modify *port_modify); int ib_find_gid(struct ib_device *device, union ib_gid *gid, u32 *port_num, u16 *index); int ib_find_pkey(struct ib_device *device, u32 port_num, u16 pkey, u16 *index); enum ib_pd_flags { /* * Create a memory registration for all memory in the system and place * the rkey for it into pd->unsafe_global_rkey. This can be used by * ULPs to avoid the overhead of dynamic MRs. * * This flag is generally considered unsafe and must only be used in * extremly trusted environments. Every use of it will log a warning * in the kernel log. */ IB_PD_UNSAFE_GLOBAL_RKEY = 0x01, }; struct ib_pd *__ib_alloc_pd(struct ib_device *device, unsigned int flags, const char *caller); /** * ib_alloc_pd - Allocates an unused protection domain. * @device: The device on which to allocate the protection domain. * @flags: protection domain flags * * A protection domain object provides an association between QPs, shared * receive queues, address handles, memory regions, and memory windows. * * Every PD has a local_dma_lkey which can be used as the lkey value for local * memory operations. */ #define ib_alloc_pd(device, flags) \ __ib_alloc_pd((device), (flags), KBUILD_MODNAME) int ib_dealloc_pd_user(struct ib_pd *pd, struct ib_udata *udata); /** * ib_dealloc_pd - Deallocate kernel PD * @pd: The protection domain * * NOTE: for user PD use ib_dealloc_pd_user with valid udata! */ static inline void ib_dealloc_pd(struct ib_pd *pd) { int ret = ib_dealloc_pd_user(pd, NULL); WARN_ONCE(ret, "Destroy of kernel PD shouldn't fail"); } enum rdma_create_ah_flags { /* In a sleepable context */ RDMA_CREATE_AH_SLEEPABLE = BIT(0), }; /** * rdma_create_ah - Creates an address handle for the given address vector. * @pd: The protection domain associated with the address handle. * @ah_attr: The attributes of the address vector. * @flags: Create address handle flags (see enum rdma_create_ah_flags). * * The address handle is used to reference a local or global destination * in all UD QP post sends. */ struct ib_ah *rdma_create_ah(struct ib_pd *pd, struct rdma_ah_attr *ah_attr, u32 flags); /** * rdma_create_user_ah - Creates an address handle for the given address vector. * It resolves destination mac address for ah attribute of RoCE type. * @pd: The protection domain associated with the address handle. * @ah_attr: The attributes of the address vector. * @udata: pointer to user's input output buffer information need by * provider driver. * * It returns 0 on success and returns appropriate error code on error. * The address handle is used to reference a local or global destination * in all UD QP post sends. */ struct ib_ah *rdma_create_user_ah(struct ib_pd *pd, struct rdma_ah_attr *ah_attr, struct ib_udata *udata); /** * ib_get_gids_from_rdma_hdr - Get sgid and dgid from GRH or IPv4 header * work completion. * @hdr: the L3 header to parse * @net_type: type of header to parse * @sgid: place to store source gid * @dgid: place to store destination gid */ int ib_get_gids_from_rdma_hdr(const union rdma_network_hdr *hdr, enum rdma_network_type net_type, union ib_gid *sgid, union ib_gid *dgid); /** * ib_get_rdma_header_version - Get the header version * @hdr: the L3 header to parse */ int ib_get_rdma_header_version(const union rdma_network_hdr *hdr); /** * ib_init_ah_attr_from_wc - Initializes address handle attributes from a * work completion. * @device: Device on which the received message arrived. * @port_num: Port on which the received message arrived. * @wc: Work completion associated with the received message. * @grh: References the received global route header. This parameter is * ignored unless the work completion indicates that the GRH is valid. * @ah_attr: Returned attributes that can be used when creating an address * handle for replying to the message. * When ib_init_ah_attr_from_wc() returns success, * (a) for IB link layer it optionally contains a reference to SGID attribute * when GRH is present for IB link layer. * (b) for RoCE link layer it contains a reference to SGID attribute. * User must invoke rdma_cleanup_ah_attr_gid_attr() to release reference to SGID * attributes which are initialized using ib_init_ah_attr_from_wc(). * */ int ib_init_ah_attr_from_wc(struct ib_device *device, u32 port_num, const struct ib_wc *wc, const struct ib_grh *grh, struct rdma_ah_attr *ah_attr); /** * ib_create_ah_from_wc - Creates an address handle associated with the * sender of the specified work completion. * @pd: The protection domain associated with the address handle. * @wc: Work completion information associated with a received message. * @grh: References the received global route header. This parameter is * ignored unless the work completion indicates that the GRH is valid. * @port_num: The outbound port number to associate with the address. * * The address handle is used to reference a local or global destination * in all UD QP post sends. */ struct ib_ah *ib_create_ah_from_wc(struct ib_pd *pd, const struct ib_wc *wc, const struct ib_grh *grh, u32 port_num); /** * rdma_modify_ah - Modifies the address vector associated with an address * handle. * @ah: The address handle to modify. * @ah_attr: The new address vector attributes to associate with the * address handle. */ int rdma_modify_ah(struct ib_ah *ah, struct rdma_ah_attr *ah_attr); /** * rdma_query_ah - Queries the address vector associated with an address * handle. * @ah: The address handle to query. * @ah_attr: The address vector attributes associated with the address * handle. */ int rdma_query_ah(struct ib_ah *ah, struct rdma_ah_attr *ah_attr); enum rdma_destroy_ah_flags { /* In a sleepable context */ RDMA_DESTROY_AH_SLEEPABLE = BIT(0), }; /** * rdma_destroy_ah_user - Destroys an address handle. * @ah: The address handle to destroy. * @flags: Destroy address handle flags (see enum rdma_destroy_ah_flags). * @udata: Valid user data or NULL for kernel objects */ int rdma_destroy_ah_user(struct ib_ah *ah, u32 flags, struct ib_udata *udata); /** * rdma_destroy_ah - Destroys an kernel address handle. * @ah: The address handle to destroy. * @flags: Destroy address handle flags (see enum rdma_destroy_ah_flags). * * NOTE: for user ah use rdma_destroy_ah_user with valid udata! */ static inline void rdma_destroy_ah(struct ib_ah *ah, u32 flags) { int ret = rdma_destroy_ah_user(ah, flags, NULL); WARN_ONCE(ret, "Destroy of kernel AH shouldn't fail"); } struct ib_srq *ib_create_srq_user(struct ib_pd *pd, struct ib_srq_init_attr *srq_init_attr, struct ib_usrq_object *uobject, struct ib_udata *udata); static inline struct ib_srq * ib_create_srq(struct ib_pd *pd, struct ib_srq_init_attr *srq_init_attr) { if (!pd->device->ops.create_srq) return ERR_PTR(-EOPNOTSUPP); return ib_create_srq_user(pd, srq_init_attr, NULL, NULL); } /** * ib_modify_srq - Modifies the attributes for the specified SRQ. * @srq: The SRQ to modify. * @srq_attr: On input, specifies the SRQ attributes to modify. On output, * the current values of selected SRQ attributes are returned. * @srq_attr_mask: A bit-mask used to specify which attributes of the SRQ * are being modified. * * The mask may contain IB_SRQ_MAX_WR to resize the SRQ and/or * IB_SRQ_LIMIT to set the SRQ's limit and request notification when * the number of receives queued drops below the limit. */ int ib_modify_srq(struct ib_srq *srq, struct ib_srq_attr *srq_attr, enum ib_srq_attr_mask srq_attr_mask); /** * ib_query_srq - Returns the attribute list and current values for the * specified SRQ. * @srq: The SRQ to query. * @srq_attr: The attributes of the specified SRQ. */ int ib_query_srq(struct ib_srq *srq, struct ib_srq_attr *srq_attr); /** * ib_destroy_srq_user - Destroys the specified SRQ. * @srq: The SRQ to destroy. * @udata: Valid user data or NULL for kernel objects */ int ib_destroy_srq_user(struct ib_srq *srq, struct ib_udata *udata); /** * ib_destroy_srq - Destroys the specified kernel SRQ. * @srq: The SRQ to destroy. * * NOTE: for user srq use ib_destroy_srq_user with valid udata! */ static inline void ib_destroy_srq(struct ib_srq *srq) { int ret = ib_destroy_srq_user(srq, NULL); WARN_ONCE(ret, "Destroy of kernel SRQ shouldn't fail"); } /** * ib_post_srq_recv - Posts a list of work requests to the specified SRQ. * @srq: The SRQ to post the work request on. * @recv_wr: A list of work requests to post on the receive queue. * @bad_recv_wr: On an immediate failure, this parameter will reference * the work request that failed to be posted on the QP. */ static inline int ib_post_srq_recv(struct ib_srq *srq, const struct ib_recv_wr *recv_wr, const struct ib_recv_wr **bad_recv_wr) { const struct ib_recv_wr *dummy; return srq->device->ops.post_srq_recv(srq, recv_wr, bad_recv_wr ? : &dummy); } struct ib_qp *ib_create_qp_kernel(struct ib_pd *pd, struct ib_qp_init_attr *qp_init_attr, const char *caller); /** * ib_create_qp - Creates a kernel QP associated with the specific protection * domain. * @pd: The protection domain associated with the QP. * @init_attr: A list of initial attributes required to create the * QP. If QP creation succeeds, then the attributes are updated to * the actual capabilities of the created QP. */ static inline struct ib_qp *ib_create_qp(struct ib_pd *pd, struct ib_qp_init_attr *init_attr) { return ib_create_qp_kernel(pd, init_attr, KBUILD_MODNAME); } /** * ib_modify_qp_with_udata - Modifies the attributes for the specified QP. * @qp: The QP to modify. * @attr: On input, specifies the QP attributes to modify. On output, * the current values of selected QP attributes are returned. * @attr_mask: A bit-mask used to specify which attributes of the QP * are being modified. * @udata: pointer to user's input output buffer information * are being modified. * It returns 0 on success and returns appropriate error code on error. */ int ib_modify_qp_with_udata(struct ib_qp *qp, struct ib_qp_attr *attr, int attr_mask, struct ib_udata *udata); /** * ib_modify_qp - Modifies the attributes for the specified QP and then * transitions the QP to the given state. * @qp: The QP to modify. * @qp_attr: On input, specifies the QP attributes to modify. On output, * the current values of selected QP attributes are returned. * @qp_attr_mask: A bit-mask used to specify which attributes of the QP * are being modified. */ int ib_modify_qp(struct ib_qp *qp, struct ib_qp_attr *qp_attr, int qp_attr_mask); /** * ib_query_qp - Returns the attribute list and current values for the * specified QP. * @qp: The QP to query. * @qp_attr: The attributes of the specified QP. * @qp_attr_mask: A bit-mask used to select specific attributes to query. * @qp_init_attr: Additional attributes of the selected QP. * * The qp_attr_mask may be used to limit the query to gathering only the * selected attributes. */ int ib_query_qp(struct ib_qp *qp, struct ib_qp_attr *qp_attr, int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr); /** * ib_destroy_qp - Destroys the specified QP. * @qp: The QP to destroy. * @udata: Valid udata or NULL for kernel objects */ int ib_destroy_qp_user(struct ib_qp *qp, struct ib_udata *udata); /** * ib_destroy_qp - Destroys the specified kernel QP. * @qp: The QP to destroy. * * NOTE: for user qp use ib_destroy_qp_user with valid udata! */ static inline int ib_destroy_qp(struct ib_qp *qp) { return ib_destroy_qp_user(qp, NULL); } /** * ib_open_qp - Obtain a reference to an existing sharable QP. * @xrcd: XRC domain * @qp_open_attr: Attributes identifying the QP to open. * * Returns a reference to a sharable QP. */ struct ib_qp *ib_open_qp(struct ib_xrcd *xrcd, struct ib_qp_open_attr *qp_open_attr); /** * ib_close_qp - Release an external reference to a QP. * @qp: The QP handle to release * * The opened QP handle is released by the caller. The underlying * shared QP is not destroyed until all internal references are released. */ int ib_close_qp(struct ib_qp *qp); /** * ib_post_send - Posts a list of work requests to the send queue of * the specified QP. * @qp: The QP to post the work request on. * @send_wr: A list of work requests to post on the send queue. * @bad_send_wr: On an immediate failure, this parameter will reference * the work request that failed to be posted on the QP. * * While IBA Vol. 1 section 11.4.1.1 specifies that if an immediate * error is returned, the QP state shall not be affected, * ib_post_send() will return an immediate error after queueing any * earlier work requests in the list. */ static inline int ib_post_send(struct ib_qp *qp, const struct ib_send_wr *send_wr, const struct ib_send_wr **bad_send_wr) { const struct ib_send_wr *dummy; return qp->device->ops.post_send(qp, send_wr, bad_send_wr ? : &dummy); } /** * ib_post_recv - Posts a list of work requests to the receive queue of * the specified QP. * @qp: The QP to post the work request on. * @recv_wr: A list of work requests to post on the receive queue. * @bad_recv_wr: On an immediate failure, this parameter will reference * the work request that failed to be posted on the QP. */ static inline int ib_post_recv(struct ib_qp *qp, const struct ib_recv_wr *recv_wr, const struct ib_recv_wr **bad_recv_wr) { const struct ib_recv_wr *dummy; return qp->device->ops.post_recv(qp, recv_wr, bad_recv_wr ? : &dummy); } struct ib_cq *__ib_alloc_cq(struct ib_device *dev, void *private, int nr_cqe, int comp_vector, enum ib_poll_context poll_ctx, const char *caller); static inline struct ib_cq *ib_alloc_cq(struct ib_device *dev, void *private, int nr_cqe, int comp_vector, enum ib_poll_context poll_ctx) { return __ib_alloc_cq(dev, private, nr_cqe, comp_vector, poll_ctx, KBUILD_MODNAME); } struct ib_cq *__ib_alloc_cq_any(struct ib_device *dev, void *private, int nr_cqe, enum ib_poll_context poll_ctx, const char *caller); /** * ib_alloc_cq_any: Allocate kernel CQ * @dev: The IB device * @private: Private data attached to the CQE * @nr_cqe: Number of CQEs in the CQ * @poll_ctx: Context used for polling the CQ */ static inline struct ib_cq *ib_alloc_cq_any(struct ib_device *dev, void *private, int nr_cqe, enum ib_poll_context poll_ctx) { return __ib_alloc_cq_any(dev, private, nr_cqe, poll_ctx, KBUILD_MODNAME); } void ib_free_cq(struct ib_cq *cq); int ib_process_cq_direct(struct ib_cq *cq, int budget); /** * ib_create_cq - Creates a CQ on the specified device. * @device: The device on which to create the CQ. * @comp_handler: A user-specified callback that is invoked when a * completion event occurs on the CQ. * @event_handler: A user-specified callback that is invoked when an * asynchronous event not associated with a completion occurs on the CQ. * @cq_context: Context associated with the CQ returned to the user via * the associated completion and event handlers. * @cq_attr: The attributes the CQ should be created upon. * * Users can examine the cq structure to determine the actual CQ size. */ struct ib_cq *__ib_create_cq(struct ib_device *device, ib_comp_handler comp_handler, void (*event_handler)(struct ib_event *, void *), void *cq_context, const struct ib_cq_init_attr *cq_attr, const char *caller); #define ib_create_cq(device, cmp_hndlr, evt_hndlr, cq_ctxt, cq_attr) \ __ib_create_cq((device), (cmp_hndlr), (evt_hndlr), (cq_ctxt), (cq_attr), KBUILD_MODNAME) /** * ib_resize_cq - Modifies the capacity of the CQ. * @cq: The CQ to resize. * @cqe: The minimum size of the CQ. * * Users can examine the cq structure to determine the actual CQ size. */ int ib_resize_cq(struct ib_cq *cq, int cqe); /** * rdma_set_cq_moderation - Modifies moderation params of the CQ * @cq: The CQ to modify. * @cq_count: number of CQEs that will trigger an event * @cq_period: max period of time in usec before triggering an event * */ int rdma_set_cq_moderation(struct ib_cq *cq, u16 cq_count, u16 cq_period); /** * ib_destroy_cq_user - Destroys the specified CQ. * @cq: The CQ to destroy. * @udata: Valid user data or NULL for kernel objects */ int ib_destroy_cq_user(struct ib_cq *cq, struct ib_udata *udata); /** * ib_destroy_cq - Destroys the specified kernel CQ. * @cq: The CQ to destroy. * * NOTE: for user cq use ib_destroy_cq_user with valid udata! */ static inline void ib_destroy_cq(struct ib_cq *cq) { int ret = ib_destroy_cq_user(cq, NULL); WARN_ONCE(ret, "Destroy of kernel CQ shouldn't fail"); } /** * ib_poll_cq - poll a CQ for completion(s) * @cq:the CQ being polled * @num_entries:maximum number of completions to return * @wc:array of at least @num_entries &struct ib_wc where completions * will be returned * * Poll a CQ for (possibly multiple) completions. If the return value * is < 0, an error occurred. If the return value is >= 0, it is the * number of completions returned. If the return value is * non-negative and < num_entries, then the CQ was emptied. */ static inline int ib_poll_cq(struct ib_cq *cq, int num_entries, struct ib_wc *wc) { return cq->device->ops.poll_cq(cq, num_entries, wc); } /** * ib_req_notify_cq - Request completion notification on a CQ. * @cq: The CQ to generate an event for. * @flags: * Must contain exactly one of %IB_CQ_SOLICITED or %IB_CQ_NEXT_COMP * to request an event on the next solicited event or next work * completion at any type, respectively. %IB_CQ_REPORT_MISSED_EVENTS * may also be |ed in to request a hint about missed events, as * described below. * * Return Value: * < 0 means an error occurred while requesting notification * == 0 means notification was requested successfully, and if * IB_CQ_REPORT_MISSED_EVENTS was passed in, then no events * were missed and it is safe to wait for another event. In * this case is it guaranteed that any work completions added * to the CQ since the last CQ poll will trigger a completion * notification event. * > 0 is only returned if IB_CQ_REPORT_MISSED_EVENTS was passed * in. It means that the consumer must poll the CQ again to * make sure it is empty to avoid missing an event because of a * race between requesting notification and an entry being * added to the CQ. This return value means it is possible * (but not guaranteed) that a work completion has been added * to the CQ since the last poll without triggering a * completion notification event. */ static inline int ib_req_notify_cq(struct ib_cq *cq, enum ib_cq_notify_flags flags) { return cq->device->ops.req_notify_cq(cq, flags); } struct ib_cq *ib_cq_pool_get(struct ib_device *dev, unsigned int nr_cqe, int comp_vector_hint, enum ib_poll_context poll_ctx); void ib_cq_pool_put(struct ib_cq *cq, unsigned int nr_cqe); /* * Drivers that don't need a DMA mapping at the RDMA layer, set dma_device to * NULL. This causes the ib_dma* helpers to just stash the kernel virtual * address into the dma address. */ static inline bool ib_uses_virt_dma(struct ib_device *dev) { return IS_ENABLED(CONFIG_INFINIBAND_VIRT_DMA) && !dev->dma_device; } /* * Check if a IB device's underlying DMA mapping supports P2PDMA transfers. */ static inline bool ib_dma_pci_p2p_dma_supported(struct ib_device *dev) { if (ib_uses_virt_dma(dev)) return false; return dma_pci_p2pdma_supported(dev->dma_device); } /** * ib_virt_dma_to_ptr - Convert a dma_addr to a kernel pointer * @dma_addr: The DMA address * * Used by ib_uses_virt_dma() devices to get back to the kernel pointer after * going through the dma_addr marshalling. */ static inline void *ib_virt_dma_to_ptr(u64 dma_addr) { /* virt_dma mode maps the kvs's directly into the dma addr */ return (void *)(uintptr_t)dma_addr; } /** * ib_virt_dma_to_page - Convert a dma_addr to a struct page * @dma_addr: The DMA address * * Used by ib_uses_virt_dma() device to get back to the struct page after going * through the dma_addr marshalling. */ static inline struct page *ib_virt_dma_to_page(u64 dma_addr) { return virt_to_page(ib_virt_dma_to_ptr(dma_addr)); } /** * ib_dma_mapping_error - check a DMA addr for error * @dev: The device for which the dma_addr was created * @dma_addr: The DMA address to check */ static inline int ib_dma_mapping_error(struct ib_device *dev, u64 dma_addr) { if (ib_uses_virt_dma(dev)) return 0; return dma_mapping_error(dev->dma_device, dma_addr); } /** * ib_dma_map_single - Map a kernel virtual address to DMA address * @dev: The device for which the dma_addr is to be created * @cpu_addr: The kernel virtual address * @size: The size of the region in bytes * @direction: The direction of the DMA */ static inline u64 ib_dma_map_single(struct ib_device *dev, void *cpu_addr, size_t size, enum dma_data_direction direction) { if (ib_uses_virt_dma(dev)) return (uintptr_t)cpu_addr; return dma_map_single(dev->dma_device, cpu_addr, size, direction); } /** * ib_dma_unmap_single - Destroy a mapping created by ib_dma_map_single() * @dev: The device for which the DMA address was created * @addr: The DMA address * @size: The size of the region in bytes * @direction: The direction of the DMA */ static inline void ib_dma_unmap_single(struct ib_device *dev, u64 addr, size_t size, enum dma_data_direction direction) { if (!ib_uses_virt_dma(dev)) dma_unmap_single(dev->dma_device, addr, size, direction); } /** * ib_dma_map_page - Map a physical page to DMA address * @dev: The device for which the dma_addr is to be created * @page: The page to be mapped * @offset: The offset within the page * @size: The size of the region in bytes * @direction: The direction of the DMA */ static inline u64 ib_dma_map_page(struct ib_device *dev, struct page *page, unsigned long offset, size_t size, enum dma_data_direction direction) { if (ib_uses_virt_dma(dev)) return (uintptr_t)(page_address(page) + offset); return dma_map_page(dev->dma_device, page, offset, size, direction); } /** * ib_dma_unmap_page - Destroy a mapping created by ib_dma_map_page() * @dev: The device for which the DMA address was created * @addr: The DMA address * @size: The size of the region in bytes * @direction: The direction of the DMA */ static inline void ib_dma_unmap_page(struct ib_device *dev, u64 addr, size_t size, enum dma_data_direction direction) { if (!ib_uses_virt_dma(dev)) dma_unmap_page(dev->dma_device, addr, size, direction); } int ib_dma_virt_map_sg(struct ib_device *dev, struct scatterlist *sg, int nents); static inline int ib_dma_map_sg_attrs(struct ib_device *dev, struct scatterlist *sg, int nents, enum dma_data_direction direction, unsigned long dma_attrs) { if (ib_uses_virt_dma(dev)) return ib_dma_virt_map_sg(dev, sg, nents); return dma_map_sg_attrs(dev->dma_device, sg, nents, direction, dma_attrs); } static inline void ib_dma_unmap_sg_attrs(struct ib_device *dev, struct scatterlist *sg, int nents, enum dma_data_direction direction, unsigned long dma_attrs) { if (!ib_uses_virt_dma(dev)) dma_unmap_sg_attrs(dev->dma_device, sg, nents, direction, dma_attrs); } /** * ib_dma_map_sgtable_attrs - Map a scatter/gather table to DMA addresses * @dev: The device for which the DMA addresses are to be created * @sgt: The sg_table object describing the buffer * @direction: The direction of the DMA * @dma_attrs: Optional DMA attributes for the map operation */ static inline int ib_dma_map_sgtable_attrs(struct ib_device *dev, struct sg_table *sgt, enum dma_data_direction direction, unsigned long dma_attrs) { int nents; if (ib_uses_virt_dma(dev)) { nents = ib_dma_virt_map_sg(dev, sgt->sgl, sgt->orig_nents); if (!nents) return -EIO; sgt->nents = nents; return 0; } return dma_map_sgtable(dev->dma_device, sgt, direction, dma_attrs); } static inline void ib_dma_unmap_sgtable_attrs(struct ib_device *dev, struct sg_table *sgt, enum dma_data_direction direction, unsigned long dma_attrs) { if (!ib_uses_virt_dma(dev)) dma_unmap_sgtable(dev->dma_device, sgt, direction, dma_attrs); } /** * ib_dma_map_sg - Map a scatter/gather list to DMA addresses * @dev: The device for which the DMA addresses are to be created * @sg: The array of scatter/gather entries * @nents: The number of scatter/gather entries * @direction: The direction of the DMA */ static inline int ib_dma_map_sg(struct ib_device *dev, struct scatterlist *sg, int nents, enum dma_data_direction direction) { return ib_dma_map_sg_attrs(dev, sg, nents, direction, 0); } /** * ib_dma_unmap_sg - Unmap a scatter/gather list of DMA addresses * @dev: The device for which the DMA addresses were created * @sg: The array of scatter/gather entries * @nents: The number of scatter/gather entries * @direction: The direction of the DMA */ static inline void ib_dma_unmap_sg(struct ib_device *dev, struct scatterlist *sg, int nents, enum dma_data_direction direction) { ib_dma_unmap_sg_attrs(dev, sg, nents, direction, 0); } /** * ib_dma_max_seg_size - Return the size limit of a single DMA transfer * @dev: The device to query * * The returned value represents a size in bytes. */ static inline unsigned int ib_dma_max_seg_size(struct ib_device *dev) { if (ib_uses_virt_dma(dev)) return UINT_MAX; return dma_get_max_seg_size(dev->dma_device); } /** * ib_dma_sync_single_for_cpu - Prepare DMA region to be accessed by CPU * @dev: The device for which the DMA address was created * @addr: The DMA address * @size: The size of the region in bytes * @dir: The direction of the DMA */ static inline void ib_dma_sync_single_for_cpu(struct ib_device *dev, u64 addr, size_t size, enum dma_data_direction dir) { if (!ib_uses_virt_dma(dev)) dma_sync_single_for_cpu(dev->dma_device, addr, size, dir); } /** * ib_dma_sync_single_for_device - Prepare DMA region to be accessed by device * @dev: The device for which the DMA address was created * @addr: The DMA address * @size: The size of the region in bytes * @dir: The direction of the DMA */ static inline void ib_dma_sync_single_for_device(struct ib_device *dev, u64 addr, size_t size, enum dma_data_direction dir) { if (!ib_uses_virt_dma(dev)) dma_sync_single_for_device(dev->dma_device, addr, size, dir); } /* ib_reg_user_mr - register a memory region for virtual addresses from kernel * space. This function should be called when 'current' is the owning MM. */ struct ib_mr *ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, u64 virt_addr, int mr_access_flags); /* ib_advise_mr - give an advice about an address range in a memory region */ int ib_advise_mr(struct ib_pd *pd, enum ib_uverbs_advise_mr_advice advice, u32 flags, struct ib_sge *sg_list, u32 num_sge); /** * ib_dereg_mr_user - Deregisters a memory region and removes it from the * HCA translation table. * @mr: The memory region to deregister. * @udata: Valid user data or NULL for kernel object * * This function can fail, if the memory region has memory windows bound to it. */ int ib_dereg_mr_user(struct ib_mr *mr, struct ib_udata *udata); /** * ib_dereg_mr - Deregisters a kernel memory region and removes it from the * HCA translation table. * @mr: The memory region to deregister. * * This function can fail, if the memory region has memory windows bound to it. * * NOTE: for user mr use ib_dereg_mr_user with valid udata! */ static inline int ib_dereg_mr(struct ib_mr *mr) { return ib_dereg_mr_user(mr, NULL); } struct ib_mr *ib_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type, u32 max_num_sg); struct ib_mr *ib_alloc_mr_integrity(struct ib_pd *pd, u32 max_num_data_sg, u32 max_num_meta_sg); /** * ib_update_fast_reg_key - updates the key portion of the fast_reg MR * R_Key and L_Key. * @mr: struct ib_mr pointer to be updated. * @newkey: new key to be used. */ static inline void ib_update_fast_reg_key(struct ib_mr *mr, u8 newkey) { mr->lkey = (mr->lkey & 0xffffff00) | newkey; mr->rkey = (mr->rkey & 0xffffff00) | newkey; } /** * ib_inc_rkey - increments the key portion of the given rkey. Can be used * for calculating a new rkey for type 2 memory windows. * @rkey: the rkey to increment. */ static inline u32 ib_inc_rkey(u32 rkey) { const u32 mask = 0x000000ff; return ((rkey + 1) & mask) | (rkey & ~mask); } /** * ib_attach_mcast - Attaches the specified QP to a multicast group. * @qp: QP to attach to the multicast group. The QP must be type * IB_QPT_UD. * @gid: Multicast group GID. * @lid: Multicast group LID in host byte order. * * In order to send and receive multicast packets, subnet * administration must have created the multicast group and configured * the fabric appropriately. The port associated with the specified * QP must also be a member of the multicast group. */ int ib_attach_mcast(struct ib_qp *qp, union ib_gid *gid, u16 lid); /** * ib_detach_mcast - Detaches the specified QP from a multicast group. * @qp: QP to detach from the multicast group. * @gid: Multicast group GID. * @lid: Multicast group LID in host byte order. */ int ib_detach_mcast(struct ib_qp *qp, union ib_gid *gid, u16 lid); struct ib_xrcd *ib_alloc_xrcd_user(struct ib_device *device, struct inode *inode, struct ib_udata *udata); int ib_dealloc_xrcd_user(struct ib_xrcd *xrcd, struct ib_udata *udata); static inline int ib_check_mr_access(struct ib_device *ib_dev, unsigned int flags) { u64 device_cap = ib_dev->attrs.device_cap_flags; /* * Local write permission is required if remote write or * remote atomic permission is also requested. */ if (flags & (IB_ACCESS_REMOTE_ATOMIC | IB_ACCESS_REMOTE_WRITE) && !(flags & IB_ACCESS_LOCAL_WRITE)) return -EINVAL; if (flags & ~IB_ACCESS_SUPPORTED) return -EINVAL; if (flags & IB_ACCESS_ON_DEMAND && !(ib_dev->attrs.kernel_cap_flags & IBK_ON_DEMAND_PAGING)) return -EOPNOTSUPP; if ((flags & IB_ACCESS_FLUSH_GLOBAL && !(device_cap & IB_DEVICE_FLUSH_GLOBAL)) || (flags & IB_ACCESS_FLUSH_PERSISTENT && !(device_cap & IB_DEVICE_FLUSH_PERSISTENT))) return -EOPNOTSUPP; return 0; } static inline bool ib_access_writable(int access_flags) { /* * We have writable memory backing the MR if any of the following * access flags are set. "Local write" and "remote write" obviously * require write access. "Remote atomic" can do things like fetch and * add, which will modify memory, and "MW bind" can change permissions * by binding a window. */ return access_flags & (IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_ATOMIC | IB_ACCESS_MW_BIND); } /** * ib_check_mr_status: lightweight check of MR status. * This routine may provide status checks on a selected * ib_mr. first use is for signature status check. * * @mr: A memory region. * @check_mask: Bitmask of which checks to perform from * ib_mr_status_check enumeration. * @mr_status: The container of relevant status checks. * failed checks will be indicated in the status bitmask * and the relevant info shall be in the error item. */ int ib_check_mr_status(struct ib_mr *mr, u32 check_mask, struct ib_mr_status *mr_status); /** * ib_device_try_get: Hold a registration lock * @dev: The device to lock * * A device under an active registration lock cannot become unregistered. It * is only possible to obtain a registration lock on a device that is fully * registered, otherwise this function returns false. * * The registration lock is only necessary for actions which require the * device to still be registered. Uses that only require the device pointer to * be valid should use get_device(&ibdev->dev) to hold the memory. * */ static inline bool ib_device_try_get(struct ib_device *dev) { return refcount_inc_not_zero(&dev->refcount); } void ib_device_put(struct ib_device *device); struct ib_device *ib_device_get_by_netdev(struct net_device *ndev, enum rdma_driver_id driver_id); struct ib_device *ib_device_get_by_name(const char *name, enum rdma_driver_id driver_id); struct net_device *ib_get_net_dev_by_params(struct ib_device *dev, u32 port, u16 pkey, const union ib_gid *gid, const struct sockaddr *addr); int ib_device_set_netdev(struct ib_device *ib_dev, struct net_device *ndev, unsigned int port); struct net_device *ib_device_get_netdev(struct ib_device *ib_dev, u32 port); int ib_query_netdev_port(struct ib_device *ibdev, struct net_device *ndev, u32 *port); static inline enum ib_port_state ib_get_curr_port_state(struct net_device *net_dev) { return (netif_running(net_dev) && netif_carrier_ok(net_dev)) ? IB_PORT_ACTIVE : IB_PORT_DOWN; } void ib_dispatch_port_state_event(struct ib_device *ibdev, struct net_device *ndev); struct ib_wq *ib_create_wq(struct ib_pd *pd, struct ib_wq_init_attr *init_attr); int ib_destroy_wq_user(struct ib_wq *wq, struct ib_udata *udata); int ib_map_mr_sg(struct ib_mr *mr, struct scatterlist *sg, int sg_nents, unsigned int *sg_offset, unsigned int page_size); int ib_map_mr_sg_pi(struct ib_mr *mr, struct scatterlist *data_sg, int data_sg_nents, unsigned int *data_sg_offset, struct scatterlist *meta_sg, int meta_sg_nents, unsigned int *meta_sg_offset, unsigned int page_size); static inline int ib_map_mr_sg_zbva(struct ib_mr *mr, struct scatterlist *sg, int sg_nents, unsigned int *sg_offset, unsigned int page_size) { int n; n = ib_map_mr_sg(mr, sg, sg_nents, sg_offset, page_size); mr->iova = 0; return n; } int ib_sg_to_pages(struct ib_mr *mr, struct scatterlist *sgl, int sg_nents, unsigned int *sg_offset, int (*set_page)(struct ib_mr *, u64)); void ib_drain_rq(struct ib_qp *qp); void ib_drain_sq(struct ib_qp *qp); void ib_drain_qp(struct ib_qp *qp); int ib_get_eth_speed(struct ib_device *dev, u32 port_num, u16 *speed, u8 *width); static inline u8 *rdma_ah_retrieve_dmac(struct rdma_ah_attr *attr) { if (attr->type == RDMA_AH_ATTR_TYPE_ROCE) return attr->roce.dmac; return NULL; } static inline void rdma_ah_set_dlid(struct rdma_ah_attr *attr, u32 dlid) { if (attr->type == RDMA_AH_ATTR_TYPE_IB) attr->ib.dlid = (u16)dlid; else if (attr->type == RDMA_AH_ATTR_TYPE_OPA) attr->opa.dlid = dlid; } static inline u32 rdma_ah_get_dlid(const struct rdma_ah_attr *attr) { if (attr->type == RDMA_AH_ATTR_TYPE_IB) return attr->ib.dlid; else if (attr->type == RDMA_AH_ATTR_TYPE_OPA) return attr->opa.dlid; return 0; } static inline void rdma_ah_set_sl(struct rdma_ah_attr *attr, u8 sl) { attr->sl = sl; } static inline u8 rdma_ah_get_sl(const struct rdma_ah_attr *attr) { return attr->sl; } static inline void rdma_ah_set_path_bits(struct rdma_ah_attr *attr, u8 src_path_bits) { if (attr->type == RDMA_AH_ATTR_TYPE_IB) attr->ib.src_path_bits = src_path_bits; else if (attr->type == RDMA_AH_ATTR_TYPE_OPA) attr->opa.src_path_bits = src_path_bits; } static inline u8 rdma_ah_get_path_bits(const struct rdma_ah_attr *attr) { if (attr->type == RDMA_AH_ATTR_TYPE_IB) return attr->ib.src_path_bits; else if (attr->type == RDMA_AH_ATTR_TYPE_OPA) return attr->opa.src_path_bits; return 0; } static inline void rdma_ah_set_make_grd(struct rdma_ah_attr *attr, bool make_grd) { if (attr->type == RDMA_AH_ATTR_TYPE_OPA) attr->opa.make_grd = make_grd; } static inline bool rdma_ah_get_make_grd(const struct rdma_ah_attr *attr) { if (attr->type == RDMA_AH_ATTR_TYPE_OPA) return attr->opa.make_grd; return false; } static inline void rdma_ah_set_port_num(struct rdma_ah_attr *attr, u32 port_num) { attr->port_num = port_num; } static inline u32 rdma_ah_get_port_num(const struct rdma_ah_attr *attr) { return attr->port_num; } static inline void rdma_ah_set_static_rate(struct rdma_ah_attr *attr, u8 static_rate) { attr->static_rate = static_rate; } static inline u8 rdma_ah_get_static_rate(const struct rdma_ah_attr *attr) { return attr->static_rate; } static inline void rdma_ah_set_ah_flags(struct rdma_ah_attr *attr, enum ib_ah_flags flag) { attr->ah_flags = flag; } static inline enum ib_ah_flags rdma_ah_get_ah_flags(const struct rdma_ah_attr *attr) { return attr->ah_flags; } static inline const struct ib_global_route *rdma_ah_read_grh(const struct rdma_ah_attr *attr) { return &attr->grh; } /*To retrieve and modify the grh */ static inline struct ib_global_route *rdma_ah_retrieve_grh(struct rdma_ah_attr *attr) { return &attr->grh; } static inline void rdma_ah_set_dgid_raw(struct rdma_ah_attr *attr, void *dgid) { struct ib_global_route *grh = rdma_ah_retrieve_grh(attr); memcpy(grh->dgid.raw, dgid, sizeof(grh->dgid)); } static inline void rdma_ah_set_subnet_prefix(struct rdma_ah_attr *attr, __be64 prefix) { struct ib_global_route *grh = rdma_ah_retrieve_grh(attr); grh->dgid.global.subnet_prefix = prefix; } static inline void rdma_ah_set_interface_id(struct rdma_ah_attr *attr, __be64 if_id) { struct ib_global_route *grh = rdma_ah_retrieve_grh(attr); grh->dgid.global.interface_id = if_id; } static inline void rdma_ah_set_grh(struct rdma_ah_attr *attr, union ib_gid *dgid, u32 flow_label, u8 sgid_index, u8 hop_limit, u8 traffic_class) { struct ib_global_route *grh = rdma_ah_retrieve_grh(attr); attr->ah_flags = IB_AH_GRH; if (dgid) grh->dgid = *dgid; grh->flow_label = flow_label; grh->sgid_index = sgid_index; grh->hop_limit = hop_limit; grh->traffic_class = traffic_class; grh->sgid_attr = NULL; } void rdma_destroy_ah_attr(struct rdma_ah_attr *ah_attr); void rdma_move_grh_sgid_attr(struct rdma_ah_attr *attr, union ib_gid *dgid, u32 flow_label, u8 hop_limit, u8 traffic_class, const struct ib_gid_attr *sgid_attr); void rdma_copy_ah_attr(struct rdma_ah_attr *dest, const struct rdma_ah_attr *src); void rdma_replace_ah_attr(struct rdma_ah_attr *old, const struct rdma_ah_attr *new); void rdma_move_ah_attr(struct rdma_ah_attr *dest, struct rdma_ah_attr *src); /** * rdma_ah_find_type - Return address handle type. * * @dev: Device to be checked * @port_num: Port number */ static inline enum rdma_ah_attr_type rdma_ah_find_type(struct ib_device *dev, u32 port_num) { if (rdma_protocol_roce(dev, port_num)) return RDMA_AH_ATTR_TYPE_ROCE; if (rdma_protocol_ib(dev, port_num)) { if (rdma_cap_opa_ah(dev, port_num)) return RDMA_AH_ATTR_TYPE_OPA; return RDMA_AH_ATTR_TYPE_IB; } if (dev->type == RDMA_DEVICE_TYPE_SMI) return RDMA_AH_ATTR_TYPE_IB; return RDMA_AH_ATTR_TYPE_UNDEFINED; } /** * ib_lid_cpu16 - Return lid in 16bit CPU encoding. * In the current implementation the only way to * get the 32bit lid is from other sources for OPA. * For IB, lids will always be 16bits so cast the * value accordingly. * * @lid: A 32bit LID */ static inline u16 ib_lid_cpu16(u32 lid) { WARN_ON_ONCE(lid & 0xFFFF0000); return (u16)lid; } /** * ib_lid_be16 - Return lid in 16bit BE encoding. * * @lid: A 32bit LID */ static inline __be16 ib_lid_be16(u32 lid) { WARN_ON_ONCE(lid & 0xFFFF0000); return cpu_to_be16((u16)lid); } /** * ib_get_vector_affinity - Get the affinity mappings of a given completion * vector * @device: the rdma device * @comp_vector: index of completion vector * * Returns NULL on failure, otherwise a corresponding cpu map of the * completion vector (returns all-cpus map if the device driver doesn't * implement get_vector_affinity). */ static inline const struct cpumask * ib_get_vector_affinity(struct ib_device *device, int comp_vector) { if (comp_vector < 0 || comp_vector >= device->num_comp_vectors || !device->ops.get_vector_affinity) return NULL; return device->ops.get_vector_affinity(device, comp_vector); } /** * rdma_roce_rescan_device - Rescan all of the network devices in the system * and add their gids, as needed, to the relevant RoCE devices. * * @ibdev: the rdma device */ void rdma_roce_rescan_device(struct ib_device *ibdev); void rdma_roce_rescan_port(struct ib_device *ib_dev, u32 port); void roce_del_all_netdev_gids(struct ib_device *ib_dev, u32 port, struct net_device *ndev); struct ib_ucontext *ib_uverbs_get_ucontext_file(struct ib_uverbs_file *ufile); #if IS_ENABLED(CONFIG_INFINIBAND_USER_ACCESS) int uverbs_destroy_def_handler(struct uverbs_attr_bundle *attrs); bool rdma_uattrs_has_raw_cap(const struct uverbs_attr_bundle *attrs); #else static inline int uverbs_destroy_def_handler(struct uverbs_attr_bundle *attrs) { return 0; } static inline bool rdma_uattrs_has_raw_cap(const struct uverbs_attr_bundle *attrs) { return false; } #endif struct net_device *rdma_alloc_netdev(struct ib_device *device, u32 port_num, enum rdma_netdev_t type, const char *name, unsigned char name_assign_type, void (*setup)(struct net_device *)); int rdma_init_netdev(struct ib_device *device, u32 port_num, enum rdma_netdev_t type, const char *name, unsigned char name_assign_type, void (*setup)(struct net_device *), struct net_device *netdev); /** * rdma_device_to_ibdev - Get ib_device pointer from device pointer * * @device: device pointer for which ib_device pointer to retrieve * * rdma_device_to_ibdev() retrieves ib_device pointer from device. * */ static inline struct ib_device *rdma_device_to_ibdev(struct device *device) { struct ib_core_device *coredev = container_of(device, struct ib_core_device, dev); return coredev->owner; } /** * ibdev_to_node - return the NUMA node for a given ib_device * @ibdev: device to get the NUMA node for. */ static inline int ibdev_to_node(struct ib_device *ibdev) { struct device *parent = ibdev->dev.parent; if (!parent) return NUMA_NO_NODE; return dev_to_node(parent); } /** * rdma_device_to_drv_device - Helper macro to reach back to driver's * ib_device holder structure from device pointer. * * NOTE: New drivers should not make use of this API; This API is only for * existing drivers who have exposed sysfs entries using * ops->device_group. */ #define rdma_device_to_drv_device(dev, drv_dev_struct, ibdev_member) \ container_of(rdma_device_to_ibdev(dev), drv_dev_struct, ibdev_member) bool rdma_dev_access_netns(const struct ib_device *device, const struct net *net); bool rdma_dev_has_raw_cap(const struct ib_device *dev); static inline struct net *rdma_dev_net(struct ib_device *device) { return read_pnet(&device->coredev.rdma_net); } #define IB_ROCE_UDP_ENCAP_VALID_PORT_MIN (0xC000) #define IB_ROCE_UDP_ENCAP_VALID_PORT_MAX (0xFFFF) #define IB_GRH_FLOWLABEL_MASK (0x000FFFFF) /** * rdma_flow_label_to_udp_sport - generate a RoCE v2 UDP src port value based * on the flow_label * @fl: flow_label value * * This function will convert the 20 bit flow_label input to a valid RoCE v2 * UDP src port 14 bit value. All RoCE V2 drivers should use this same * convention. */ static inline u16 rdma_flow_label_to_udp_sport(u32 fl) { u32 fl_low = fl & 0x03fff, fl_high = fl & 0xFC000; fl_low ^= fl_high >> 14; return (u16)(fl_low | IB_ROCE_UDP_ENCAP_VALID_PORT_MIN); } /** * rdma_calc_flow_label - generate a RDMA symmetric flow label value based on * local and remote qpn values * * This function folded the multiplication results of two qpns, 24 bit each, * fields, and converts it to a 20 bit results. * * This function will create symmetric flow_label value based on the local * and remote qpn values. this will allow both the requester and responder * to calculate the same flow_label for a given connection. * * This helper function should be used by driver in case the upper layer * provide a zero flow_label value. This is to improve entropy of RDMA * traffic in the network. */ static inline u32 rdma_calc_flow_label(u32 lqpn, u32 rqpn) { u64 v = (u64)lqpn * rqpn; v ^= v >> 20; v ^= v >> 40; return (u32)(v & IB_GRH_FLOWLABEL_MASK); } /** * rdma_get_udp_sport - Calculate and set UDP source port based on the flow * label. If flow label is not defined in GRH then * calculate it based on lqpn/rqpn. * * @fl: flow label from GRH * @lqpn: local qp number * @rqpn: remote qp number */ static inline u16 rdma_get_udp_sport(u32 fl, u32 lqpn, u32 rqpn) { if (!fl) fl = rdma_calc_flow_label(lqpn, rqpn); return rdma_flow_label_to_udp_sport(fl); } const struct ib_port_immutable* ib_port_immutable_read(struct ib_device *dev, unsigned int port); /** ib_add_sub_device - Add a sub IB device on an existing one * * @parent: The IB device that needs to add a sub device * @type: The type of the new sub device * @name: The name of the new sub device * * * Return 0 on success, an error code otherwise */ int ib_add_sub_device(struct ib_device *parent, enum rdma_nl_dev_type type, const char *name); /** ib_del_sub_device_and_put - Delect an IB sub device while holding a 'get' * * @sub: The sub device that is going to be deleted * * Return 0 on success, an error code otherwise */ int ib_del_sub_device_and_put(struct ib_device *sub); static inline void ib_mark_name_assigned_by_user(struct ib_device *ibdev) { ibdev->name_assign_type = RDMA_NAME_ASSIGN_TYPE_USER; } #endif /* IB_VERBS_H */
63 59 22 59 2 4 63 61 6 6 5 36 61 61 58 29 1 2 59 24 10 14 13 10 14 13 18 18 18 105 108 106 24 32 32 32 32 31 26 26 30 30 4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 // SPDX-License-Identifier: GPL-2.0-or-later /* * ALSA sequencer Memory Manager * Copyright (c) 1998 by Frank van de Pol <fvdpol@coil.demon.nl> * Jaroslav Kysela <perex@perex.cz> * 2000 by Takashi Iwai <tiwai@suse.de> */ #include <linux/init.h> #include <linux/export.h> #include <linux/slab.h> #include <linux/sched/signal.h> #include <linux/mm.h> #include <sound/core.h> #include <sound/seq_kernel.h> #include "seq_memory.h" #include "seq_queue.h" #include "seq_info.h" #include "seq_lock.h" static inline int snd_seq_pool_available(struct snd_seq_pool *pool) { return pool->total_elements - atomic_read(&pool->counter); } static inline int snd_seq_output_ok(struct snd_seq_pool *pool) { return snd_seq_pool_available(pool) >= pool->room; } /* * Variable length event: * The event like sysex uses variable length type. * The external data may be stored in three different formats. * 1) kernel space * This is the normal case. * ext.data.len = length * ext.data.ptr = buffer pointer * 2) user space * When an event is generated via read(), the external data is * kept in user space until expanded. * ext.data.len = length | SNDRV_SEQ_EXT_USRPTR * ext.data.ptr = userspace pointer * 3) chained cells * When the variable length event is enqueued (in prioq or fifo), * the external data is decomposed to several cells. * ext.data.len = length | SNDRV_SEQ_EXT_CHAINED * ext.data.ptr = the additiona cell head * -> cell.next -> cell.next -> .. */ /* * exported: * call dump function to expand external data. */ static int get_var_len(const struct snd_seq_event *event) { if ((event->flags & SNDRV_SEQ_EVENT_LENGTH_MASK) != SNDRV_SEQ_EVENT_LENGTH_VARIABLE) return -EINVAL; return event->data.ext.len & ~SNDRV_SEQ_EXT_MASK; } static int dump_var_event(const struct snd_seq_event *event, snd_seq_dump_func_t func, void *private_data, int offset, int maxlen) { int len, err; struct snd_seq_event_cell *cell; len = get_var_len(event); if (len <= 0) return len; if (len <= offset) return 0; if (maxlen && len > offset + maxlen) len = offset + maxlen; if (event->data.ext.len & SNDRV_SEQ_EXT_USRPTR) { char buf[32]; char __user *curptr = (char __force __user *)event->data.ext.ptr; curptr += offset; len -= offset; while (len > 0) { int size = sizeof(buf); if (len < size) size = len; if (copy_from_user(buf, curptr, size)) return -EFAULT; err = func(private_data, buf, size); if (err < 0) return err; curptr += size; len -= size; } return 0; } if (!(event->data.ext.len & SNDRV_SEQ_EXT_CHAINED)) return func(private_data, event->data.ext.ptr + offset, len - offset); cell = (struct snd_seq_event_cell *)event->data.ext.ptr; for (; len > 0 && cell; cell = cell->next) { int size = sizeof(struct snd_seq_event); char *curptr = (char *)&cell->event; if (offset >= size) { offset -= size; len -= size; continue; } if (len < size) size = len; err = func(private_data, curptr + offset, size - offset); if (err < 0) return err; offset = 0; len -= size; } return 0; } int snd_seq_dump_var_event(const struct snd_seq_event *event, snd_seq_dump_func_t func, void *private_data) { return dump_var_event(event, func, private_data, 0, 0); } EXPORT_SYMBOL(snd_seq_dump_var_event); /* * exported: * expand the variable length event to linear buffer space. */ static int seq_copy_in_kernel(void *ptr, void *src, int size) { char **bufptr = ptr; memcpy(*bufptr, src, size); *bufptr += size; return 0; } static int seq_copy_in_user(void *ptr, void *src, int size) { char __user **bufptr = ptr; if (copy_to_user(*bufptr, src, size)) return -EFAULT; *bufptr += size; return 0; } static int expand_var_event(const struct snd_seq_event *event, int offset, int size, char *buf, bool in_kernel) { if (event->data.ext.len & SNDRV_SEQ_EXT_USRPTR) { if (! in_kernel) return -EINVAL; if (copy_from_user(buf, (char __force __user *)event->data.ext.ptr + offset, size)) return -EFAULT; return 0; } return dump_var_event(event, in_kernel ? seq_copy_in_kernel : seq_copy_in_user, &buf, offset, size); } int snd_seq_expand_var_event(const struct snd_seq_event *event, int count, char *buf, int in_kernel, int size_aligned) { int len, newlen, err; len = get_var_len(event); if (len < 0) return len; newlen = len; if (size_aligned > 0) newlen = roundup(len, size_aligned); if (count < newlen) return -EAGAIN; err = expand_var_event(event, 0, len, buf, in_kernel); if (err < 0) return err; if (len != newlen) { if (in_kernel) memset(buf + len, 0, newlen - len); else if (clear_user((__force void __user *)buf + len, newlen - len)) return -EFAULT; } return newlen; } EXPORT_SYMBOL(snd_seq_expand_var_event); int snd_seq_expand_var_event_at(const struct snd_seq_event *event, int count, char *buf, int offset) { int len, err; len = get_var_len(event); if (len < 0) return len; if (len <= offset) return 0; len -= offset; if (len > count) len = count; err = expand_var_event(event, offset, count, buf, true); if (err < 0) return err; return len; } EXPORT_SYMBOL_GPL(snd_seq_expand_var_event_at); /* * release this cell, free extended data if available */ static inline void free_cell(struct snd_seq_pool *pool, struct snd_seq_event_cell *cell) { cell->next = pool->free; pool->free = cell; atomic_dec(&pool->counter); } void snd_seq_cell_free(struct snd_seq_event_cell * cell) { struct snd_seq_pool *pool; if (snd_BUG_ON(!cell)) return; pool = cell->pool; if (snd_BUG_ON(!pool)) return; guard(spinlock_irqsave)(&pool->lock); free_cell(pool, cell); if (snd_seq_ev_is_variable(&cell->event)) { if (cell->event.data.ext.len & SNDRV_SEQ_EXT_CHAINED) { struct snd_seq_event_cell *curp, *nextptr; curp = cell->event.data.ext.ptr; for (; curp; curp = nextptr) { nextptr = curp->next; curp->next = pool->free; free_cell(pool, curp); } } } if (waitqueue_active(&pool->output_sleep)) { /* has enough space now? */ if (snd_seq_output_ok(pool)) wake_up(&pool->output_sleep); } } /* * allocate an event cell. */ static int snd_seq_cell_alloc(struct snd_seq_pool *pool, struct snd_seq_event_cell **cellp, int nonblock, struct file *file, struct mutex *mutexp) { struct snd_seq_event_cell *cell; unsigned long flags; int err = -EAGAIN; wait_queue_entry_t wait; if (pool == NULL) return -EINVAL; *cellp = NULL; init_waitqueue_entry(&wait, current); spin_lock_irqsave(&pool->lock, flags); if (pool->ptr == NULL) { /* not initialized */ pr_debug("ALSA: seq: pool is not initialized\n"); err = -EINVAL; goto __error; } while (pool->free == NULL && ! nonblock && ! pool->closing) { set_current_state(TASK_INTERRUPTIBLE); add_wait_queue(&pool->output_sleep, &wait); spin_unlock_irqrestore(&pool->lock, flags); if (mutexp) mutex_unlock(mutexp); schedule(); if (mutexp) mutex_lock(mutexp); spin_lock_irqsave(&pool->lock, flags); remove_wait_queue(&pool->output_sleep, &wait); /* interrupted? */ if (signal_pending(current)) { err = -ERESTARTSYS; goto __error; } } if (pool->closing) { /* closing.. */ err = -ENOMEM; goto __error; } cell = pool->free; if (cell) { int used; pool->free = cell->next; atomic_inc(&pool->counter); used = atomic_read(&pool->counter); if (pool->max_used < used) pool->max_used = used; pool->event_alloc_success++; /* clear cell pointers */ cell->next = NULL; err = 0; } else pool->event_alloc_failures++; *cellp = cell; __error: spin_unlock_irqrestore(&pool->lock, flags); return err; } /* * duplicate the event to a cell. * if the event has external data, the data is decomposed to additional * cells. */ int snd_seq_event_dup(struct snd_seq_pool *pool, struct snd_seq_event *event, struct snd_seq_event_cell **cellp, int nonblock, struct file *file, struct mutex *mutexp) { int ncells, err; unsigned int extlen; struct snd_seq_event_cell *cell; int size; *cellp = NULL; ncells = 0; extlen = 0; if (snd_seq_ev_is_variable(event)) { extlen = event->data.ext.len & ~SNDRV_SEQ_EXT_MASK; ncells = DIV_ROUND_UP(extlen, sizeof(struct snd_seq_event)); } if (ncells >= pool->total_elements) return -ENOMEM; err = snd_seq_cell_alloc(pool, &cell, nonblock, file, mutexp); if (err < 0) return err; /* copy the event */ size = snd_seq_event_packet_size(event); memcpy(&cell->ump, event, size); #if IS_ENABLED(CONFIG_SND_SEQ_UMP) if (size < sizeof(cell->event)) cell->ump.raw.extra = 0; #endif /* decompose */ if (snd_seq_ev_is_variable(event)) { int len = extlen; int is_chained = event->data.ext.len & SNDRV_SEQ_EXT_CHAINED; int is_usrptr = event->data.ext.len & SNDRV_SEQ_EXT_USRPTR; struct snd_seq_event_cell *src, *tmp, *tail; char *buf; cell->event.data.ext.len = extlen | SNDRV_SEQ_EXT_CHAINED; cell->event.data.ext.ptr = NULL; src = (struct snd_seq_event_cell *)event->data.ext.ptr; buf = (char *)event->data.ext.ptr; tail = NULL; while (ncells-- > 0) { size = sizeof(struct snd_seq_event); if (len < size) size = len; err = snd_seq_cell_alloc(pool, &tmp, nonblock, file, mutexp); if (err < 0) goto __error; if (cell->event.data.ext.ptr == NULL) cell->event.data.ext.ptr = tmp; if (tail) tail->next = tmp; tail = tmp; /* copy chunk */ if (is_chained && src) { tmp->event = src->event; src = src->next; } else if (is_usrptr) { if (copy_from_user(&tmp->event, (char __force __user *)buf, size)) { err = -EFAULT; goto __error; } } else { memcpy(&tmp->event, buf, size); } buf += size; len -= size; } } *cellp = cell; return 0; __error: snd_seq_cell_free(cell); return err; } /* poll wait */ int snd_seq_pool_poll_wait(struct snd_seq_pool *pool, struct file *file, poll_table *wait) { poll_wait(file, &pool->output_sleep, wait); guard(spinlock_irq)(&pool->lock); return snd_seq_output_ok(pool); } /* allocate room specified number of events */ int snd_seq_pool_init(struct snd_seq_pool *pool) { int cell; struct snd_seq_event_cell *cellptr; if (snd_BUG_ON(!pool)) return -EINVAL; cellptr = kvmalloc_array(pool->size, sizeof(struct snd_seq_event_cell), GFP_KERNEL); if (!cellptr) return -ENOMEM; /* add new cells to the free cell list */ guard(spinlock_irq)(&pool->lock); if (pool->ptr) { kvfree(cellptr); return 0; } pool->ptr = cellptr; pool->free = NULL; for (cell = 0; cell < pool->size; cell++) { cellptr = pool->ptr + cell; cellptr->pool = pool; cellptr->next = pool->free; pool->free = cellptr; } pool->room = (pool->size + 1) / 2; /* init statistics */ pool->max_used = 0; pool->total_elements = pool->size; return 0; } /* refuse the further insertion to the pool */ void snd_seq_pool_mark_closing(struct snd_seq_pool *pool) { if (snd_BUG_ON(!pool)) return; guard(spinlock_irqsave)(&pool->lock); pool->closing = 1; } /* remove events */ int snd_seq_pool_done(struct snd_seq_pool *pool) { struct snd_seq_event_cell *ptr; if (snd_BUG_ON(!pool)) return -EINVAL; /* wait for closing all threads */ if (waitqueue_active(&pool->output_sleep)) wake_up(&pool->output_sleep); while (atomic_read(&pool->counter) > 0) schedule_timeout_uninterruptible(1); /* release all resources */ scoped_guard(spinlock_irq, &pool->lock) { ptr = pool->ptr; pool->ptr = NULL; pool->free = NULL; pool->total_elements = 0; } kvfree(ptr); guard(spinlock_irq)(&pool->lock); pool->closing = 0; return 0; } /* init new memory pool */ struct snd_seq_pool *snd_seq_pool_new(int poolsize) { struct snd_seq_pool *pool; /* create pool block */ pool = kzalloc(sizeof(*pool), GFP_KERNEL); if (!pool) return NULL; spin_lock_init(&pool->lock); pool->ptr = NULL; pool->free = NULL; pool->total_elements = 0; atomic_set(&pool->counter, 0); pool->closing = 0; init_waitqueue_head(&pool->output_sleep); pool->size = poolsize; /* init statistics */ pool->max_used = 0; return pool; } /* remove memory pool */ int snd_seq_pool_delete(struct snd_seq_pool **ppool) { struct snd_seq_pool *pool = *ppool; *ppool = NULL; if (pool == NULL) return 0; snd_seq_pool_mark_closing(pool); snd_seq_pool_done(pool); kfree(pool); return 0; } /* exported to seq_clientmgr.c */ void snd_seq_info_pool(struct snd_info_buffer *buffer, struct snd_seq_pool *pool, char *space) { if (pool == NULL) return; snd_iprintf(buffer, "%sPool size : %d\n", space, pool->total_elements); snd_iprintf(buffer, "%sCells in use : %d\n", space, atomic_read(&pool->counter)); snd_iprintf(buffer, "%sPeak cells in use : %d\n", space, pool->max_used); snd_iprintf(buffer, "%sAlloc success : %d\n", space, pool->event_alloc_success); snd_iprintf(buffer, "%sAlloc failures : %d\n", space, pool->event_alloc_failures); }
3 3 2 18 18 8 16 11 7 18 6 1 1 2 2 2 2 5 1 6 15 14 15 1 15 1 15 4 4 11 69 19 4 5 4 4 3 3 6 5 4 1 9 6 5 38 6 1 36 27 1 3 7 3 3 5 1 5 1 2 2 1 3 13 15 15 24 27 26 10 14 20 11 10 5 11 15 8 18 11 15 3 4 62 28 1 58 1 57 41 2 4 5 4 26 30 21 1 7 1 7 31 2 19 8 7 27 8 8 7 25 32 5 1 4 4 4 133 2 1 3 3 103 2 4 4 2 4 38 62 6 5 1 9 1 5 1 7 4 3 7 4 4 3 1 2060 2016 143 66 7 2 2 11 1 10 10 3 7 10 10 1 1 3 10 3 7 10 8 2 1 1 1 1 3 4 24 3 21 3 21 5 16 1 119 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 // SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) /* * bcm.c - Broadcast Manager to filter/send (cyclic) CAN content * * Copyright (c) 2002-2017 Volkswagen Group Electronic Research * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of Volkswagen nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * Alternatively, provided that this notice is retained in full, this * software may be distributed under the terms of the GNU General * Public License ("GPL") version 2, in which case the provisions of the * GPL apply INSTEAD OF those given above. * * The provided data structures and external interfaces from this code * are not restricted to be used by modules with a GPL compatible license. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH * DAMAGE. * */ #include <linux/module.h> #include <linux/init.h> #include <linux/interrupt.h> #include <linux/hrtimer.h> #include <linux/list.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/uio.h> #include <linux/net.h> #include <linux/netdevice.h> #include <linux/socket.h> #include <linux/if_arp.h> #include <linux/skbuff.h> #include <linux/can.h> #include <linux/can/core.h> #include <linux/can/skb.h> #include <linux/can/bcm.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <net/sock.h> #include <net/net_namespace.h> /* * To send multiple CAN frame content within TX_SETUP or to filter * CAN messages with multiplex index within RX_SETUP, the number of * different filters is limited to 256 due to the one byte index value. */ #define MAX_NFRAMES 256 /* limit timers to 400 days for sending/timeouts */ #define BCM_TIMER_SEC_MAX (400 * 24 * 60 * 60) /* use of last_frames[index].flags */ #define RX_LOCAL 0x10 /* frame was created on the local host */ #define RX_OWN 0x20 /* frame was sent via the socket it was received on */ #define RX_RECV 0x40 /* received data for this element */ #define RX_THR 0x80 /* element not been sent due to throttle feature */ #define BCM_CAN_FLAGS_MASK 0x0F /* to clean private flags after usage */ /* get best masking value for can_rx_register() for a given single can_id */ #define REGMASK(id) ((id & CAN_EFF_FLAG) ? \ (CAN_EFF_MASK | CAN_EFF_FLAG | CAN_RTR_FLAG) : \ (CAN_SFF_MASK | CAN_EFF_FLAG | CAN_RTR_FLAG)) MODULE_DESCRIPTION("PF_CAN broadcast manager protocol"); MODULE_LICENSE("Dual BSD/GPL"); MODULE_AUTHOR("Oliver Hartkopp <oliver.hartkopp@volkswagen.de>"); MODULE_ALIAS("can-proto-2"); #define BCM_MIN_NAMELEN CAN_REQUIRED_SIZE(struct sockaddr_can, can_ifindex) /* * easy access to the first 64 bit of can(fd)_frame payload. cp->data is * 64 bit aligned so the offset has to be multiples of 8 which is ensured * by the only callers in bcm_rx_cmp_to_index() bcm_rx_handler(). */ static inline u64 get_u64(const struct canfd_frame *cp, int offset) { return *(u64 *)(cp->data + offset); } struct bcm_op { struct list_head list; struct rcu_head rcu; int ifindex; canid_t can_id; u32 flags; unsigned long frames_abs, frames_filtered; struct bcm_timeval ival1, ival2; struct hrtimer timer, thrtimer; ktime_t rx_stamp, kt_ival1, kt_ival2, kt_lastmsg; int rx_ifindex; int cfsiz; u32 count; u32 nframes; u32 currframe; /* void pointers to arrays of struct can[fd]_frame */ void *frames; void *last_frames; struct canfd_frame sframe; struct canfd_frame last_sframe; struct sock *sk; struct net_device *rx_reg_dev; spinlock_t bcm_tx_lock; /* protect currframe/count in runtime updates */ }; struct bcm_sock { struct sock sk; int bound; int ifindex; struct list_head notifier; struct list_head rx_ops; struct list_head tx_ops; unsigned long dropped_usr_msgs; struct proc_dir_entry *bcm_proc_read; char procname [32]; /* inode number in decimal with \0 */ }; static LIST_HEAD(bcm_notifier_list); static DEFINE_SPINLOCK(bcm_notifier_lock); static struct bcm_sock *bcm_busy_notifier; /* Return pointer to store the extra msg flags for bcm_recvmsg(). * We use the space of one unsigned int beyond the 'struct sockaddr_can' * in skb->cb. */ static inline unsigned int *bcm_flags(struct sk_buff *skb) { /* return pointer after struct sockaddr_can */ return (unsigned int *)(&((struct sockaddr_can *)skb->cb)[1]); } static inline struct bcm_sock *bcm_sk(const struct sock *sk) { return (struct bcm_sock *)sk; } static inline ktime_t bcm_timeval_to_ktime(struct bcm_timeval tv) { return ktime_set(tv.tv_sec, tv.tv_usec * NSEC_PER_USEC); } /* check limitations for timeval provided by user */ static bool bcm_is_invalid_tv(struct bcm_msg_head *msg_head) { if ((msg_head->ival1.tv_sec < 0) || (msg_head->ival1.tv_sec > BCM_TIMER_SEC_MAX) || (msg_head->ival1.tv_usec < 0) || (msg_head->ival1.tv_usec >= USEC_PER_SEC) || (msg_head->ival2.tv_sec < 0) || (msg_head->ival2.tv_sec > BCM_TIMER_SEC_MAX) || (msg_head->ival2.tv_usec < 0) || (msg_head->ival2.tv_usec >= USEC_PER_SEC)) return true; return false; } #define CFSIZ(flags) ((flags & CAN_FD_FRAME) ? CANFD_MTU : CAN_MTU) #define OPSIZ sizeof(struct bcm_op) #define MHSIZ sizeof(struct bcm_msg_head) /* * procfs functions */ #if IS_ENABLED(CONFIG_PROC_FS) static char *bcm_proc_getifname(struct net *net, char *result, int ifindex) { struct net_device *dev; if (!ifindex) return "any"; rcu_read_lock(); dev = dev_get_by_index_rcu(net, ifindex); if (dev) strcpy(result, dev->name); else strcpy(result, "???"); rcu_read_unlock(); return result; } static int bcm_proc_show(struct seq_file *m, void *v) { char ifname[IFNAMSIZ]; struct net *net = m->private; struct sock *sk = (struct sock *)pde_data(m->file->f_inode); struct bcm_sock *bo = bcm_sk(sk); struct bcm_op *op; seq_printf(m, ">>> socket %pK", sk->sk_socket); seq_printf(m, " / sk %pK", sk); seq_printf(m, " / bo %pK", bo); seq_printf(m, " / dropped %lu", bo->dropped_usr_msgs); seq_printf(m, " / bound %s", bcm_proc_getifname(net, ifname, bo->ifindex)); seq_printf(m, " <<<\n"); rcu_read_lock(); list_for_each_entry_rcu(op, &bo->rx_ops, list) { unsigned long reduction; /* print only active entries & prevent division by zero */ if (!op->frames_abs) continue; seq_printf(m, "rx_op: %03X %-5s ", op->can_id, bcm_proc_getifname(net, ifname, op->ifindex)); if (op->flags & CAN_FD_FRAME) seq_printf(m, "(%u)", op->nframes); else seq_printf(m, "[%u]", op->nframes); seq_printf(m, "%c ", (op->flags & RX_CHECK_DLC) ? 'd' : ' '); if (op->kt_ival1) seq_printf(m, "timeo=%lld ", (long long)ktime_to_us(op->kt_ival1)); if (op->kt_ival2) seq_printf(m, "thr=%lld ", (long long)ktime_to_us(op->kt_ival2)); seq_printf(m, "# recv %ld (%ld) => reduction: ", op->frames_filtered, op->frames_abs); reduction = 100 - (op->frames_filtered * 100) / op->frames_abs; seq_printf(m, "%s%ld%%\n", (reduction == 100) ? "near " : "", reduction); } list_for_each_entry(op, &bo->tx_ops, list) { seq_printf(m, "tx_op: %03X %s ", op->can_id, bcm_proc_getifname(net, ifname, op->ifindex)); if (op->flags & CAN_FD_FRAME) seq_printf(m, "(%u) ", op->nframes); else seq_printf(m, "[%u] ", op->nframes); if (op->kt_ival1) seq_printf(m, "t1=%lld ", (long long)ktime_to_us(op->kt_ival1)); if (op->kt_ival2) seq_printf(m, "t2=%lld ", (long long)ktime_to_us(op->kt_ival2)); seq_printf(m, "# sent %ld\n", op->frames_abs); } seq_putc(m, '\n'); rcu_read_unlock(); return 0; } #endif /* CONFIG_PROC_FS */ /* * bcm_can_tx - send the (next) CAN frame to the appropriate CAN interface * of the given bcm tx op */ static void bcm_can_tx(struct bcm_op *op) { struct sk_buff *skb; struct net_device *dev; struct canfd_frame *cf; int err; /* no target device? => exit */ if (!op->ifindex) return; /* read currframe under lock protection */ spin_lock_bh(&op->bcm_tx_lock); cf = op->frames + op->cfsiz * op->currframe; spin_unlock_bh(&op->bcm_tx_lock); dev = dev_get_by_index(sock_net(op->sk), op->ifindex); if (!dev) { /* RFC: should this bcm_op remove itself here? */ return; } skb = alloc_skb(op->cfsiz + sizeof(struct can_skb_priv), gfp_any()); if (!skb) goto out; can_skb_reserve(skb); can_skb_prv(skb)->ifindex = dev->ifindex; can_skb_prv(skb)->skbcnt = 0; skb_put_data(skb, cf, op->cfsiz); /* send with loopback */ skb->dev = dev; can_skb_set_owner(skb, op->sk); err = can_send(skb, 1); /* update currframe and count under lock protection */ spin_lock_bh(&op->bcm_tx_lock); if (!err) op->frames_abs++; op->currframe++; /* reached last frame? */ if (op->currframe >= op->nframes) op->currframe = 0; if (op->count > 0) op->count--; spin_unlock_bh(&op->bcm_tx_lock); out: dev_put(dev); } /* * bcm_send_to_user - send a BCM message to the userspace * (consisting of bcm_msg_head + x CAN frames) */ static void bcm_send_to_user(struct bcm_op *op, struct bcm_msg_head *head, struct canfd_frame *frames, int has_timestamp) { struct sk_buff *skb; struct canfd_frame *firstframe; struct sockaddr_can *addr; struct sock *sk = op->sk; unsigned int datalen = head->nframes * op->cfsiz; int err; unsigned int *pflags; enum skb_drop_reason reason; skb = alloc_skb(sizeof(*head) + datalen, gfp_any()); if (!skb) return; skb_put_data(skb, head, sizeof(*head)); /* ensure space for sockaddr_can and msg flags */ sock_skb_cb_check_size(sizeof(struct sockaddr_can) + sizeof(unsigned int)); /* initialize msg flags */ pflags = bcm_flags(skb); *pflags = 0; if (head->nframes) { /* CAN frames starting here */ firstframe = (struct canfd_frame *)skb_tail_pointer(skb); skb_put_data(skb, frames, datalen); /* * the BCM uses the flags-element of the canfd_frame * structure for internal purposes. This is only * relevant for updates that are generated by the * BCM, where nframes is 1 */ if (head->nframes == 1) { if (firstframe->flags & RX_LOCAL) *pflags |= MSG_DONTROUTE; if (firstframe->flags & RX_OWN) *pflags |= MSG_CONFIRM; firstframe->flags &= BCM_CAN_FLAGS_MASK; } } if (has_timestamp) { /* restore rx timestamp */ skb->tstamp = op->rx_stamp; } /* * Put the datagram to the queue so that bcm_recvmsg() can * get it from there. We need to pass the interface index to * bcm_recvmsg(). We pass a whole struct sockaddr_can in skb->cb * containing the interface index. */ addr = (struct sockaddr_can *)skb->cb; memset(addr, 0, sizeof(*addr)); addr->can_family = AF_CAN; addr->can_ifindex = op->rx_ifindex; err = sock_queue_rcv_skb_reason(sk, skb, &reason); if (err < 0) { struct bcm_sock *bo = bcm_sk(sk); sk_skb_reason_drop(sk, skb, reason); /* don't care about overflows in this statistic */ bo->dropped_usr_msgs++; } } static bool bcm_tx_set_expiry(struct bcm_op *op, struct hrtimer *hrt) { ktime_t ival; if (op->kt_ival1 && op->count) ival = op->kt_ival1; else if (op->kt_ival2) ival = op->kt_ival2; else return false; hrtimer_set_expires(hrt, ktime_add(ktime_get(), ival)); return true; } static void bcm_tx_start_timer(struct bcm_op *op) { if (bcm_tx_set_expiry(op, &op->timer)) hrtimer_start_expires(&op->timer, HRTIMER_MODE_ABS_SOFT); } /* bcm_tx_timeout_handler - performs cyclic CAN frame transmissions */ static enum hrtimer_restart bcm_tx_timeout_handler(struct hrtimer *hrtimer) { struct bcm_op *op = container_of(hrtimer, struct bcm_op, timer); struct bcm_msg_head msg_head; if (op->kt_ival1 && (op->count > 0)) { bcm_can_tx(op); if (!op->count && (op->flags & TX_COUNTEVT)) { /* create notification to user */ memset(&msg_head, 0, sizeof(msg_head)); msg_head.opcode = TX_EXPIRED; msg_head.flags = op->flags; msg_head.count = op->count; msg_head.ival1 = op->ival1; msg_head.ival2 = op->ival2; msg_head.can_id = op->can_id; msg_head.nframes = 0; bcm_send_to_user(op, &msg_head, NULL, 0); } } else if (op->kt_ival2) { bcm_can_tx(op); } return bcm_tx_set_expiry(op, &op->timer) ? HRTIMER_RESTART : HRTIMER_NORESTART; } /* * bcm_rx_changed - create a RX_CHANGED notification due to changed content */ static void bcm_rx_changed(struct bcm_op *op, struct canfd_frame *data) { struct bcm_msg_head head; /* update statistics */ op->frames_filtered++; /* prevent statistics overflow */ if (op->frames_filtered > ULONG_MAX/100) op->frames_filtered = op->frames_abs = 0; /* this element is not throttled anymore */ data->flags &= ~RX_THR; memset(&head, 0, sizeof(head)); head.opcode = RX_CHANGED; head.flags = op->flags; head.count = op->count; head.ival1 = op->ival1; head.ival2 = op->ival2; head.can_id = op->can_id; head.nframes = 1; bcm_send_to_user(op, &head, data, 1); } /* * bcm_rx_update_and_send - process a detected relevant receive content change * 1. update the last received data * 2. send a notification to the user (if possible) */ static void bcm_rx_update_and_send(struct bcm_op *op, struct canfd_frame *lastdata, const struct canfd_frame *rxdata, unsigned char traffic_flags) { memcpy(lastdata, rxdata, op->cfsiz); /* mark as used and throttled by default */ lastdata->flags |= (RX_RECV|RX_THR); /* add own/local/remote traffic flags */ lastdata->flags |= traffic_flags; /* throttling mode inactive ? */ if (!op->kt_ival2) { /* send RX_CHANGED to the user immediately */ bcm_rx_changed(op, lastdata); return; } /* with active throttling timer we are just done here */ if (hrtimer_active(&op->thrtimer)) return; /* first reception with enabled throttling mode */ if (!op->kt_lastmsg) goto rx_changed_settime; /* got a second frame inside a potential throttle period? */ if (ktime_us_delta(ktime_get(), op->kt_lastmsg) < ktime_to_us(op->kt_ival2)) { /* do not send the saved data - only start throttle timer */ hrtimer_start(&op->thrtimer, ktime_add(op->kt_lastmsg, op->kt_ival2), HRTIMER_MODE_ABS_SOFT); return; } /* the gap was that big, that throttling was not needed here */ rx_changed_settime: bcm_rx_changed(op, lastdata); op->kt_lastmsg = ktime_get(); } /* * bcm_rx_cmp_to_index - (bit)compares the currently received data to formerly * received data stored in op->last_frames[] */ static void bcm_rx_cmp_to_index(struct bcm_op *op, unsigned int index, const struct canfd_frame *rxdata, unsigned char traffic_flags) { struct canfd_frame *cf = op->frames + op->cfsiz * index; struct canfd_frame *lcf = op->last_frames + op->cfsiz * index; int i; /* * no one uses the MSBs of flags for comparison, * so we use it here to detect the first time of reception */ if (!(lcf->flags & RX_RECV)) { /* received data for the first time => send update to user */ bcm_rx_update_and_send(op, lcf, rxdata, traffic_flags); return; } /* do a real check in CAN frame data section */ for (i = 0; i < rxdata->len; i += 8) { if ((get_u64(cf, i) & get_u64(rxdata, i)) != (get_u64(cf, i) & get_u64(lcf, i))) { bcm_rx_update_and_send(op, lcf, rxdata, traffic_flags); return; } } if (op->flags & RX_CHECK_DLC) { /* do a real check in CAN frame length */ if (rxdata->len != lcf->len) { bcm_rx_update_and_send(op, lcf, rxdata, traffic_flags); return; } } } /* * bcm_rx_starttimer - enable timeout monitoring for CAN frame reception */ static void bcm_rx_starttimer(struct bcm_op *op) { if (op->flags & RX_NO_AUTOTIMER) return; if (op->kt_ival1) hrtimer_start(&op->timer, op->kt_ival1, HRTIMER_MODE_REL_SOFT); } /* bcm_rx_timeout_handler - when the (cyclic) CAN frame reception timed out */ static enum hrtimer_restart bcm_rx_timeout_handler(struct hrtimer *hrtimer) { struct bcm_op *op = container_of(hrtimer, struct bcm_op, timer); struct bcm_msg_head msg_head; /* if user wants to be informed, when cyclic CAN-Messages come back */ if ((op->flags & RX_ANNOUNCE_RESUME) && op->last_frames) { /* clear received CAN frames to indicate 'nothing received' */ memset(op->last_frames, 0, op->nframes * op->cfsiz); } /* create notification to user */ memset(&msg_head, 0, sizeof(msg_head)); msg_head.opcode = RX_TIMEOUT; msg_head.flags = op->flags; msg_head.count = op->count; msg_head.ival1 = op->ival1; msg_head.ival2 = op->ival2; msg_head.can_id = op->can_id; msg_head.nframes = 0; bcm_send_to_user(op, &msg_head, NULL, 0); return HRTIMER_NORESTART; } /* * bcm_rx_do_flush - helper for bcm_rx_thr_flush */ static inline int bcm_rx_do_flush(struct bcm_op *op, unsigned int index) { struct canfd_frame *lcf = op->last_frames + op->cfsiz * index; if ((op->last_frames) && (lcf->flags & RX_THR)) { bcm_rx_changed(op, lcf); return 1; } return 0; } /* * bcm_rx_thr_flush - Check for throttled data and send it to the userspace */ static int bcm_rx_thr_flush(struct bcm_op *op) { int updated = 0; if (op->nframes > 1) { unsigned int i; /* for MUX filter we start at index 1 */ for (i = 1; i < op->nframes; i++) updated += bcm_rx_do_flush(op, i); } else { /* for RX_FILTER_ID and simple filter */ updated += bcm_rx_do_flush(op, 0); } return updated; } /* * bcm_rx_thr_handler - the time for blocked content updates is over now: * Check for throttled data and send it to the userspace */ static enum hrtimer_restart bcm_rx_thr_handler(struct hrtimer *hrtimer) { struct bcm_op *op = container_of(hrtimer, struct bcm_op, thrtimer); if (bcm_rx_thr_flush(op)) { hrtimer_forward_now(hrtimer, op->kt_ival2); return HRTIMER_RESTART; } else { /* rearm throttle handling */ op->kt_lastmsg = 0; return HRTIMER_NORESTART; } } /* * bcm_rx_handler - handle a CAN frame reception */ static void bcm_rx_handler(struct sk_buff *skb, void *data) { struct bcm_op *op = (struct bcm_op *)data; const struct canfd_frame *rxframe = (struct canfd_frame *)skb->data; unsigned int i; unsigned char traffic_flags; if (op->can_id != rxframe->can_id) return; /* make sure to handle the correct frame type (CAN / CAN FD) */ if (op->flags & CAN_FD_FRAME) { if (!can_is_canfd_skb(skb)) return; } else { if (!can_is_can_skb(skb)) return; } /* disable timeout */ hrtimer_cancel(&op->timer); /* save rx timestamp */ op->rx_stamp = skb->tstamp; /* save originator for recvfrom() */ op->rx_ifindex = skb->dev->ifindex; /* update statistics */ op->frames_abs++; if (op->flags & RX_RTR_FRAME) { /* send reply for RTR-request (placed in op->frames[0]) */ bcm_can_tx(op); return; } /* compute flags to distinguish between own/local/remote CAN traffic */ traffic_flags = 0; if (skb->sk) { traffic_flags |= RX_LOCAL; if (skb->sk == op->sk) traffic_flags |= RX_OWN; } if (op->flags & RX_FILTER_ID) { /* the easiest case */ bcm_rx_update_and_send(op, op->last_frames, rxframe, traffic_flags); goto rx_starttimer; } if (op->nframes == 1) { /* simple compare with index 0 */ bcm_rx_cmp_to_index(op, 0, rxframe, traffic_flags); goto rx_starttimer; } if (op->nframes > 1) { /* * multiplex compare * * find the first multiplex mask that fits. * Remark: The MUX-mask is stored in index 0 - but only the * first 64 bits of the frame data[] are relevant (CAN FD) */ for (i = 1; i < op->nframes; i++) { if ((get_u64(op->frames, 0) & get_u64(rxframe, 0)) == (get_u64(op->frames, 0) & get_u64(op->frames + op->cfsiz * i, 0))) { bcm_rx_cmp_to_index(op, i, rxframe, traffic_flags); break; } } } rx_starttimer: bcm_rx_starttimer(op); } /* * helpers for bcm_op handling: find & delete bcm [rx|tx] op elements */ static struct bcm_op *bcm_find_op(struct list_head *ops, struct bcm_msg_head *mh, int ifindex) { struct bcm_op *op; list_for_each_entry(op, ops, list) { if ((op->can_id == mh->can_id) && (op->ifindex == ifindex) && (op->flags & CAN_FD_FRAME) == (mh->flags & CAN_FD_FRAME)) return op; } return NULL; } static void bcm_free_op_rcu(struct rcu_head *rcu_head) { struct bcm_op *op = container_of(rcu_head, struct bcm_op, rcu); if ((op->frames) && (op->frames != &op->sframe)) kfree(op->frames); if ((op->last_frames) && (op->last_frames != &op->last_sframe)) kfree(op->last_frames); kfree(op); } static void bcm_remove_op(struct bcm_op *op) { hrtimer_cancel(&op->timer); hrtimer_cancel(&op->thrtimer); call_rcu(&op->rcu, bcm_free_op_rcu); } static void bcm_rx_unreg(struct net_device *dev, struct bcm_op *op) { if (op->rx_reg_dev == dev) { can_rx_unregister(dev_net(dev), dev, op->can_id, REGMASK(op->can_id), bcm_rx_handler, op); /* mark as removed subscription */ op->rx_reg_dev = NULL; } else printk(KERN_ERR "can-bcm: bcm_rx_unreg: registered device " "mismatch %p %p\n", op->rx_reg_dev, dev); } /* * bcm_delete_rx_op - find and remove a rx op (returns number of removed ops) */ static int bcm_delete_rx_op(struct list_head *ops, struct bcm_msg_head *mh, int ifindex) { struct bcm_op *op, *n; list_for_each_entry_safe(op, n, ops, list) { if ((op->can_id == mh->can_id) && (op->ifindex == ifindex) && (op->flags & CAN_FD_FRAME) == (mh->flags & CAN_FD_FRAME)) { /* disable automatic timer on frame reception */ op->flags |= RX_NO_AUTOTIMER; /* * Don't care if we're bound or not (due to netdev * problems) can_rx_unregister() is always a save * thing to do here. */ if (op->ifindex) { /* * Only remove subscriptions that had not * been removed due to NETDEV_UNREGISTER * in bcm_notifier() */ if (op->rx_reg_dev) { struct net_device *dev; dev = dev_get_by_index(sock_net(op->sk), op->ifindex); if (dev) { bcm_rx_unreg(dev, op); dev_put(dev); } } } else can_rx_unregister(sock_net(op->sk), NULL, op->can_id, REGMASK(op->can_id), bcm_rx_handler, op); list_del_rcu(&op->list); bcm_remove_op(op); return 1; /* done */ } } return 0; /* not found */ } /* * bcm_delete_tx_op - find and remove a tx op (returns number of removed ops) */ static int bcm_delete_tx_op(struct list_head *ops, struct bcm_msg_head *mh, int ifindex) { struct bcm_op *op, *n; list_for_each_entry_safe(op, n, ops, list) { if ((op->can_id == mh->can_id) && (op->ifindex == ifindex) && (op->flags & CAN_FD_FRAME) == (mh->flags & CAN_FD_FRAME)) { list_del_rcu(&op->list); bcm_remove_op(op); return 1; /* done */ } } return 0; /* not found */ } /* * bcm_read_op - read out a bcm_op and send it to the user (for bcm_sendmsg) */ static int bcm_read_op(struct list_head *ops, struct bcm_msg_head *msg_head, int ifindex) { struct bcm_op *op = bcm_find_op(ops, msg_head, ifindex); if (!op) return -EINVAL; /* put current values into msg_head */ msg_head->flags = op->flags; msg_head->count = op->count; msg_head->ival1 = op->ival1; msg_head->ival2 = op->ival2; msg_head->nframes = op->nframes; bcm_send_to_user(op, msg_head, op->frames, 0); return MHSIZ; } /* * bcm_tx_setup - create or update a bcm tx op (for bcm_sendmsg) */ static int bcm_tx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg, int ifindex, struct sock *sk) { struct bcm_sock *bo = bcm_sk(sk); struct bcm_op *op; struct canfd_frame *cf; unsigned int i; int err; /* we need a real device to send frames */ if (!ifindex) return -ENODEV; /* check nframes boundaries - we need at least one CAN frame */ if (msg_head->nframes < 1 || msg_head->nframes > MAX_NFRAMES) return -EINVAL; /* check timeval limitations */ if ((msg_head->flags & SETTIMER) && bcm_is_invalid_tv(msg_head)) return -EINVAL; /* check the given can_id */ op = bcm_find_op(&bo->tx_ops, msg_head, ifindex); if (op) { /* update existing BCM operation */ /* * Do we need more space for the CAN frames than currently * allocated? -> This is a _really_ unusual use-case and * therefore (complexity / locking) it is not supported. */ if (msg_head->nframes > op->nframes) return -E2BIG; /* update CAN frames content */ for (i = 0; i < msg_head->nframes; i++) { cf = op->frames + op->cfsiz * i; err = memcpy_from_msg((u8 *)cf, msg, op->cfsiz); if (op->flags & CAN_FD_FRAME) { if (cf->len > 64) err = -EINVAL; } else { if (cf->len > 8) err = -EINVAL; } if (err < 0) return err; if (msg_head->flags & TX_CP_CAN_ID) { /* copy can_id into frame */ cf->can_id = msg_head->can_id; } } op->flags = msg_head->flags; /* only lock for unlikely count/nframes/currframe changes */ if (op->nframes != msg_head->nframes || op->flags & TX_RESET_MULTI_IDX || op->flags & SETTIMER) { spin_lock_bh(&op->bcm_tx_lock); if (op->nframes != msg_head->nframes || op->flags & TX_RESET_MULTI_IDX) { /* potentially update changed nframes */ op->nframes = msg_head->nframes; /* restart multiple frame transmission */ op->currframe = 0; } if (op->flags & SETTIMER) op->count = msg_head->count; spin_unlock_bh(&op->bcm_tx_lock); } } else { /* insert new BCM operation for the given can_id */ op = kzalloc(OPSIZ, GFP_KERNEL); if (!op) return -ENOMEM; spin_lock_init(&op->bcm_tx_lock); op->can_id = msg_head->can_id; op->cfsiz = CFSIZ(msg_head->flags); op->flags = msg_head->flags; op->nframes = msg_head->nframes; if (op->flags & SETTIMER) op->count = msg_head->count; /* create array for CAN frames and copy the data */ if (msg_head->nframes > 1) { op->frames = kmalloc_array(msg_head->nframes, op->cfsiz, GFP_KERNEL); if (!op->frames) { kfree(op); return -ENOMEM; } } else op->frames = &op->sframe; for (i = 0; i < msg_head->nframes; i++) { cf = op->frames + op->cfsiz * i; err = memcpy_from_msg((u8 *)cf, msg, op->cfsiz); if (err < 0) goto free_op; if (op->flags & CAN_FD_FRAME) { if (cf->len > 64) err = -EINVAL; } else { if (cf->len > 8) err = -EINVAL; } if (err < 0) goto free_op; if (msg_head->flags & TX_CP_CAN_ID) { /* copy can_id into frame */ cf->can_id = msg_head->can_id; } } /* tx_ops never compare with previous received messages */ op->last_frames = NULL; /* bcm_can_tx / bcm_tx_timeout_handler needs this */ op->sk = sk; op->ifindex = ifindex; /* initialize uninitialized (kzalloc) structure */ hrtimer_setup(&op->timer, bcm_tx_timeout_handler, CLOCK_MONOTONIC, HRTIMER_MODE_REL_SOFT); /* currently unused in tx_ops */ hrtimer_setup(&op->thrtimer, hrtimer_dummy_timeout, CLOCK_MONOTONIC, HRTIMER_MODE_REL_SOFT); /* add this bcm_op to the list of the tx_ops */ list_add(&op->list, &bo->tx_ops); } /* if ((op = bcm_find_op(&bo->tx_ops, msg_head->can_id, ifindex))) */ if (op->flags & SETTIMER) { /* set timer values */ op->ival1 = msg_head->ival1; op->ival2 = msg_head->ival2; op->kt_ival1 = bcm_timeval_to_ktime(msg_head->ival1); op->kt_ival2 = bcm_timeval_to_ktime(msg_head->ival2); /* disable an active timer due to zero values? */ if (!op->kt_ival1 && !op->kt_ival2) hrtimer_cancel(&op->timer); } if (op->flags & STARTTIMER) { hrtimer_cancel(&op->timer); /* spec: send CAN frame when starting timer */ op->flags |= TX_ANNOUNCE; } if (op->flags & TX_ANNOUNCE) bcm_can_tx(op); if (op->flags & STARTTIMER) bcm_tx_start_timer(op); return msg_head->nframes * op->cfsiz + MHSIZ; free_op: if (op->frames != &op->sframe) kfree(op->frames); kfree(op); return err; } /* * bcm_rx_setup - create or update a bcm rx op (for bcm_sendmsg) */ static int bcm_rx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg, int ifindex, struct sock *sk) { struct bcm_sock *bo = bcm_sk(sk); struct bcm_op *op; int do_rx_register; int err = 0; if ((msg_head->flags & RX_FILTER_ID) || (!(msg_head->nframes))) { /* be robust against wrong usage ... */ msg_head->flags |= RX_FILTER_ID; /* ignore trailing garbage */ msg_head->nframes = 0; } /* the first element contains the mux-mask => MAX_NFRAMES + 1 */ if (msg_head->nframes > MAX_NFRAMES + 1) return -EINVAL; if ((msg_head->flags & RX_RTR_FRAME) && ((msg_head->nframes != 1) || (!(msg_head->can_id & CAN_RTR_FLAG)))) return -EINVAL; /* check timeval limitations */ if ((msg_head->flags & SETTIMER) && bcm_is_invalid_tv(msg_head)) return -EINVAL; /* check the given can_id */ op = bcm_find_op(&bo->rx_ops, msg_head, ifindex); if (op) { /* update existing BCM operation */ /* * Do we need more space for the CAN frames than currently * allocated? -> This is a _really_ unusual use-case and * therefore (complexity / locking) it is not supported. */ if (msg_head->nframes > op->nframes) return -E2BIG; if (msg_head->nframes) { /* update CAN frames content */ err = memcpy_from_msg(op->frames, msg, msg_head->nframes * op->cfsiz); if (err < 0) return err; /* clear last_frames to indicate 'nothing received' */ memset(op->last_frames, 0, msg_head->nframes * op->cfsiz); } op->nframes = msg_head->nframes; op->flags = msg_head->flags; /* Only an update -> do not call can_rx_register() */ do_rx_register = 0; } else { /* insert new BCM operation for the given can_id */ op = kzalloc(OPSIZ, GFP_KERNEL); if (!op) return -ENOMEM; op->can_id = msg_head->can_id; op->nframes = msg_head->nframes; op->cfsiz = CFSIZ(msg_head->flags); op->flags = msg_head->flags; if (msg_head->nframes > 1) { /* create array for CAN frames and copy the data */ op->frames = kmalloc_array(msg_head->nframes, op->cfsiz, GFP_KERNEL); if (!op->frames) { kfree(op); return -ENOMEM; } /* create and init array for received CAN frames */ op->last_frames = kcalloc(msg_head->nframes, op->cfsiz, GFP_KERNEL); if (!op->last_frames) { kfree(op->frames); kfree(op); return -ENOMEM; } } else { op->frames = &op->sframe; op->last_frames = &op->last_sframe; } if (msg_head->nframes) { err = memcpy_from_msg(op->frames, msg, msg_head->nframes * op->cfsiz); if (err < 0) { if (op->frames != &op->sframe) kfree(op->frames); if (op->last_frames != &op->last_sframe) kfree(op->last_frames); kfree(op); return err; } } /* bcm_can_tx / bcm_tx_timeout_handler needs this */ op->sk = sk; op->ifindex = ifindex; /* ifindex for timeout events w/o previous frame reception */ op->rx_ifindex = ifindex; /* initialize uninitialized (kzalloc) structure */ hrtimer_setup(&op->timer, bcm_rx_timeout_handler, CLOCK_MONOTONIC, HRTIMER_MODE_REL_SOFT); hrtimer_setup(&op->thrtimer, bcm_rx_thr_handler, CLOCK_MONOTONIC, HRTIMER_MODE_REL_SOFT); /* add this bcm_op to the list of the rx_ops */ list_add(&op->list, &bo->rx_ops); /* call can_rx_register() */ do_rx_register = 1; } /* if ((op = bcm_find_op(&bo->rx_ops, msg_head->can_id, ifindex))) */ /* check flags */ if (op->flags & RX_RTR_FRAME) { struct canfd_frame *frame0 = op->frames; /* no timers in RTR-mode */ hrtimer_cancel(&op->thrtimer); hrtimer_cancel(&op->timer); /* * funny feature in RX(!)_SETUP only for RTR-mode: * copy can_id into frame BUT without RTR-flag to * prevent a full-load-loopback-test ... ;-] */ if ((op->flags & TX_CP_CAN_ID) || (frame0->can_id == op->can_id)) frame0->can_id = op->can_id & ~CAN_RTR_FLAG; } else { if (op->flags & SETTIMER) { /* set timer value */ op->ival1 = msg_head->ival1; op->ival2 = msg_head->ival2; op->kt_ival1 = bcm_timeval_to_ktime(msg_head->ival1); op->kt_ival2 = bcm_timeval_to_ktime(msg_head->ival2); /* disable an active timer due to zero value? */ if (!op->kt_ival1) hrtimer_cancel(&op->timer); /* * In any case cancel the throttle timer, flush * potentially blocked msgs and reset throttle handling */ op->kt_lastmsg = 0; hrtimer_cancel(&op->thrtimer); bcm_rx_thr_flush(op); } if ((op->flags & STARTTIMER) && op->kt_ival1) hrtimer_start(&op->timer, op->kt_ival1, HRTIMER_MODE_REL_SOFT); } /* now we can register for can_ids, if we added a new bcm_op */ if (do_rx_register) { if (ifindex) { struct net_device *dev; dev = dev_get_by_index(sock_net(sk), ifindex); if (dev) { err = can_rx_register(sock_net(sk), dev, op->can_id, REGMASK(op->can_id), bcm_rx_handler, op, "bcm", sk); op->rx_reg_dev = dev; dev_put(dev); } } else err = can_rx_register(sock_net(sk), NULL, op->can_id, REGMASK(op->can_id), bcm_rx_handler, op, "bcm", sk); if (err) { /* this bcm rx op is broken -> remove it */ list_del_rcu(&op->list); bcm_remove_op(op); return err; } } return msg_head->nframes * op->cfsiz + MHSIZ; } /* * bcm_tx_send - send a single CAN frame to the CAN interface (for bcm_sendmsg) */ static int bcm_tx_send(struct msghdr *msg, int ifindex, struct sock *sk, int cfsiz) { struct sk_buff *skb; struct net_device *dev; int err; /* we need a real device to send frames */ if (!ifindex) return -ENODEV; skb = alloc_skb(cfsiz + sizeof(struct can_skb_priv), GFP_KERNEL); if (!skb) return -ENOMEM; can_skb_reserve(skb); err = memcpy_from_msg(skb_put(skb, cfsiz), msg, cfsiz); if (err < 0) { kfree_skb(skb); return err; } dev = dev_get_by_index(sock_net(sk), ifindex); if (!dev) { kfree_skb(skb); return -ENODEV; } can_skb_prv(skb)->ifindex = dev->ifindex; can_skb_prv(skb)->skbcnt = 0; skb->dev = dev; can_skb_set_owner(skb, sk); err = can_send(skb, 1); /* send with loopback */ dev_put(dev); if (err) return err; return cfsiz + MHSIZ; } /* * bcm_sendmsg - process BCM commands (opcodes) from the userspace */ static int bcm_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) { struct sock *sk = sock->sk; struct bcm_sock *bo = bcm_sk(sk); int ifindex = bo->ifindex; /* default ifindex for this bcm_op */ struct bcm_msg_head msg_head; int cfsiz; int ret; /* read bytes or error codes as return value */ if (!bo->bound) return -ENOTCONN; /* check for valid message length from userspace */ if (size < MHSIZ) return -EINVAL; /* read message head information */ ret = memcpy_from_msg((u8 *)&msg_head, msg, MHSIZ); if (ret < 0) return ret; cfsiz = CFSIZ(msg_head.flags); if ((size - MHSIZ) % cfsiz) return -EINVAL; /* check for alternative ifindex for this bcm_op */ if (!ifindex && msg->msg_name) { /* no bound device as default => check msg_name */ DECLARE_SOCKADDR(struct sockaddr_can *, addr, msg->msg_name); if (msg->msg_namelen < BCM_MIN_NAMELEN) return -EINVAL; if (addr->can_family != AF_CAN) return -EINVAL; /* ifindex from sendto() */ ifindex = addr->can_ifindex; if (ifindex) { struct net_device *dev; dev = dev_get_by_index(sock_net(sk), ifindex); if (!dev) return -ENODEV; if (dev->type != ARPHRD_CAN) { dev_put(dev); return -ENODEV; } dev_put(dev); } } lock_sock(sk); switch (msg_head.opcode) { case TX_SETUP: ret = bcm_tx_setup(&msg_head, msg, ifindex, sk); break; case RX_SETUP: ret = bcm_rx_setup(&msg_head, msg, ifindex, sk); break; case TX_DELETE: if (bcm_delete_tx_op(&bo->tx_ops, &msg_head, ifindex)) ret = MHSIZ; else ret = -EINVAL; break; case RX_DELETE: if (bcm_delete_rx_op(&bo->rx_ops, &msg_head, ifindex)) ret = MHSIZ; else ret = -EINVAL; break; case TX_READ: /* reuse msg_head for the reply to TX_READ */ msg_head.opcode = TX_STATUS; ret = bcm_read_op(&bo->tx_ops, &msg_head, ifindex); break; case RX_READ: /* reuse msg_head for the reply to RX_READ */ msg_head.opcode = RX_STATUS; ret = bcm_read_op(&bo->rx_ops, &msg_head, ifindex); break; case TX_SEND: /* we need exactly one CAN frame behind the msg head */ if ((msg_head.nframes != 1) || (size != cfsiz + MHSIZ)) ret = -EINVAL; else ret = bcm_tx_send(msg, ifindex, sk, cfsiz); break; default: ret = -EINVAL; break; } release_sock(sk); return ret; } /* * notification handler for netdevice status changes */ static void bcm_notify(struct bcm_sock *bo, unsigned long msg, struct net_device *dev) { struct sock *sk = &bo->sk; struct bcm_op *op; int notify_enodev = 0; if (!net_eq(dev_net(dev), sock_net(sk))) return; switch (msg) { case NETDEV_UNREGISTER: lock_sock(sk); /* remove device specific receive entries */ list_for_each_entry(op, &bo->rx_ops, list) if (op->rx_reg_dev == dev) bcm_rx_unreg(dev, op); /* remove device reference, if this is our bound device */ if (bo->bound && bo->ifindex == dev->ifindex) { #if IS_ENABLED(CONFIG_PROC_FS) if (sock_net(sk)->can.bcmproc_dir && bo->bcm_proc_read) { remove_proc_entry(bo->procname, sock_net(sk)->can.bcmproc_dir); bo->bcm_proc_read = NULL; } #endif bo->bound = 0; bo->ifindex = 0; notify_enodev = 1; } release_sock(sk); if (notify_enodev) { sk->sk_err = ENODEV; if (!sock_flag(sk, SOCK_DEAD)) sk_error_report(sk); } break; case NETDEV_DOWN: if (bo->bound && bo->ifindex == dev->ifindex) { sk->sk_err = ENETDOWN; if (!sock_flag(sk, SOCK_DEAD)) sk_error_report(sk); } } } static int bcm_notifier(struct notifier_block *nb, unsigned long msg, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); if (dev->type != ARPHRD_CAN) return NOTIFY_DONE; if (msg != NETDEV_UNREGISTER && msg != NETDEV_DOWN) return NOTIFY_DONE; if (unlikely(bcm_busy_notifier)) /* Check for reentrant bug. */ return NOTIFY_DONE; spin_lock(&bcm_notifier_lock); list_for_each_entry(bcm_busy_notifier, &bcm_notifier_list, notifier) { spin_unlock(&bcm_notifier_lock); bcm_notify(bcm_busy_notifier, msg, dev); spin_lock(&bcm_notifier_lock); } bcm_busy_notifier = NULL; spin_unlock(&bcm_notifier_lock); return NOTIFY_DONE; } /* * initial settings for all BCM sockets to be set at socket creation time */ static int bcm_init(struct sock *sk) { struct bcm_sock *bo = bcm_sk(sk); bo->bound = 0; bo->ifindex = 0; bo->dropped_usr_msgs = 0; bo->bcm_proc_read = NULL; INIT_LIST_HEAD(&bo->tx_ops); INIT_LIST_HEAD(&bo->rx_ops); /* set notifier */ spin_lock(&bcm_notifier_lock); list_add_tail(&bo->notifier, &bcm_notifier_list); spin_unlock(&bcm_notifier_lock); return 0; } /* * standard socket functions */ static int bcm_release(struct socket *sock) { struct sock *sk = sock->sk; struct net *net; struct bcm_sock *bo; struct bcm_op *op, *next; if (!sk) return 0; net = sock_net(sk); bo = bcm_sk(sk); /* remove bcm_ops, timer, rx_unregister(), etc. */ spin_lock(&bcm_notifier_lock); while (bcm_busy_notifier == bo) { spin_unlock(&bcm_notifier_lock); schedule_timeout_uninterruptible(1); spin_lock(&bcm_notifier_lock); } list_del(&bo->notifier); spin_unlock(&bcm_notifier_lock); lock_sock(sk); #if IS_ENABLED(CONFIG_PROC_FS) /* remove procfs entry */ if (net->can.bcmproc_dir && bo->bcm_proc_read) remove_proc_entry(bo->procname, net->can.bcmproc_dir); #endif /* CONFIG_PROC_FS */ list_for_each_entry_safe(op, next, &bo->tx_ops, list) bcm_remove_op(op); list_for_each_entry_safe(op, next, &bo->rx_ops, list) { /* * Don't care if we're bound or not (due to netdev problems) * can_rx_unregister() is always a save thing to do here. */ if (op->ifindex) { /* * Only remove subscriptions that had not * been removed due to NETDEV_UNREGISTER * in bcm_notifier() */ if (op->rx_reg_dev) { struct net_device *dev; dev = dev_get_by_index(net, op->ifindex); if (dev) { bcm_rx_unreg(dev, op); dev_put(dev); } } } else can_rx_unregister(net, NULL, op->can_id, REGMASK(op->can_id), bcm_rx_handler, op); } synchronize_rcu(); list_for_each_entry_safe(op, next, &bo->rx_ops, list) bcm_remove_op(op); /* remove device reference */ if (bo->bound) { bo->bound = 0; bo->ifindex = 0; } sock_orphan(sk); sock->sk = NULL; release_sock(sk); sock_prot_inuse_add(net, sk->sk_prot, -1); sock_put(sk); return 0; } static int bcm_connect(struct socket *sock, struct sockaddr_unsized *uaddr, int len, int flags) { struct sockaddr_can *addr = (struct sockaddr_can *)uaddr; struct sock *sk = sock->sk; struct bcm_sock *bo = bcm_sk(sk); struct net *net = sock_net(sk); int ret = 0; if (len < BCM_MIN_NAMELEN) return -EINVAL; lock_sock(sk); if (bo->bound) { ret = -EISCONN; goto fail; } /* bind a device to this socket */ if (addr->can_ifindex) { struct net_device *dev; dev = dev_get_by_index(net, addr->can_ifindex); if (!dev) { ret = -ENODEV; goto fail; } if (dev->type != ARPHRD_CAN) { dev_put(dev); ret = -ENODEV; goto fail; } bo->ifindex = dev->ifindex; dev_put(dev); } else { /* no interface reference for ifindex = 0 ('any' CAN device) */ bo->ifindex = 0; } #if IS_ENABLED(CONFIG_PROC_FS) if (net->can.bcmproc_dir) { /* unique socket address as filename */ sprintf(bo->procname, "%lu", sock_i_ino(sk)); bo->bcm_proc_read = proc_create_net_single(bo->procname, 0644, net->can.bcmproc_dir, bcm_proc_show, sk); if (!bo->bcm_proc_read) { ret = -ENOMEM; goto fail; } } #endif /* CONFIG_PROC_FS */ bo->bound = 1; fail: release_sock(sk); return ret; } static int bcm_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, int flags) { struct sock *sk = sock->sk; struct sk_buff *skb; int error = 0; int err; skb = skb_recv_datagram(sk, flags, &error); if (!skb) return error; if (skb->len < size) size = skb->len; err = memcpy_to_msg(msg, skb->data, size); if (err < 0) { skb_free_datagram(sk, skb); return err; } sock_recv_cmsgs(msg, sk, skb); if (msg->msg_name) { __sockaddr_check_size(BCM_MIN_NAMELEN); msg->msg_namelen = BCM_MIN_NAMELEN; memcpy(msg->msg_name, skb->cb, msg->msg_namelen); } /* assign the flags that have been recorded in bcm_send_to_user() */ msg->msg_flags |= *(bcm_flags(skb)); skb_free_datagram(sk, skb); return size; } static int bcm_sock_no_ioctlcmd(struct socket *sock, unsigned int cmd, unsigned long arg) { /* no ioctls for socket layer -> hand it down to NIC layer */ return -ENOIOCTLCMD; } static const struct proto_ops bcm_ops = { .family = PF_CAN, .release = bcm_release, .bind = sock_no_bind, .connect = bcm_connect, .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = sock_no_getname, .poll = datagram_poll, .ioctl = bcm_sock_no_ioctlcmd, .gettstamp = sock_gettstamp, .listen = sock_no_listen, .shutdown = sock_no_shutdown, .sendmsg = bcm_sendmsg, .recvmsg = bcm_recvmsg, .mmap = sock_no_mmap, }; static struct proto bcm_proto __read_mostly = { .name = "CAN_BCM", .owner = THIS_MODULE, .obj_size = sizeof(struct bcm_sock), .init = bcm_init, }; static const struct can_proto bcm_can_proto = { .type = SOCK_DGRAM, .protocol = CAN_BCM, .ops = &bcm_ops, .prot = &bcm_proto, }; static int canbcm_pernet_init(struct net *net) { #if IS_ENABLED(CONFIG_PROC_FS) /* create /proc/net/can-bcm directory */ net->can.bcmproc_dir = proc_net_mkdir(net, "can-bcm", net->proc_net); #endif /* CONFIG_PROC_FS */ return 0; } static void canbcm_pernet_exit(struct net *net) { #if IS_ENABLED(CONFIG_PROC_FS) /* remove /proc/net/can-bcm directory */ if (net->can.bcmproc_dir) remove_proc_entry("can-bcm", net->proc_net); #endif /* CONFIG_PROC_FS */ } static struct pernet_operations canbcm_pernet_ops __read_mostly = { .init = canbcm_pernet_init, .exit = canbcm_pernet_exit, }; static struct notifier_block canbcm_notifier = { .notifier_call = bcm_notifier }; static int __init bcm_module_init(void) { int err; pr_info("can: broadcast manager protocol\n"); err = register_pernet_subsys(&canbcm_pernet_ops); if (err) return err; err = register_netdevice_notifier(&canbcm_notifier); if (err) goto register_notifier_failed; err = can_proto_register(&bcm_can_proto); if (err < 0) { printk(KERN_ERR "can: registration of bcm protocol failed\n"); goto register_proto_failed; } return 0; register_proto_failed: unregister_netdevice_notifier(&canbcm_notifier); register_notifier_failed: unregister_pernet_subsys(&canbcm_pernet_ops); return err; } static void __exit bcm_module_exit(void) { can_proto_unregister(&bcm_can_proto); unregister_netdevice_notifier(&canbcm_notifier); unregister_pernet_subsys(&canbcm_pernet_ops); } module_init(bcm_module_init); module_exit(bcm_module_exit);
4 3 3 1 1 7 5 3 3 4 4 4 4 3 4 2 2 4 4 4 4 4 3 3 3 4 4 4 4 4 1 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 // SPDX-License-Identifier: GPL-2.0-or-later /* * IPVS: Source Hashing scheduling module * * Authors: Wensong Zhang <wensong@gnuchina.org> * * Changes: */ /* * The sh algorithm is to select server by the hash key of source IP * address. The pseudo code is as follows: * * n <- servernode[src_ip]; * if (n is dead) OR * (n is overloaded) or (n.weight <= 0) then * return NULL; * * return n; * * Notes that servernode is a 256-bucket hash table that maps the hash * index derived from packet source IP address to the current server * array. If the sh scheduler is used in cache cluster, it is good to * combine it with cache_bypass feature. When the statically assigned * server is dead or overloaded, the load balancer can bypass the cache * server and send requests to the original server directly. * * The weight destination attribute can be used to control the * distribution of connections to the destinations in servernode. The * greater the weight, the more connections the destination * will receive. * */ #define pr_fmt(fmt) "IPVS: " fmt #include <linux/ip.h> #include <linux/slab.h> #include <linux/module.h> #include <linux/kernel.h> #include <linux/skbuff.h> #include <net/ip_vs.h> #include <net/tcp.h> #include <linux/udp.h> #include <linux/sctp.h> /* * IPVS SH bucket */ struct ip_vs_sh_bucket { struct ip_vs_dest __rcu *dest; /* real server (cache) */ }; /* * for IPVS SH entry hash table */ #ifndef CONFIG_IP_VS_SH_TAB_BITS #define CONFIG_IP_VS_SH_TAB_BITS 8 #endif #define IP_VS_SH_TAB_BITS CONFIG_IP_VS_SH_TAB_BITS #define IP_VS_SH_TAB_SIZE (1 << IP_VS_SH_TAB_BITS) #define IP_VS_SH_TAB_MASK (IP_VS_SH_TAB_SIZE - 1) struct ip_vs_sh_state { struct rcu_head rcu_head; struct ip_vs_sh_bucket buckets[IP_VS_SH_TAB_SIZE]; }; /* Helper function to determine if server is unavailable */ static inline bool is_unavailable(struct ip_vs_dest *dest) { return atomic_read(&dest->weight) <= 0 || dest->flags & IP_VS_DEST_F_OVERLOAD; } /* * Returns hash value for IPVS SH entry */ static inline unsigned int ip_vs_sh_hashkey(int af, const union nf_inet_addr *addr, __be16 port, unsigned int offset) { __be32 addr_fold = addr->ip; #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) addr_fold = addr->ip6[0]^addr->ip6[1]^ addr->ip6[2]^addr->ip6[3]; #endif return (offset + hash_32(ntohs(port) + ntohl(addr_fold), IP_VS_SH_TAB_BITS)) & IP_VS_SH_TAB_MASK; } /* * Get ip_vs_dest associated with supplied parameters. */ static inline struct ip_vs_dest * ip_vs_sh_get(struct ip_vs_service *svc, struct ip_vs_sh_state *s, const union nf_inet_addr *addr, __be16 port) { unsigned int hash = ip_vs_sh_hashkey(svc->af, addr, port, 0); struct ip_vs_dest *dest = rcu_dereference(s->buckets[hash].dest); return (!dest || is_unavailable(dest)) ? NULL : dest; } /* As ip_vs_sh_get, but with fallback if selected server is unavailable * * The fallback strategy loops around the table starting from a "random" * point (in fact, it is chosen to be the original hash value to make the * algorithm deterministic) to find a new server. */ static inline struct ip_vs_dest * ip_vs_sh_get_fallback(struct ip_vs_service *svc, struct ip_vs_sh_state *s, const union nf_inet_addr *addr, __be16 port) { unsigned int offset, roffset; unsigned int hash, ihash; struct ip_vs_dest *dest; /* first try the dest it's supposed to go to */ ihash = ip_vs_sh_hashkey(svc->af, addr, port, 0); dest = rcu_dereference(s->buckets[ihash].dest); if (!dest) return NULL; if (!is_unavailable(dest)) return dest; IP_VS_DBG_BUF(6, "SH: selected unavailable server %s:%d, reselecting", IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port)); /* if the original dest is unavailable, loop around the table * starting from ihash to find a new dest */ for (offset = 0; offset < IP_VS_SH_TAB_SIZE; offset++) { roffset = (offset + ihash) % IP_VS_SH_TAB_SIZE; hash = ip_vs_sh_hashkey(svc->af, addr, port, roffset); dest = rcu_dereference(s->buckets[hash].dest); if (!dest) break; if (!is_unavailable(dest)) return dest; IP_VS_DBG_BUF(6, "SH: selected unavailable " "server %s:%d (offset %d), reselecting", IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port), roffset); } return NULL; } /* * Assign all the hash buckets of the specified table with the service. */ static int ip_vs_sh_reassign(struct ip_vs_sh_state *s, struct ip_vs_service *svc) { int i; struct ip_vs_sh_bucket *b; struct list_head *p; struct ip_vs_dest *dest; int d_count; bool empty; b = &s->buckets[0]; p = &svc->destinations; empty = list_empty(p); d_count = 0; for (i=0; i<IP_VS_SH_TAB_SIZE; i++) { dest = rcu_dereference_protected(b->dest, 1); if (dest) ip_vs_dest_put(dest); if (empty) RCU_INIT_POINTER(b->dest, NULL); else { if (p == &svc->destinations) p = p->next; dest = list_entry(p, struct ip_vs_dest, n_list); ip_vs_dest_hold(dest); RCU_INIT_POINTER(b->dest, dest); IP_VS_DBG_BUF(6, "assigned i: %d dest: %s weight: %d\n", i, IP_VS_DBG_ADDR(dest->af, &dest->addr), atomic_read(&dest->weight)); /* Don't move to next dest until filling weight */ if (++d_count >= atomic_read(&dest->weight)) { p = p->next; d_count = 0; } } b++; } return 0; } /* * Flush all the hash buckets of the specified table. */ static void ip_vs_sh_flush(struct ip_vs_sh_state *s) { int i; struct ip_vs_sh_bucket *b; struct ip_vs_dest *dest; b = &s->buckets[0]; for (i=0; i<IP_VS_SH_TAB_SIZE; i++) { dest = rcu_dereference_protected(b->dest, 1); if (dest) { ip_vs_dest_put(dest); RCU_INIT_POINTER(b->dest, NULL); } b++; } } static int ip_vs_sh_init_svc(struct ip_vs_service *svc) { struct ip_vs_sh_state *s; /* allocate the SH table for this service */ s = kzalloc(sizeof(struct ip_vs_sh_state), GFP_KERNEL); if (s == NULL) return -ENOMEM; svc->sched_data = s; IP_VS_DBG(6, "SH hash table (memory=%zdbytes) allocated for " "current service\n", sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE); /* assign the hash buckets with current dests */ ip_vs_sh_reassign(s, svc); return 0; } static void ip_vs_sh_done_svc(struct ip_vs_service *svc) { struct ip_vs_sh_state *s = svc->sched_data; /* got to clean up hash buckets here */ ip_vs_sh_flush(s); /* release the table itself */ kfree_rcu(s, rcu_head); IP_VS_DBG(6, "SH hash table (memory=%zdbytes) released\n", sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE); } static int ip_vs_sh_dest_changed(struct ip_vs_service *svc, struct ip_vs_dest *dest) { struct ip_vs_sh_state *s = svc->sched_data; /* assign the hash buckets with the updated service */ ip_vs_sh_reassign(s, svc); return 0; } /* Helper function to get port number */ static inline __be16 ip_vs_sh_get_port(const struct sk_buff *skb, struct ip_vs_iphdr *iph) { __be16 _ports[2], *ports; /* At this point we know that we have a valid packet of some kind. * Because ICMP packets are only guaranteed to have the first 8 * bytes, let's just grab the ports. Fortunately they're in the * same position for all three of the protocols we care about. */ switch (iph->protocol) { case IPPROTO_TCP: case IPPROTO_UDP: case IPPROTO_SCTP: ports = skb_header_pointer(skb, iph->len, sizeof(_ports), &_ports); if (unlikely(!ports)) return 0; if (likely(!ip_vs_iph_inverse(iph))) return ports[0]; else return ports[1]; default: return 0; } } /* * Source Hashing scheduling */ static struct ip_vs_dest * ip_vs_sh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb, struct ip_vs_iphdr *iph) { struct ip_vs_dest *dest; struct ip_vs_sh_state *s; __be16 port = 0; const union nf_inet_addr *hash_addr; hash_addr = ip_vs_iph_inverse(iph) ? &iph->daddr : &iph->saddr; IP_VS_DBG(6, "ip_vs_sh_schedule(): Scheduling...\n"); if (svc->flags & IP_VS_SVC_F_SCHED_SH_PORT) port = ip_vs_sh_get_port(skb, iph); s = (struct ip_vs_sh_state *) svc->sched_data; if (svc->flags & IP_VS_SVC_F_SCHED_SH_FALLBACK) dest = ip_vs_sh_get_fallback(svc, s, hash_addr, port); else dest = ip_vs_sh_get(svc, s, hash_addr, port); if (!dest) { ip_vs_scheduler_err(svc, "no destination available"); return NULL; } IP_VS_DBG_BUF(6, "SH: source IP address %s --> server %s:%d\n", IP_VS_DBG_ADDR(svc->af, hash_addr), IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port)); return dest; } /* * IPVS SH Scheduler structure */ static struct ip_vs_scheduler ip_vs_sh_scheduler = { .name = "sh", .refcnt = ATOMIC_INIT(0), .module = THIS_MODULE, .n_list = LIST_HEAD_INIT(ip_vs_sh_scheduler.n_list), .init_service = ip_vs_sh_init_svc, .done_service = ip_vs_sh_done_svc, .add_dest = ip_vs_sh_dest_changed, .del_dest = ip_vs_sh_dest_changed, .upd_dest = ip_vs_sh_dest_changed, .schedule = ip_vs_sh_schedule, }; static int __init ip_vs_sh_init(void) { return register_ip_vs_scheduler(&ip_vs_sh_scheduler); } static void __exit ip_vs_sh_cleanup(void) { unregister_ip_vs_scheduler(&ip_vs_sh_scheduler); synchronize_rcu(); } module_init(ip_vs_sh_init); module_exit(ip_vs_sh_cleanup); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("ipvs source hashing scheduler");
3 16 105 104 100 1 99 66 31 10 122 14 2 107 108 3 19 2 1 16 19 10 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 /* * llc_input.c - Minimal input path for LLC * * Copyright (c) 1997 by Procom Technology, Inc. * 2001-2003 by Arnaldo Carvalho de Melo <acme@conectiva.com.br> * * This program can be redistributed or modified under the terms of the * GNU General Public License as published by the Free Software Foundation. * This program is distributed without any warranty or implied warranty * of merchantability or fitness for a particular purpose. * * See the GNU General Public License for more details. */ #include <linux/netdevice.h> #include <linux/slab.h> #include <linux/export.h> #include <net/net_namespace.h> #include <net/llc.h> #include <net/llc_pdu.h> #include <net/llc_sap.h> #if 0 #define dprintk(args...) printk(KERN_DEBUG args) #else #define dprintk(args...) #endif /* * Packet handler for the station, registerable because in the minimal * LLC core that is taking shape only the very minimal subset of LLC that * is needed for things like IPX, Appletalk, etc will stay, with all the * rest in the llc1 and llc2 modules. */ static void (*llc_station_handler)(struct sk_buff *skb); /* * Packet handlers for LLC_DEST_SAP and LLC_DEST_CONN. */ static void (*llc_type_handlers[2])(struct llc_sap *sap, struct sk_buff *skb); void llc_add_pack(int type, void (*handler)(struct llc_sap *sap, struct sk_buff *skb)) { smp_wmb(); /* ensure initialisation is complete before it's called */ if (type == LLC_DEST_SAP || type == LLC_DEST_CONN) llc_type_handlers[type - 1] = handler; } void llc_remove_pack(int type) { if (type == LLC_DEST_SAP || type == LLC_DEST_CONN) llc_type_handlers[type - 1] = NULL; synchronize_net(); } void llc_set_station_handler(void (*handler)(struct sk_buff *skb)) { /* Ensure initialisation is complete before it's called */ if (handler) smp_wmb(); llc_station_handler = handler; if (!handler) synchronize_net(); } /** * llc_pdu_type - returns which LLC component must handle for PDU * @skb: input skb * * This function returns which LLC component must handle this PDU. */ static __inline__ int llc_pdu_type(struct sk_buff *skb) { int type = LLC_DEST_CONN; /* I-PDU or S-PDU type */ struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); if ((pdu->ctrl_1 & LLC_PDU_TYPE_MASK) != LLC_PDU_TYPE_U) goto out; switch (LLC_U_PDU_CMD(pdu)) { case LLC_1_PDU_CMD_XID: case LLC_1_PDU_CMD_UI: case LLC_1_PDU_CMD_TEST: type = LLC_DEST_SAP; break; case LLC_2_PDU_CMD_SABME: case LLC_2_PDU_CMD_DISC: case LLC_2_PDU_RSP_UA: case LLC_2_PDU_RSP_DM: case LLC_2_PDU_RSP_FRMR: break; default: type = LLC_DEST_INVALID; break; } out: return type; } /** * llc_fixup_skb - initializes skb pointers * @skb: This argument points to incoming skb * * Initializes internal skb pointer to start of network layer by deriving * length of LLC header; finds length of LLC control field in LLC header * by looking at the two lowest-order bits of the first control field * byte; field is either 3 or 4 bytes long. */ static inline int llc_fixup_skb(struct sk_buff *skb) { u8 llc_len = 2; struct llc_pdu_un *pdu; if (unlikely(!pskb_may_pull(skb, sizeof(*pdu)))) return 0; pdu = (struct llc_pdu_un *)skb->data; if ((pdu->ctrl_1 & LLC_PDU_TYPE_MASK) == LLC_PDU_TYPE_U) llc_len = 1; llc_len += 2; if (unlikely(!pskb_may_pull(skb, llc_len))) return 0; skb_pull(skb, llc_len); skb_reset_transport_header(skb); if (skb->protocol == htons(ETH_P_802_2)) { __be16 pdulen; s32 data_size; if (skb->mac_len < ETH_HLEN) return 0; pdulen = eth_hdr(skb)->h_proto; data_size = ntohs(pdulen) - llc_len; if (data_size < 0 || !pskb_may_pull(skb, data_size)) return 0; if (unlikely(pskb_trim_rcsum(skb, data_size))) return 0; } return 1; } /** * llc_rcv - 802.2 entry point from net lower layers * @skb: received pdu * @dev: device that receive pdu * @pt: packet type * @orig_dev: the original receive net device * * When the system receives a 802.2 frame this function is called. It * checks SAP and connection of received pdu and passes frame to * llc_{station,sap,conn}_rcv for sending to proper state machine. If * the frame is related to a busy connection (a connection is sending * data now), it queues this frame in the connection's backlog. */ int llc_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { struct llc_sap *sap; struct llc_pdu_sn *pdu; int dest; int (*rcv)(struct sk_buff *, struct net_device *, struct packet_type *, struct net_device *); void (*sta_handler)(struct sk_buff *skb); void (*sap_handler)(struct llc_sap *sap, struct sk_buff *skb); /* * When the interface is in promisc. mode, drop all the crap that it * receives, do not try to analyse it. */ if (unlikely(skb->pkt_type == PACKET_OTHERHOST)) { dprintk("%s: PACKET_OTHERHOST\n", __func__); goto drop; } skb = skb_share_check(skb, GFP_ATOMIC); if (unlikely(!skb)) goto out; if (unlikely(!llc_fixup_skb(skb))) goto drop; pdu = llc_pdu_sn_hdr(skb); if (unlikely(!pdu->dsap)) /* NULL DSAP, refer to station */ goto handle_station; sap = llc_sap_find(pdu->dsap); if (unlikely(!sap)) {/* unknown SAP */ dprintk("%s: llc_sap_find(%02X) failed!\n", __func__, pdu->dsap); goto drop; } /* * First the upper layer protocols that don't need the full * LLC functionality */ rcv = rcu_dereference(sap->rcv_func); dest = llc_pdu_type(skb); sap_handler = dest ? READ_ONCE(llc_type_handlers[dest - 1]) : NULL; if (unlikely(!sap_handler)) { if (rcv) rcv(skb, dev, pt, orig_dev); else kfree_skb(skb); } else { if (rcv) { struct sk_buff *cskb = skb_clone(skb, GFP_ATOMIC); if (cskb) rcv(cskb, dev, pt, orig_dev); } sap_handler(sap, skb); } llc_sap_put(sap); out: return 0; drop: kfree_skb(skb); goto out; handle_station: sta_handler = READ_ONCE(llc_station_handler); if (!sta_handler) goto drop; sta_handler(skb); goto out; } EXPORT_SYMBOL(llc_add_pack); EXPORT_SYMBOL(llc_remove_pack); EXPORT_SYMBOL(llc_set_station_handler);
6 377 14 14 120 61 14 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __LINUX_GFP_H #define __LINUX_GFP_H #include <linux/gfp_types.h> #include <linux/mmzone.h> #include <linux/topology.h> #include <linux/alloc_tag.h> #include <linux/cleanup.h> #include <linux/sched.h> struct vm_area_struct; struct mempolicy; /* Convert GFP flags to their corresponding migrate type */ #define GFP_MOVABLE_MASK (__GFP_RECLAIMABLE|__GFP_MOVABLE) #define GFP_MOVABLE_SHIFT 3 static inline int gfp_migratetype(const gfp_t gfp_flags) { VM_WARN_ON((gfp_flags & GFP_MOVABLE_MASK) == GFP_MOVABLE_MASK); BUILD_BUG_ON((1UL << GFP_MOVABLE_SHIFT) != ___GFP_MOVABLE); BUILD_BUG_ON((___GFP_MOVABLE >> GFP_MOVABLE_SHIFT) != MIGRATE_MOVABLE); BUILD_BUG_ON((___GFP_RECLAIMABLE >> GFP_MOVABLE_SHIFT) != MIGRATE_RECLAIMABLE); BUILD_BUG_ON(((___GFP_MOVABLE | ___GFP_RECLAIMABLE) >> GFP_MOVABLE_SHIFT) != MIGRATE_HIGHATOMIC); if (unlikely(page_group_by_mobility_disabled)) return MIGRATE_UNMOVABLE; /* Group based on mobility */ return (__force unsigned long)(gfp_flags & GFP_MOVABLE_MASK) >> GFP_MOVABLE_SHIFT; } #undef GFP_MOVABLE_MASK #undef GFP_MOVABLE_SHIFT static inline bool gfpflags_allow_blocking(const gfp_t gfp_flags) { return !!(gfp_flags & __GFP_DIRECT_RECLAIM); } static inline bool gfpflags_allow_spinning(const gfp_t gfp_flags) { /* * !__GFP_DIRECT_RECLAIM -> direct claim is not allowed. * !__GFP_KSWAPD_RECLAIM -> it's not safe to wake up kswapd. * All GFP_* flags including GFP_NOWAIT use one or both flags. * alloc_pages_nolock() is the only API that doesn't specify either flag. * * This is stronger than GFP_NOWAIT or GFP_ATOMIC because * those are guaranteed to never block on a sleeping lock. * Here we are enforcing that the allocation doesn't ever spin * on any locks (i.e. only trylocks). There is no high level * GFP_$FOO flag for this use in alloc_pages_nolock() as the * regular page allocator doesn't fully support this * allocation mode. */ return !!(gfp_flags & __GFP_RECLAIM); } #ifdef CONFIG_HIGHMEM #define OPT_ZONE_HIGHMEM ZONE_HIGHMEM #else #define OPT_ZONE_HIGHMEM ZONE_NORMAL #endif #ifdef CONFIG_ZONE_DMA #define OPT_ZONE_DMA ZONE_DMA #else #define OPT_ZONE_DMA ZONE_NORMAL #endif #ifdef CONFIG_ZONE_DMA32 #define OPT_ZONE_DMA32 ZONE_DMA32 #else #define OPT_ZONE_DMA32 ZONE_NORMAL #endif /* * GFP_ZONE_TABLE is a word size bitstring that is used for looking up the * zone to use given the lowest 4 bits of gfp_t. Entries are GFP_ZONES_SHIFT * bits long and there are 16 of them to cover all possible combinations of * __GFP_DMA, __GFP_DMA32, __GFP_MOVABLE and __GFP_HIGHMEM. * * The zone fallback order is MOVABLE=>HIGHMEM=>NORMAL=>DMA32=>DMA. * But GFP_MOVABLE is not only a zone specifier but also an allocation * policy. Therefore __GFP_MOVABLE plus another zone selector is valid. * Only 1 bit of the lowest 3 bits (DMA,DMA32,HIGHMEM) can be set to "1". * * bit result * ================= * 0x0 => NORMAL * 0x1 => DMA or NORMAL * 0x2 => HIGHMEM or NORMAL * 0x3 => BAD (DMA+HIGHMEM) * 0x4 => DMA32 or NORMAL * 0x5 => BAD (DMA+DMA32) * 0x6 => BAD (HIGHMEM+DMA32) * 0x7 => BAD (HIGHMEM+DMA32+DMA) * 0x8 => NORMAL (MOVABLE+0) * 0x9 => DMA or NORMAL (MOVABLE+DMA) * 0xa => MOVABLE (Movable is valid only if HIGHMEM is set too) * 0xb => BAD (MOVABLE+HIGHMEM+DMA) * 0xc => DMA32 or NORMAL (MOVABLE+DMA32) * 0xd => BAD (MOVABLE+DMA32+DMA) * 0xe => BAD (MOVABLE+DMA32+HIGHMEM) * 0xf => BAD (MOVABLE+DMA32+HIGHMEM+DMA) * * GFP_ZONES_SHIFT must be <= 2 on 32 bit platforms. */ #if defined(CONFIG_ZONE_DEVICE) && (MAX_NR_ZONES-1) <= 4 /* ZONE_DEVICE is not a valid GFP zone specifier */ #define GFP_ZONES_SHIFT 2 #else #define GFP_ZONES_SHIFT ZONES_SHIFT #endif #if 16 * GFP_ZONES_SHIFT > BITS_PER_LONG #error GFP_ZONES_SHIFT too large to create GFP_ZONE_TABLE integer #endif #define GFP_ZONE_TABLE ( \ (ZONE_NORMAL << 0 * GFP_ZONES_SHIFT) \ | (OPT_ZONE_DMA << ___GFP_DMA * GFP_ZONES_SHIFT) \ | (OPT_ZONE_HIGHMEM << ___GFP_HIGHMEM * GFP_ZONES_SHIFT) \ | (OPT_ZONE_DMA32 << ___GFP_DMA32 * GFP_ZONES_SHIFT) \ | (ZONE_NORMAL << ___GFP_MOVABLE * GFP_ZONES_SHIFT) \ | (OPT_ZONE_DMA << (___GFP_MOVABLE | ___GFP_DMA) * GFP_ZONES_SHIFT) \ | (ZONE_MOVABLE << (___GFP_MOVABLE | ___GFP_HIGHMEM) * GFP_ZONES_SHIFT)\ | (OPT_ZONE_DMA32 << (___GFP_MOVABLE | ___GFP_DMA32) * GFP_ZONES_SHIFT)\ ) /* * GFP_ZONE_BAD is a bitmap for all combinations of __GFP_DMA, __GFP_DMA32 * __GFP_HIGHMEM and __GFP_MOVABLE that are not permitted. One flag per * entry starting with bit 0. Bit is set if the combination is not * allowed. */ #define GFP_ZONE_BAD ( \ 1 << (___GFP_DMA | ___GFP_HIGHMEM) \ | 1 << (___GFP_DMA | ___GFP_DMA32) \ | 1 << (___GFP_DMA32 | ___GFP_HIGHMEM) \ | 1 << (___GFP_DMA | ___GFP_DMA32 | ___GFP_HIGHMEM) \ | 1 << (___GFP_MOVABLE | ___GFP_HIGHMEM | ___GFP_DMA) \ | 1 << (___GFP_MOVABLE | ___GFP_DMA32 | ___GFP_DMA) \ | 1 << (___GFP_MOVABLE | ___GFP_DMA32 | ___GFP_HIGHMEM) \ | 1 << (___GFP_MOVABLE | ___GFP_DMA32 | ___GFP_DMA | ___GFP_HIGHMEM) \ ) static inline enum zone_type gfp_zone(gfp_t flags) { enum zone_type z; int bit = (__force int) (flags & GFP_ZONEMASK); z = (GFP_ZONE_TABLE >> (bit * GFP_ZONES_SHIFT)) & ((1 << GFP_ZONES_SHIFT) - 1); VM_BUG_ON((GFP_ZONE_BAD >> bit) & 1); return z; } /* * There is only one page-allocator function, and two main namespaces to * it. The alloc_page*() variants return 'struct page *' and as such * can allocate highmem pages, the *get*page*() variants return * virtual kernel addresses to the allocated page(s). */ static inline int gfp_zonelist(gfp_t flags) { #ifdef CONFIG_NUMA if (unlikely(flags & __GFP_THISNODE)) return ZONELIST_NOFALLBACK; #endif return ZONELIST_FALLBACK; } /* * gfp flag masking for nested internal allocations. * * For code that needs to do allocations inside the public allocation API (e.g. * memory allocation tracking code) the allocations need to obey the caller * allocation context constrains to prevent allocation context mismatches (e.g. * GFP_KERNEL allocations in GFP_NOFS contexts) from potential deadlock * situations. * * It is also assumed that these nested allocations are for internal kernel * object storage purposes only and are not going to be used for DMA, etc. Hence * we strip out all the zone information and leave just the context information * intact. * * Further, internal allocations must fail before the higher level allocation * can fail, so we must make them fail faster and fail silently. We also don't * want them to deplete emergency reserves. Hence nested allocations must be * prepared for these allocations to fail. */ static inline gfp_t gfp_nested_mask(gfp_t flags) { return ((flags & (GFP_KERNEL | GFP_ATOMIC | __GFP_NOLOCKDEP)) | (__GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN)); } /* * We get the zone list from the current node and the gfp_mask. * This zone list contains a maximum of MAX_NUMNODES*MAX_NR_ZONES zones. * There are two zonelists per node, one for all zones with memory and * one containing just zones from the node the zonelist belongs to. * * For the case of non-NUMA systems the NODE_DATA() gets optimized to * &contig_page_data at compile-time. */ static inline struct zonelist *node_zonelist(int nid, gfp_t flags) { return NODE_DATA(nid)->node_zonelists + gfp_zonelist(flags); } #ifndef HAVE_ARCH_FREE_PAGE static inline void arch_free_page(struct page *page, int order) { } #endif #ifndef HAVE_ARCH_ALLOC_PAGE static inline void arch_alloc_page(struct page *page, int order) { } #endif struct page *__alloc_pages_noprof(gfp_t gfp, unsigned int order, int preferred_nid, nodemask_t *nodemask); #define __alloc_pages(...) alloc_hooks(__alloc_pages_noprof(__VA_ARGS__)) struct folio *__folio_alloc_noprof(gfp_t gfp, unsigned int order, int preferred_nid, nodemask_t *nodemask); #define __folio_alloc(...) alloc_hooks(__folio_alloc_noprof(__VA_ARGS__)) unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int preferred_nid, nodemask_t *nodemask, int nr_pages, struct page **page_array); #define __alloc_pages_bulk(...) alloc_hooks(alloc_pages_bulk_noprof(__VA_ARGS__)) unsigned long alloc_pages_bulk_mempolicy_noprof(gfp_t gfp, unsigned long nr_pages, struct page **page_array); #define alloc_pages_bulk_mempolicy(...) \ alloc_hooks(alloc_pages_bulk_mempolicy_noprof(__VA_ARGS__)) /* Bulk allocate order-0 pages */ #define alloc_pages_bulk(_gfp, _nr_pages, _page_array) \ __alloc_pages_bulk(_gfp, numa_mem_id(), NULL, _nr_pages, _page_array) static inline unsigned long alloc_pages_bulk_node_noprof(gfp_t gfp, int nid, unsigned long nr_pages, struct page **page_array) { if (nid == NUMA_NO_NODE) nid = numa_mem_id(); return alloc_pages_bulk_noprof(gfp, nid, NULL, nr_pages, page_array); } #define alloc_pages_bulk_node(...) \ alloc_hooks(alloc_pages_bulk_node_noprof(__VA_ARGS__)) static inline void warn_if_node_offline(int this_node, gfp_t gfp_mask) { gfp_t warn_gfp = gfp_mask & (__GFP_THISNODE|__GFP_NOWARN); if (warn_gfp != (__GFP_THISNODE|__GFP_NOWARN)) return; if (node_online(this_node)) return; pr_warn("%pGg allocation from offline node %d\n", &gfp_mask, this_node); dump_stack(); } /* * Allocate pages, preferring the node given as nid. The node must be valid and * online. For more general interface, see alloc_pages_node(). */ static inline struct page * __alloc_pages_node_noprof(int nid, gfp_t gfp_mask, unsigned int order) { VM_BUG_ON(nid < 0 || nid >= MAX_NUMNODES); warn_if_node_offline(nid, gfp_mask); return __alloc_pages_noprof(gfp_mask, order, nid, NULL); } #define __alloc_pages_node(...) alloc_hooks(__alloc_pages_node_noprof(__VA_ARGS__)) static inline struct folio *__folio_alloc_node_noprof(gfp_t gfp, unsigned int order, int nid) { VM_BUG_ON(nid < 0 || nid >= MAX_NUMNODES); warn_if_node_offline(nid, gfp); return __folio_alloc_noprof(gfp, order, nid, NULL); } #define __folio_alloc_node(...) alloc_hooks(__folio_alloc_node_noprof(__VA_ARGS__)) /* * Allocate pages, preferring the node given as nid. When nid == NUMA_NO_NODE, * prefer the current CPU's closest node. Otherwise node must be valid and * online. */ static inline struct page *alloc_pages_node_noprof(int nid, gfp_t gfp_mask, unsigned int order) { if (nid == NUMA_NO_NODE) nid = numa_mem_id(); return __alloc_pages_node_noprof(nid, gfp_mask, order); } #define alloc_pages_node(...) alloc_hooks(alloc_pages_node_noprof(__VA_ARGS__)) #ifdef CONFIG_NUMA struct page *alloc_pages_noprof(gfp_t gfp, unsigned int order); struct folio *folio_alloc_noprof(gfp_t gfp, unsigned int order); struct folio *folio_alloc_mpol_noprof(gfp_t gfp, unsigned int order, struct mempolicy *mpol, pgoff_t ilx, int nid); struct folio *vma_alloc_folio_noprof(gfp_t gfp, int order, struct vm_area_struct *vma, unsigned long addr); #else static inline struct page *alloc_pages_noprof(gfp_t gfp_mask, unsigned int order) { return alloc_pages_node_noprof(numa_node_id(), gfp_mask, order); } static inline struct folio *folio_alloc_noprof(gfp_t gfp, unsigned int order) { return __folio_alloc_node_noprof(gfp, order, numa_node_id()); } static inline struct folio *folio_alloc_mpol_noprof(gfp_t gfp, unsigned int order, struct mempolicy *mpol, pgoff_t ilx, int nid) { return folio_alloc_noprof(gfp, order); } #define vma_alloc_folio_noprof(gfp, order, vma, addr) \ folio_alloc_noprof(gfp, order) #endif #define alloc_pages(...) alloc_hooks(alloc_pages_noprof(__VA_ARGS__)) #define folio_alloc(...) alloc_hooks(folio_alloc_noprof(__VA_ARGS__)) #define folio_alloc_mpol(...) alloc_hooks(folio_alloc_mpol_noprof(__VA_ARGS__)) #define vma_alloc_folio(...) alloc_hooks(vma_alloc_folio_noprof(__VA_ARGS__)) #define alloc_page(gfp_mask) alloc_pages(gfp_mask, 0) static inline struct page *alloc_page_vma_noprof(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr) { struct folio *folio = vma_alloc_folio_noprof(gfp, 0, vma, addr); return &folio->page; } #define alloc_page_vma(...) alloc_hooks(alloc_page_vma_noprof(__VA_ARGS__)) struct page *alloc_pages_nolock_noprof(gfp_t gfp_flags, int nid, unsigned int order); #define alloc_pages_nolock(...) alloc_hooks(alloc_pages_nolock_noprof(__VA_ARGS__)) extern unsigned long get_free_pages_noprof(gfp_t gfp_mask, unsigned int order); #define __get_free_pages(...) alloc_hooks(get_free_pages_noprof(__VA_ARGS__)) extern unsigned long get_zeroed_page_noprof(gfp_t gfp_mask); #define get_zeroed_page(...) alloc_hooks(get_zeroed_page_noprof(__VA_ARGS__)) void *alloc_pages_exact_noprof(size_t size, gfp_t gfp_mask) __alloc_size(1); #define alloc_pages_exact(...) alloc_hooks(alloc_pages_exact_noprof(__VA_ARGS__)) void free_pages_exact(void *virt, size_t size); __meminit void *alloc_pages_exact_nid_noprof(int nid, size_t size, gfp_t gfp_mask) __alloc_size(2); #define alloc_pages_exact_nid(...) \ alloc_hooks(alloc_pages_exact_nid_noprof(__VA_ARGS__)) #define __get_free_page(gfp_mask) \ __get_free_pages((gfp_mask), 0) #define __get_dma_pages(gfp_mask, order) \ __get_free_pages((gfp_mask) | GFP_DMA, (order)) extern void __free_pages(struct page *page, unsigned int order); extern void free_pages_nolock(struct page *page, unsigned int order); extern void free_pages(unsigned long addr, unsigned int order); #define __free_page(page) __free_pages((page), 0) #define free_page(addr) free_pages((addr), 0) void page_alloc_init_cpuhp(void); bool decay_pcp_high(struct zone *zone, struct per_cpu_pages *pcp); void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp); void drain_all_pages(struct zone *zone); void drain_local_pages(struct zone *zone); void page_alloc_init_late(void); void setup_pcp_cacheinfo(unsigned int cpu); /* * gfp_allowed_mask is set to GFP_BOOT_MASK during early boot to restrict what * GFP flags are used before interrupts are enabled. Once interrupts are * enabled, it is set to __GFP_BITS_MASK while the system is running. During * hibernation, it is used by PM to avoid I/O during memory allocation while * devices are suspended. */ extern gfp_t gfp_allowed_mask; /* Returns true if the gfp_mask allows use of ALLOC_NO_WATERMARK */ bool gfp_pfmemalloc_allowed(gfp_t gfp_mask); static inline bool gfp_has_io_fs(gfp_t gfp) { return (gfp & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS); } /* * Check if the gfp flags allow compaction - GFP_NOIO is a really * tricky context because the migration might require IO. */ static inline bool gfp_compaction_allowed(gfp_t gfp_mask) { return IS_ENABLED(CONFIG_COMPACTION) && (gfp_mask & __GFP_IO); } extern gfp_t vma_thp_gfp_mask(struct vm_area_struct *vma); #ifdef CONFIG_CONTIG_ALLOC typedef unsigned int __bitwise acr_flags_t; #define ACR_FLAGS_NONE ((__force acr_flags_t)0) // ordinary allocation request #define ACR_FLAGS_CMA ((__force acr_flags_t)BIT(0)) // allocate for CMA /* The below functions must be run on a range from a single zone. */ extern int alloc_contig_range_noprof(unsigned long start, unsigned long end, acr_flags_t alloc_flags, gfp_t gfp_mask); #define alloc_contig_range(...) alloc_hooks(alloc_contig_range_noprof(__VA_ARGS__)) extern struct page *alloc_contig_pages_noprof(unsigned long nr_pages, gfp_t gfp_mask, int nid, nodemask_t *nodemask); #define alloc_contig_pages(...) alloc_hooks(alloc_contig_pages_noprof(__VA_ARGS__)) #endif void free_contig_range(unsigned long pfn, unsigned long nr_pages); #ifdef CONFIG_CONTIG_ALLOC static inline struct folio *folio_alloc_gigantic_noprof(int order, gfp_t gfp, int nid, nodemask_t *node) { struct page *page; if (WARN_ON(!order || !(gfp & __GFP_COMP))) return NULL; page = alloc_contig_pages_noprof(1 << order, gfp, nid, node); return page ? page_folio(page) : NULL; } #else static inline struct folio *folio_alloc_gigantic_noprof(int order, gfp_t gfp, int nid, nodemask_t *node) { return NULL; } #endif /* This should be paired with folio_put() rather than free_contig_range(). */ #define folio_alloc_gigantic(...) alloc_hooks(folio_alloc_gigantic_noprof(__VA_ARGS__)) DEFINE_FREE(free_page, void *, free_page((unsigned long)_T)) #endif /* __LINUX_GFP_H */
1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 // SPDX-License-Identifier: GPL-2.0-only /* * IPv6 raw table, a port of the IPv4 raw table to IPv6 * * Copyright (C) 2003 Jozsef Kadlecsik <kadlec@netfilter.org> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/netfilter_ipv6/ip6_tables.h> #include <linux/slab.h> #define RAW_VALID_HOOKS ((1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_OUT)) static bool raw_before_defrag __read_mostly; MODULE_PARM_DESC(raw_before_defrag, "Enable raw table before defrag"); module_param(raw_before_defrag, bool, 0000); static const struct xt_table packet_raw = { .name = "raw", .valid_hooks = RAW_VALID_HOOKS, .me = THIS_MODULE, .af = NFPROTO_IPV6, .priority = NF_IP6_PRI_RAW, }; static const struct xt_table packet_raw_before_defrag = { .name = "raw", .valid_hooks = RAW_VALID_HOOKS, .me = THIS_MODULE, .af = NFPROTO_IPV6, .priority = NF_IP6_PRI_RAW_BEFORE_DEFRAG, }; static struct nf_hook_ops *rawtable_ops __read_mostly; static int ip6table_raw_table_init(struct net *net) { struct ip6t_replace *repl; const struct xt_table *table = &packet_raw; int ret; if (raw_before_defrag) table = &packet_raw_before_defrag; repl = ip6t_alloc_initial_table(table); if (repl == NULL) return -ENOMEM; ret = ip6t_register_table(net, table, repl, rawtable_ops); kfree(repl); return ret; } static void __net_exit ip6table_raw_net_pre_exit(struct net *net) { ip6t_unregister_table_pre_exit(net, "raw"); } static void __net_exit ip6table_raw_net_exit(struct net *net) { ip6t_unregister_table_exit(net, "raw"); } static struct pernet_operations ip6table_raw_net_ops = { .pre_exit = ip6table_raw_net_pre_exit, .exit = ip6table_raw_net_exit, }; static int __init ip6table_raw_init(void) { const struct xt_table *table = &packet_raw; int ret; if (raw_before_defrag) { table = &packet_raw_before_defrag; pr_info("Enabling raw table before defrag\n"); } ret = xt_register_template(table, ip6table_raw_table_init); if (ret < 0) return ret; /* Register hooks */ rawtable_ops = xt_hook_ops_alloc(table, ip6t_do_table); if (IS_ERR(rawtable_ops)) { xt_unregister_template(table); return PTR_ERR(rawtable_ops); } ret = register_pernet_subsys(&ip6table_raw_net_ops); if (ret < 0) { kfree(rawtable_ops); xt_unregister_template(table); return ret; } return ret; } static void __exit ip6table_raw_fini(void) { unregister_pernet_subsys(&ip6table_raw_net_ops); xt_unregister_template(&packet_raw); kfree(rawtable_ops); } module_init(ip6table_raw_init); module_exit(ip6table_raw_fini); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Ip6tables legacy raw table");
514 3 1051 278 81 377 27 4 3 416 260 341 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Definitions for the Forwarding Information Base. * * Authors: A.N.Kuznetsov, <kuznet@ms2.inr.ac.ru> */ #ifndef _NET_IP_FIB_H #define _NET_IP_FIB_H #include <net/flow.h> #include <linux/seq_file.h> #include <linux/rcupdate.h> #include <net/fib_notifier.h> #include <net/fib_rules.h> #include <net/inet_dscp.h> #include <net/inetpeer.h> #include <linux/percpu.h> #include <linux/notifier.h> #include <linux/refcount.h> #include <linux/ip.h> #include <linux/in_route.h> struct fib_config { u8 fc_dst_len; dscp_t fc_dscp; u8 fc_protocol; u8 fc_scope; u8 fc_type; u8 fc_gw_family; /* 2 bytes unused */ u32 fc_table; __be32 fc_dst; union { __be32 fc_gw4; struct in6_addr fc_gw6; }; int fc_oif; u32 fc_flags; u32 fc_priority; __be32 fc_prefsrc; u32 fc_nh_id; struct nlattr *fc_mx; struct rtnexthop *fc_mp; int fc_mx_len; int fc_mp_len; u32 fc_flow; u32 fc_nlflags; struct nl_info fc_nlinfo; struct nlattr *fc_encap; u16 fc_encap_type; }; struct fib_info; struct rtable; struct fib_nh_exception { struct fib_nh_exception __rcu *fnhe_next; int fnhe_genid; __be32 fnhe_daddr; u32 fnhe_pmtu; bool fnhe_mtu_locked; __be32 fnhe_gw; unsigned long fnhe_expires; struct rtable __rcu *fnhe_rth_input; struct rtable __rcu *fnhe_rth_output; unsigned long fnhe_stamp; struct rcu_head rcu; }; struct fnhe_hash_bucket { struct fib_nh_exception __rcu *chain; }; #define FNHE_HASH_SHIFT 11 #define FNHE_HASH_SIZE (1 << FNHE_HASH_SHIFT) #define FNHE_RECLAIM_DEPTH 5 struct fib_nh_common { struct net_device *nhc_dev; netdevice_tracker nhc_dev_tracker; int nhc_oif; unsigned char nhc_scope; u8 nhc_family; u8 nhc_gw_family; unsigned char nhc_flags; struct lwtunnel_state *nhc_lwtstate; union { __be32 ipv4; struct in6_addr ipv6; } nhc_gw; int nhc_weight; atomic_t nhc_upper_bound; /* v4 specific, but allows fib6_nh with v4 routes */ struct rtable __rcu * __percpu *nhc_pcpu_rth_output; struct rtable __rcu *nhc_rth_input; struct fnhe_hash_bucket __rcu *nhc_exceptions; }; struct fib_nh { struct fib_nh_common nh_common; struct hlist_node nh_hash; struct fib_info *nh_parent; #ifdef CONFIG_IP_ROUTE_CLASSID __u32 nh_tclassid; #endif __be32 nh_saddr; int nh_saddr_genid; #define fib_nh_family nh_common.nhc_family #define fib_nh_dev nh_common.nhc_dev #define fib_nh_dev_tracker nh_common.nhc_dev_tracker #define fib_nh_oif nh_common.nhc_oif #define fib_nh_flags nh_common.nhc_flags #define fib_nh_lws nh_common.nhc_lwtstate #define fib_nh_scope nh_common.nhc_scope #define fib_nh_gw_family nh_common.nhc_gw_family #define fib_nh_gw4 nh_common.nhc_gw.ipv4 #define fib_nh_gw6 nh_common.nhc_gw.ipv6 #define fib_nh_weight nh_common.nhc_weight #define fib_nh_upper_bound nh_common.nhc_upper_bound }; /* * This structure contains data shared by many of routes. */ struct nexthop; struct fib_info { struct hlist_node fib_hash; struct hlist_node fib_lhash; struct list_head nh_list; struct net *fib_net; refcount_t fib_treeref; refcount_t fib_clntref; unsigned int fib_flags; unsigned char fib_dead; unsigned char fib_protocol; unsigned char fib_scope; unsigned char fib_type; __be32 fib_prefsrc; u32 fib_tb_id; u32 fib_priority; struct dst_metrics *fib_metrics; #define fib_mtu fib_metrics->metrics[RTAX_MTU-1] #define fib_window fib_metrics->metrics[RTAX_WINDOW-1] #define fib_rtt fib_metrics->metrics[RTAX_RTT-1] #define fib_advmss fib_metrics->metrics[RTAX_ADVMSS-1] int fib_nhs; bool fib_nh_is_v6; bool nh_updated; bool pfsrc_removed; struct nexthop *nh; struct rcu_head rcu; struct fib_nh fib_nh[] __counted_by(fib_nhs); }; int __net_init fib4_semantics_init(struct net *net); void __net_exit fib4_semantics_exit(struct net *net); #ifdef CONFIG_IP_MULTIPLE_TABLES struct fib_rule; #endif struct fib_table; struct fib_result { __be32 prefix; unsigned char prefixlen; unsigned char nh_sel; unsigned char type; unsigned char scope; u32 tclassid; dscp_t dscp; struct fib_nh_common *nhc; struct fib_info *fi; struct fib_table *table; struct hlist_head *fa_head; }; struct fib_result_nl { __be32 fl_addr; /* To be looked up*/ u32 fl_mark; unsigned char fl_tos; unsigned char fl_scope; unsigned char tb_id_in; unsigned char tb_id; /* Results */ unsigned char prefixlen; unsigned char nh_sel; unsigned char type; unsigned char scope; int err; }; #ifdef CONFIG_IP_MULTIPLE_TABLES #define FIB_TABLE_HASHSZ 256 #else #define FIB_TABLE_HASHSZ 2 #endif __be32 fib_info_update_nhc_saddr(struct net *net, struct fib_nh_common *nhc, unsigned char scope); __be32 fib_result_prefsrc(struct net *net, struct fib_result *res); #define FIB_RES_NHC(res) ((res).nhc) #define FIB_RES_DEV(res) (FIB_RES_NHC(res)->nhc_dev) #define FIB_RES_OIF(res) (FIB_RES_NHC(res)->nhc_oif) struct fib_rt_info { struct fib_info *fi; u32 tb_id; __be32 dst; int dst_len; dscp_t dscp; u8 type; u8 offload:1, trap:1, offload_failed:1, unused:5; }; struct fib_entry_notifier_info { struct fib_notifier_info info; /* must be first */ u32 dst; int dst_len; struct fib_info *fi; dscp_t dscp; u8 type; u32 tb_id; }; struct fib_nh_notifier_info { struct fib_notifier_info info; /* must be first */ struct fib_nh *fib_nh; }; int call_fib4_notifier(struct notifier_block *nb, enum fib_event_type event_type, struct fib_notifier_info *info); int call_fib4_notifiers(struct net *net, enum fib_event_type event_type, struct fib_notifier_info *info); int __net_init fib4_notifier_init(struct net *net); void __net_exit fib4_notifier_exit(struct net *net); void fib_info_notify_update(struct net *net, struct nl_info *info); int fib_notify(struct net *net, struct notifier_block *nb, struct netlink_ext_ack *extack); struct fib_table { struct hlist_node tb_hlist; u32 tb_id; int tb_num_default; struct rcu_head rcu; unsigned long *tb_data; unsigned long __data[]; }; struct fib_dump_filter { u32 table_id; /* filter_set is an optimization that an entry is set */ bool filter_set; bool dump_routes; bool dump_exceptions; bool rtnl_held; unsigned char protocol; unsigned char rt_type; unsigned int flags; struct net_device *dev; }; int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp, struct fib_result *res, int fib_flags); int fib_table_insert(struct net *, struct fib_table *, struct fib_config *, struct netlink_ext_ack *extack); int fib_table_delete(struct net *, struct fib_table *, struct fib_config *, struct netlink_ext_ack *extack); int fib_table_dump(struct fib_table *table, struct sk_buff *skb, struct netlink_callback *cb, struct fib_dump_filter *filter); int fib_table_flush(struct net *net, struct fib_table *table, bool flush_all); struct fib_table *fib_trie_unmerge(struct fib_table *main_tb); void fib_table_flush_external(struct fib_table *table); void fib_free_table(struct fib_table *tb); #ifndef CONFIG_IP_MULTIPLE_TABLES #define TABLE_LOCAL_INDEX (RT_TABLE_LOCAL & (FIB_TABLE_HASHSZ - 1)) #define TABLE_MAIN_INDEX (RT_TABLE_MAIN & (FIB_TABLE_HASHSZ - 1)) static inline struct fib_table *fib_get_table(struct net *net, u32 id) { struct hlist_node *tb_hlist; struct hlist_head *ptr; ptr = id == RT_TABLE_LOCAL ? &net->ipv4.fib_table_hash[TABLE_LOCAL_INDEX] : &net->ipv4.fib_table_hash[TABLE_MAIN_INDEX]; tb_hlist = rcu_dereference_rtnl(hlist_first_rcu(ptr)); return hlist_entry(tb_hlist, struct fib_table, tb_hlist); } static inline struct fib_table *fib_new_table(struct net *net, u32 id) { return fib_get_table(net, id); } static inline int fib_lookup(struct net *net, const struct flowi4 *flp, struct fib_result *res, unsigned int flags) { struct fib_table *tb; int err = -ENETUNREACH; rcu_read_lock(); tb = fib_get_table(net, RT_TABLE_MAIN); if (tb) err = fib_table_lookup(tb, flp, res, flags | FIB_LOOKUP_NOREF); if (err == -EAGAIN) err = -ENETUNREACH; rcu_read_unlock(); return err; } static inline bool fib4_has_custom_rules(const struct net *net) { return false; } static inline bool fib4_rule_default(const struct fib_rule *rule) { return true; } static inline int fib4_rules_dump(struct net *net, struct notifier_block *nb, struct netlink_ext_ack *extack) { return 0; } static inline unsigned int fib4_rules_seq_read(const struct net *net) { return 0; } static inline bool fib4_rules_early_flow_dissect(struct net *net, struct sk_buff *skb, struct flowi4 *fl4, struct flow_keys *flkeys) { return false; } #else /* CONFIG_IP_MULTIPLE_TABLES */ int __net_init fib4_rules_init(struct net *net); void __net_exit fib4_rules_exit(struct net *net); struct fib_table *fib_new_table(struct net *net, u32 id); struct fib_table *fib_get_table(struct net *net, u32 id); int __fib_lookup(struct net *net, struct flowi4 *flp, struct fib_result *res, unsigned int flags); static inline int fib_lookup(struct net *net, struct flowi4 *flp, struct fib_result *res, unsigned int flags) { struct fib_table *tb; int err = -ENETUNREACH; flags |= FIB_LOOKUP_NOREF; if (net->ipv4.fib_has_custom_rules) return __fib_lookup(net, flp, res, flags); rcu_read_lock(); res->tclassid = 0; tb = rcu_dereference_rtnl(net->ipv4.fib_main); if (tb) err = fib_table_lookup(tb, flp, res, flags); if (!err) goto out; tb = rcu_dereference_rtnl(net->ipv4.fib_default); if (tb) err = fib_table_lookup(tb, flp, res, flags); out: if (err == -EAGAIN) err = -ENETUNREACH; rcu_read_unlock(); return err; } static inline bool fib4_has_custom_rules(const struct net *net) { return net->ipv4.fib_has_custom_rules; } bool fib4_rule_default(const struct fib_rule *rule); int fib4_rules_dump(struct net *net, struct notifier_block *nb, struct netlink_ext_ack *extack); unsigned int fib4_rules_seq_read(const struct net *net); static inline bool fib4_rules_early_flow_dissect(struct net *net, struct sk_buff *skb, struct flowi4 *fl4, struct flow_keys *flkeys) { unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP; if (!net->ipv4.fib_rules_require_fldissect) return false; memset(flkeys, 0, sizeof(*flkeys)); __skb_flow_dissect(net, skb, &flow_keys_dissector, flkeys, NULL, 0, 0, 0, flag); fl4->fl4_sport = flkeys->ports.src; fl4->fl4_dport = flkeys->ports.dst; fl4->flowi4_proto = flkeys->basic.ip_proto; return true; } #endif /* CONFIG_IP_MULTIPLE_TABLES */ static inline bool fib_dscp_masked_match(dscp_t dscp, const struct flowi4 *fl4) { return dscp == (fl4->flowi4_dscp & INET_DSCP_LEGACY_TOS_MASK); } /* Exported by fib_frontend.c */ extern const struct nla_policy rtm_ipv4_policy[]; void ip_fib_init(void); int fib_gw_from_via(struct fib_config *cfg, struct nlattr *nla, struct netlink_ext_ack *extack); __be32 fib_compute_spec_dst(struct sk_buff *skb); bool fib_info_nh_uses_dev(struct fib_info *fi, const struct net_device *dev); int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, dscp_t dscp, int oif, struct net_device *dev, struct in_device *idev, u32 *itag); static inline enum skb_drop_reason fib_validate_source_reason(struct sk_buff *skb, __be32 src, __be32 dst, dscp_t dscp, int oif, struct net_device *dev, struct in_device *idev, u32 *itag) { int err = fib_validate_source(skb, src, dst, dscp, oif, dev, idev, itag); if (err < 0) return -err; return SKB_NOT_DROPPED_YET; } #ifdef CONFIG_IP_ROUTE_CLASSID static inline int fib_num_tclassid_users(struct net *net) { return atomic_read(&net->ipv4.fib_num_tclassid_users); } #else static inline int fib_num_tclassid_users(struct net *net) { return 0; } #endif int fib_unmerge(struct net *net); static inline bool nhc_l3mdev_matches_dev(const struct fib_nh_common *nhc, const struct net_device *dev) { if (nhc->nhc_dev == dev || l3mdev_master_ifindex_rcu(nhc->nhc_dev) == dev->ifindex) return true; return false; } /* Exported by fib_semantics.c */ int ip_fib_check_default(__be32 gw, struct net_device *dev); int fib_sync_down_dev(struct net_device *dev, unsigned long event, bool force); int fib_sync_down_addr(struct net_device *dev, __be32 local); int fib_sync_up(struct net_device *dev, unsigned char nh_flags); void fib_sync_mtu(struct net_device *dev, u32 orig_mtu); void fib_nhc_update_mtu(struct fib_nh_common *nhc, u32 new, u32 orig); /* Fields used for sysctl_fib_multipath_hash_fields. * Common to IPv4 and IPv6. * * Add new fields at the end. This is user API. */ #define FIB_MULTIPATH_HASH_FIELD_SRC_IP BIT(0) #define FIB_MULTIPATH_HASH_FIELD_DST_IP BIT(1) #define FIB_MULTIPATH_HASH_FIELD_IP_PROTO BIT(2) #define FIB_MULTIPATH_HASH_FIELD_FLOWLABEL BIT(3) #define FIB_MULTIPATH_HASH_FIELD_SRC_PORT BIT(4) #define FIB_MULTIPATH_HASH_FIELD_DST_PORT BIT(5) #define FIB_MULTIPATH_HASH_FIELD_INNER_SRC_IP BIT(6) #define FIB_MULTIPATH_HASH_FIELD_INNER_DST_IP BIT(7) #define FIB_MULTIPATH_HASH_FIELD_INNER_IP_PROTO BIT(8) #define FIB_MULTIPATH_HASH_FIELD_INNER_FLOWLABEL BIT(9) #define FIB_MULTIPATH_HASH_FIELD_INNER_SRC_PORT BIT(10) #define FIB_MULTIPATH_HASH_FIELD_INNER_DST_PORT BIT(11) #define FIB_MULTIPATH_HASH_FIELD_OUTER_MASK \ (FIB_MULTIPATH_HASH_FIELD_SRC_IP | \ FIB_MULTIPATH_HASH_FIELD_DST_IP | \ FIB_MULTIPATH_HASH_FIELD_IP_PROTO | \ FIB_MULTIPATH_HASH_FIELD_FLOWLABEL | \ FIB_MULTIPATH_HASH_FIELD_SRC_PORT | \ FIB_MULTIPATH_HASH_FIELD_DST_PORT) #define FIB_MULTIPATH_HASH_FIELD_INNER_MASK \ (FIB_MULTIPATH_HASH_FIELD_INNER_SRC_IP | \ FIB_MULTIPATH_HASH_FIELD_INNER_DST_IP | \ FIB_MULTIPATH_HASH_FIELD_INNER_IP_PROTO | \ FIB_MULTIPATH_HASH_FIELD_INNER_FLOWLABEL | \ FIB_MULTIPATH_HASH_FIELD_INNER_SRC_PORT | \ FIB_MULTIPATH_HASH_FIELD_INNER_DST_PORT) #define FIB_MULTIPATH_HASH_FIELD_ALL_MASK \ (FIB_MULTIPATH_HASH_FIELD_OUTER_MASK | \ FIB_MULTIPATH_HASH_FIELD_INNER_MASK) #define FIB_MULTIPATH_HASH_FIELD_DEFAULT_MASK \ (FIB_MULTIPATH_HASH_FIELD_SRC_IP | \ FIB_MULTIPATH_HASH_FIELD_DST_IP | \ FIB_MULTIPATH_HASH_FIELD_IP_PROTO) #ifdef CONFIG_IP_ROUTE_MULTIPATH int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4, const struct sk_buff *skb, struct flow_keys *flkeys); static void fib_multipath_hash_construct_key(siphash_key_t *key, u32 mp_seed) { u64 mp_seed_64 = mp_seed; key->key[0] = (mp_seed_64 << 32) | mp_seed_64; key->key[1] = key->key[0]; } static inline u32 fib_multipath_hash_from_keys(const struct net *net, struct flow_keys *keys) { siphash_aligned_key_t hash_key; u32 mp_seed; mp_seed = READ_ONCE(net->ipv4.sysctl_fib_multipath_hash_seed).mp_seed; fib_multipath_hash_construct_key(&hash_key, mp_seed); return flow_hash_from_keys_seed(keys, &hash_key); } #else static inline u32 fib_multipath_hash_from_keys(const struct net *net, struct flow_keys *keys) { return flow_hash_from_keys(keys); } #endif int fib_check_nh(struct net *net, struct fib_nh *nh, u32 table, u8 scope, struct netlink_ext_ack *extack); void fib_select_multipath(struct fib_result *res, int hash, const struct flowi4 *fl4); void fib_select_path(struct net *net, struct fib_result *res, struct flowi4 *fl4, const struct sk_buff *skb); int fib_nh_init(struct net *net, struct fib_nh *fib_nh, struct fib_config *cfg, int nh_weight, struct netlink_ext_ack *extack); void fib_nh_release(struct net *net, struct fib_nh *fib_nh); int fib_nh_common_init(struct net *net, struct fib_nh_common *nhc, struct nlattr *fc_encap, u16 fc_encap_type, void *cfg, gfp_t gfp_flags, struct netlink_ext_ack *extack); void fib_nh_common_release(struct fib_nh_common *nhc); /* Exported by fib_trie.c */ void fib_alias_hw_flags_set(struct net *net, const struct fib_rt_info *fri); void fib_trie_init(void); struct fib_table *fib_trie_table(u32 id, struct fib_table *alias); bool fib_lookup_good_nhc(const struct fib_nh_common *nhc, int fib_flags, const struct flowi4 *flp); static inline void fib_combine_itag(u32 *itag, const struct fib_result *res) { #ifdef CONFIG_IP_ROUTE_CLASSID struct fib_nh_common *nhc = res->nhc; #ifdef CONFIG_IP_MULTIPLE_TABLES u32 rtag; #endif if (nhc->nhc_family == AF_INET) { struct fib_nh *nh; nh = container_of(nhc, struct fib_nh, nh_common); *itag = nh->nh_tclassid << 16; } else { *itag = 0; } #ifdef CONFIG_IP_MULTIPLE_TABLES rtag = res->tclassid; if (*itag == 0) *itag = (rtag<<16); *itag |= (rtag>>16); #endif #endif } void fib_flush(struct net *net); void free_fib_info(struct fib_info *fi); static inline void fib_info_hold(struct fib_info *fi) { refcount_inc(&fi->fib_clntref); } static inline void fib_info_put(struct fib_info *fi) { if (refcount_dec_and_test(&fi->fib_clntref)) free_fib_info(fi); } #ifdef CONFIG_PROC_FS int __net_init fib_proc_init(struct net *net); void __net_exit fib_proc_exit(struct net *net); #else static inline int fib_proc_init(struct net *net) { return 0; } static inline void fib_proc_exit(struct net *net) { } #endif u32 ip_mtu_from_fib_result(struct fib_result *res, __be32 daddr); int ip_valid_fib_dump_req(struct net *net, const struct nlmsghdr *nlh, struct fib_dump_filter *filter, struct netlink_callback *cb); int fib_nexthop_info(struct sk_buff *skb, const struct fib_nh_common *nh, u8 rt_family, unsigned char *flags, bool skip_oif); int fib_add_nexthop(struct sk_buff *skb, const struct fib_nh_common *nh, int nh_weight, u8 rt_family, u32 nh_tclassid); #endif /* _NET_FIB_H */
19 8 8 12 11 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 // SPDX-License-Identifier: GPL-2.0-only /* Kernel module to match one of a list of TCP/UDP(-Lite)/SCTP/DCCP ports: ports are in the same place so we can treat them as equal. */ /* (C) 1999-2001 Paul `Rusty' Russell * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/types.h> #include <linux/udp.h> #include <linux/skbuff.h> #include <linux/in.h> #include <linux/netfilter/xt_multiport.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter_ipv4/ip_tables.h> #include <linux/netfilter_ipv6/ip6_tables.h> MODULE_LICENSE("GPL"); MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); MODULE_DESCRIPTION("Xtables: multiple port matching for TCP, UDP, UDP-Lite, SCTP and DCCP"); MODULE_ALIAS("ipt_multiport"); MODULE_ALIAS("ip6t_multiport"); /* Returns 1 if the port is matched by the test, 0 otherwise. */ static inline bool ports_match_v1(const struct xt_multiport_v1 *minfo, u_int16_t src, u_int16_t dst) { unsigned int i; u_int16_t s, e; for (i = 0; i < minfo->count; i++) { s = minfo->ports[i]; if (minfo->pflags[i]) { /* range port matching */ e = minfo->ports[++i]; pr_debug("src or dst matches with %d-%d?\n", s, e); switch (minfo->flags) { case XT_MULTIPORT_SOURCE: if (src >= s && src <= e) return true ^ minfo->invert; break; case XT_MULTIPORT_DESTINATION: if (dst >= s && dst <= e) return true ^ minfo->invert; break; case XT_MULTIPORT_EITHER: if ((dst >= s && dst <= e) || (src >= s && src <= e)) return true ^ minfo->invert; break; default: break; } } else { /* exact port matching */ pr_debug("src or dst matches with %d?\n", s); switch (minfo->flags) { case XT_MULTIPORT_SOURCE: if (src == s) return true ^ minfo->invert; break; case XT_MULTIPORT_DESTINATION: if (dst == s) return true ^ minfo->invert; break; case XT_MULTIPORT_EITHER: if (src == s || dst == s) return true ^ minfo->invert; break; default: break; } } } return minfo->invert; } static bool multiport_mt(const struct sk_buff *skb, struct xt_action_param *par) { const __be16 *pptr; __be16 _ports[2]; const struct xt_multiport_v1 *multiinfo = par->matchinfo; if (par->fragoff != 0) return false; pptr = skb_header_pointer(skb, par->thoff, sizeof(_ports), _ports); if (pptr == NULL) { /* We've been asked to examine this packet, and we * can't. Hence, no choice but to drop. */ pr_debug("Dropping evil offset=0 tinygram.\n"); par->hotdrop = true; return false; } return ports_match_v1(multiinfo, ntohs(pptr[0]), ntohs(pptr[1])); } static inline bool check(u_int16_t proto, u_int8_t ip_invflags, u_int8_t match_flags, u_int8_t count) { /* Must specify supported protocol, no unknown flags or bad count */ return (proto == IPPROTO_TCP || proto == IPPROTO_UDP || proto == IPPROTO_UDPLITE || proto == IPPROTO_SCTP || proto == IPPROTO_DCCP) && !(ip_invflags & XT_INV_PROTO) && (match_flags == XT_MULTIPORT_SOURCE || match_flags == XT_MULTIPORT_DESTINATION || match_flags == XT_MULTIPORT_EITHER) && count <= XT_MULTI_PORTS; } static int multiport_mt_check(const struct xt_mtchk_param *par) { const struct ipt_ip *ip = par->entryinfo; const struct xt_multiport_v1 *multiinfo = par->matchinfo; return check(ip->proto, ip->invflags, multiinfo->flags, multiinfo->count) ? 0 : -EINVAL; } static int multiport_mt6_check(const struct xt_mtchk_param *par) { const struct ip6t_ip6 *ip = par->entryinfo; const struct xt_multiport_v1 *multiinfo = par->matchinfo; return check(ip->proto, ip->invflags, multiinfo->flags, multiinfo->count) ? 0 : -EINVAL; } static struct xt_match multiport_mt_reg[] __read_mostly = { { .name = "multiport", .family = NFPROTO_IPV4, .revision = 1, .checkentry = multiport_mt_check, .match = multiport_mt, .matchsize = sizeof(struct xt_multiport_v1), .me = THIS_MODULE, }, { .name = "multiport", .family = NFPROTO_IPV6, .revision = 1, .checkentry = multiport_mt6_check, .match = multiport_mt, .matchsize = sizeof(struct xt_multiport_v1), .me = THIS_MODULE, }, }; static int __init multiport_mt_init(void) { return xt_register_matches(multiport_mt_reg, ARRAY_SIZE(multiport_mt_reg)); } static void __exit multiport_mt_exit(void) { xt_unregister_matches(multiport_mt_reg, ARRAY_SIZE(multiport_mt_reg)); } module_init(multiport_mt_init); module_exit(multiport_mt_exit);
5 3 5 6 2 8 6 2 5 3 8 6 1 3 5 3 1 2 2 1 2 1 5 5 5 2 3 3 4 3 1 4 2 5 4 5 3 2 1 1 1 1 1 1 17 3 9 2 3 2 7 8 1 5 10 10 164 164 130 130 137 30 1 9 1 9 702 44 1269 997 998 994 4 1 994 396 613 511 6 3 3 2 18 1 18 3 3 11 3 4 2 8 3 8 23 7 8 10 1 3 2 11 6 5 11 7 10 4 1 3 1 1 1 1 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 1991, 1992 Linus Torvalds * Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE * Copyright (C) 2016 - 2020 Christoph Hellwig */ #include <linux/init.h> #include <linux/mm.h> #include <linux/blkdev.h> #include <linux/blk-integrity.h> #include <linux/buffer_head.h> #include <linux/mpage.h> #include <linux/uio.h> #include <linux/namei.h> #include <linux/task_io_accounting_ops.h> #include <linux/falloc.h> #include <linux/suspend.h> #include <linux/fs.h> #include <linux/iomap.h> #include <linux/module.h> #include <linux/io_uring/cmd.h> #include "blk.h" static inline struct inode *bdev_file_inode(struct file *file) { return file->f_mapping->host; } static blk_opf_t dio_bio_write_op(struct kiocb *iocb) { blk_opf_t opf = REQ_OP_WRITE | REQ_SYNC | REQ_IDLE; /* avoid the need for a I/O completion work item */ if (iocb_is_dsync(iocb)) opf |= REQ_FUA; return opf; } static bool blkdev_dio_invalid(struct block_device *bdev, struct kiocb *iocb, struct iov_iter *iter) { return (iocb->ki_pos | iov_iter_count(iter)) & (bdev_logical_block_size(bdev) - 1); } static inline int blkdev_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter, struct block_device *bdev) { return bio_iov_iter_get_pages(bio, iter, bdev_logical_block_size(bdev) - 1); } #define DIO_INLINE_BIO_VECS 4 static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter, struct block_device *bdev, unsigned int nr_pages) { struct bio_vec inline_vecs[DIO_INLINE_BIO_VECS], *vecs; loff_t pos = iocb->ki_pos; bool should_dirty = false; struct bio bio; ssize_t ret; if (nr_pages <= DIO_INLINE_BIO_VECS) vecs = inline_vecs; else { vecs = kmalloc_array(nr_pages, sizeof(struct bio_vec), GFP_KERNEL); if (!vecs) return -ENOMEM; } if (iov_iter_rw(iter) == READ) { bio_init(&bio, bdev, vecs, nr_pages, REQ_OP_READ); if (user_backed_iter(iter)) should_dirty = true; } else { bio_init(&bio, bdev, vecs, nr_pages, dio_bio_write_op(iocb)); } bio.bi_iter.bi_sector = pos >> SECTOR_SHIFT; bio.bi_write_hint = file_inode(iocb->ki_filp)->i_write_hint; bio.bi_write_stream = iocb->ki_write_stream; bio.bi_ioprio = iocb->ki_ioprio; if (iocb->ki_flags & IOCB_ATOMIC) bio.bi_opf |= REQ_ATOMIC; ret = blkdev_iov_iter_get_pages(&bio, iter, bdev); if (unlikely(ret)) goto out; ret = bio.bi_iter.bi_size; if (iov_iter_rw(iter) == WRITE) task_io_account_write(ret); if (iocb->ki_flags & IOCB_NOWAIT) bio.bi_opf |= REQ_NOWAIT; submit_bio_wait(&bio); bio_release_pages(&bio, should_dirty); if (unlikely(bio.bi_status)) ret = blk_status_to_errno(bio.bi_status); out: if (vecs != inline_vecs) kfree(vecs); bio_uninit(&bio); return ret; } enum { DIO_SHOULD_DIRTY = 1, DIO_IS_SYNC = 2, }; struct blkdev_dio { union { struct kiocb *iocb; struct task_struct *waiter; }; size_t size; atomic_t ref; unsigned int flags; struct bio bio ____cacheline_aligned_in_smp; }; static struct bio_set blkdev_dio_pool; static void blkdev_bio_end_io(struct bio *bio) { struct blkdev_dio *dio = bio->bi_private; bool should_dirty = dio->flags & DIO_SHOULD_DIRTY; bool is_sync = dio->flags & DIO_IS_SYNC; if (bio->bi_status && !dio->bio.bi_status) dio->bio.bi_status = bio->bi_status; if (bio_integrity(bio)) bio_integrity_unmap_user(bio); if (atomic_dec_and_test(&dio->ref)) { if (!is_sync) { struct kiocb *iocb = dio->iocb; ssize_t ret; WRITE_ONCE(iocb->private, NULL); if (likely(!dio->bio.bi_status)) { ret = dio->size; iocb->ki_pos += ret; } else { ret = blk_status_to_errno(dio->bio.bi_status); } dio->iocb->ki_complete(iocb, ret); bio_put(&dio->bio); } else { struct task_struct *waiter = dio->waiter; WRITE_ONCE(dio->waiter, NULL); blk_wake_io_task(waiter); } } if (should_dirty) { bio_check_pages_dirty(bio); } else { bio_release_pages(bio, false); bio_put(bio); } } static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, struct block_device *bdev, unsigned int nr_pages) { struct blk_plug plug; struct blkdev_dio *dio; struct bio *bio; bool is_read = (iov_iter_rw(iter) == READ), is_sync; blk_opf_t opf = is_read ? REQ_OP_READ : dio_bio_write_op(iocb); loff_t pos = iocb->ki_pos; int ret = 0; bio = bio_alloc_bioset(bdev, nr_pages, opf, GFP_KERNEL, &blkdev_dio_pool); dio = container_of(bio, struct blkdev_dio, bio); atomic_set(&dio->ref, 1); /* * Grab an extra reference to ensure the dio structure which is embedded * into the first bio stays around. */ bio_get(bio); is_sync = is_sync_kiocb(iocb); if (is_sync) { dio->flags = DIO_IS_SYNC; dio->waiter = current; } else { dio->flags = 0; dio->iocb = iocb; } dio->size = 0; if (is_read && user_backed_iter(iter)) dio->flags |= DIO_SHOULD_DIRTY; blk_start_plug(&plug); for (;;) { bio->bi_iter.bi_sector = pos >> SECTOR_SHIFT; bio->bi_write_hint = file_inode(iocb->ki_filp)->i_write_hint; bio->bi_write_stream = iocb->ki_write_stream; bio->bi_private = dio; bio->bi_end_io = blkdev_bio_end_io; bio->bi_ioprio = iocb->ki_ioprio; ret = blkdev_iov_iter_get_pages(bio, iter, bdev); if (unlikely(ret)) { bio->bi_status = BLK_STS_IOERR; bio_endio(bio); break; } if (iocb->ki_flags & IOCB_NOWAIT) { /* * This is nonblocking IO, and we need to allocate * another bio if we have data left to map. As we * cannot guarantee that one of the sub bios will not * fail getting issued FOR NOWAIT and as error results * are coalesced across all of them, be safe and ask for * a retry of this from blocking context. */ if (unlikely(iov_iter_count(iter))) { ret = -EAGAIN; goto fail; } bio->bi_opf |= REQ_NOWAIT; } if (iocb->ki_flags & IOCB_HAS_METADATA) { ret = bio_integrity_map_iter(bio, iocb->private); if (unlikely(ret)) goto fail; } if (is_read) { if (dio->flags & DIO_SHOULD_DIRTY) bio_set_pages_dirty(bio); } else { task_io_account_write(bio->bi_iter.bi_size); } dio->size += bio->bi_iter.bi_size; pos += bio->bi_iter.bi_size; nr_pages = bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS); if (!nr_pages) { submit_bio(bio); break; } atomic_inc(&dio->ref); submit_bio(bio); bio = bio_alloc(bdev, nr_pages, opf, GFP_KERNEL); } blk_finish_plug(&plug); if (!is_sync) return -EIOCBQUEUED; for (;;) { set_current_state(TASK_UNINTERRUPTIBLE); if (!READ_ONCE(dio->waiter)) break; blk_io_schedule(); } __set_current_state(TASK_RUNNING); if (!ret) ret = blk_status_to_errno(dio->bio.bi_status); if (likely(!ret)) ret = dio->size; bio_put(&dio->bio); return ret; fail: bio_release_pages(bio, false); bio_clear_flag(bio, BIO_REFFED); bio_put(bio); blk_finish_plug(&plug); return ret; } static void blkdev_bio_end_io_async(struct bio *bio) { struct blkdev_dio *dio = container_of(bio, struct blkdev_dio, bio); struct kiocb *iocb = dio->iocb; ssize_t ret; WRITE_ONCE(iocb->private, NULL); if (likely(!bio->bi_status)) { ret = dio->size; iocb->ki_pos += ret; } else { ret = blk_status_to_errno(bio->bi_status); } if (bio_integrity(bio)) bio_integrity_unmap_user(bio); iocb->ki_complete(iocb, ret); if (dio->flags & DIO_SHOULD_DIRTY) { bio_check_pages_dirty(bio); } else { bio_release_pages(bio, false); bio_put(bio); } } static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb, struct iov_iter *iter, struct block_device *bdev, unsigned int nr_pages) { bool is_read = iov_iter_rw(iter) == READ; blk_opf_t opf = is_read ? REQ_OP_READ : dio_bio_write_op(iocb); struct blkdev_dio *dio; struct bio *bio; loff_t pos = iocb->ki_pos; int ret = 0; bio = bio_alloc_bioset(bdev, nr_pages, opf, GFP_KERNEL, &blkdev_dio_pool); dio = container_of(bio, struct blkdev_dio, bio); dio->flags = 0; dio->iocb = iocb; bio->bi_iter.bi_sector = pos >> SECTOR_SHIFT; bio->bi_write_hint = file_inode(iocb->ki_filp)->i_write_hint; bio->bi_write_stream = iocb->ki_write_stream; bio->bi_end_io = blkdev_bio_end_io_async; bio->bi_ioprio = iocb->ki_ioprio; if (iov_iter_is_bvec(iter)) { /* * Users don't rely on the iterator being in any particular * state for async I/O returning -EIOCBQUEUED, hence we can * avoid expensive iov_iter_advance(). Bypass * bio_iov_iter_get_pages() and set the bvec directly. */ bio_iov_bvec_set(bio, iter); } else { ret = blkdev_iov_iter_get_pages(bio, iter, bdev); if (unlikely(ret)) goto out_bio_put; } dio->size = bio->bi_iter.bi_size; if (is_read) { if (user_backed_iter(iter)) { dio->flags |= DIO_SHOULD_DIRTY; bio_set_pages_dirty(bio); } } else { task_io_account_write(bio->bi_iter.bi_size); } if (iocb->ki_flags & IOCB_HAS_METADATA) { ret = bio_integrity_map_iter(bio, iocb->private); WRITE_ONCE(iocb->private, NULL); if (unlikely(ret)) goto out_bio_put; } if (iocb->ki_flags & IOCB_ATOMIC) bio->bi_opf |= REQ_ATOMIC; if (iocb->ki_flags & IOCB_NOWAIT) bio->bi_opf |= REQ_NOWAIT; if (iocb->ki_flags & IOCB_HIPRI) { bio->bi_opf |= REQ_POLLED; submit_bio(bio); WRITE_ONCE(iocb->private, bio); } else { submit_bio(bio); } return -EIOCBQUEUED; out_bio_put: bio_put(bio); return ret; } static ssize_t blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter) { struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host); unsigned int nr_pages; if (!iov_iter_count(iter)) return 0; if (blkdev_dio_invalid(bdev, iocb, iter)) return -EINVAL; if (iov_iter_rw(iter) == WRITE) { u16 max_write_streams = bdev_max_write_streams(bdev); if (iocb->ki_write_stream) { if (iocb->ki_write_stream > max_write_streams) return -EINVAL; } else if (max_write_streams) { enum rw_hint write_hint = file_inode(iocb->ki_filp)->i_write_hint; /* * Just use the write hint as write stream for block * device writes. This assumes no file system is * mounted that would use the streams differently. */ if (write_hint <= max_write_streams) iocb->ki_write_stream = write_hint; } } nr_pages = bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS + 1); if (likely(nr_pages <= BIO_MAX_VECS && !(iocb->ki_flags & IOCB_HAS_METADATA))) { if (is_sync_kiocb(iocb)) return __blkdev_direct_IO_simple(iocb, iter, bdev, nr_pages); return __blkdev_direct_IO_async(iocb, iter, bdev, nr_pages); } else if (iocb->ki_flags & IOCB_ATOMIC) { return -EINVAL; } return __blkdev_direct_IO(iocb, iter, bdev, bio_max_segs(nr_pages)); } static int blkdev_iomap_begin(struct inode *inode, loff_t offset, loff_t length, unsigned int flags, struct iomap *iomap, struct iomap *srcmap) { struct block_device *bdev = I_BDEV(inode); loff_t isize = i_size_read(inode); if (offset >= isize) return -EIO; iomap->bdev = bdev; iomap->offset = ALIGN_DOWN(offset, bdev_logical_block_size(bdev)); iomap->type = IOMAP_MAPPED; iomap->addr = iomap->offset; iomap->length = isize - iomap->offset; iomap->flags |= IOMAP_F_BUFFER_HEAD; /* noop for !CONFIG_BUFFER_HEAD */ return 0; } static const struct iomap_ops blkdev_iomap_ops = { .iomap_begin = blkdev_iomap_begin, }; #ifdef CONFIG_BUFFER_HEAD static int blkdev_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh, int create) { bh->b_bdev = I_BDEV(inode); bh->b_blocknr = iblock; set_buffer_mapped(bh); return 0; } /* * We cannot call mpage_writepages() as it does not take the buffer lock. * We must use block_write_full_folio() directly which holds the buffer * lock. The buffer lock provides the synchronisation with writeback * that filesystems rely on when they use the blockdev's mapping. */ static int blkdev_writepages(struct address_space *mapping, struct writeback_control *wbc) { struct folio *folio = NULL; struct blk_plug plug; int err; blk_start_plug(&plug); while ((folio = writeback_iter(mapping, wbc, folio, &err))) err = block_write_full_folio(folio, wbc, blkdev_get_block); blk_finish_plug(&plug); return err; } static int blkdev_read_folio(struct file *file, struct folio *folio) { return block_read_full_folio(folio, blkdev_get_block); } static void blkdev_readahead(struct readahead_control *rac) { mpage_readahead(rac, blkdev_get_block); } static int blkdev_write_begin(const struct kiocb *iocb, struct address_space *mapping, loff_t pos, unsigned len, struct folio **foliop, void **fsdata) { return block_write_begin(mapping, pos, len, foliop, blkdev_get_block); } static int blkdev_write_end(const struct kiocb *iocb, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct folio *folio, void *fsdata) { int ret; ret = block_write_end(pos, len, copied, folio); folio_unlock(folio); folio_put(folio); return ret; } const struct address_space_operations def_blk_aops = { .dirty_folio = block_dirty_folio, .invalidate_folio = block_invalidate_folio, .read_folio = blkdev_read_folio, .readahead = blkdev_readahead, .writepages = blkdev_writepages, .write_begin = blkdev_write_begin, .write_end = blkdev_write_end, .migrate_folio = buffer_migrate_folio_norefs, .is_dirty_writeback = buffer_check_dirty_writeback, }; #else /* CONFIG_BUFFER_HEAD */ static int blkdev_read_folio(struct file *file, struct folio *folio) { iomap_bio_read_folio(folio, &blkdev_iomap_ops); return 0; } static void blkdev_readahead(struct readahead_control *rac) { iomap_bio_readahead(rac, &blkdev_iomap_ops); } static ssize_t blkdev_writeback_range(struct iomap_writepage_ctx *wpc, struct folio *folio, u64 offset, unsigned int len, u64 end_pos) { loff_t isize = i_size_read(wpc->inode); if (WARN_ON_ONCE(offset >= isize)) return -EIO; if (offset < wpc->iomap.offset || offset >= wpc->iomap.offset + wpc->iomap.length) { int error; error = blkdev_iomap_begin(wpc->inode, offset, isize - offset, IOMAP_WRITE, &wpc->iomap, NULL); if (error) return error; } return iomap_add_to_ioend(wpc, folio, offset, end_pos, len); } static const struct iomap_writeback_ops blkdev_writeback_ops = { .writeback_range = blkdev_writeback_range, .writeback_submit = iomap_ioend_writeback_submit, }; static int blkdev_writepages(struct address_space *mapping, struct writeback_control *wbc) { struct iomap_writepage_ctx wpc = { .inode = mapping->host, .wbc = wbc, .ops = &blkdev_writeback_ops }; return iomap_writepages(&wpc); } const struct address_space_operations def_blk_aops = { .dirty_folio = filemap_dirty_folio, .release_folio = iomap_release_folio, .invalidate_folio = iomap_invalidate_folio, .read_folio = blkdev_read_folio, .readahead = blkdev_readahead, .writepages = blkdev_writepages, .is_partially_uptodate = iomap_is_partially_uptodate, .error_remove_folio = generic_error_remove_folio, .migrate_folio = filemap_migrate_folio, }; #endif /* CONFIG_BUFFER_HEAD */ /* * for a block special file file_inode(file)->i_size is zero * so we compute the size by hand (just as in block_read/write above) */ static loff_t blkdev_llseek(struct file *file, loff_t offset, int whence) { struct inode *bd_inode = bdev_file_inode(file); loff_t retval; inode_lock(bd_inode); retval = fixed_size_llseek(file, offset, whence, i_size_read(bd_inode)); inode_unlock(bd_inode); return retval; } static int blkdev_fsync(struct file *filp, loff_t start, loff_t end, int datasync) { struct block_device *bdev = I_BDEV(filp->f_mapping->host); int error; error = file_write_and_wait_range(filp, start, end); if (error) return error; /* * There is no need to serialise calls to blkdev_issue_flush with * i_mutex and doing so causes performance issues with concurrent * O_SYNC writers to a block device. */ error = blkdev_issue_flush(bdev); if (error == -EOPNOTSUPP) error = 0; return error; } /** * file_to_blk_mode - get block open flags from file flags * @file: file whose open flags should be converted * * Look at file open flags and generate corresponding block open flags from * them. The function works both for file just being open (e.g. during ->open * callback) and for file that is already open. This is actually non-trivial * (see comment in the function). */ blk_mode_t file_to_blk_mode(struct file *file) { blk_mode_t mode = 0; if (file->f_mode & FMODE_READ) mode |= BLK_OPEN_READ; if (file->f_mode & FMODE_WRITE) mode |= BLK_OPEN_WRITE; /* * do_dentry_open() clears O_EXCL from f_flags, use file->private_data * to determine whether the open was exclusive for already open files. */ if (file->private_data) mode |= BLK_OPEN_EXCL; else if (file->f_flags & O_EXCL) mode |= BLK_OPEN_EXCL; if (file->f_flags & O_NDELAY) mode |= BLK_OPEN_NDELAY; /* * If all bits in O_ACCMODE set (aka O_RDWR | O_WRONLY), the floppy * driver has historically allowed ioctls as if the file was opened for * writing, but does not allow and actual reads or writes. */ if ((file->f_flags & O_ACCMODE) == (O_RDWR | O_WRONLY)) mode |= BLK_OPEN_WRITE_IOCTL; return mode; } static int blkdev_open(struct inode *inode, struct file *filp) { struct block_device *bdev; blk_mode_t mode; int ret; mode = file_to_blk_mode(filp); /* Use the file as the holder. */ if (mode & BLK_OPEN_EXCL) filp->private_data = filp; ret = bdev_permission(inode->i_rdev, mode, filp->private_data); if (ret) return ret; bdev = blkdev_get_no_open(inode->i_rdev, true); if (!bdev) return -ENXIO; if (bdev_can_atomic_write(bdev)) filp->f_mode |= FMODE_CAN_ATOMIC_WRITE; if (blk_get_integrity(bdev->bd_disk)) filp->f_mode |= FMODE_HAS_METADATA; ret = bdev_open(bdev, mode, filp->private_data, NULL, filp); if (ret) blkdev_put_no_open(bdev); return ret; } static int blkdev_release(struct inode *inode, struct file *filp) { bdev_release(filp); return 0; } static ssize_t blkdev_direct_write(struct kiocb *iocb, struct iov_iter *from) { size_t count = iov_iter_count(from); ssize_t written; written = kiocb_invalidate_pages(iocb, count); if (written) { if (written == -EBUSY) return 0; return written; } written = blkdev_direct_IO(iocb, from); if (written > 0) { kiocb_invalidate_post_direct_write(iocb, count); iocb->ki_pos += written; count -= written; } if (written != -EIOCBQUEUED) iov_iter_revert(from, count - iov_iter_count(from)); return written; } static ssize_t blkdev_buffered_write(struct kiocb *iocb, struct iov_iter *from) { return iomap_file_buffered_write(iocb, from, &blkdev_iomap_ops, NULL, NULL); } /* * Write data to the block device. Only intended for the block device itself * and the raw driver which basically is a fake block device. * * Does not take i_mutex for the write and thus is not for general purpose * use. */ static ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; struct inode *bd_inode = bdev_file_inode(file); struct block_device *bdev = I_BDEV(bd_inode); bool atomic = iocb->ki_flags & IOCB_ATOMIC; loff_t size = bdev_nr_bytes(bdev); size_t shorted = 0; ssize_t ret; if (bdev_read_only(bdev)) return -EPERM; if (IS_SWAPFILE(bd_inode) && !is_hibernate_resume_dev(bd_inode->i_rdev)) return -ETXTBSY; if (!iov_iter_count(from)) return 0; if (iocb->ki_pos >= size) return -ENOSPC; if ((iocb->ki_flags & (IOCB_NOWAIT | IOCB_DIRECT)) == IOCB_NOWAIT) return -EOPNOTSUPP; if (atomic) { ret = generic_atomic_write_valid(iocb, from); if (ret) return ret; } size -= iocb->ki_pos; if (iov_iter_count(from) > size) { if (atomic) return -EINVAL; shorted = iov_iter_count(from) - size; iov_iter_truncate(from, size); } ret = file_update_time(file); if (ret) return ret; if (iocb->ki_flags & IOCB_DIRECT) { ret = blkdev_direct_write(iocb, from); if (ret >= 0 && iov_iter_count(from)) ret = direct_write_fallback(iocb, from, ret, blkdev_buffered_write(iocb, from)); } else { /* * Take i_rwsem and invalidate_lock to avoid racing with * set_blocksize changing i_blkbits/folio order and punching * out the pagecache. */ inode_lock_shared(bd_inode); ret = blkdev_buffered_write(iocb, from); inode_unlock_shared(bd_inode); } if (ret > 0) ret = generic_write_sync(iocb, ret); iov_iter_reexpand(from, iov_iter_count(from) + shorted); return ret; } static ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to) { struct inode *bd_inode = bdev_file_inode(iocb->ki_filp); struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host); loff_t size = bdev_nr_bytes(bdev); loff_t pos = iocb->ki_pos; size_t shorted = 0; ssize_t ret = 0; size_t count; if (unlikely(pos + iov_iter_count(to) > size)) { if (pos >= size) return 0; size -= pos; shorted = iov_iter_count(to) - size; iov_iter_truncate(to, size); } count = iov_iter_count(to); if (!count) goto reexpand; /* skip atime */ if (iocb->ki_flags & IOCB_DIRECT) { ret = kiocb_write_and_wait(iocb, count); if (ret < 0) goto reexpand; file_accessed(iocb->ki_filp); ret = blkdev_direct_IO(iocb, to); if (ret > 0) { iocb->ki_pos += ret; count -= ret; } if (ret != -EIOCBQUEUED) iov_iter_revert(to, count - iov_iter_count(to)); if (ret < 0 || !count) goto reexpand; } /* * Take i_rwsem and invalidate_lock to avoid racing with set_blocksize * changing i_blkbits/folio order and punching out the pagecache. */ inode_lock_shared(bd_inode); ret = filemap_read(iocb, to, ret); inode_unlock_shared(bd_inode); reexpand: if (unlikely(shorted)) iov_iter_reexpand(to, iov_iter_count(to) + shorted); return ret; } #define BLKDEV_FALLOC_FL_SUPPORTED \ (FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | \ FALLOC_FL_ZERO_RANGE | FALLOC_FL_WRITE_ZEROES) static long blkdev_fallocate(struct file *file, int mode, loff_t start, loff_t len) { struct inode *inode = bdev_file_inode(file); struct block_device *bdev = I_BDEV(inode); loff_t end = start + len - 1; loff_t isize; unsigned int flags; int error; /* Fail if we don't recognize the flags. */ if (mode & ~BLKDEV_FALLOC_FL_SUPPORTED) return -EOPNOTSUPP; /* * Don't allow writing zeroes if the device does not enable the * unmap write zeroes operation. */ if ((mode & FALLOC_FL_WRITE_ZEROES) && !bdev_write_zeroes_unmap_sectors(bdev)) return -EOPNOTSUPP; /* Don't go off the end of the device. */ isize = bdev_nr_bytes(bdev); if (start >= isize) return -EINVAL; if (end >= isize) { if (mode & FALLOC_FL_KEEP_SIZE) { len = isize - start; end = start + len - 1; } else return -EINVAL; } /* * Don't allow IO that isn't aligned to logical block size. */ if ((start | len) & (bdev_logical_block_size(bdev) - 1)) return -EINVAL; inode_lock(inode); filemap_invalidate_lock(inode->i_mapping); switch (mode) { case FALLOC_FL_ZERO_RANGE: case FALLOC_FL_ZERO_RANGE | FALLOC_FL_KEEP_SIZE: flags = BLKDEV_ZERO_NOUNMAP; break; case FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE: flags = BLKDEV_ZERO_NOFALLBACK; break; case FALLOC_FL_WRITE_ZEROES: flags = 0; break; default: error = -EOPNOTSUPP; goto fail; } /* * Invalidate the page cache, including dirty pages, for valid * de-allocate mode calls to fallocate(). */ error = truncate_bdev_range(bdev, file_to_blk_mode(file), start, end); if (error) goto fail; error = blkdev_issue_zeroout(bdev, start >> SECTOR_SHIFT, len >> SECTOR_SHIFT, GFP_KERNEL, flags); fail: filemap_invalidate_unlock(inode->i_mapping); inode_unlock(inode); return error; } static int blkdev_mmap_prepare(struct vm_area_desc *desc) { struct file *file = desc->file; if (bdev_read_only(I_BDEV(bdev_file_inode(file)))) return generic_file_readonly_mmap_prepare(desc); return generic_file_mmap_prepare(desc); } const struct file_operations def_blk_fops = { .open = blkdev_open, .release = blkdev_release, .llseek = blkdev_llseek, .read_iter = blkdev_read_iter, .write_iter = blkdev_write_iter, .iopoll = iocb_bio_iopoll, .mmap_prepare = blkdev_mmap_prepare, .fsync = blkdev_fsync, .unlocked_ioctl = blkdev_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = compat_blkdev_ioctl, #endif .splice_read = filemap_splice_read, .splice_write = iter_file_splice_write, .fallocate = blkdev_fallocate, .uring_cmd = blkdev_uring_cmd, .fop_flags = FOP_BUFFER_RASYNC, }; static __init int blkdev_init(void) { return bioset_init(&blkdev_dio_pool, 4, offsetof(struct blkdev_dio, bio), BIOSET_NEED_BVECS|BIOSET_PERCPU_CACHE); } module_init(blkdev_init);
2233 2235 186 2232 176 143 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _BLK_CGROUP_PRIVATE_H #define _BLK_CGROUP_PRIVATE_H /* * block cgroup private header * * Based on ideas and code from CFQ, CFS and BFQ: * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk> * * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it> * Paolo Valente <paolo.valente@unimore.it> * * Copyright (C) 2009 Vivek Goyal <vgoyal@redhat.com> * Nauman Rafique <nauman@google.com> */ #include <linux/blk-cgroup.h> #include <linux/cgroup.h> #include <linux/kthread.h> #include <linux/blk-mq.h> #include <linux/llist.h> #include "blk.h" struct blkcg_gq; struct blkg_policy_data; /* percpu_counter batch for blkg_[rw]stats, per-cpu drift doesn't matter */ #define BLKG_STAT_CPU_BATCH (INT_MAX / 2) #ifdef CONFIG_BLK_CGROUP enum blkg_iostat_type { BLKG_IOSTAT_READ, BLKG_IOSTAT_WRITE, BLKG_IOSTAT_DISCARD, BLKG_IOSTAT_NR, }; struct blkg_iostat { u64 bytes[BLKG_IOSTAT_NR]; u64 ios[BLKG_IOSTAT_NR]; }; struct blkg_iostat_set { struct u64_stats_sync sync; struct blkcg_gq *blkg; struct llist_node lnode; int lqueued; /* queued in llist */ struct blkg_iostat cur; struct blkg_iostat last; }; /* association between a blk cgroup and a request queue */ struct blkcg_gq { /* Pointer to the associated request_queue */ struct request_queue *q; struct list_head q_node; struct hlist_node blkcg_node; struct blkcg *blkcg; /* all non-root blkcg_gq's are guaranteed to have access to parent */ struct blkcg_gq *parent; /* reference count */ struct percpu_ref refcnt; /* is this blkg online? protected by both blkcg and q locks */ bool online; struct blkg_iostat_set __percpu *iostat_cpu; struct blkg_iostat_set iostat; struct blkg_policy_data *pd[BLKCG_MAX_POLS]; #ifdef CONFIG_BLK_CGROUP_PUNT_BIO spinlock_t async_bio_lock; struct bio_list async_bios; #endif union { struct work_struct async_bio_work; struct work_struct free_work; }; atomic_t use_delay; atomic64_t delay_nsec; atomic64_t delay_start; u64 last_delay; int last_use; struct rcu_head rcu_head; }; struct blkcg { struct cgroup_subsys_state css; spinlock_t lock; refcount_t online_pin; /* If there is block congestion on this cgroup. */ atomic_t congestion_count; struct radix_tree_root blkg_tree; struct blkcg_gq __rcu *blkg_hint; struct hlist_head blkg_list; struct blkcg_policy_data *cpd[BLKCG_MAX_POLS]; struct list_head all_blkcgs_node; /* * List of updated percpu blkg_iostat_set's since the last flush. */ struct llist_head __percpu *lhead; #ifdef CONFIG_BLK_CGROUP_FC_APPID char fc_app_id[FC_APPID_LEN]; #endif #ifdef CONFIG_CGROUP_WRITEBACK struct list_head cgwb_list; #endif }; static inline struct blkcg *css_to_blkcg(struct cgroup_subsys_state *css) { return css ? container_of(css, struct blkcg, css) : NULL; } /* * A blkcg_gq (blkg) is association between a block cgroup (blkcg) and a * request_queue (q). This is used by blkcg policies which need to track * information per blkcg - q pair. * * There can be multiple active blkcg policies and each blkg:policy pair is * represented by a blkg_policy_data which is allocated and freed by each * policy's pd_alloc/free_fn() methods. A policy can allocate private data * area by allocating larger data structure which embeds blkg_policy_data * at the beginning. */ struct blkg_policy_data { /* the blkg and policy id this per-policy data belongs to */ struct blkcg_gq *blkg; int plid; bool online; }; /* * Policies that need to keep per-blkcg data which is independent from any * request_queue associated to it should implement cpd_alloc/free_fn() * methods. A policy can allocate private data area by allocating larger * data structure which embeds blkcg_policy_data at the beginning. * cpd_init() is invoked to let each policy handle per-blkcg data. */ struct blkcg_policy_data { /* the blkcg and policy id this per-policy data belongs to */ struct blkcg *blkcg; int plid; }; typedef struct blkcg_policy_data *(blkcg_pol_alloc_cpd_fn)(gfp_t gfp); typedef void (blkcg_pol_init_cpd_fn)(struct blkcg_policy_data *cpd); typedef void (blkcg_pol_free_cpd_fn)(struct blkcg_policy_data *cpd); typedef void (blkcg_pol_bind_cpd_fn)(struct blkcg_policy_data *cpd); typedef struct blkg_policy_data *(blkcg_pol_alloc_pd_fn)(struct gendisk *disk, struct blkcg *blkcg, gfp_t gfp); typedef void (blkcg_pol_init_pd_fn)(struct blkg_policy_data *pd); typedef void (blkcg_pol_online_pd_fn)(struct blkg_policy_data *pd); typedef void (blkcg_pol_offline_pd_fn)(struct blkg_policy_data *pd); typedef void (blkcg_pol_free_pd_fn)(struct blkg_policy_data *pd); typedef void (blkcg_pol_reset_pd_stats_fn)(struct blkg_policy_data *pd); typedef void (blkcg_pol_stat_pd_fn)(struct blkg_policy_data *pd, struct seq_file *s); struct blkcg_policy { int plid; /* cgroup files for the policy */ struct cftype *dfl_cftypes; struct cftype *legacy_cftypes; /* operations */ blkcg_pol_alloc_cpd_fn *cpd_alloc_fn; blkcg_pol_free_cpd_fn *cpd_free_fn; blkcg_pol_alloc_pd_fn *pd_alloc_fn; blkcg_pol_init_pd_fn *pd_init_fn; blkcg_pol_online_pd_fn *pd_online_fn; blkcg_pol_offline_pd_fn *pd_offline_fn; blkcg_pol_free_pd_fn *pd_free_fn; blkcg_pol_reset_pd_stats_fn *pd_reset_stats_fn; blkcg_pol_stat_pd_fn *pd_stat_fn; }; extern struct blkcg blkcg_root; extern bool blkcg_debug_stats; void blkg_init_queue(struct request_queue *q); int blkcg_init_disk(struct gendisk *disk); void blkcg_exit_disk(struct gendisk *disk); /* Blkio controller policy registration */ int blkcg_policy_register(struct blkcg_policy *pol); void blkcg_policy_unregister(struct blkcg_policy *pol); int blkcg_activate_policy(struct gendisk *disk, const struct blkcg_policy *pol); void blkcg_deactivate_policy(struct gendisk *disk, const struct blkcg_policy *pol); const char *blkg_dev_name(struct blkcg_gq *blkg); void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg, u64 (*prfill)(struct seq_file *, struct blkg_policy_data *, int), const struct blkcg_policy *pol, int data, bool show_total); u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v); struct blkg_conf_ctx { char *input; char *body; struct block_device *bdev; struct blkcg_gq *blkg; }; void blkg_conf_init(struct blkg_conf_ctx *ctx, char *input); int blkg_conf_open_bdev(struct blkg_conf_ctx *ctx); unsigned long blkg_conf_open_bdev_frozen(struct blkg_conf_ctx *ctx); int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, struct blkg_conf_ctx *ctx); void blkg_conf_exit(struct blkg_conf_ctx *ctx); void blkg_conf_exit_frozen(struct blkg_conf_ctx *ctx, unsigned long memflags); /** * bio_issue_as_root_blkg - see if this bio needs to be issued as root blkg * @bio: the target &bio * * Return: true if this bio needs to be submitted with the root blkg context. * * In order to avoid priority inversions we sometimes need to issue a bio as if * it were attached to the root blkg, and then backcharge to the actual owning * blkg. The idea is we do bio_blkcg_css() to look up the actual context for * the bio and attach the appropriate blkg to the bio. Then we call this helper * and if it is true run with the root blkg for that queue and then do any * backcharging to the originating cgroup once the io is complete. */ static inline bool bio_issue_as_root_blkg(struct bio *bio) { return (bio->bi_opf & (REQ_META | REQ_SWAP)) != 0; } /** * blkg_lookup - lookup blkg for the specified blkcg - q pair * @blkcg: blkcg of interest * @q: request_queue of interest * * Lookup blkg for the @blkcg - @q pair. * * Must be called in a RCU critical section. */ static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, struct request_queue *q) { struct blkcg_gq *blkg; if (blkcg == &blkcg_root) return q->root_blkg; blkg = rcu_dereference_check(blkcg->blkg_hint, lockdep_is_held(&q->queue_lock)); if (blkg && blkg->q == q) return blkg; blkg = radix_tree_lookup(&blkcg->blkg_tree, q->id); if (blkg && blkg->q != q) blkg = NULL; return blkg; } /** * blkg_to_pd - get policy private data * @blkg: blkg of interest * @pol: policy of interest * * Return pointer to private data associated with the @blkg-@pol pair. */ static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg, struct blkcg_policy *pol) { return blkg ? blkg->pd[pol->plid] : NULL; } static inline struct blkcg_policy_data *blkcg_to_cpd(struct blkcg *blkcg, struct blkcg_policy *pol) { return blkcg ? blkcg->cpd[pol->plid] : NULL; } /** * pd_to_blkg - get blkg associated with policy private data * @pd: policy private data of interest * * @pd is policy private data. Determine the blkg it's associated with. */ static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd) { return pd ? pd->blkg : NULL; } static inline struct blkcg *cpd_to_blkcg(struct blkcg_policy_data *cpd) { return cpd ? cpd->blkcg : NULL; } /** * blkg_get - get a blkg reference * @blkg: blkg to get * * The caller should be holding an existing reference. */ static inline void blkg_get(struct blkcg_gq *blkg) { percpu_ref_get(&blkg->refcnt); } /** * blkg_tryget - try and get a blkg reference * @blkg: blkg to get * * This is for use when doing an RCU lookup of the blkg. We may be in the midst * of freeing this blkg, so we can only use it if the refcnt is not zero. */ static inline bool blkg_tryget(struct blkcg_gq *blkg) { return blkg && percpu_ref_tryget(&blkg->refcnt); } /** * blkg_put - put a blkg reference * @blkg: blkg to put */ static inline void blkg_put(struct blkcg_gq *blkg) { percpu_ref_put(&blkg->refcnt); } /** * blkg_for_each_descendant_pre - pre-order walk of a blkg's descendants * @d_blkg: loop cursor pointing to the current descendant * @pos_css: used for iteration * @p_blkg: target blkg to walk descendants of * * Walk @c_blkg through the descendants of @p_blkg. Must be used with RCU * read locked. If called under either blkcg or queue lock, the iteration * is guaranteed to include all and only online blkgs. The caller may * update @pos_css by calling css_rightmost_descendant() to skip subtree. * @p_blkg is included in the iteration and the first node to be visited. */ #define blkg_for_each_descendant_pre(d_blkg, pos_css, p_blkg) \ css_for_each_descendant_pre((pos_css), &(p_blkg)->blkcg->css) \ if (((d_blkg) = blkg_lookup(css_to_blkcg(pos_css), \ (p_blkg)->q))) /** * blkg_for_each_descendant_post - post-order walk of a blkg's descendants * @d_blkg: loop cursor pointing to the current descendant * @pos_css: used for iteration * @p_blkg: target blkg to walk descendants of * * Similar to blkg_for_each_descendant_pre() but performs post-order * traversal instead. Synchronization rules are the same. @p_blkg is * included in the iteration and the last node to be visited. */ #define blkg_for_each_descendant_post(d_blkg, pos_css, p_blkg) \ css_for_each_descendant_post((pos_css), &(p_blkg)->blkcg->css) \ if (((d_blkg) = blkg_lookup(css_to_blkcg(pos_css), \ (p_blkg)->q))) static inline void blkcg_use_delay(struct blkcg_gq *blkg) { if (WARN_ON_ONCE(atomic_read(&blkg->use_delay) < 0)) return; if (atomic_add_return(1, &blkg->use_delay) == 1) atomic_inc(&blkg->blkcg->congestion_count); } static inline int blkcg_unuse_delay(struct blkcg_gq *blkg) { int old = atomic_read(&blkg->use_delay); if (WARN_ON_ONCE(old < 0)) return 0; if (old == 0) return 0; /* * We do this song and dance because we can race with somebody else * adding or removing delay. If we just did an atomic_dec we'd end up * negative and we'd already be in trouble. We need to subtract 1 and * then check to see if we were the last delay so we can drop the * congestion count on the cgroup. */ while (old && !atomic_try_cmpxchg(&blkg->use_delay, &old, old - 1)) ; if (old == 0) return 0; if (old == 1) atomic_dec(&blkg->blkcg->congestion_count); return 1; } /** * blkcg_set_delay - Enable allocator delay mechanism with the specified delay amount * @blkg: target blkg * @delay: delay duration in nsecs * * When enabled with this function, the delay is not decayed and must be * explicitly cleared with blkcg_clear_delay(). Must not be mixed with * blkcg_[un]use_delay() and blkcg_add_delay() usages. */ static inline void blkcg_set_delay(struct blkcg_gq *blkg, u64 delay) { int old = atomic_read(&blkg->use_delay); /* We only want 1 person setting the congestion count for this blkg. */ if (!old && atomic_try_cmpxchg(&blkg->use_delay, &old, -1)) atomic_inc(&blkg->blkcg->congestion_count); atomic64_set(&blkg->delay_nsec, delay); } /** * blkcg_clear_delay - Disable allocator delay mechanism * @blkg: target blkg * * Disable use_delay mechanism. See blkcg_set_delay(). */ static inline void blkcg_clear_delay(struct blkcg_gq *blkg) { int old = atomic_read(&blkg->use_delay); /* We only want 1 person clearing the congestion count for this blkg. */ if (old && atomic_try_cmpxchg(&blkg->use_delay, &old, 0)) atomic_dec(&blkg->blkcg->congestion_count); } /** * blk_cgroup_mergeable - Determine whether to allow or disallow merges * @rq: request to merge into * @bio: bio to merge * * @bio and @rq should belong to the same cgroup and their issue_as_root should * match. The latter is necessary as we don't want to throttle e.g. a metadata * update because it happens to be next to a regular IO. */ static inline bool blk_cgroup_mergeable(struct request *rq, struct bio *bio) { return rq->bio->bi_blkg == bio->bi_blkg && bio_issue_as_root_blkg(rq->bio) == bio_issue_as_root_blkg(bio); } static inline bool blkcg_policy_enabled(struct request_queue *q, const struct blkcg_policy *pol) { return pol && test_bit(pol->plid, q->blkcg_pols); } void blk_cgroup_bio_start(struct bio *bio); void blkcg_add_delay(struct blkcg_gq *blkg, u64 now, u64 delta); #else /* CONFIG_BLK_CGROUP */ struct blkg_policy_data { }; struct blkcg_policy_data { }; struct blkcg_policy { }; struct blkcg { }; static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, void *key) { return NULL; } static inline void blkg_init_queue(struct request_queue *q) { } static inline int blkcg_init_disk(struct gendisk *disk) { return 0; } static inline void blkcg_exit_disk(struct gendisk *disk) { } static inline int blkcg_policy_register(struct blkcg_policy *pol) { return 0; } static inline void blkcg_policy_unregister(struct blkcg_policy *pol) { } static inline int blkcg_activate_policy(struct gendisk *disk, const struct blkcg_policy *pol) { return 0; } static inline void blkcg_deactivate_policy(struct gendisk *disk, const struct blkcg_policy *pol) { } static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg, struct blkcg_policy *pol) { return NULL; } static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd) { return NULL; } static inline void blkg_get(struct blkcg_gq *blkg) { } static inline void blkg_put(struct blkcg_gq *blkg) { } static inline void blk_cgroup_bio_start(struct bio *bio) { } static inline bool blk_cgroup_mergeable(struct request *rq, struct bio *bio) { return true; } #define blk_queue_for_each_rl(rl, q) \ for ((rl) = &(q)->root_rl; (rl); (rl) = NULL) #endif /* CONFIG_BLK_CGROUP */ #endif /* _BLK_CGROUP_PRIVATE_H */
4 3 2 1 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 // SPDX-License-Identifier: GPL-2.0-only #include <linux/kernel.h> #include <linux/init.h> #include <linux/module.h> #include <linux/netfilter.h> #include <linux/rhashtable.h> #include <net/netfilter/nf_flow_table.h> #include <net/netfilter/nf_tables.h> #include <linux/if_vlan.h> static unsigned int nf_flow_offload_inet_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { struct vlan_ethhdr *veth; __be16 proto; switch (skb->protocol) { case htons(ETH_P_8021Q): if (!pskb_may_pull(skb, skb_mac_offset(skb) + sizeof(*veth))) return NF_ACCEPT; veth = (struct vlan_ethhdr *)skb_mac_header(skb); proto = veth->h_vlan_encapsulated_proto; break; case htons(ETH_P_PPP_SES): if (!nf_flow_pppoe_proto(skb, &proto)) return NF_ACCEPT; break; default: proto = skb->protocol; break; } switch (proto) { case htons(ETH_P_IP): return nf_flow_offload_ip_hook(priv, skb, state); case htons(ETH_P_IPV6): return nf_flow_offload_ipv6_hook(priv, skb, state); } return NF_ACCEPT; } static int nf_flow_rule_route_inet(struct net *net, struct flow_offload *flow, enum flow_offload_tuple_dir dir, struct nf_flow_rule *flow_rule) { const struct flow_offload_tuple *flow_tuple = &flow->tuplehash[dir].tuple; int err; switch (flow_tuple->l3proto) { case NFPROTO_IPV4: err = nf_flow_rule_route_ipv4(net, flow, dir, flow_rule); break; case NFPROTO_IPV6: err = nf_flow_rule_route_ipv6(net, flow, dir, flow_rule); break; default: err = -1; break; } return err; } static struct nf_flowtable_type flowtable_inet = { .family = NFPROTO_INET, .init = nf_flow_table_init, .setup = nf_flow_table_offload_setup, .action = nf_flow_rule_route_inet, .free = nf_flow_table_free, .hook = nf_flow_offload_inet_hook, .owner = THIS_MODULE, }; static struct nf_flowtable_type flowtable_ipv4 = { .family = NFPROTO_IPV4, .init = nf_flow_table_init, .setup = nf_flow_table_offload_setup, .action = nf_flow_rule_route_ipv4, .free = nf_flow_table_free, .hook = nf_flow_offload_ip_hook, .owner = THIS_MODULE, }; static struct nf_flowtable_type flowtable_ipv6 = { .family = NFPROTO_IPV6, .init = nf_flow_table_init, .setup = nf_flow_table_offload_setup, .action = nf_flow_rule_route_ipv6, .free = nf_flow_table_free, .hook = nf_flow_offload_ipv6_hook, .owner = THIS_MODULE, }; static int __init nf_flow_inet_module_init(void) { nft_register_flowtable_type(&flowtable_ipv4); nft_register_flowtable_type(&flowtable_ipv6); nft_register_flowtable_type(&flowtable_inet); return 0; } static void __exit nf_flow_inet_module_exit(void) { nft_unregister_flowtable_type(&flowtable_inet); nft_unregister_flowtable_type(&flowtable_ipv6); nft_unregister_flowtable_type(&flowtable_ipv4); } module_init(nf_flow_inet_module_init); module_exit(nf_flow_inet_module_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>"); MODULE_ALIAS_NF_FLOWTABLE(AF_INET); MODULE_ALIAS_NF_FLOWTABLE(AF_INET6); MODULE_ALIAS_NF_FLOWTABLE(1); /* NFPROTO_INET */ MODULE_DESCRIPTION("Netfilter flow table mixed IPv4/IPv6 module");
5 4 4 3 19 18 20 12 2 2 3 3 3 45 37 39 38 6 2 2 2 2 2 4 2 2 2 12 19 20 39 38 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 // SPDX-License-Identifier: GPL-2.0 #include <linux/kernel.h> #include <linux/syscalls.h> #include <linux/fdtable.h> #include <linux/string.h> #include <linux/random.h> #include <linux/module.h> #include <linux/ptrace.h> #include <linux/init.h> #include <linux/errno.h> #include <linux/cache.h> #include <linux/bug.h> #include <linux/err.h> #include <linux/kcmp.h> #include <linux/capability.h> #include <linux/list.h> #include <linux/eventpoll.h> #include <linux/file.h> #include <asm/unistd.h> /* * We don't expose the real in-memory order of objects for security reasons. * But still the comparison results should be suitable for sorting. So we * obfuscate kernel pointers values and compare the production instead. * * The obfuscation is done in two steps. First we xor the kernel pointer with * a random value, which puts pointer into a new position in a reordered space. * Secondly we multiply the xor production with a large odd random number to * permute its bits even more (the odd multiplier guarantees that the product * is unique ever after the high bits are truncated, since any odd number is * relative prime to 2^n). * * Note also that the obfuscation itself is invisible to userspace and if needed * it can be changed to an alternate scheme. */ static unsigned long cookies[KCMP_TYPES][2] __read_mostly; static long kptr_obfuscate(long v, int type) { return (v ^ cookies[type][0]) * cookies[type][1]; } /* * 0 - equal, i.e. v1 = v2 * 1 - less than, i.e. v1 < v2 * 2 - greater than, i.e. v1 > v2 * 3 - not equal but ordering unavailable (reserved for future) */ static int kcmp_ptr(void *v1, void *v2, enum kcmp_type type) { long t1, t2; t1 = kptr_obfuscate((long)v1, type); t2 = kptr_obfuscate((long)v2, type); return (t1 < t2) | ((t1 > t2) << 1); } /* The caller must have pinned the task */ static struct file * get_file_raw_ptr(struct task_struct *task, unsigned int idx) { struct file *file; file = fget_task(task, idx); if (file) fput(file); return file; } static void kcmp_unlock(struct rw_semaphore *l1, struct rw_semaphore *l2) { if (likely(l2 != l1)) up_read(l2); up_read(l1); } static int kcmp_lock(struct rw_semaphore *l1, struct rw_semaphore *l2) { int err; if (l2 > l1) swap(l1, l2); err = down_read_killable(l1); if (!err && likely(l1 != l2)) { err = down_read_killable_nested(l2, SINGLE_DEPTH_NESTING); if (err) up_read(l1); } return err; } #ifdef CONFIG_EPOLL static int kcmp_epoll_target(struct task_struct *task1, struct task_struct *task2, unsigned long idx1, struct kcmp_epoll_slot __user *uslot) { struct file *filp, *filp_epoll, *filp_tgt; struct kcmp_epoll_slot slot; if (copy_from_user(&slot, uslot, sizeof(slot))) return -EFAULT; filp = get_file_raw_ptr(task1, idx1); if (!filp) return -EBADF; filp_epoll = fget_task(task2, slot.efd); if (!filp_epoll) return -EBADF; filp_tgt = get_epoll_tfile_raw_ptr(filp_epoll, slot.tfd, slot.toff); fput(filp_epoll); if (IS_ERR(filp_tgt)) return PTR_ERR(filp_tgt); return kcmp_ptr(filp, filp_tgt, KCMP_FILE); } #else static int kcmp_epoll_target(struct task_struct *task1, struct task_struct *task2, unsigned long idx1, struct kcmp_epoll_slot __user *uslot) { return -EOPNOTSUPP; } #endif SYSCALL_DEFINE5(kcmp, pid_t, pid1, pid_t, pid2, int, type, unsigned long, idx1, unsigned long, idx2) { struct task_struct *task1, *task2; int ret; rcu_read_lock(); /* * Tasks are looked up in caller's PID namespace only. */ task1 = find_task_by_vpid(pid1); task2 = find_task_by_vpid(pid2); if (unlikely(!task1 || !task2)) goto err_no_task; get_task_struct(task1); get_task_struct(task2); rcu_read_unlock(); /* * One should have enough rights to inspect task details. */ ret = kcmp_lock(&task1->signal->exec_update_lock, &task2->signal->exec_update_lock); if (ret) goto err; if (!ptrace_may_access(task1, PTRACE_MODE_READ_REALCREDS) || !ptrace_may_access(task2, PTRACE_MODE_READ_REALCREDS)) { ret = -EPERM; goto err_unlock; } switch (type) { case KCMP_FILE: { struct file *filp1, *filp2; filp1 = get_file_raw_ptr(task1, idx1); filp2 = get_file_raw_ptr(task2, idx2); if (filp1 && filp2) ret = kcmp_ptr(filp1, filp2, KCMP_FILE); else ret = -EBADF; break; } case KCMP_VM: ret = kcmp_ptr(task1->mm, task2->mm, KCMP_VM); break; case KCMP_FILES: ret = kcmp_ptr(task1->files, task2->files, KCMP_FILES); break; case KCMP_FS: ret = kcmp_ptr(task1->fs, task2->fs, KCMP_FS); break; case KCMP_SIGHAND: ret = kcmp_ptr(task1->sighand, task2->sighand, KCMP_SIGHAND); break; case KCMP_IO: ret = kcmp_ptr(task1->io_context, task2->io_context, KCMP_IO); break; case KCMP_SYSVSEM: #ifdef CONFIG_SYSVIPC ret = kcmp_ptr(task1->sysvsem.undo_list, task2->sysvsem.undo_list, KCMP_SYSVSEM); #else ret = -EOPNOTSUPP; #endif break; case KCMP_EPOLL_TFD: ret = kcmp_epoll_target(task1, task2, idx1, (void *)idx2); break; default: ret = -EINVAL; break; } err_unlock: kcmp_unlock(&task1->signal->exec_update_lock, &task2->signal->exec_update_lock); err: put_task_struct(task1); put_task_struct(task2); return ret; err_no_task: rcu_read_unlock(); return -ESRCH; } static __init int kcmp_cookies_init(void) { int i; get_random_bytes(cookies, sizeof(cookies)); for (i = 0; i < KCMP_TYPES; i++) cookies[i][1] |= (~(~0UL >> 1) | 1); return 0; } arch_initcall(kcmp_cookies_init);
46 47 1 48 47 1 1 1 6 47 45 6 47 47 47 47 45 6 30 1 1 1 8 25 25 25 25 30 1 30 8 8 8 8 30 30 1 30 30 30 4 28 28 1 5 5 5 3 2 30 22 8 30 2 23 2 3 23 1 2 2 21 3 48 1 44 2 1 1 1 1 1 1 39 1 2 1 2 2 3 34 6 2 29 29 1 5 25 25 25 2 21 21 21 21 20 1 17 3 21 21 21 21 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 // SPDX-License-Identifier: GPL-2.0-or-later /* * net/sched/cls_u32.c Ugly (or Universal) 32bit key Packet Classifier. * * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> * * The filters are packed to hash tables of key nodes * with a set of 32bit key/mask pairs at every node. * Nodes reference next level hash tables etc. * * This scheme is the best universal classifier I managed to * invent; it is not super-fast, but it is not slow (provided you * program it correctly), and general enough. And its relative * speed grows as the number of rules becomes larger. * * It seems that it represents the best middle point between * speed and manageability both by human and by machine. * * It is especially useful for link sharing combined with QoS; * pure RSVP doesn't need such a general approach and can use * much simpler (and faster) schemes, sort of cls_rsvp.c. * * nfmark match added by Catalin(ux aka Dino) BOIE <catab at umbrella.ro> */ #include <linux/module.h> #include <linux/slab.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/errno.h> #include <linux/percpu.h> #include <linux/rtnetlink.h> #include <linux/skbuff.h> #include <linux/bitmap.h> #include <linux/netdevice.h> #include <linux/hash.h> #include <net/netlink.h> #include <net/act_api.h> #include <net/pkt_cls.h> #include <linux/idr.h> #include <net/tc_wrapper.h> struct tc_u_knode { struct tc_u_knode __rcu *next; u32 handle; struct tc_u_hnode __rcu *ht_up; struct tcf_exts exts; int ifindex; u8 fshift; struct tcf_result res; struct tc_u_hnode __rcu *ht_down; #ifdef CONFIG_CLS_U32_PERF struct tc_u32_pcnt __percpu *pf; #endif u32 flags; unsigned int in_hw_count; #ifdef CONFIG_CLS_U32_MARK u32 val; u32 mask; u32 __percpu *pcpu_success; #endif struct rcu_work rwork; /* The 'sel' field MUST be the last field in structure to allow for * tc_u32_keys allocated at end of structure. */ struct tc_u32_sel sel; }; struct tc_u_hnode { struct tc_u_hnode __rcu *next; u32 handle; u32 prio; refcount_t refcnt; unsigned int divisor; struct idr handle_idr; bool is_root; struct rcu_head rcu; u32 flags; /* The 'ht' field MUST be the last field in structure to allow for * more entries allocated at end of structure. */ struct tc_u_knode __rcu *ht[]; }; struct tc_u_common { struct tc_u_hnode __rcu *hlist; void *ptr; refcount_t refcnt; struct idr handle_idr; struct hlist_node hnode; long knodes; }; static u32 handle2id(u32 h) { return ((h & 0x80000000) ? ((h >> 20) & 0x7FF) : h); } static u32 id2handle(u32 id) { return (id | 0x800U) << 20; } static inline unsigned int u32_hash_fold(__be32 key, const struct tc_u32_sel *sel, u8 fshift) { unsigned int h = ntohl(key & sel->hmask) >> fshift; return h; } TC_INDIRECT_SCOPE int u32_classify(struct sk_buff *skb, const struct tcf_proto *tp, struct tcf_result *res) { struct { struct tc_u_knode *knode; unsigned int off; } stack[TC_U32_MAXDEPTH]; struct tc_u_hnode *ht = rcu_dereference_bh(tp->root); unsigned int off = skb_network_offset(skb); struct tc_u_knode *n; int sdepth = 0; int off2 = 0; int sel = 0; #ifdef CONFIG_CLS_U32_PERF int j; #endif int i, r; next_ht: n = rcu_dereference_bh(ht->ht[sel]); next_knode: if (n) { struct tc_u32_key *key = n->sel.keys; #ifdef CONFIG_CLS_U32_PERF __this_cpu_inc(n->pf->rcnt); j = 0; #endif if (tc_skip_sw(n->flags)) { n = rcu_dereference_bh(n->next); goto next_knode; } #ifdef CONFIG_CLS_U32_MARK if ((skb->mark & n->mask) != n->val) { n = rcu_dereference_bh(n->next); goto next_knode; } else { __this_cpu_inc(*n->pcpu_success); } #endif for (i = n->sel.nkeys; i > 0; i--, key++) { int toff = off + key->off + (off2 & key->offmask); __be32 *data, hdata; if (skb_headroom(skb) + toff > INT_MAX) goto out; data = skb_header_pointer(skb, toff, 4, &hdata); if (!data) goto out; if ((*data ^ key->val) & key->mask) { n = rcu_dereference_bh(n->next); goto next_knode; } #ifdef CONFIG_CLS_U32_PERF __this_cpu_inc(n->pf->kcnts[j]); j++; #endif } ht = rcu_dereference_bh(n->ht_down); if (!ht) { check_terminal: if (n->sel.flags & TC_U32_TERMINAL) { *res = n->res; if (!tcf_match_indev(skb, n->ifindex)) { n = rcu_dereference_bh(n->next); goto next_knode; } #ifdef CONFIG_CLS_U32_PERF __this_cpu_inc(n->pf->rhit); #endif r = tcf_exts_exec(skb, &n->exts, res); if (r < 0) { n = rcu_dereference_bh(n->next); goto next_knode; } return r; } n = rcu_dereference_bh(n->next); goto next_knode; } /* PUSH */ if (sdepth >= TC_U32_MAXDEPTH) goto deadloop; stack[sdepth].knode = n; stack[sdepth].off = off; sdepth++; ht = rcu_dereference_bh(n->ht_down); sel = 0; if (ht->divisor) { __be32 *data, hdata; data = skb_header_pointer(skb, off + n->sel.hoff, 4, &hdata); if (!data) goto out; sel = ht->divisor & u32_hash_fold(*data, &n->sel, n->fshift); } if (!(n->sel.flags & (TC_U32_VAROFFSET | TC_U32_OFFSET | TC_U32_EAT))) goto next_ht; if (n->sel.flags & (TC_U32_OFFSET | TC_U32_VAROFFSET)) { off2 = n->sel.off + 3; if (n->sel.flags & TC_U32_VAROFFSET) { __be16 *data, hdata; data = skb_header_pointer(skb, off + n->sel.offoff, 2, &hdata); if (!data) goto out; off2 += ntohs(n->sel.offmask & *data) >> n->sel.offshift; } off2 &= ~3; } if (n->sel.flags & TC_U32_EAT) { off += off2; off2 = 0; } if (off < skb->len) goto next_ht; } /* POP */ if (sdepth--) { n = stack[sdepth].knode; ht = rcu_dereference_bh(n->ht_up); off = stack[sdepth].off; goto check_terminal; } out: return -1; deadloop: net_warn_ratelimited("cls_u32: dead loop\n"); return -1; } static struct tc_u_hnode *u32_lookup_ht(struct tc_u_common *tp_c, u32 handle) { struct tc_u_hnode *ht; for (ht = rtnl_dereference(tp_c->hlist); ht; ht = rtnl_dereference(ht->next)) if (ht->handle == handle) break; return ht; } static struct tc_u_knode *u32_lookup_key(struct tc_u_hnode *ht, u32 handle) { unsigned int sel; struct tc_u_knode *n = NULL; sel = TC_U32_HASH(handle); if (sel > ht->divisor) goto out; for (n = rtnl_dereference(ht->ht[sel]); n; n = rtnl_dereference(n->next)) if (n->handle == handle) break; out: return n; } static void *u32_get(struct tcf_proto *tp, u32 handle) { struct tc_u_hnode *ht; struct tc_u_common *tp_c = tp->data; if (TC_U32_HTID(handle) == TC_U32_ROOT) ht = rtnl_dereference(tp->root); else ht = u32_lookup_ht(tp_c, TC_U32_HTID(handle)); if (!ht) return NULL; if (TC_U32_KEY(handle) == 0) return ht; return u32_lookup_key(ht, handle); } /* Protected by rtnl lock */ static u32 gen_new_htid(struct tc_u_common *tp_c, struct tc_u_hnode *ptr) { int id = idr_alloc_cyclic(&tp_c->handle_idr, ptr, 1, 0x7FF, GFP_KERNEL); if (id < 0) return 0; return id2handle(id); } static struct hlist_head *tc_u_common_hash; #define U32_HASH_SHIFT 10 #define U32_HASH_SIZE (1 << U32_HASH_SHIFT) static void *tc_u_common_ptr(const struct tcf_proto *tp) { struct tcf_block *block = tp->chain->block; /* The block sharing is currently supported only * for classless qdiscs. In that case we use block * for tc_u_common identification. In case the * block is not shared, block->q is a valid pointer * and we can use that. That works for classful qdiscs. */ if (tcf_block_shared(block)) return block; else return block->q; } static struct hlist_head *tc_u_hash(void *key) { return tc_u_common_hash + hash_ptr(key, U32_HASH_SHIFT); } static struct tc_u_common *tc_u_common_find(void *key) { struct tc_u_common *tc; hlist_for_each_entry(tc, tc_u_hash(key), hnode) { if (tc->ptr == key) return tc; } return NULL; } static int u32_init(struct tcf_proto *tp) { struct tc_u_hnode *root_ht; void *key = tc_u_common_ptr(tp); struct tc_u_common *tp_c = tc_u_common_find(key); root_ht = kzalloc(struct_size(root_ht, ht, 1), GFP_KERNEL); if (root_ht == NULL) return -ENOBUFS; refcount_set(&root_ht->refcnt, 1); root_ht->handle = tp_c ? gen_new_htid(tp_c, root_ht) : id2handle(0); root_ht->prio = tp->prio; root_ht->is_root = true; idr_init(&root_ht->handle_idr); if (tp_c == NULL) { tp_c = kzalloc(sizeof(*tp_c), GFP_KERNEL); if (tp_c == NULL) { kfree(root_ht); return -ENOBUFS; } refcount_set(&tp_c->refcnt, 1); tp_c->ptr = key; INIT_HLIST_NODE(&tp_c->hnode); idr_init(&tp_c->handle_idr); hlist_add_head(&tp_c->hnode, tc_u_hash(key)); } else { refcount_inc(&tp_c->refcnt); } RCU_INIT_POINTER(root_ht->next, tp_c->hlist); rcu_assign_pointer(tp_c->hlist, root_ht); /* root_ht must be destroyed when tcf_proto is destroyed */ rcu_assign_pointer(tp->root, root_ht); tp->data = tp_c; return 0; } static void __u32_destroy_key(struct tc_u_knode *n) { struct tc_u_hnode *ht = rtnl_dereference(n->ht_down); tcf_exts_destroy(&n->exts); if (ht && refcount_dec_and_test(&ht->refcnt)) kfree(ht); kfree(n); } static void u32_destroy_key(struct tc_u_knode *n, bool free_pf) { tcf_exts_put_net(&n->exts); #ifdef CONFIG_CLS_U32_PERF if (free_pf) free_percpu(n->pf); #endif #ifdef CONFIG_CLS_U32_MARK if (free_pf) free_percpu(n->pcpu_success); #endif __u32_destroy_key(n); } /* u32_delete_key_rcu should be called when free'ing a copied * version of a tc_u_knode obtained from u32_init_knode(). When * copies are obtained from u32_init_knode() the statistics are * shared between the old and new copies to allow readers to * continue to update the statistics during the copy. To support * this the u32_delete_key_rcu variant does not free the percpu * statistics. */ static void u32_delete_key_work(struct work_struct *work) { struct tc_u_knode *key = container_of(to_rcu_work(work), struct tc_u_knode, rwork); rtnl_lock(); u32_destroy_key(key, false); rtnl_unlock(); } /* u32_delete_key_freepf_rcu is the rcu callback variant * that free's the entire structure including the statistics * percpu variables. Only use this if the key is not a copy * returned by u32_init_knode(). See u32_delete_key_rcu() * for the variant that should be used with keys return from * u32_init_knode() */ static void u32_delete_key_freepf_work(struct work_struct *work) { struct tc_u_knode *key = container_of(to_rcu_work(work), struct tc_u_knode, rwork); rtnl_lock(); u32_destroy_key(key, true); rtnl_unlock(); } static int u32_delete_key(struct tcf_proto *tp, struct tc_u_knode *key) { struct tc_u_common *tp_c = tp->data; struct tc_u_knode __rcu **kp; struct tc_u_knode *pkp; struct tc_u_hnode *ht = rtnl_dereference(key->ht_up); if (ht) { kp = &ht->ht[TC_U32_HASH(key->handle)]; for (pkp = rtnl_dereference(*kp); pkp; kp = &pkp->next, pkp = rtnl_dereference(*kp)) { if (pkp == key) { RCU_INIT_POINTER(*kp, key->next); tp_c->knodes--; tcf_unbind_filter(tp, &key->res); idr_remove(&ht->handle_idr, key->handle); tcf_exts_get_net(&key->exts); tcf_queue_work(&key->rwork, u32_delete_key_freepf_work); return 0; } } } WARN_ON(1); return 0; } static void u32_clear_hw_hnode(struct tcf_proto *tp, struct tc_u_hnode *h, struct netlink_ext_ack *extack) { struct tcf_block *block = tp->chain->block; struct tc_cls_u32_offload cls_u32 = {}; tc_cls_common_offload_init(&cls_u32.common, tp, h->flags, extack); cls_u32.command = TC_CLSU32_DELETE_HNODE; cls_u32.hnode.divisor = h->divisor; cls_u32.hnode.handle = h->handle; cls_u32.hnode.prio = h->prio; tc_setup_cb_call(block, TC_SETUP_CLSU32, &cls_u32, false, true); } static int u32_replace_hw_hnode(struct tcf_proto *tp, struct tc_u_hnode *h, u32 flags, struct netlink_ext_ack *extack) { struct tcf_block *block = tp->chain->block; struct tc_cls_u32_offload cls_u32 = {}; bool skip_sw = tc_skip_sw(flags); bool offloaded = false; int err; tc_cls_common_offload_init(&cls_u32.common, tp, flags, extack); cls_u32.command = TC_CLSU32_NEW_HNODE; cls_u32.hnode.divisor = h->divisor; cls_u32.hnode.handle = h->handle; cls_u32.hnode.prio = h->prio; err = tc_setup_cb_call(block, TC_SETUP_CLSU32, &cls_u32, skip_sw, true); if (err < 0) { u32_clear_hw_hnode(tp, h, NULL); return err; } else if (err > 0) { offloaded = true; } if (skip_sw && !offloaded) return -EINVAL; return 0; } static void u32_remove_hw_knode(struct tcf_proto *tp, struct tc_u_knode *n, struct netlink_ext_ack *extack) { struct tcf_block *block = tp->chain->block; struct tc_cls_u32_offload cls_u32 = {}; tc_cls_common_offload_init(&cls_u32.common, tp, n->flags, extack); cls_u32.command = TC_CLSU32_DELETE_KNODE; cls_u32.knode.handle = n->handle; tc_setup_cb_destroy(block, tp, TC_SETUP_CLSU32, &cls_u32, false, &n->flags, &n->in_hw_count, true); } static int u32_replace_hw_knode(struct tcf_proto *tp, struct tc_u_knode *n, u32 flags, struct netlink_ext_ack *extack) { struct tc_u_hnode *ht = rtnl_dereference(n->ht_down); struct tcf_block *block = tp->chain->block; struct tc_cls_u32_offload cls_u32 = {}; bool skip_sw = tc_skip_sw(flags); int err; tc_cls_common_offload_init(&cls_u32.common, tp, flags, extack); cls_u32.command = TC_CLSU32_REPLACE_KNODE; cls_u32.knode.handle = n->handle; cls_u32.knode.fshift = n->fshift; #ifdef CONFIG_CLS_U32_MARK cls_u32.knode.val = n->val; cls_u32.knode.mask = n->mask; #else cls_u32.knode.val = 0; cls_u32.knode.mask = 0; #endif cls_u32.knode.sel = &n->sel; cls_u32.knode.res = &n->res; cls_u32.knode.exts = &n->exts; if (n->ht_down) cls_u32.knode.link_handle = ht->handle; err = tc_setup_cb_add(block, tp, TC_SETUP_CLSU32, &cls_u32, skip_sw, &n->flags, &n->in_hw_count, true); if (err) { u32_remove_hw_knode(tp, n, NULL); return err; } if (skip_sw && !(n->flags & TCA_CLS_FLAGS_IN_HW)) return -EINVAL; return 0; } static void u32_clear_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht, struct netlink_ext_ack *extack) { struct tc_u_common *tp_c = tp->data; struct tc_u_knode *n; unsigned int h; for (h = 0; h <= ht->divisor; h++) { while ((n = rtnl_dereference(ht->ht[h])) != NULL) { RCU_INIT_POINTER(ht->ht[h], rtnl_dereference(n->next)); tp_c->knodes--; tcf_unbind_filter(tp, &n->res); u32_remove_hw_knode(tp, n, extack); idr_remove(&ht->handle_idr, n->handle); if (tcf_exts_get_net(&n->exts)) tcf_queue_work(&n->rwork, u32_delete_key_freepf_work); else u32_destroy_key(n, true); } } } static int u32_destroy_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht, struct netlink_ext_ack *extack) { struct tc_u_common *tp_c = tp->data; struct tc_u_hnode __rcu **hn; struct tc_u_hnode *phn; u32_clear_hnode(tp, ht, extack); hn = &tp_c->hlist; for (phn = rtnl_dereference(*hn); phn; hn = &phn->next, phn = rtnl_dereference(*hn)) { if (phn == ht) { u32_clear_hw_hnode(tp, ht, extack); idr_destroy(&ht->handle_idr); idr_remove(&tp_c->handle_idr, handle2id(ht->handle)); RCU_INIT_POINTER(*hn, ht->next); kfree_rcu(ht, rcu); return 0; } } return -ENOENT; } static void u32_destroy(struct tcf_proto *tp, bool rtnl_held, struct netlink_ext_ack *extack) { struct tc_u_common *tp_c = tp->data; struct tc_u_hnode *root_ht = rtnl_dereference(tp->root); WARN_ON(root_ht == NULL); if (root_ht && refcount_dec_and_test(&root_ht->refcnt)) u32_destroy_hnode(tp, root_ht, extack); if (refcount_dec_and_test(&tp_c->refcnt)) { struct tc_u_hnode *ht; hlist_del(&tp_c->hnode); while ((ht = rtnl_dereference(tp_c->hlist)) != NULL) { u32_clear_hnode(tp, ht, extack); RCU_INIT_POINTER(tp_c->hlist, ht->next); /* u32_destroy_key() will later free ht for us, if it's * still referenced by some knode */ if (refcount_dec_and_test(&ht->refcnt)) kfree_rcu(ht, rcu); } idr_destroy(&tp_c->handle_idr); kfree(tp_c); } tp->data = NULL; } static int u32_delete(struct tcf_proto *tp, void *arg, bool *last, bool rtnl_held, struct netlink_ext_ack *extack) { struct tc_u_hnode *ht = arg; struct tc_u_common *tp_c = tp->data; int ret = 0; if (TC_U32_KEY(ht->handle)) { u32_remove_hw_knode(tp, (struct tc_u_knode *)ht, extack); ret = u32_delete_key(tp, (struct tc_u_knode *)ht); goto out; } if (ht->is_root) { NL_SET_ERR_MSG_MOD(extack, "Not allowed to delete root node"); return -EINVAL; } if (refcount_dec_if_one(&ht->refcnt)) { u32_destroy_hnode(tp, ht, extack); } else { NL_SET_ERR_MSG_MOD(extack, "Can not delete in-use filter"); return -EBUSY; } out: *last = refcount_read(&tp_c->refcnt) == 1 && tp_c->knodes == 0; return ret; } static u32 gen_new_kid(struct tc_u_hnode *ht, u32 htid) { u32 index = htid | 0x800; u32 max = htid | 0xFFF; if (idr_alloc_u32(&ht->handle_idr, NULL, &index, max, GFP_KERNEL)) { index = htid + 1; if (idr_alloc_u32(&ht->handle_idr, NULL, &index, max, GFP_KERNEL)) index = max; } return index; } static const struct nla_policy u32_policy[TCA_U32_MAX + 1] = { [TCA_U32_CLASSID] = { .type = NLA_U32 }, [TCA_U32_HASH] = { .type = NLA_U32 }, [TCA_U32_LINK] = { .type = NLA_U32 }, [TCA_U32_DIVISOR] = { .type = NLA_U32 }, [TCA_U32_SEL] = { .len = sizeof(struct tc_u32_sel) }, [TCA_U32_INDEV] = { .type = NLA_STRING, .len = IFNAMSIZ }, [TCA_U32_MARK] = { .len = sizeof(struct tc_u32_mark) }, [TCA_U32_FLAGS] = { .type = NLA_U32 }, }; static void u32_unbind_filter(struct tcf_proto *tp, struct tc_u_knode *n, struct nlattr **tb) { if (tb[TCA_U32_CLASSID]) tcf_unbind_filter(tp, &n->res); } static void u32_bind_filter(struct tcf_proto *tp, struct tc_u_knode *n, unsigned long base, struct nlattr **tb) { if (tb[TCA_U32_CLASSID]) { n->res.classid = nla_get_u32(tb[TCA_U32_CLASSID]); tcf_bind_filter(tp, &n->res, base); } } static int u32_set_parms(struct net *net, struct tcf_proto *tp, struct tc_u_knode *n, struct nlattr **tb, struct nlattr *est, u32 flags, u32 fl_flags, struct netlink_ext_ack *extack) { int err, ifindex = -1; err = tcf_exts_validate_ex(net, tp, tb, est, &n->exts, flags, fl_flags, extack); if (err < 0) return err; if (tb[TCA_U32_INDEV]) { ifindex = tcf_change_indev(net, tb[TCA_U32_INDEV], extack); if (ifindex < 0) return -EINVAL; } if (tb[TCA_U32_LINK]) { u32 handle = nla_get_u32(tb[TCA_U32_LINK]); struct tc_u_hnode *ht_down = NULL, *ht_old; if (TC_U32_KEY(handle)) { NL_SET_ERR_MSG_MOD(extack, "u32 Link handle must be a hash table"); return -EINVAL; } if (handle) { ht_down = u32_lookup_ht(tp->data, handle); if (!ht_down) { NL_SET_ERR_MSG_MOD(extack, "Link hash table not found"); return -EINVAL; } if (ht_down->is_root) { NL_SET_ERR_MSG_MOD(extack, "Not linking to root node"); return -EINVAL; } refcount_inc(&ht_down->refcnt); } ht_old = rtnl_dereference(n->ht_down); rcu_assign_pointer(n->ht_down, ht_down); if (ht_old) refcount_dec(&ht_old->refcnt); } if (ifindex >= 0) n->ifindex = ifindex; return 0; } static void u32_replace_knode(struct tcf_proto *tp, struct tc_u_common *tp_c, struct tc_u_knode *n) { struct tc_u_knode __rcu **ins; struct tc_u_knode *pins; struct tc_u_hnode *ht; if (TC_U32_HTID(n->handle) == TC_U32_ROOT) ht = rtnl_dereference(tp->root); else ht = u32_lookup_ht(tp_c, TC_U32_HTID(n->handle)); ins = &ht->ht[TC_U32_HASH(n->handle)]; /* The node must always exist for it to be replaced if this is not the * case then something went very wrong elsewhere. */ for (pins = rtnl_dereference(*ins); ; ins = &pins->next, pins = rtnl_dereference(*ins)) if (pins->handle == n->handle) break; idr_replace(&ht->handle_idr, n, n->handle); RCU_INIT_POINTER(n->next, pins->next); rcu_assign_pointer(*ins, n); } static struct tc_u_knode *u32_init_knode(struct net *net, struct tcf_proto *tp, struct tc_u_knode *n) { struct tc_u_hnode *ht = rtnl_dereference(n->ht_down); struct tc_u32_sel *s = &n->sel; struct tc_u_knode *new; new = kzalloc(struct_size(new, sel.keys, s->nkeys), GFP_KERNEL); if (!new) return NULL; RCU_INIT_POINTER(new->next, n->next); new->handle = n->handle; RCU_INIT_POINTER(new->ht_up, n->ht_up); new->ifindex = n->ifindex; new->fshift = n->fshift; new->flags = n->flags; RCU_INIT_POINTER(new->ht_down, ht); #ifdef CONFIG_CLS_U32_PERF /* Statistics may be incremented by readers during update * so we must keep them in tact. When the node is later destroyed * a special destroy call must be made to not free the pf memory. */ new->pf = n->pf; #endif #ifdef CONFIG_CLS_U32_MARK new->val = n->val; new->mask = n->mask; /* Similarly success statistics must be moved as pointers */ new->pcpu_success = n->pcpu_success; #endif memcpy(&new->sel, s, struct_size(s, keys, s->nkeys)); if (tcf_exts_init(&new->exts, net, TCA_U32_ACT, TCA_U32_POLICE)) { kfree(new); return NULL; } /* bump reference count as long as we hold pointer to structure */ if (ht) refcount_inc(&ht->refcnt); return new; } static int u32_change(struct net *net, struct sk_buff *in_skb, struct tcf_proto *tp, unsigned long base, u32 handle, struct nlattr **tca, void **arg, u32 flags, struct netlink_ext_ack *extack) { struct tc_u_common *tp_c = tp->data; struct tc_u_hnode *ht; struct tc_u_knode *n; struct tc_u32_sel *s; struct nlattr *opt = tca[TCA_OPTIONS]; struct nlattr *tb[TCA_U32_MAX + 1]; u32 htid, userflags = 0; size_t sel_size; int err; if (!opt) { if (handle) { NL_SET_ERR_MSG_MOD(extack, "Filter handle requires options"); return -EINVAL; } else { return 0; } } err = nla_parse_nested_deprecated(tb, TCA_U32_MAX, opt, u32_policy, extack); if (err < 0) return err; if (tb[TCA_U32_FLAGS]) { userflags = nla_get_u32(tb[TCA_U32_FLAGS]); if (!tc_flags_valid(userflags)) { NL_SET_ERR_MSG_MOD(extack, "Invalid filter flags"); return -EINVAL; } } n = *arg; if (n) { struct tc_u_knode *new; if (TC_U32_KEY(n->handle) == 0) { NL_SET_ERR_MSG_MOD(extack, "Key node id cannot be zero"); return -EINVAL; } if ((n->flags ^ userflags) & ~(TCA_CLS_FLAGS_IN_HW | TCA_CLS_FLAGS_NOT_IN_HW)) { NL_SET_ERR_MSG_MOD(extack, "Key node flags do not match passed flags"); return -EINVAL; } new = u32_init_knode(net, tp, n); if (!new) return -ENOMEM; err = u32_set_parms(net, tp, new, tb, tca[TCA_RATE], flags, new->flags, extack); if (err) { __u32_destroy_key(new); return err; } u32_bind_filter(tp, new, base, tb); err = u32_replace_hw_knode(tp, new, flags, extack); if (err) { u32_unbind_filter(tp, new, tb); if (tb[TCA_U32_LINK]) { struct tc_u_hnode *ht_old; ht_old = rtnl_dereference(n->ht_down); if (ht_old) refcount_inc(&ht_old->refcnt); } __u32_destroy_key(new); return err; } if (!tc_in_hw(new->flags)) new->flags |= TCA_CLS_FLAGS_NOT_IN_HW; tcf_proto_update_usesw(tp, new->flags); u32_replace_knode(tp, tp_c, new); tcf_unbind_filter(tp, &n->res); tcf_exts_get_net(&n->exts); tcf_queue_work(&n->rwork, u32_delete_key_work); return 0; } if (tb[TCA_U32_DIVISOR]) { unsigned int divisor = nla_get_u32(tb[TCA_U32_DIVISOR]); if (!is_power_of_2(divisor)) { NL_SET_ERR_MSG_MOD(extack, "Divisor is not a power of 2"); return -EINVAL; } if (divisor-- > 0x100) { NL_SET_ERR_MSG_MOD(extack, "Exceeded maximum 256 hash buckets"); return -EINVAL; } if (TC_U32_KEY(handle)) { NL_SET_ERR_MSG_MOD(extack, "Divisor can only be used on a hash table"); return -EINVAL; } ht = kzalloc(struct_size(ht, ht, divisor + 1), GFP_KERNEL); if (ht == NULL) return -ENOBUFS; if (handle == 0) { handle = gen_new_htid(tp->data, ht); if (handle == 0) { kfree(ht); return -ENOMEM; } } else { err = idr_alloc_u32(&tp_c->handle_idr, ht, &handle, handle, GFP_KERNEL); if (err) { kfree(ht); return err; } } refcount_set(&ht->refcnt, 1); ht->divisor = divisor; ht->handle = handle; ht->prio = tp->prio; idr_init(&ht->handle_idr); ht->flags = userflags; err = u32_replace_hw_hnode(tp, ht, userflags, extack); if (err) { idr_remove(&tp_c->handle_idr, handle2id(handle)); kfree(ht); return err; } RCU_INIT_POINTER(ht->next, tp_c->hlist); rcu_assign_pointer(tp_c->hlist, ht); *arg = ht; return 0; } if (tb[TCA_U32_HASH]) { htid = nla_get_u32(tb[TCA_U32_HASH]); if (TC_U32_HTID(htid) == TC_U32_ROOT) { ht = rtnl_dereference(tp->root); htid = ht->handle; } else { ht = u32_lookup_ht(tp->data, TC_U32_HTID(htid)); if (!ht) { NL_SET_ERR_MSG_MOD(extack, "Specified hash table not found"); return -EINVAL; } } } else { ht = rtnl_dereference(tp->root); htid = ht->handle; } if (ht->divisor < TC_U32_HASH(htid)) { NL_SET_ERR_MSG_MOD(extack, "Specified hash table buckets exceed configured value"); return -EINVAL; } /* At this point, we need to derive the new handle that will be used to * uniquely map the identity of this table match entry. The * identity of the entry that we need to construct is 32 bits made of: * htid(12b):bucketid(8b):node/entryid(12b) * * At this point _we have the table(ht)_ in which we will insert this * entry. We carry the table's id in variable "htid". * Note that earlier code picked the ht selection either by a) the user * providing the htid specified via TCA_U32_HASH attribute or b) when * no such attribute is passed then the root ht, is default to at ID * 0x[800][00][000]. Rule: the root table has a single bucket with ID 0. * If OTOH the user passed us the htid, they may also pass a bucketid of * choice. 0 is fine. For example a user htid is 0x[600][01][000] it is * indicating hash bucketid of 1. Rule: the entry/node ID _cannot_ be * passed via the htid, so even if it was non-zero it will be ignored. * * We may also have a handle, if the user passed one. The handle also * carries the same addressing of htid(12b):bucketid(8b):node/entryid(12b). * Rule: the bucketid on the handle is ignored even if one was passed; * rather the value on "htid" is always assumed to be the bucketid. */ if (handle) { /* Rule: The htid from handle and tableid from htid must match */ if (TC_U32_HTID(handle) && TC_U32_HTID(handle ^ htid)) { NL_SET_ERR_MSG_MOD(extack, "Handle specified hash table address mismatch"); return -EINVAL; } /* Ok, so far we have a valid htid(12b):bucketid(8b) but we * need to finalize the table entry identification with the last * part - the node/entryid(12b)). Rule: Nodeid _cannot be 0_ for * entries. Rule: nodeid of 0 is reserved only for tables(see * earlier code which processes TC_U32_DIVISOR attribute). * Rule: The nodeid can only be derived from the handle (and not * htid). * Rule: if the handle specified zero for the node id example * 0x60000000, then pick a new nodeid from the pool of IDs * this hash table has been allocating from. * If OTOH it is specified (i.e for example the user passed a * handle such as 0x60000123), then we use it generate our final * handle which is used to uniquely identify the match entry. */ if (!TC_U32_NODE(handle)) { handle = gen_new_kid(ht, htid); } else { handle = htid | TC_U32_NODE(handle); err = idr_alloc_u32(&ht->handle_idr, NULL, &handle, handle, GFP_KERNEL); if (err) return err; } } else { /* The user did not give us a handle; lets just generate one * from the table's pool of nodeids. */ handle = gen_new_kid(ht, htid); } if (tb[TCA_U32_SEL] == NULL) { NL_SET_ERR_MSG_MOD(extack, "Selector not specified"); err = -EINVAL; goto erridr; } s = nla_data(tb[TCA_U32_SEL]); sel_size = struct_size(s, keys, s->nkeys); if (nla_len(tb[TCA_U32_SEL]) < sel_size) { err = -EINVAL; goto erridr; } n = kzalloc(struct_size(n, sel.keys, s->nkeys), GFP_KERNEL); if (n == NULL) { err = -ENOBUFS; goto erridr; } #ifdef CONFIG_CLS_U32_PERF n->pf = __alloc_percpu(struct_size(n->pf, kcnts, s->nkeys), __alignof__(struct tc_u32_pcnt)); if (!n->pf) { err = -ENOBUFS; goto errfree; } #endif unsafe_memcpy(&n->sel, s, sel_size, /* A composite flex-array structure destination, * which was correctly sized with struct_size(), * bounds-checked against nla_len(), and allocated * above. */); RCU_INIT_POINTER(n->ht_up, ht); n->handle = handle; n->fshift = s->hmask ? ffs(ntohl(s->hmask)) - 1 : 0; n->flags = userflags; err = tcf_exts_init(&n->exts, net, TCA_U32_ACT, TCA_U32_POLICE); if (err < 0) goto errout; #ifdef CONFIG_CLS_U32_MARK n->pcpu_success = alloc_percpu(u32); if (!n->pcpu_success) { err = -ENOMEM; goto errout; } if (tb[TCA_U32_MARK]) { struct tc_u32_mark *mark; mark = nla_data(tb[TCA_U32_MARK]); n->val = mark->val; n->mask = mark->mask; } #endif err = u32_set_parms(net, tp, n, tb, tca[TCA_RATE], flags, n->flags, extack); u32_bind_filter(tp, n, base, tb); if (err == 0) { struct tc_u_knode __rcu **ins; struct tc_u_knode *pins; err = u32_replace_hw_knode(tp, n, flags, extack); if (err) goto errunbind; if (!tc_in_hw(n->flags)) n->flags |= TCA_CLS_FLAGS_NOT_IN_HW; tcf_proto_update_usesw(tp, n->flags); ins = &ht->ht[TC_U32_HASH(handle)]; for (pins = rtnl_dereference(*ins); pins; ins = &pins->next, pins = rtnl_dereference(*ins)) if (TC_U32_NODE(handle) < TC_U32_NODE(pins->handle)) break; RCU_INIT_POINTER(n->next, pins); rcu_assign_pointer(*ins, n); tp_c->knodes++; *arg = n; return 0; } errunbind: u32_unbind_filter(tp, n, tb); #ifdef CONFIG_CLS_U32_MARK free_percpu(n->pcpu_success); #endif errout: tcf_exts_destroy(&n->exts); #ifdef CONFIG_CLS_U32_PERF errfree: free_percpu(n->pf); #endif kfree(n); erridr: idr_remove(&ht->handle_idr, handle); return err; } static void u32_walk(struct tcf_proto *tp, struct tcf_walker *arg, bool rtnl_held) { struct tc_u_common *tp_c = tp->data; struct tc_u_hnode *ht; struct tc_u_knode *n; unsigned int h; if (arg->stop) return; for (ht = rtnl_dereference(tp_c->hlist); ht; ht = rtnl_dereference(ht->next)) { if (ht->prio != tp->prio) continue; if (!tc_cls_stats_dump(tp, arg, ht)) return; for (h = 0; h <= ht->divisor; h++) { for (n = rtnl_dereference(ht->ht[h]); n; n = rtnl_dereference(n->next)) { if (!tc_cls_stats_dump(tp, arg, n)) return; } } } } static int u32_reoffload_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht, bool add, flow_setup_cb_t *cb, void *cb_priv, struct netlink_ext_ack *extack) { struct tc_cls_u32_offload cls_u32 = {}; int err; tc_cls_common_offload_init(&cls_u32.common, tp, ht->flags, extack); cls_u32.command = add ? TC_CLSU32_NEW_HNODE : TC_CLSU32_DELETE_HNODE; cls_u32.hnode.divisor = ht->divisor; cls_u32.hnode.handle = ht->handle; cls_u32.hnode.prio = ht->prio; err = cb(TC_SETUP_CLSU32, &cls_u32, cb_priv); if (err && add && tc_skip_sw(ht->flags)) return err; return 0; } static int u32_reoffload_knode(struct tcf_proto *tp, struct tc_u_knode *n, bool add, flow_setup_cb_t *cb, void *cb_priv, struct netlink_ext_ack *extack) { struct tc_u_hnode *ht = rtnl_dereference(n->ht_down); struct tcf_block *block = tp->chain->block; struct tc_cls_u32_offload cls_u32 = {}; tc_cls_common_offload_init(&cls_u32.common, tp, n->flags, extack); cls_u32.command = add ? TC_CLSU32_REPLACE_KNODE : TC_CLSU32_DELETE_KNODE; cls_u32.knode.handle = n->handle; if (add) { cls_u32.knode.fshift = n->fshift; #ifdef CONFIG_CLS_U32_MARK cls_u32.knode.val = n->val; cls_u32.knode.mask = n->mask; #else cls_u32.knode.val = 0; cls_u32.knode.mask = 0; #endif cls_u32.knode.sel = &n->sel; cls_u32.knode.res = &n->res; cls_u32.knode.exts = &n->exts; if (n->ht_down) cls_u32.knode.link_handle = ht->handle; } return tc_setup_cb_reoffload(block, tp, add, cb, TC_SETUP_CLSU32, &cls_u32, cb_priv, &n->flags, &n->in_hw_count); } static int u32_reoffload(struct tcf_proto *tp, bool add, flow_setup_cb_t *cb, void *cb_priv, struct netlink_ext_ack *extack) { struct tc_u_common *tp_c = tp->data; struct tc_u_hnode *ht; struct tc_u_knode *n; unsigned int h; int err; for (ht = rtnl_dereference(tp_c->hlist); ht; ht = rtnl_dereference(ht->next)) { if (ht->prio != tp->prio) continue; /* When adding filters to a new dev, try to offload the * hashtable first. When removing, do the filters before the * hashtable. */ if (add && !tc_skip_hw(ht->flags)) { err = u32_reoffload_hnode(tp, ht, add, cb, cb_priv, extack); if (err) return err; } for (h = 0; h <= ht->divisor; h++) { for (n = rtnl_dereference(ht->ht[h]); n; n = rtnl_dereference(n->next)) { if (tc_skip_hw(n->flags)) continue; err = u32_reoffload_knode(tp, n, add, cb, cb_priv, extack); if (err) return err; } } if (!add && !tc_skip_hw(ht->flags)) u32_reoffload_hnode(tp, ht, add, cb, cb_priv, extack); } return 0; } static void u32_bind_class(void *fh, u32 classid, unsigned long cl, void *q, unsigned long base) { struct tc_u_knode *n = fh; tc_cls_bind_class(classid, cl, q, &n->res, base); } static int u32_dump(struct net *net, struct tcf_proto *tp, void *fh, struct sk_buff *skb, struct tcmsg *t, bool rtnl_held) { struct tc_u_knode *n = fh; struct tc_u_hnode *ht_up, *ht_down; struct nlattr *nest; if (n == NULL) return skb->len; t->tcm_handle = n->handle; nest = nla_nest_start_noflag(skb, TCA_OPTIONS); if (nest == NULL) goto nla_put_failure; if (TC_U32_KEY(n->handle) == 0) { struct tc_u_hnode *ht = fh; u32 divisor = ht->divisor + 1; if (nla_put_u32(skb, TCA_U32_DIVISOR, divisor)) goto nla_put_failure; } else { #ifdef CONFIG_CLS_U32_PERF struct tc_u32_pcnt *gpf; int cpu; #endif if (nla_put(skb, TCA_U32_SEL, struct_size(&n->sel, keys, n->sel.nkeys), &n->sel)) goto nla_put_failure; ht_up = rtnl_dereference(n->ht_up); if (ht_up) { u32 htid = n->handle & 0xFFFFF000; if (nla_put_u32(skb, TCA_U32_HASH, htid)) goto nla_put_failure; } if (n->res.classid && nla_put_u32(skb, TCA_U32_CLASSID, n->res.classid)) goto nla_put_failure; ht_down = rtnl_dereference(n->ht_down); if (ht_down && nla_put_u32(skb, TCA_U32_LINK, ht_down->handle)) goto nla_put_failure; if (n->flags && nla_put_u32(skb, TCA_U32_FLAGS, n->flags)) goto nla_put_failure; #ifdef CONFIG_CLS_U32_MARK if ((n->val || n->mask)) { struct tc_u32_mark mark = {.val = n->val, .mask = n->mask, .success = 0}; int cpum; for_each_possible_cpu(cpum) { __u32 cnt = *per_cpu_ptr(n->pcpu_success, cpum); mark.success += cnt; } if (nla_put(skb, TCA_U32_MARK, sizeof(mark), &mark)) goto nla_put_failure; } #endif if (tcf_exts_dump(skb, &n->exts) < 0) goto nla_put_failure; if (n->ifindex) { struct net_device *dev; dev = __dev_get_by_index(net, n->ifindex); if (dev && nla_put_string(skb, TCA_U32_INDEV, dev->name)) goto nla_put_failure; } #ifdef CONFIG_CLS_U32_PERF gpf = kzalloc(struct_size(gpf, kcnts, n->sel.nkeys), GFP_KERNEL); if (!gpf) goto nla_put_failure; for_each_possible_cpu(cpu) { int i; struct tc_u32_pcnt *pf = per_cpu_ptr(n->pf, cpu); gpf->rcnt += pf->rcnt; gpf->rhit += pf->rhit; for (i = 0; i < n->sel.nkeys; i++) gpf->kcnts[i] += pf->kcnts[i]; } if (nla_put_64bit(skb, TCA_U32_PCNT, struct_size(gpf, kcnts, n->sel.nkeys), gpf, TCA_U32_PAD)) { kfree(gpf); goto nla_put_failure; } kfree(gpf); #endif } nla_nest_end(skb, nest); if (TC_U32_KEY(n->handle)) if (tcf_exts_dump_stats(skb, &n->exts) < 0) goto nla_put_failure; return skb->len; nla_put_failure: nla_nest_cancel(skb, nest); return -1; } static struct tcf_proto_ops cls_u32_ops __read_mostly = { .kind = "u32", .classify = u32_classify, .init = u32_init, .destroy = u32_destroy, .get = u32_get, .change = u32_change, .delete = u32_delete, .walk = u32_walk, .reoffload = u32_reoffload, .dump = u32_dump, .bind_class = u32_bind_class, .owner = THIS_MODULE, }; MODULE_ALIAS_NET_CLS("u32"); static int __init init_u32(void) { int i, ret; pr_info("u32 classifier\n"); #ifdef CONFIG_CLS_U32_PERF pr_info(" Performance counters on\n"); #endif pr_info(" input device check on\n"); #ifdef CONFIG_NET_CLS_ACT pr_info(" Actions configured\n"); #endif tc_u_common_hash = kvmalloc_array(U32_HASH_SIZE, sizeof(struct hlist_head), GFP_KERNEL); if (!tc_u_common_hash) return -ENOMEM; for (i = 0; i < U32_HASH_SIZE; i++) INIT_HLIST_HEAD(&tc_u_common_hash[i]); ret = register_tcf_proto_ops(&cls_u32_ops); if (ret) kvfree(tc_u_common_hash); return ret; } static void __exit exit_u32(void) { unregister_tcf_proto_ops(&cls_u32_ops); kvfree(tc_u_common_hash); } module_init(init_u32) module_exit(exit_u32) MODULE_DESCRIPTION("Universal 32bit based TC Classifier"); MODULE_LICENSE("GPL");
1 1 2 1 1 17 17 34 34 17 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 // SPDX-License-Identifier: GPL-2.0-only #include <linux/ethtool.h> #include <linux/skbuff.h> #include <linux/xarray.h> #include <net/genetlink.h> #include <net/psp.h> #include <net/sock.h> #include "psp-nl-gen.h" #include "psp.h" /* Netlink helpers */ static struct sk_buff *psp_nl_reply_new(struct genl_info *info) { struct sk_buff *rsp; void *hdr; rsp = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!rsp) return NULL; hdr = genlmsg_iput(rsp, info); if (!hdr) { nlmsg_free(rsp); return NULL; } return rsp; } static int psp_nl_reply_send(struct sk_buff *rsp, struct genl_info *info) { /* Note that this *only* works with a single message per skb! */ nlmsg_end(rsp, (struct nlmsghdr *)rsp->data); return genlmsg_reply(rsp, info); } /* Device stuff */ static struct psp_dev * psp_device_get_and_lock(struct net *net, struct nlattr *dev_id) { struct psp_dev *psd; int err; mutex_lock(&psp_devs_lock); psd = xa_load(&psp_devs, nla_get_u32(dev_id)); if (!psd) { mutex_unlock(&psp_devs_lock); return ERR_PTR(-ENODEV); } mutex_lock(&psd->lock); mutex_unlock(&psp_devs_lock); err = psp_dev_check_access(psd, net); if (err) { mutex_unlock(&psd->lock); return ERR_PTR(err); } return psd; } int psp_device_get_locked(const struct genl_split_ops *ops, struct sk_buff *skb, struct genl_info *info) { if (GENL_REQ_ATTR_CHECK(info, PSP_A_DEV_ID)) return -EINVAL; info->user_ptr[0] = psp_device_get_and_lock(genl_info_net(info), info->attrs[PSP_A_DEV_ID]); return PTR_ERR_OR_ZERO(info->user_ptr[0]); } void psp_device_unlock(const struct genl_split_ops *ops, struct sk_buff *skb, struct genl_info *info) { struct socket *socket = info->user_ptr[1]; struct psp_dev *psd = info->user_ptr[0]; mutex_unlock(&psd->lock); if (socket) sockfd_put(socket); } static int psp_nl_dev_fill(struct psp_dev *psd, struct sk_buff *rsp, const struct genl_info *info) { void *hdr; hdr = genlmsg_iput(rsp, info); if (!hdr) return -EMSGSIZE; if (nla_put_u32(rsp, PSP_A_DEV_ID, psd->id) || nla_put_u32(rsp, PSP_A_DEV_IFINDEX, psd->main_netdev->ifindex) || nla_put_u32(rsp, PSP_A_DEV_PSP_VERSIONS_CAP, psd->caps->versions) || nla_put_u32(rsp, PSP_A_DEV_PSP_VERSIONS_ENA, psd->config.versions)) goto err_cancel_msg; genlmsg_end(rsp, hdr); return 0; err_cancel_msg: genlmsg_cancel(rsp, hdr); return -EMSGSIZE; } void psp_nl_notify_dev(struct psp_dev *psd, u32 cmd) { struct genl_info info; struct sk_buff *ntf; if (!genl_has_listeners(&psp_nl_family, dev_net(psd->main_netdev), PSP_NLGRP_MGMT)) return; ntf = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!ntf) return; genl_info_init_ntf(&info, &psp_nl_family, cmd); if (psp_nl_dev_fill(psd, ntf, &info)) { nlmsg_free(ntf); return; } genlmsg_multicast_netns(&psp_nl_family, dev_net(psd->main_netdev), ntf, 0, PSP_NLGRP_MGMT, GFP_KERNEL); } int psp_nl_dev_get_doit(struct sk_buff *req, struct genl_info *info) { struct psp_dev *psd = info->user_ptr[0]; struct sk_buff *rsp; int err; rsp = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!rsp) return -ENOMEM; err = psp_nl_dev_fill(psd, rsp, info); if (err) goto err_free_msg; return genlmsg_reply(rsp, info); err_free_msg: nlmsg_free(rsp); return err; } static int psp_nl_dev_get_dumpit_one(struct sk_buff *rsp, struct netlink_callback *cb, struct psp_dev *psd) { if (psp_dev_check_access(psd, sock_net(rsp->sk))) return 0; return psp_nl_dev_fill(psd, rsp, genl_info_dump(cb)); } int psp_nl_dev_get_dumpit(struct sk_buff *rsp, struct netlink_callback *cb) { struct psp_dev *psd; int err = 0; mutex_lock(&psp_devs_lock); xa_for_each_start(&psp_devs, cb->args[0], psd, cb->args[0]) { mutex_lock(&psd->lock); err = psp_nl_dev_get_dumpit_one(rsp, cb, psd); mutex_unlock(&psd->lock); if (err) break; } mutex_unlock(&psp_devs_lock); return err; } int psp_nl_dev_set_doit(struct sk_buff *skb, struct genl_info *info) { struct psp_dev *psd = info->user_ptr[0]; struct psp_dev_config new_config; struct sk_buff *rsp; int err; memcpy(&new_config, &psd->config, sizeof(new_config)); if (info->attrs[PSP_A_DEV_PSP_VERSIONS_ENA]) { new_config.versions = nla_get_u32(info->attrs[PSP_A_DEV_PSP_VERSIONS_ENA]); if (new_config.versions & ~psd->caps->versions) { NL_SET_ERR_MSG(info->extack, "Requested PSP versions not supported by the device"); return -EINVAL; } } else { NL_SET_ERR_MSG(info->extack, "No settings present"); return -EINVAL; } rsp = psp_nl_reply_new(info); if (!rsp) return -ENOMEM; if (memcmp(&new_config, &psd->config, sizeof(new_config))) { err = psd->ops->set_config(psd, &new_config, info->extack); if (err) goto err_free_rsp; memcpy(&psd->config, &new_config, sizeof(new_config)); } psp_nl_notify_dev(psd, PSP_CMD_DEV_CHANGE_NTF); return psp_nl_reply_send(rsp, info); err_free_rsp: nlmsg_free(rsp); return err; } int psp_nl_key_rotate_doit(struct sk_buff *skb, struct genl_info *info) { struct psp_dev *psd = info->user_ptr[0]; struct genl_info ntf_info; struct sk_buff *ntf, *rsp; u8 prev_gen; int err; rsp = psp_nl_reply_new(info); if (!rsp) return -ENOMEM; genl_info_init_ntf(&ntf_info, &psp_nl_family, PSP_CMD_KEY_ROTATE_NTF); ntf = psp_nl_reply_new(&ntf_info); if (!ntf) { err = -ENOMEM; goto err_free_rsp; } if (nla_put_u32(rsp, PSP_A_DEV_ID, psd->id) || nla_put_u32(ntf, PSP_A_DEV_ID, psd->id)) { err = -EMSGSIZE; goto err_free_ntf; } /* suggest the next gen number, driver can override */ prev_gen = psd->generation; psd->generation = (prev_gen + 1) & PSP_GEN_VALID_MASK; err = psd->ops->key_rotate(psd, info->extack); if (err) goto err_free_ntf; WARN_ON_ONCE((psd->generation && psd->generation == prev_gen) || psd->generation & ~PSP_GEN_VALID_MASK); psp_assocs_key_rotated(psd); psd->stats.rotations++; nlmsg_end(ntf, (struct nlmsghdr *)ntf->data); genlmsg_multicast_netns(&psp_nl_family, dev_net(psd->main_netdev), ntf, 0, PSP_NLGRP_USE, GFP_KERNEL); return psp_nl_reply_send(rsp, info); err_free_ntf: nlmsg_free(ntf); err_free_rsp: nlmsg_free(rsp); return err; } /* Key etc. */ int psp_assoc_device_get_locked(const struct genl_split_ops *ops, struct sk_buff *skb, struct genl_info *info) { struct socket *socket; struct psp_dev *psd; struct nlattr *id; int fd, err; if (GENL_REQ_ATTR_CHECK(info, PSP_A_ASSOC_SOCK_FD)) return -EINVAL; fd = nla_get_u32(info->attrs[PSP_A_ASSOC_SOCK_FD]); socket = sockfd_lookup(fd, &err); if (!socket) return err; if (!sk_is_tcp(socket->sk)) { NL_SET_ERR_MSG_ATTR(info->extack, info->attrs[PSP_A_ASSOC_SOCK_FD], "Unsupported socket family and type"); err = -EOPNOTSUPP; goto err_sock_put; } psd = psp_dev_get_for_sock(socket->sk); if (psd) { err = psp_dev_check_access(psd, genl_info_net(info)); if (err) { psp_dev_put(psd); psd = NULL; } } if (!psd && GENL_REQ_ATTR_CHECK(info, PSP_A_ASSOC_DEV_ID)) { err = -EINVAL; goto err_sock_put; } id = info->attrs[PSP_A_ASSOC_DEV_ID]; if (psd) { mutex_lock(&psd->lock); if (id && psd->id != nla_get_u32(id)) { mutex_unlock(&psd->lock); NL_SET_ERR_MSG_ATTR(info->extack, id, "Device id vs socket mismatch"); err = -EINVAL; goto err_psd_put; } psp_dev_put(psd); } else { psd = psp_device_get_and_lock(genl_info_net(info), id); if (IS_ERR(psd)) { err = PTR_ERR(psd); goto err_sock_put; } } info->user_ptr[0] = psd; info->user_ptr[1] = socket; return 0; err_psd_put: psp_dev_put(psd); err_sock_put: sockfd_put(socket); return err; } static int psp_nl_parse_key(struct genl_info *info, u32 attr, struct psp_key_parsed *key, unsigned int key_sz) { struct nlattr *nest = info->attrs[attr]; struct nlattr *tb[PSP_A_KEYS_SPI + 1]; u32 spi; int err; err = nla_parse_nested(tb, ARRAY_SIZE(tb) - 1, nest, psp_keys_nl_policy, info->extack); if (err) return err; if (NL_REQ_ATTR_CHECK(info->extack, nest, tb, PSP_A_KEYS_KEY) || NL_REQ_ATTR_CHECK(info->extack, nest, tb, PSP_A_KEYS_SPI)) return -EINVAL; if (nla_len(tb[PSP_A_KEYS_KEY]) != key_sz) { NL_SET_ERR_MSG_ATTR(info->extack, tb[PSP_A_KEYS_KEY], "incorrect key length"); return -EINVAL; } spi = nla_get_u32(tb[PSP_A_KEYS_SPI]); if (!(spi & PSP_SPI_KEY_ID)) { NL_SET_ERR_MSG_ATTR(info->extack, tb[PSP_A_KEYS_KEY], "invalid SPI: lower 31b must be non-zero"); return -EINVAL; } key->spi = cpu_to_be32(spi); memcpy(key->key, nla_data(tb[PSP_A_KEYS_KEY]), key_sz); return 0; } static int psp_nl_put_key(struct sk_buff *skb, u32 attr, u32 version, struct psp_key_parsed *key) { int key_sz = psp_key_size(version); void *nest; nest = nla_nest_start(skb, attr); if (nla_put_u32(skb, PSP_A_KEYS_SPI, be32_to_cpu(key->spi)) || nla_put(skb, PSP_A_KEYS_KEY, key_sz, key->key)) { nla_nest_cancel(skb, nest); return -EMSGSIZE; } nla_nest_end(skb, nest); return 0; } int psp_nl_rx_assoc_doit(struct sk_buff *skb, struct genl_info *info) { struct socket *socket = info->user_ptr[1]; struct psp_dev *psd = info->user_ptr[0]; struct psp_key_parsed key; struct psp_assoc *pas; struct sk_buff *rsp; u32 version; int err; if (GENL_REQ_ATTR_CHECK(info, PSP_A_ASSOC_VERSION)) return -EINVAL; version = nla_get_u32(info->attrs[PSP_A_ASSOC_VERSION]); if (!(psd->caps->versions & (1 << version))) { NL_SET_BAD_ATTR(info->extack, info->attrs[PSP_A_ASSOC_VERSION]); return -EOPNOTSUPP; } rsp = psp_nl_reply_new(info); if (!rsp) return -ENOMEM; pas = psp_assoc_create(psd); if (!pas) { err = -ENOMEM; goto err_free_rsp; } pas->version = version; err = psd->ops->rx_spi_alloc(psd, version, &key, info->extack); if (err) goto err_free_pas; if (nla_put_u32(rsp, PSP_A_ASSOC_DEV_ID, psd->id) || psp_nl_put_key(rsp, PSP_A_ASSOC_RX_KEY, version, &key)) { err = -EMSGSIZE; goto err_free_pas; } err = psp_sock_assoc_set_rx(socket->sk, pas, &key, info->extack); if (err) { NL_SET_BAD_ATTR(info->extack, info->attrs[PSP_A_ASSOC_SOCK_FD]); goto err_free_pas; } psp_assoc_put(pas); return psp_nl_reply_send(rsp, info); err_free_pas: psp_assoc_put(pas); err_free_rsp: nlmsg_free(rsp); return err; } int psp_nl_tx_assoc_doit(struct sk_buff *skb, struct genl_info *info) { struct socket *socket = info->user_ptr[1]; struct psp_dev *psd = info->user_ptr[0]; struct psp_key_parsed key; struct sk_buff *rsp; unsigned int key_sz; u32 version; int err; if (GENL_REQ_ATTR_CHECK(info, PSP_A_ASSOC_VERSION) || GENL_REQ_ATTR_CHECK(info, PSP_A_ASSOC_TX_KEY)) return -EINVAL; version = nla_get_u32(info->attrs[PSP_A_ASSOC_VERSION]); if (!(psd->caps->versions & (1 << version))) { NL_SET_BAD_ATTR(info->extack, info->attrs[PSP_A_ASSOC_VERSION]); return -EOPNOTSUPP; } key_sz = psp_key_size(version); if (!key_sz) return -EINVAL; err = psp_nl_parse_key(info, PSP_A_ASSOC_TX_KEY, &key, key_sz); if (err < 0) return err; rsp = psp_nl_reply_new(info); if (!rsp) return -ENOMEM; err = psp_sock_assoc_set_tx(socket->sk, psd, version, &key, info->extack); if (err) goto err_free_msg; return psp_nl_reply_send(rsp, info); err_free_msg: nlmsg_free(rsp); return err; } static int psp_nl_stats_fill(struct psp_dev *psd, struct sk_buff *rsp, const struct genl_info *info) { unsigned int required_cnt = sizeof(struct psp_dev_stats) / sizeof(u64); struct psp_dev_stats stats; void *hdr; int i; memset(&stats, 0xff, sizeof(stats)); psd->ops->get_stats(psd, &stats); for (i = 0; i < required_cnt; i++) if (WARN_ON_ONCE(stats.required[i] == ETHTOOL_STAT_NOT_SET)) return -EOPNOTSUPP; hdr = genlmsg_iput(rsp, info); if (!hdr) return -EMSGSIZE; if (nla_put_u32(rsp, PSP_A_STATS_DEV_ID, psd->id) || nla_put_uint(rsp, PSP_A_STATS_KEY_ROTATIONS, psd->stats.rotations) || nla_put_uint(rsp, PSP_A_STATS_STALE_EVENTS, psd->stats.stales) || nla_put_uint(rsp, PSP_A_STATS_RX_PACKETS, stats.rx_packets) || nla_put_uint(rsp, PSP_A_STATS_RX_BYTES, stats.rx_bytes) || nla_put_uint(rsp, PSP_A_STATS_RX_AUTH_FAIL, stats.rx_auth_fail) || nla_put_uint(rsp, PSP_A_STATS_RX_ERROR, stats.rx_error) || nla_put_uint(rsp, PSP_A_STATS_RX_BAD, stats.rx_bad) || nla_put_uint(rsp, PSP_A_STATS_TX_PACKETS, stats.tx_packets) || nla_put_uint(rsp, PSP_A_STATS_TX_BYTES, stats.tx_bytes) || nla_put_uint(rsp, PSP_A_STATS_TX_ERROR, stats.tx_error)) goto err_cancel_msg; genlmsg_end(rsp, hdr); return 0; err_cancel_msg: genlmsg_cancel(rsp, hdr); return -EMSGSIZE; } int psp_nl_get_stats_doit(struct sk_buff *skb, struct genl_info *info) { struct psp_dev *psd = info->user_ptr[0]; struct sk_buff *rsp; int err; rsp = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!rsp) return -ENOMEM; err = psp_nl_stats_fill(psd, rsp, info); if (err) goto err_free_msg; return genlmsg_reply(rsp, info); err_free_msg: nlmsg_free(rsp); return err; } static int psp_nl_stats_get_dumpit_one(struct sk_buff *rsp, struct netlink_callback *cb, struct psp_dev *psd) { if (psp_dev_check_access(psd, sock_net(rsp->sk))) return 0; return psp_nl_stats_fill(psd, rsp, genl_info_dump(cb)); } int psp_nl_get_stats_dumpit(struct sk_buff *rsp, struct netlink_callback *cb) { struct psp_dev *psd; int err = 0; mutex_lock(&psp_devs_lock); xa_for_each_start(&psp_devs, cb->args[0], psd, cb->args[0]) { mutex_lock(&psd->lock); err = psp_nl_stats_get_dumpit_one(rsp, cb, psd); mutex_unlock(&psd->lock); if (err) break; } mutex_unlock(&psp_devs_lock); return err; }
19 26 40 26 1 1 1 1 1 1 12 11 1 3 11 11 11 16 19 19 21 26 29 29 29 3 4 1 3 3 1 39 10 27 3 26 26 26 26 26 2 10 14 10 16 24 2 3 36 6 3 6 2 3 3 2 4 5 3 2 2 2 1 1 1 25 26 19 7 26 10 21 3 3 1 2 2 3 2 2 2 1 1 1 2 1 1 1 1 1 39 3 37 15 23 14 14 3 14 8 11 38 38 1 11 28 26 28 19 9 9 26 3 9 1 8 7 2 1 31 13 26 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright(c) 1999 - 2004 Intel Corporation. All rights reserved. */ #include <linux/skbuff.h> #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/pkt_sched.h> #include <linux/spinlock.h> #include <linux/slab.h> #include <linux/timer.h> #include <linux/ip.h> #include <linux/ipv6.h> #include <linux/if_arp.h> #include <linux/if_ether.h> #include <linux/if_bonding.h> #include <linux/if_vlan.h> #include <linux/in.h> #include <net/arp.h> #include <net/ipv6.h> #include <net/ndisc.h> #include <asm/byteorder.h> #include <net/bonding.h> #include <net/bond_alb.h> static const u8 mac_v6_allmcast[ETH_ALEN + 2] __long_aligned = { 0x33, 0x33, 0x00, 0x00, 0x00, 0x01 }; static const int alb_delta_in_ticks = HZ / ALB_TIMER_TICKS_PER_SEC; #pragma pack(1) struct learning_pkt { u8 mac_dst[ETH_ALEN]; u8 mac_src[ETH_ALEN]; __be16 type; u8 padding[ETH_ZLEN - ETH_HLEN]; }; struct arp_pkt { __be16 hw_addr_space; __be16 prot_addr_space; u8 hw_addr_len; u8 prot_addr_len; __be16 op_code; u8 mac_src[ETH_ALEN]; /* sender hardware address */ __be32 ip_src; /* sender IP address */ u8 mac_dst[ETH_ALEN]; /* target hardware address */ __be32 ip_dst; /* target IP address */ }; #pragma pack() /* Forward declaration */ static void alb_send_learning_packets(struct slave *slave, const u8 mac_addr[], bool strict_match); static void rlb_purge_src_ip(struct bonding *bond, struct arp_pkt *arp); static void rlb_src_unlink(struct bonding *bond, u32 index); static void rlb_src_link(struct bonding *bond, u32 ip_src_hash, u32 ip_dst_hash); static inline u8 _simple_hash(const u8 *hash_start, int hash_size) { int i; u8 hash = 0; for (i = 0; i < hash_size; i++) hash ^= hash_start[i]; return hash; } /*********************** tlb specific functions ***************************/ static inline void tlb_init_table_entry(struct tlb_client_info *entry, int save_load) { if (save_load) { entry->load_history = 1 + entry->tx_bytes / BOND_TLB_REBALANCE_INTERVAL; entry->tx_bytes = 0; } entry->tx_slave = NULL; entry->next = TLB_NULL_INDEX; entry->prev = TLB_NULL_INDEX; } static inline void tlb_init_slave(struct slave *slave) { SLAVE_TLB_INFO(slave).load = 0; SLAVE_TLB_INFO(slave).head = TLB_NULL_INDEX; } static void __tlb_clear_slave(struct bonding *bond, struct slave *slave, int save_load) { struct tlb_client_info *tx_hash_table; u32 index; /* clear slave from tx_hashtbl */ tx_hash_table = BOND_ALB_INFO(bond).tx_hashtbl; /* skip this if we've already freed the tx hash table */ if (tx_hash_table) { index = SLAVE_TLB_INFO(slave).head; while (index != TLB_NULL_INDEX) { u32 next_index = tx_hash_table[index].next; tlb_init_table_entry(&tx_hash_table[index], save_load); index = next_index; } } tlb_init_slave(slave); } static void tlb_clear_slave(struct bonding *bond, struct slave *slave, int save_load) { spin_lock_bh(&bond->mode_lock); __tlb_clear_slave(bond, slave, save_load); spin_unlock_bh(&bond->mode_lock); } /* Must be called before starting the monitor timer */ static int tlb_initialize(struct bonding *bond) { struct alb_bond_info *bond_info = &(BOND_ALB_INFO(bond)); int size = TLB_HASH_TABLE_SIZE * sizeof(struct tlb_client_info); struct tlb_client_info *new_hashtbl; int i; new_hashtbl = kzalloc(size, GFP_KERNEL); if (!new_hashtbl) return -ENOMEM; spin_lock_bh(&bond->mode_lock); bond_info->tx_hashtbl = new_hashtbl; for (i = 0; i < TLB_HASH_TABLE_SIZE; i++) tlb_init_table_entry(&bond_info->tx_hashtbl[i], 0); spin_unlock_bh(&bond->mode_lock); return 0; } /* Must be called only after all slaves have been released */ static void tlb_deinitialize(struct bonding *bond) { struct alb_bond_info *bond_info = &(BOND_ALB_INFO(bond)); spin_lock_bh(&bond->mode_lock); kfree(bond_info->tx_hashtbl); bond_info->tx_hashtbl = NULL; spin_unlock_bh(&bond->mode_lock); } static long long compute_gap(struct slave *slave) { return (s64) (slave->speed << 20) - /* Convert to Megabit per sec */ (s64) (SLAVE_TLB_INFO(slave).load << 3); /* Bytes to bits */ } static struct slave *tlb_get_least_loaded_slave(struct bonding *bond) { struct slave *slave, *least_loaded; struct list_head *iter; long long max_gap; least_loaded = NULL; max_gap = LLONG_MIN; /* Find the slave with the largest gap */ bond_for_each_slave_rcu(bond, slave, iter) { if (bond_slave_can_tx(slave)) { long long gap = compute_gap(slave); if (max_gap < gap) { least_loaded = slave; max_gap = gap; } } } return least_loaded; } static struct slave *__tlb_choose_channel(struct bonding *bond, u32 hash_index, u32 skb_len) { struct alb_bond_info *bond_info = &(BOND_ALB_INFO(bond)); struct tlb_client_info *hash_table; struct slave *assigned_slave; hash_table = bond_info->tx_hashtbl; assigned_slave = hash_table[hash_index].tx_slave; if (!assigned_slave) { assigned_slave = tlb_get_least_loaded_slave(bond); if (assigned_slave) { struct tlb_slave_info *slave_info = &(SLAVE_TLB_INFO(assigned_slave)); u32 next_index = slave_info->head; hash_table[hash_index].tx_slave = assigned_slave; hash_table[hash_index].next = next_index; hash_table[hash_index].prev = TLB_NULL_INDEX; if (next_index != TLB_NULL_INDEX) hash_table[next_index].prev = hash_index; slave_info->head = hash_index; slave_info->load += hash_table[hash_index].load_history; } } if (assigned_slave) hash_table[hash_index].tx_bytes += skb_len; return assigned_slave; } static struct slave *tlb_choose_channel(struct bonding *bond, u32 hash_index, u32 skb_len) { struct slave *tx_slave; /* We don't need to disable softirq here, because * tlb_choose_channel() is only called by bond_alb_xmit() * which already has softirq disabled. */ spin_lock(&bond->mode_lock); tx_slave = __tlb_choose_channel(bond, hash_index, skb_len); spin_unlock(&bond->mode_lock); return tx_slave; } /*********************** rlb specific functions ***************************/ /* when an ARP REPLY is received from a client update its info * in the rx_hashtbl */ static void rlb_update_entry_from_arp(struct bonding *bond, struct arp_pkt *arp) { struct alb_bond_info *bond_info = &(BOND_ALB_INFO(bond)); struct rlb_client_info *client_info; u32 hash_index; spin_lock_bh(&bond->mode_lock); hash_index = _simple_hash((u8 *)&(arp->ip_src), sizeof(arp->ip_src)); client_info = &(bond_info->rx_hashtbl[hash_index]); if ((client_info->assigned) && (client_info->ip_src == arp->ip_dst) && (client_info->ip_dst == arp->ip_src) && (!ether_addr_equal_64bits(client_info->mac_dst, arp->mac_src))) { /* update the clients MAC address */ ether_addr_copy(client_info->mac_dst, arp->mac_src); client_info->ntt = 1; bond_info->rx_ntt = 1; } spin_unlock_bh(&bond->mode_lock); } static int rlb_arp_recv(const struct sk_buff *skb, struct bonding *bond, struct slave *slave) { struct arp_pkt *arp, _arp; if (skb->protocol != cpu_to_be16(ETH_P_ARP)) goto out; arp = skb_header_pointer(skb, 0, sizeof(_arp), &_arp); if (!arp) goto out; /* We received an ARP from arp->ip_src. * We might have used this IP address previously (on the bonding host * itself or on a system that is bridged together with the bond). * However, if arp->mac_src is different than what is stored in * rx_hashtbl, some other host is now using the IP and we must prevent * sending out client updates with this IP address and the old MAC * address. * Clean up all hash table entries that have this address as ip_src but * have a different mac_src. */ rlb_purge_src_ip(bond, arp); if (arp->op_code == htons(ARPOP_REPLY)) { /* update rx hash table for this ARP */ rlb_update_entry_from_arp(bond, arp); slave_dbg(bond->dev, slave->dev, "Server received an ARP Reply from client\n"); } out: return RX_HANDLER_ANOTHER; } /* Caller must hold rcu_read_lock() */ static struct slave *__rlb_next_rx_slave(struct bonding *bond) { struct alb_bond_info *bond_info = &(BOND_ALB_INFO(bond)); struct slave *before = NULL, *rx_slave = NULL, *slave; struct list_head *iter; bool found = false; bond_for_each_slave_rcu(bond, slave, iter) { if (!bond_slave_can_tx(slave)) continue; if (!found) { if (!before || before->speed < slave->speed) before = slave; } else { if (!rx_slave || rx_slave->speed < slave->speed) rx_slave = slave; } if (slave == bond_info->rx_slave) found = true; } /* we didn't find anything after the current or we have something * better before and up to the current slave */ if (!rx_slave || (before && rx_slave->speed < before->speed)) rx_slave = before; if (rx_slave) bond_info->rx_slave = rx_slave; return rx_slave; } /* Caller must hold RTNL, rcu_read_lock is obtained only to silence checkers */ static struct slave *rlb_next_rx_slave(struct bonding *bond) { struct slave *rx_slave; ASSERT_RTNL(); rcu_read_lock(); rx_slave = __rlb_next_rx_slave(bond); rcu_read_unlock(); return rx_slave; } /* teach the switch the mac of a disabled slave * on the primary for fault tolerance * * Caller must hold RTNL */ static void rlb_teach_disabled_mac_on_primary(struct bonding *bond, const u8 addr[]) { struct slave *curr_active = rtnl_dereference(bond->curr_active_slave); if (!curr_active) return; if (!bond->alb_info.primary_is_promisc) { if (!dev_set_promiscuity(curr_active->dev, 1)) bond->alb_info.primary_is_promisc = 1; else bond->alb_info.primary_is_promisc = 0; } bond->alb_info.rlb_promisc_timeout_counter = 0; alb_send_learning_packets(curr_active, addr, true); } /* slave being removed should not be active at this point * * Caller must hold rtnl. */ static void rlb_clear_slave(struct bonding *bond, struct slave *slave) { struct alb_bond_info *bond_info = &(BOND_ALB_INFO(bond)); struct rlb_client_info *rx_hash_table; u32 index, next_index; /* clear slave from rx_hashtbl */ spin_lock_bh(&bond->mode_lock); rx_hash_table = bond_info->rx_hashtbl; index = bond_info->rx_hashtbl_used_head; for (; index != RLB_NULL_INDEX; index = next_index) { next_index = rx_hash_table[index].used_next; if (rx_hash_table[index].slave == slave) { struct slave *assigned_slave = rlb_next_rx_slave(bond); if (assigned_slave) { rx_hash_table[index].slave = assigned_slave; if (is_valid_ether_addr(rx_hash_table[index].mac_dst)) { bond_info->rx_hashtbl[index].ntt = 1; bond_info->rx_ntt = 1; /* A slave has been removed from the * table because it is either disabled * or being released. We must retry the * update to avoid clients from not * being updated & disconnecting when * there is stress */ bond_info->rlb_update_retry_counter = RLB_UPDATE_RETRY; } } else { /* there is no active slave */ rx_hash_table[index].slave = NULL; } } } spin_unlock_bh(&bond->mode_lock); if (slave != rtnl_dereference(bond->curr_active_slave)) rlb_teach_disabled_mac_on_primary(bond, slave->dev->dev_addr); } static void rlb_update_client(struct rlb_client_info *client_info) { int i; if (!client_info->slave || !is_valid_ether_addr(client_info->mac_dst)) return; for (i = 0; i < RLB_ARP_BURST_SIZE; i++) { struct sk_buff *skb; skb = arp_create(ARPOP_REPLY, ETH_P_ARP, client_info->ip_dst, client_info->slave->dev, client_info->ip_src, client_info->mac_dst, client_info->slave->dev->dev_addr, client_info->mac_dst); if (!skb) { slave_err(client_info->slave->bond->dev, client_info->slave->dev, "failed to create an ARP packet\n"); continue; } skb->dev = client_info->slave->dev; if (client_info->vlan_id) { __vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), client_info->vlan_id); } arp_xmit(skb); } } /* sends ARP REPLIES that update the clients that need updating */ static void rlb_update_rx_clients(struct bonding *bond) { struct alb_bond_info *bond_info = &(BOND_ALB_INFO(bond)); struct rlb_client_info *client_info; u32 hash_index; spin_lock_bh(&bond->mode_lock); hash_index = bond_info->rx_hashtbl_used_head; for (; hash_index != RLB_NULL_INDEX; hash_index = client_info->used_next) { client_info = &(bond_info->rx_hashtbl[hash_index]); if (client_info->ntt) { rlb_update_client(client_info); if (bond_info->rlb_update_retry_counter == 0) client_info->ntt = 0; } } /* do not update the entries again until this counter is zero so that * not to confuse the clients. */ bond_info->rlb_update_delay_counter = RLB_UPDATE_DELAY; spin_unlock_bh(&bond->mode_lock); } /* The slave was assigned a new mac address - update the clients */ static void rlb_req_update_slave_clients(struct bonding *bond, struct slave *slave) { struct alb_bond_info *bond_info = &(BOND_ALB_INFO(bond)); struct rlb_client_info *client_info; int ntt = 0; u32 hash_index; spin_lock_bh(&bond->mode_lock); hash_index = bond_info->rx_hashtbl_used_head; for (; hash_index != RLB_NULL_INDEX; hash_index = client_info->used_next) { client_info = &(bond_info->rx_hashtbl[hash_index]); if ((client_info->slave == slave) && is_valid_ether_addr(client_info->mac_dst)) { client_info->ntt = 1; ntt = 1; } } /* update the team's flag only after the whole iteration */ if (ntt) { bond_info->rx_ntt = 1; /* fasten the change */ bond_info->rlb_update_retry_counter = RLB_UPDATE_RETRY; } spin_unlock_bh(&bond->mode_lock); } /* mark all clients using src_ip to be updated */ static void rlb_req_update_subnet_clients(struct bonding *bond, __be32 src_ip) { struct alb_bond_info *bond_info = &(BOND_ALB_INFO(bond)); struct rlb_client_info *client_info; u32 hash_index; spin_lock(&bond->mode_lock); hash_index = bond_info->rx_hashtbl_used_head; for (; hash_index != RLB_NULL_INDEX; hash_index = client_info->used_next) { client_info = &(bond_info->rx_hashtbl[hash_index]); if (!client_info->slave) { netdev_err(bond->dev, "found a client with no channel in the client's hash table\n"); continue; } /* update all clients using this src_ip, that are not assigned * to the team's address (curr_active_slave) and have a known * unicast mac address. */ if ((client_info->ip_src == src_ip) && !ether_addr_equal_64bits(client_info->slave->dev->dev_addr, bond->dev->dev_addr) && is_valid_ether_addr(client_info->mac_dst)) { client_info->ntt = 1; bond_info->rx_ntt = 1; } } spin_unlock(&bond->mode_lock); } static struct slave *rlb_choose_channel(struct sk_buff *skb, struct bonding *bond, const struct arp_pkt *arp) { struct alb_bond_info *bond_info = &(BOND_ALB_INFO(bond)); struct slave *assigned_slave, *curr_active_slave; struct rlb_client_info *client_info; u32 hash_index = 0; spin_lock(&bond->mode_lock); curr_active_slave = rcu_dereference(bond->curr_active_slave); hash_index = _simple_hash((u8 *)&arp->ip_dst, sizeof(arp->ip_dst)); client_info = &(bond_info->rx_hashtbl[hash_index]); if (client_info->assigned) { if ((client_info->ip_src == arp->ip_src) && (client_info->ip_dst == arp->ip_dst)) { /* the entry is already assigned to this client */ if (!is_broadcast_ether_addr(arp->mac_dst)) { /* update mac address from arp */ ether_addr_copy(client_info->mac_dst, arp->mac_dst); } ether_addr_copy(client_info->mac_src, arp->mac_src); assigned_slave = client_info->slave; if (assigned_slave) { spin_unlock(&bond->mode_lock); return assigned_slave; } } else { /* the entry is already assigned to some other client, * move the old client to primary (curr_active_slave) so * that the new client can be assigned to this entry. */ if (curr_active_slave && client_info->slave != curr_active_slave) { client_info->slave = curr_active_slave; rlb_update_client(client_info); } } } /* assign a new slave */ assigned_slave = __rlb_next_rx_slave(bond); if (assigned_slave) { if (!(client_info->assigned && client_info->ip_src == arp->ip_src)) { /* ip_src is going to be updated, * fix the src hash list */ u32 hash_src = _simple_hash((u8 *)&arp->ip_src, sizeof(arp->ip_src)); rlb_src_unlink(bond, hash_index); rlb_src_link(bond, hash_src, hash_index); } client_info->ip_src = arp->ip_src; client_info->ip_dst = arp->ip_dst; /* arp->mac_dst is broadcast for arp requests. * will be updated with clients actual unicast mac address * upon receiving an arp reply. */ ether_addr_copy(client_info->mac_dst, arp->mac_dst); ether_addr_copy(client_info->mac_src, arp->mac_src); client_info->slave = assigned_slave; if (is_valid_ether_addr(client_info->mac_dst)) { client_info->ntt = 1; bond->alb_info.rx_ntt = 1; } else { client_info->ntt = 0; } if (vlan_get_tag(skb, &client_info->vlan_id)) client_info->vlan_id = 0; if (!client_info->assigned) { u32 prev_tbl_head = bond_info->rx_hashtbl_used_head; bond_info->rx_hashtbl_used_head = hash_index; client_info->used_next = prev_tbl_head; if (prev_tbl_head != RLB_NULL_INDEX) { bond_info->rx_hashtbl[prev_tbl_head].used_prev = hash_index; } client_info->assigned = 1; } } spin_unlock(&bond->mode_lock); return assigned_slave; } /* chooses (and returns) transmit channel for arp reply * does not choose channel for other arp types since they are * sent on the curr_active_slave */ static struct slave *rlb_arp_xmit(struct sk_buff *skb, struct bonding *bond) { struct slave *tx_slave = NULL; struct net_device *dev; struct arp_pkt *arp; if (!pskb_network_may_pull(skb, sizeof(*arp))) return NULL; arp = (struct arp_pkt *)skb_network_header(skb); /* Don't modify or load balance ARPs that do not originate * from the bond itself or a VLAN directly above the bond. */ if (!bond_slave_has_mac_rcu(bond, arp->mac_src)) return NULL; dev = ip_dev_find(dev_net(bond->dev), arp->ip_src); if (dev) { if (netif_is_any_bridge_master(dev)) { dev_put(dev); return NULL; } dev_put(dev); } if (arp->op_code == htons(ARPOP_REPLY)) { /* the arp must be sent on the selected rx channel */ tx_slave = rlb_choose_channel(skb, bond, arp); if (tx_slave) bond_hw_addr_copy(arp->mac_src, tx_slave->dev->dev_addr, tx_slave->dev->addr_len); netdev_dbg(bond->dev, "(slave %s): Server sent ARP Reply packet\n", tx_slave ? tx_slave->dev->name : "NULL"); } else if (arp->op_code == htons(ARPOP_REQUEST)) { /* Create an entry in the rx_hashtbl for this client as a * place holder. * When the arp reply is received the entry will be updated * with the correct unicast address of the client. */ tx_slave = rlb_choose_channel(skb, bond, arp); /* The ARP reply packets must be delayed so that * they can cancel out the influence of the ARP request. */ bond->alb_info.rlb_update_delay_counter = RLB_UPDATE_DELAY; /* arp requests are broadcast and are sent on the primary * the arp request will collapse all clients on the subnet to * the primary slave. We must register these clients to be * updated with their assigned mac. */ rlb_req_update_subnet_clients(bond, arp->ip_src); netdev_dbg(bond->dev, "(slave %s): Server sent ARP Request packet\n", tx_slave ? tx_slave->dev->name : "NULL"); } return tx_slave; } static void rlb_rebalance(struct bonding *bond) { struct alb_bond_info *bond_info = &(BOND_ALB_INFO(bond)); struct slave *assigned_slave; struct rlb_client_info *client_info; int ntt; u32 hash_index; spin_lock_bh(&bond->mode_lock); ntt = 0; hash_index = bond_info->rx_hashtbl_used_head; for (; hash_index != RLB_NULL_INDEX; hash_index = client_info->used_next) { client_info = &(bond_info->rx_hashtbl[hash_index]); assigned_slave = __rlb_next_rx_slave(bond); if (assigned_slave && (client_info->slave != assigned_slave)) { client_info->slave = assigned_slave; if (!is_zero_ether_addr(client_info->mac_dst)) { client_info->ntt = 1; ntt = 1; } } } /* update the team's flag only after the whole iteration */ if (ntt) bond_info->rx_ntt = 1; spin_unlock_bh(&bond->mode_lock); } /* Caller must hold mode_lock */ static void rlb_init_table_entry_dst(struct rlb_client_info *entry) { entry->used_next = RLB_NULL_INDEX; entry->used_prev = RLB_NULL_INDEX; entry->assigned = 0; entry->slave = NULL; entry->vlan_id = 0; } static void rlb_init_table_entry_src(struct rlb_client_info *entry) { entry->src_first = RLB_NULL_INDEX; entry->src_prev = RLB_NULL_INDEX; entry->src_next = RLB_NULL_INDEX; } static void rlb_init_table_entry(struct rlb_client_info *entry) { memset(entry, 0, sizeof(struct rlb_client_info)); rlb_init_table_entry_dst(entry); rlb_init_table_entry_src(entry); } static void rlb_delete_table_entry_dst(struct bonding *bond, u32 index) { struct alb_bond_info *bond_info = &(BOND_ALB_INFO(bond)); u32 next_index = bond_info->rx_hashtbl[index].used_next; u32 prev_index = bond_info->rx_hashtbl[index].used_prev; if (index == bond_info->rx_hashtbl_used_head) bond_info->rx_hashtbl_used_head = next_index; if (prev_index != RLB_NULL_INDEX) bond_info->rx_hashtbl[prev_index].used_next = next_index; if (next_index != RLB_NULL_INDEX) bond_info->rx_hashtbl[next_index].used_prev = prev_index; } /* unlink a rlb hash table entry from the src list */ static void rlb_src_unlink(struct bonding *bond, u32 index) { struct alb_bond_info *bond_info = &(BOND_ALB_INFO(bond)); u32 next_index = bond_info->rx_hashtbl[index].src_next; u32 prev_index = bond_info->rx_hashtbl[index].src_prev; bond_info->rx_hashtbl[index].src_next = RLB_NULL_INDEX; bond_info->rx_hashtbl[index].src_prev = RLB_NULL_INDEX; if (next_index != RLB_NULL_INDEX) bond_info->rx_hashtbl[next_index].src_prev = prev_index; if (prev_index == RLB_NULL_INDEX) return; /* is prev_index pointing to the head of this list? */ if (bond_info->rx_hashtbl[prev_index].src_first == index) bond_info->rx_hashtbl[prev_index].src_first = next_index; else bond_info->rx_hashtbl[prev_index].src_next = next_index; } static void rlb_delete_table_entry(struct bonding *bond, u32 index) { struct alb_bond_info *bond_info = &(BOND_ALB_INFO(bond)); struct rlb_client_info *entry = &(bond_info->rx_hashtbl[index]); rlb_delete_table_entry_dst(bond, index); rlb_init_table_entry_dst(entry); rlb_src_unlink(bond, index); } /* add the rx_hashtbl[ip_dst_hash] entry to the list * of entries with identical ip_src_hash */ static void rlb_src_link(struct bonding *bond, u32 ip_src_hash, u32 ip_dst_hash) { struct alb_bond_info *bond_info = &(BOND_ALB_INFO(bond)); u32 next; bond_info->rx_hashtbl[ip_dst_hash].src_prev = ip_src_hash; next = bond_info->rx_hashtbl[ip_src_hash].src_first; bond_info->rx_hashtbl[ip_dst_hash].src_next = next; if (next != RLB_NULL_INDEX) bond_info->rx_hashtbl[next].src_prev = ip_dst_hash; bond_info->rx_hashtbl[ip_src_hash].src_first = ip_dst_hash; } /* deletes all rx_hashtbl entries with arp->ip_src if their mac_src does * not match arp->mac_src */ static void rlb_purge_src_ip(struct bonding *bond, struct arp_pkt *arp) { struct alb_bond_info *bond_info = &(BOND_ALB_INFO(bond)); u32 ip_src_hash = _simple_hash((u8 *)&(arp->ip_src), sizeof(arp->ip_src)); u32 index; spin_lock_bh(&bond->mode_lock); index = bond_info->rx_hashtbl[ip_src_hash].src_first; while (index != RLB_NULL_INDEX) { struct rlb_client_info *entry = &(bond_info->rx_hashtbl[index]); u32 next_index = entry->src_next; if (entry->ip_src == arp->ip_src && !ether_addr_equal_64bits(arp->mac_src, entry->mac_src)) rlb_delete_table_entry(bond, index); index = next_index; } spin_unlock_bh(&bond->mode_lock); } static int rlb_initialize(struct bonding *bond) { struct alb_bond_info *bond_info = &(BOND_ALB_INFO(bond)); struct rlb_client_info *new_hashtbl; int size = RLB_HASH_TABLE_SIZE * sizeof(struct rlb_client_info); int i; new_hashtbl = kmalloc(size, GFP_KERNEL); if (!new_hashtbl) return -1; spin_lock_bh(&bond->mode_lock); bond_info->rx_hashtbl = new_hashtbl; bond_info->rx_hashtbl_used_head = RLB_NULL_INDEX; for (i = 0; i < RLB_HASH_TABLE_SIZE; i++) rlb_init_table_entry(bond_info->rx_hashtbl + i); spin_unlock_bh(&bond->mode_lock); /* register to receive ARPs */ bond->recv_probe = rlb_arp_recv; return 0; } static void rlb_deinitialize(struct bonding *bond) { struct alb_bond_info *bond_info = &(BOND_ALB_INFO(bond)); spin_lock_bh(&bond->mode_lock); kfree(bond_info->rx_hashtbl); bond_info->rx_hashtbl = NULL; bond_info->rx_hashtbl_used_head = RLB_NULL_INDEX; spin_unlock_bh(&bond->mode_lock); } static void rlb_clear_vlan(struct bonding *bond, unsigned short vlan_id) { struct alb_bond_info *bond_info = &(BOND_ALB_INFO(bond)); u32 curr_index; spin_lock_bh(&bond->mode_lock); curr_index = bond_info->rx_hashtbl_used_head; while (curr_index != RLB_NULL_INDEX) { struct rlb_client_info *curr = &(bond_info->rx_hashtbl[curr_index]); u32 next_index = bond_info->rx_hashtbl[curr_index].used_next; if (curr->vlan_id == vlan_id) rlb_delete_table_entry(bond, curr_index); curr_index = next_index; } spin_unlock_bh(&bond->mode_lock); } /*********************** tlb/rlb shared functions *********************/ static void alb_send_lp_vid(struct slave *slave, const u8 mac_addr[], __be16 vlan_proto, u16 vid) { struct learning_pkt pkt; struct sk_buff *skb; int size = sizeof(struct learning_pkt); memset(&pkt, 0, size); ether_addr_copy(pkt.mac_dst, mac_addr); ether_addr_copy(pkt.mac_src, mac_addr); pkt.type = cpu_to_be16(ETH_P_LOOPBACK); skb = dev_alloc_skb(size); if (!skb) return; skb_put_data(skb, &pkt, size); skb_reset_mac_header(skb); skb->network_header = skb->mac_header + ETH_HLEN; skb->protocol = pkt.type; skb->priority = TC_PRIO_CONTROL; skb->dev = slave->dev; slave_dbg(slave->bond->dev, slave->dev, "Send learning packet: mac %pM vlan %d\n", mac_addr, vid); if (vid) __vlan_hwaccel_put_tag(skb, vlan_proto, vid); dev_queue_xmit(skb); } struct alb_walk_data { struct bonding *bond; struct slave *slave; const u8 *mac_addr; bool strict_match; }; static int alb_upper_dev_walk(struct net_device *upper, struct netdev_nested_priv *priv) { struct alb_walk_data *data = (struct alb_walk_data *)priv->data; bool strict_match = data->strict_match; const u8 *mac_addr = data->mac_addr; struct bonding *bond = data->bond; struct slave *slave = data->slave; struct bond_vlan_tag *tags; if (is_vlan_dev(upper) && bond->dev->lower_level == upper->lower_level - 1) { if (upper->addr_assign_type == NET_ADDR_STOLEN) { alb_send_lp_vid(slave, mac_addr, vlan_dev_vlan_proto(upper), vlan_dev_vlan_id(upper)); } else { alb_send_lp_vid(slave, upper->dev_addr, vlan_dev_vlan_proto(upper), vlan_dev_vlan_id(upper)); } } /* If this is a macvlan device, then only send updates * when strict_match is turned off. */ if (netif_is_macvlan(upper) && !strict_match) { tags = bond_verify_device_path(bond->dev, upper, 0); if (IS_ERR_OR_NULL(tags)) return -ENOMEM; alb_send_lp_vid(slave, upper->dev_addr, tags[0].vlan_proto, tags[0].vlan_id); kfree(tags); } return 0; } static void alb_send_learning_packets(struct slave *slave, const u8 mac_addr[], bool strict_match) { struct bonding *bond = bond_get_bond_by_slave(slave); struct netdev_nested_priv priv; struct alb_walk_data data = { .strict_match = strict_match, .mac_addr = mac_addr, .slave = slave, .bond = bond, }; priv.data = (void *)&data; /* send untagged */ alb_send_lp_vid(slave, mac_addr, 0, 0); /* loop through all devices and see if we need to send a packet * for that device. */ rcu_read_lock(); netdev_walk_all_upper_dev_rcu(bond->dev, alb_upper_dev_walk, &priv); rcu_read_unlock(); } static int alb_set_slave_mac_addr(struct slave *slave, const u8 addr[], unsigned int len) { struct net_device *dev = slave->dev; struct sockaddr_storage ss; if (BOND_MODE(slave->bond) == BOND_MODE_TLB) { __dev_addr_set(dev, addr, len); return 0; } /* for rlb each slave must have a unique hw mac addresses so that * each slave will receive packets destined to a different mac */ memcpy(ss.__data, addr, len); ss.ss_family = dev->type; if (dev_set_mac_address(dev, &ss, NULL)) { slave_err(slave->bond->dev, dev, "dev_set_mac_address on slave failed! ALB mode requires that the base driver support setting the hw address also when the network device's interface is open\n"); return -EOPNOTSUPP; } return 0; } /* Swap MAC addresses between two slaves. * * Called with RTNL held, and no other locks. */ static void alb_swap_mac_addr(struct slave *slave1, struct slave *slave2) { u8 tmp_mac_addr[MAX_ADDR_LEN]; bond_hw_addr_copy(tmp_mac_addr, slave1->dev->dev_addr, slave1->dev->addr_len); alb_set_slave_mac_addr(slave1, slave2->dev->dev_addr, slave2->dev->addr_len); alb_set_slave_mac_addr(slave2, tmp_mac_addr, slave1->dev->addr_len); } /* Send learning packets after MAC address swap. * * Called with RTNL and no other locks */ static void alb_fasten_mac_swap(struct bonding *bond, struct slave *slave1, struct slave *slave2) { int slaves_state_differ = (bond_slave_can_tx(slave1) != bond_slave_can_tx(slave2)); struct slave *disabled_slave = NULL; ASSERT_RTNL(); /* fasten the change in the switch */ if (bond_slave_can_tx(slave1)) { alb_send_learning_packets(slave1, slave1->dev->dev_addr, false); if (bond->alb_info.rlb_enabled) { /* inform the clients that the mac address * has changed */ rlb_req_update_slave_clients(bond, slave1); } } else { disabled_slave = slave1; } if (bond_slave_can_tx(slave2)) { alb_send_learning_packets(slave2, slave2->dev->dev_addr, false); if (bond->alb_info.rlb_enabled) { /* inform the clients that the mac address * has changed */ rlb_req_update_slave_clients(bond, slave2); } } else { disabled_slave = slave2; } if (bond->alb_info.rlb_enabled && slaves_state_differ) { /* A disabled slave was assigned an active mac addr */ rlb_teach_disabled_mac_on_primary(bond, disabled_slave->dev->dev_addr); } } /** * alb_change_hw_addr_on_detach * @bond: bonding we're working on * @slave: the slave that was just detached * * We assume that @slave was already detached from the slave list. * * If @slave's permanent hw address is different both from its current * address and from @bond's address, then somewhere in the bond there's * a slave that has @slave's permanet address as its current address. * We'll make sure that slave no longer uses @slave's permanent address. * * Caller must hold RTNL and no other locks */ static void alb_change_hw_addr_on_detach(struct bonding *bond, struct slave *slave) { int perm_curr_diff; int perm_bond_diff; struct slave *found_slave; perm_curr_diff = !ether_addr_equal_64bits(slave->perm_hwaddr, slave->dev->dev_addr); perm_bond_diff = !ether_addr_equal_64bits(slave->perm_hwaddr, bond->dev->dev_addr); if (perm_curr_diff && perm_bond_diff) { found_slave = bond_slave_has_mac(bond, slave->perm_hwaddr); if (found_slave) { alb_swap_mac_addr(slave, found_slave); alb_fasten_mac_swap(bond, slave, found_slave); } } } /** * alb_handle_addr_collision_on_attach * @bond: bonding we're working on * @slave: the slave that was just attached * * checks uniqueness of slave's mac address and handles the case the * new slave uses the bonds mac address. * * If the permanent hw address of @slave is @bond's hw address, we need to * find a different hw address to give @slave, that isn't in use by any other * slave in the bond. This address must be, of course, one of the permanent * addresses of the other slaves. * * We go over the slave list, and for each slave there we compare its * permanent hw address with the current address of all the other slaves. * If no match was found, then we've found a slave with a permanent address * that isn't used by any other slave in the bond, so we can assign it to * @slave. * * assumption: this function is called before @slave is attached to the * bond slave list. */ static int alb_handle_addr_collision_on_attach(struct bonding *bond, struct slave *slave) { struct slave *has_bond_addr = rcu_access_pointer(bond->curr_active_slave); struct slave *tmp_slave1, *free_mac_slave = NULL; struct list_head *iter; if (!bond_has_slaves(bond)) { /* this is the first slave */ return 0; } /* if slave's mac address differs from bond's mac address * check uniqueness of slave's mac address against the other * slaves in the bond. */ if (!ether_addr_equal_64bits(slave->perm_hwaddr, bond->dev->dev_addr)) { if (!bond_slave_has_mac(bond, slave->dev->dev_addr)) return 0; /* Try setting slave mac to bond address and fall-through * to code handling that situation below... */ alb_set_slave_mac_addr(slave, bond->dev->dev_addr, bond->dev->addr_len); } /* The slave's address is equal to the address of the bond. * Search for a spare address in the bond for this slave. */ bond_for_each_slave(bond, tmp_slave1, iter) { if (!bond_slave_has_mac(bond, tmp_slave1->perm_hwaddr)) { /* no slave has tmp_slave1's perm addr * as its curr addr */ free_mac_slave = tmp_slave1; break; } if (!has_bond_addr) { if (ether_addr_equal_64bits(tmp_slave1->dev->dev_addr, bond->dev->dev_addr)) { has_bond_addr = tmp_slave1; } } } if (free_mac_slave) { alb_set_slave_mac_addr(slave, free_mac_slave->perm_hwaddr, free_mac_slave->dev->addr_len); slave_warn(bond->dev, slave->dev, "the slave hw address is in use by the bond; giving it the hw address of %s\n", free_mac_slave->dev->name); } else if (has_bond_addr) { slave_err(bond->dev, slave->dev, "the slave hw address is in use by the bond; couldn't find a slave with a free hw address to give it (this should not have happened)\n"); return -EFAULT; } return 0; } /** * alb_set_mac_address * @bond: bonding we're working on * @addr: MAC address to set * * In TLB mode all slaves are configured to the bond's hw address, but set * their dev_addr field to different addresses (based on their permanent hw * addresses). * * For each slave, this function sets the interface to the new address and then * changes its dev_addr field to its previous value. * * Unwinding assumes bond's mac address has not yet changed. */ static int alb_set_mac_address(struct bonding *bond, void *addr) { struct slave *slave, *rollback_slave; struct list_head *iter; struct sockaddr_storage ss; char tmp_addr[MAX_ADDR_LEN]; int res; if (bond->alb_info.rlb_enabled) return 0; bond_for_each_slave(bond, slave, iter) { /* save net_device's current hw address */ bond_hw_addr_copy(tmp_addr, slave->dev->dev_addr, slave->dev->addr_len); res = dev_set_mac_address(slave->dev, addr, NULL); /* restore net_device's hw address */ dev_addr_set(slave->dev, tmp_addr); if (res) goto unwind; } return 0; unwind: memcpy(ss.__data, bond->dev->dev_addr, bond->dev->addr_len); ss.ss_family = bond->dev->type; /* unwind from head to the slave that failed */ bond_for_each_slave(bond, rollback_slave, iter) { if (rollback_slave == slave) break; bond_hw_addr_copy(tmp_addr, rollback_slave->dev->dev_addr, rollback_slave->dev->addr_len); dev_set_mac_address(rollback_slave->dev, &ss, NULL); dev_addr_set(rollback_slave->dev, tmp_addr); } return res; } /* determine if the packet is NA or NS */ static bool alb_determine_nd(struct sk_buff *skb, struct bonding *bond) { struct ipv6hdr *ip6hdr; struct icmp6hdr *hdr; if (!pskb_network_may_pull(skb, sizeof(*ip6hdr))) return true; ip6hdr = ipv6_hdr(skb); if (ip6hdr->nexthdr != IPPROTO_ICMPV6) return false; if (!pskb_network_may_pull(skb, sizeof(*ip6hdr) + sizeof(*hdr))) return true; hdr = icmp6_hdr(skb); return hdr->icmp6_type == NDISC_NEIGHBOUR_ADVERTISEMENT || hdr->icmp6_type == NDISC_NEIGHBOUR_SOLICITATION; } /************************ exported alb functions ************************/ int bond_alb_initialize(struct bonding *bond, int rlb_enabled) { int res; res = tlb_initialize(bond); if (res) return res; if (rlb_enabled) { res = rlb_initialize(bond); if (res) { tlb_deinitialize(bond); return res; } bond->alb_info.rlb_enabled = 1; } else { bond->alb_info.rlb_enabled = 0; } return 0; } void bond_alb_deinitialize(struct bonding *bond) { struct alb_bond_info *bond_info = &(BOND_ALB_INFO(bond)); tlb_deinitialize(bond); if (bond_info->rlb_enabled) rlb_deinitialize(bond); } static netdev_tx_t bond_do_alb_xmit(struct sk_buff *skb, struct bonding *bond, struct slave *tx_slave) { struct alb_bond_info *bond_info = &(BOND_ALB_INFO(bond)); struct ethhdr *eth_data = eth_hdr(skb); if (!tx_slave) { /* unbalanced or unassigned, send through primary */ tx_slave = rcu_dereference(bond->curr_active_slave); if (bond->params.tlb_dynamic_lb) bond_info->unbalanced_load += skb->len; } if (tx_slave && bond_slave_can_tx(tx_slave)) { if (tx_slave != rcu_access_pointer(bond->curr_active_slave)) { ether_addr_copy(eth_data->h_source, tx_slave->dev->dev_addr); } return bond_dev_queue_xmit(bond, skb, tx_slave->dev); } if (tx_slave && bond->params.tlb_dynamic_lb) { spin_lock(&bond->mode_lock); __tlb_clear_slave(bond, tx_slave, 0); spin_unlock(&bond->mode_lock); } /* no suitable interface, frame not sent */ return bond_tx_drop(bond->dev, skb); } struct slave *bond_xmit_tlb_slave_get(struct bonding *bond, struct sk_buff *skb) { struct slave *tx_slave = NULL; struct ethhdr *eth_data; u32 hash_index; skb_reset_mac_header(skb); eth_data = eth_hdr(skb); /* Do not TX balance any multicast or broadcast */ if (!is_multicast_ether_addr(eth_data->h_dest)) { switch (skb->protocol) { case htons(ETH_P_IPV6): if (alb_determine_nd(skb, bond)) break; fallthrough; case htons(ETH_P_IP): hash_index = bond_xmit_hash(bond, skb); if (bond->params.tlb_dynamic_lb) { tx_slave = tlb_choose_channel(bond, hash_index & 0xFF, skb->len); } else { struct bond_up_slave *slaves; unsigned int count; slaves = rcu_dereference(bond->usable_slaves); count = slaves ? READ_ONCE(slaves->count) : 0; if (likely(count)) tx_slave = slaves->arr[hash_index % count]; } break; } } return tx_slave; } netdev_tx_t bond_tlb_xmit(struct sk_buff *skb, struct net_device *bond_dev) { struct bonding *bond = netdev_priv(bond_dev); struct slave *tx_slave; tx_slave = bond_xmit_tlb_slave_get(bond, skb); return bond_do_alb_xmit(skb, bond, tx_slave); } struct slave *bond_xmit_alb_slave_get(struct bonding *bond, struct sk_buff *skb) { struct alb_bond_info *bond_info = &(BOND_ALB_INFO(bond)); static const __be32 ip_bcast = htonl(0xffffffff); struct slave *tx_slave = NULL; const u8 *hash_start = NULL; bool do_tx_balance = true; struct ethhdr *eth_data; u32 hash_index = 0; int hash_size = 0; skb_reset_mac_header(skb); eth_data = eth_hdr(skb); switch (ntohs(skb->protocol)) { case ETH_P_IP: { const struct iphdr *iph; if (is_broadcast_ether_addr(eth_data->h_dest) || !pskb_network_may_pull(skb, sizeof(*iph))) { do_tx_balance = false; break; } iph = ip_hdr(skb); if (iph->daddr == ip_bcast || iph->protocol == IPPROTO_IGMP) { do_tx_balance = false; break; } hash_start = (char *)&(iph->daddr); hash_size = sizeof(iph->daddr); break; } case ETH_P_IPV6: { const struct ipv6hdr *ip6hdr; /* IPv6 doesn't really use broadcast mac address, but leave * that here just in case. */ if (is_broadcast_ether_addr(eth_data->h_dest)) { do_tx_balance = false; break; } /* IPv6 uses all-nodes multicast as an equivalent to * broadcasts in IPv4. */ if (ether_addr_equal_64bits(eth_data->h_dest, mac_v6_allmcast)) { do_tx_balance = false; break; } if (alb_determine_nd(skb, bond)) { do_tx_balance = false; break; } /* The IPv6 header is pulled by alb_determine_nd */ /* Additionally, DAD probes should not be tx-balanced as that * will lead to false positives for duplicate addresses and * prevent address configuration from working. */ ip6hdr = ipv6_hdr(skb); if (ipv6_addr_any(&ip6hdr->saddr)) { do_tx_balance = false; break; } hash_start = (char *)&ip6hdr->daddr; hash_size = sizeof(ip6hdr->daddr); break; } case ETH_P_ARP: do_tx_balance = false; if (bond_info->rlb_enabled) tx_slave = rlb_arp_xmit(skb, bond); break; default: do_tx_balance = false; break; } if (do_tx_balance) { if (bond->params.tlb_dynamic_lb) { hash_index = _simple_hash(hash_start, hash_size); tx_slave = tlb_choose_channel(bond, hash_index, skb->len); } else { /* * do_tx_balance means we are free to select the tx_slave * So we do exactly what tlb would do for hash selection */ struct bond_up_slave *slaves; unsigned int count; slaves = rcu_dereference(bond->usable_slaves); count = slaves ? READ_ONCE(slaves->count) : 0; if (likely(count)) tx_slave = slaves->arr[bond_xmit_hash(bond, skb) % count]; } } return tx_slave; } netdev_tx_t bond_alb_xmit(struct sk_buff *skb, struct net_device *bond_dev) { struct bonding *bond = netdev_priv(bond_dev); struct slave *tx_slave = NULL; tx_slave = bond_xmit_alb_slave_get(bond, skb); return bond_do_alb_xmit(skb, bond, tx_slave); } void bond_alb_monitor(struct work_struct *work) { struct bonding *bond = container_of(work, struct bonding, alb_work.work); struct alb_bond_info *bond_info = &(BOND_ALB_INFO(bond)); struct list_head *iter; struct slave *slave; if (!bond_has_slaves(bond)) { atomic_set(&bond_info->tx_rebalance_counter, 0); bond_info->lp_counter = 0; goto re_arm; } rcu_read_lock(); atomic_inc(&bond_info->tx_rebalance_counter); bond_info->lp_counter++; /* send learning packets */ if (bond_info->lp_counter >= BOND_ALB_LP_TICKS(bond)) { bool strict_match; bond_for_each_slave_rcu(bond, slave, iter) { /* If updating current_active, use all currently * user mac addresses (!strict_match). Otherwise, only * use mac of the slave device. * In RLB mode, we always use strict matches. */ strict_match = (slave != rcu_access_pointer(bond->curr_active_slave) || bond_info->rlb_enabled); alb_send_learning_packets(slave, slave->dev->dev_addr, strict_match); } bond_info->lp_counter = 0; } /* rebalance tx traffic */ if (atomic_read(&bond_info->tx_rebalance_counter) >= BOND_TLB_REBALANCE_TICKS) { bond_for_each_slave_rcu(bond, slave, iter) { tlb_clear_slave(bond, slave, 1); if (slave == rcu_access_pointer(bond->curr_active_slave)) { SLAVE_TLB_INFO(slave).load = bond_info->unbalanced_load / BOND_TLB_REBALANCE_INTERVAL; bond_info->unbalanced_load = 0; } } atomic_set(&bond_info->tx_rebalance_counter, 0); } if (bond_info->rlb_enabled) { if (bond_info->primary_is_promisc && (++bond_info->rlb_promisc_timeout_counter >= RLB_PROMISC_TIMEOUT)) { /* dev_set_promiscuity requires rtnl and * nothing else. Avoid race with bond_close. */ rcu_read_unlock(); if (!rtnl_trylock()) goto re_arm; bond_info->rlb_promisc_timeout_counter = 0; /* If the primary was set to promiscuous mode * because a slave was disabled then * it can now leave promiscuous mode. */ dev_set_promiscuity(rtnl_dereference(bond->curr_active_slave)->dev, -1); bond_info->primary_is_promisc = 0; rtnl_unlock(); rcu_read_lock(); } if (bond_info->rlb_rebalance) { bond_info->rlb_rebalance = 0; rlb_rebalance(bond); } /* check if clients need updating */ if (bond_info->rx_ntt) { if (bond_info->rlb_update_delay_counter) { --bond_info->rlb_update_delay_counter; } else { rlb_update_rx_clients(bond); if (bond_info->rlb_update_retry_counter) --bond_info->rlb_update_retry_counter; else bond_info->rx_ntt = 0; } } } rcu_read_unlock(); re_arm: queue_delayed_work(bond->wq, &bond->alb_work, alb_delta_in_ticks); } /* assumption: called before the slave is attached to the bond * and not locked by the bond lock */ int bond_alb_init_slave(struct bonding *bond, struct slave *slave) { int res; res = alb_set_slave_mac_addr(slave, slave->perm_hwaddr, slave->dev->addr_len); if (res) return res; res = alb_handle_addr_collision_on_attach(bond, slave); if (res) return res; tlb_init_slave(slave); /* order a rebalance ASAP */ atomic_set(&bond->alb_info.tx_rebalance_counter, BOND_TLB_REBALANCE_TICKS); if (bond->alb_info.rlb_enabled) bond->alb_info.rlb_rebalance = 1; return 0; } /* Remove slave from tlb and rlb hash tables, and fix up MAC addresses * if necessary. * * Caller must hold RTNL and no other locks */ void bond_alb_deinit_slave(struct bonding *bond, struct slave *slave) { if (bond_has_slaves(bond)) alb_change_hw_addr_on_detach(bond, slave); tlb_clear_slave(bond, slave, 0); if (bond->alb_info.rlb_enabled) { bond->alb_info.rx_slave = NULL; rlb_clear_slave(bond, slave); } } void bond_alb_handle_link_change(struct bonding *bond, struct slave *slave, char link) { struct alb_bond_info *bond_info = &(BOND_ALB_INFO(bond)); if (link == BOND_LINK_DOWN) { tlb_clear_slave(bond, slave, 0); if (bond->alb_info.rlb_enabled) rlb_clear_slave(bond, slave); } else if (link == BOND_LINK_UP) { /* order a rebalance ASAP */ atomic_set(&bond_info->tx_rebalance_counter, BOND_TLB_REBALANCE_TICKS); if (bond->alb_info.rlb_enabled) { bond->alb_info.rlb_rebalance = 1; /* If the updelay module parameter is smaller than the * forwarding delay of the switch the rebalance will * not work because the rebalance arp replies will * not be forwarded to the clients.. */ } } if (bond_is_nondyn_tlb(bond)) { if (bond_update_slave_arr(bond, NULL)) pr_err("Failed to build slave-array for TLB mode.\n"); } } /** * bond_alb_handle_active_change - assign new curr_active_slave * @bond: our bonding struct * @new_slave: new slave to assign * * Set the bond->curr_active_slave to @new_slave and handle * mac address swapping and promiscuity changes as needed. * * Caller must hold RTNL */ void bond_alb_handle_active_change(struct bonding *bond, struct slave *new_slave) { struct slave *swap_slave; struct slave *curr_active; curr_active = rtnl_dereference(bond->curr_active_slave); if (curr_active == new_slave) return; if (curr_active && bond->alb_info.primary_is_promisc) { dev_set_promiscuity(curr_active->dev, -1); bond->alb_info.primary_is_promisc = 0; bond->alb_info.rlb_promisc_timeout_counter = 0; } swap_slave = curr_active; rcu_assign_pointer(bond->curr_active_slave, new_slave); if (!new_slave || !bond_has_slaves(bond)) return; /* set the new curr_active_slave to the bonds mac address * i.e. swap mac addresses of old curr_active_slave and new curr_active_slave */ if (!swap_slave) swap_slave = bond_slave_has_mac(bond, bond->dev->dev_addr); /* Arrange for swap_slave and new_slave to temporarily be * ignored so we can mess with their MAC addresses without * fear of interference from transmit activity. */ if (swap_slave) tlb_clear_slave(bond, swap_slave, 1); tlb_clear_slave(bond, new_slave, 1); /* in TLB mode, the slave might flip down/up with the old dev_addr, * and thus filter bond->dev_addr's packets, so force bond's mac */ if (BOND_MODE(bond) == BOND_MODE_TLB) { struct sockaddr_storage ss; u8 tmp_addr[MAX_ADDR_LEN]; bond_hw_addr_copy(tmp_addr, new_slave->dev->dev_addr, new_slave->dev->addr_len); bond_hw_addr_copy(ss.__data, bond->dev->dev_addr, bond->dev->addr_len); ss.ss_family = bond->dev->type; /* we don't care if it can't change its mac, best effort */ dev_set_mac_address(new_slave->dev, &ss, NULL); dev_addr_set(new_slave->dev, tmp_addr); } /* curr_active_slave must be set before calling alb_swap_mac_addr */ if (swap_slave) { /* swap mac address */ alb_swap_mac_addr(swap_slave, new_slave); alb_fasten_mac_swap(bond, swap_slave, new_slave); } else { /* set the new_slave to the bond mac address */ alb_set_slave_mac_addr(new_slave, bond->dev->dev_addr, bond->dev->addr_len); alb_send_learning_packets(new_slave, bond->dev->dev_addr, false); } } /* Called with RTNL */ int bond_alb_set_mac_address(struct net_device *bond_dev, void *addr) { struct bonding *bond = netdev_priv(bond_dev); struct sockaddr_storage *ss = addr; struct slave *curr_active; struct slave *swap_slave; int res; if (!is_valid_ether_addr(ss->__data)) return -EADDRNOTAVAIL; res = alb_set_mac_address(bond, addr); if (res) return res; dev_addr_set(bond_dev, ss->__data); /* If there is no curr_active_slave there is nothing else to do. * Otherwise we'll need to pass the new address to it and handle * duplications. */ curr_active = rtnl_dereference(bond->curr_active_slave); if (!curr_active) return 0; swap_slave = bond_slave_has_mac(bond, bond_dev->dev_addr); if (swap_slave) { alb_swap_mac_addr(swap_slave, curr_active); alb_fasten_mac_swap(bond, swap_slave, curr_active); } else { alb_set_slave_mac_addr(curr_active, bond_dev->dev_addr, bond_dev->addr_len); alb_send_learning_packets(curr_active, bond_dev->dev_addr, false); if (bond->alb_info.rlb_enabled) { /* inform clients mac address has changed */ rlb_req_update_slave_clients(bond, curr_active); } } return 0; } void bond_alb_clear_vlan(struct bonding *bond, unsigned short vlan_id) { if (bond->alb_info.rlb_enabled) rlb_clear_vlan(bond, vlan_id); }
3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 // SPDX-License-Identifier: GPL-2.0 #include <linux/export.h> #include <linux/kvm_types.h> #include <linux/percpu.h> #include <linux/preempt.h> #include <asm/msr.h> #define CREATE_TRACE_POINTS #include <asm/msr-trace.h> struct msr __percpu *msrs_alloc(void) { struct msr __percpu *msrs = NULL; msrs = alloc_percpu(struct msr); if (!msrs) { pr_warn("%s: error allocating msrs\n", __func__); return NULL; } return msrs; } EXPORT_SYMBOL(msrs_alloc); void msrs_free(struct msr __percpu *msrs) { free_percpu(msrs); } EXPORT_SYMBOL(msrs_free); /** * msr_read - Read an MSR with error handling * @msr: MSR to read * @m: value to read into * * It returns read data only on success, otherwise it doesn't change the output * argument @m. * * Return: %0 for success, otherwise an error code */ static int msr_read(u32 msr, struct msr *m) { int err; u64 val; err = rdmsrq_safe(msr, &val); if (!err) m->q = val; return err; } /** * msr_write - Write an MSR with error handling * * @msr: MSR to write * @m: value to write * * Return: %0 for success, otherwise an error code */ static int msr_write(u32 msr, struct msr *m) { return wrmsrq_safe(msr, m->q); } static inline int __flip_bit(u32 msr, u8 bit, bool set) { struct msr m, m1; int err = -EINVAL; if (bit > 63) return err; err = msr_read(msr, &m); if (err) return err; m1 = m; if (set) m1.q |= BIT_64(bit); else m1.q &= ~BIT_64(bit); if (m1.q == m.q) return 0; err = msr_write(msr, &m1); if (err) return err; return 1; } /** * msr_set_bit - Set @bit in a MSR @msr. * @msr: MSR to write * @bit: bit number to set * * Return: * * < 0: An error was encountered. * * = 0: Bit was already set. * * > 0: Hardware accepted the MSR write. */ int msr_set_bit(u32 msr, u8 bit) { return __flip_bit(msr, bit, true); } EXPORT_SYMBOL_FOR_KVM(msr_set_bit); /** * msr_clear_bit - Clear @bit in a MSR @msr. * @msr: MSR to write * @bit: bit number to clear * * Return: * * < 0: An error was encountered. * * = 0: Bit was already cleared. * * > 0: Hardware accepted the MSR write. */ int msr_clear_bit(u32 msr, u8 bit) { return __flip_bit(msr, bit, false); } EXPORT_SYMBOL_FOR_KVM(msr_clear_bit); #ifdef CONFIG_TRACEPOINTS void do_trace_write_msr(u32 msr, u64 val, int failed) { trace_write_msr(msr, val, failed); } EXPORT_SYMBOL(do_trace_write_msr); EXPORT_TRACEPOINT_SYMBOL(write_msr); void do_trace_read_msr(u32 msr, u64 val, int failed) { trace_read_msr(msr, val, failed); } EXPORT_SYMBOL(do_trace_read_msr); EXPORT_TRACEPOINT_SYMBOL(read_msr); void do_trace_rdpmc(u32 msr, u64 val, int failed) { trace_rdpmc(msr, val, failed); } EXPORT_SYMBOL(do_trace_rdpmc); EXPORT_TRACEPOINT_SYMBOL(rdpmc); #endif
5 5 146 93 33 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * AEAD: Authenticated Encryption with Associated Data * * Copyright (c) 2007-2015 Herbert Xu <herbert@gondor.apana.org.au> */ #ifndef _CRYPTO_AEAD_H #define _CRYPTO_AEAD_H #include <linux/atomic.h> #include <linux/container_of.h> #include <linux/crypto.h> #include <linux/slab.h> #include <linux/types.h> /** * DOC: Authenticated Encryption With Associated Data (AEAD) Cipher API * * The AEAD cipher API is used with the ciphers of type CRYPTO_ALG_TYPE_AEAD * (listed as type "aead" in /proc/crypto) * * The most prominent examples for this type of encryption is GCM and CCM. * However, the kernel supports other types of AEAD ciphers which are defined * with the following cipher string: * * authenc(keyed message digest, block cipher) * * For example: authenc(hmac(sha256), cbc(aes)) * * The example code provided for the symmetric key cipher operation applies * here as well. Naturally all *skcipher* symbols must be exchanged the *aead* * pendants discussed in the following. In addition, for the AEAD operation, * the aead_request_set_ad function must be used to set the pointer to the * associated data memory location before performing the encryption or * decryption operation. Another deviation from the asynchronous block cipher * operation is that the caller should explicitly check for -EBADMSG of the * crypto_aead_decrypt. That error indicates an authentication error, i.e. * a breach in the integrity of the message. In essence, that -EBADMSG error * code is the key bonus an AEAD cipher has over "standard" block chaining * modes. * * Memory Structure: * * The source scatterlist must contain the concatenation of * associated data || plaintext or ciphertext. * * The destination scatterlist has the same layout, except that the plaintext * (resp. ciphertext) will grow (resp. shrink) by the authentication tag size * during encryption (resp. decryption). The authentication tag is generated * during the encryption operation and appended to the ciphertext. During * decryption, the authentication tag is consumed along with the ciphertext and * used to verify the integrity of the plaintext and the associated data. * * In-place encryption/decryption is enabled by using the same scatterlist * pointer for both the source and destination. * * Even in the out-of-place case, space must be reserved in the destination for * the associated data, even though it won't be written to. This makes the * in-place and out-of-place cases more consistent. It is permissible for the * "destination" associated data to alias the "source" associated data. * * As with the other scatterlist crypto APIs, zero-length scatterlist elements * are not allowed in the used part of the scatterlist. Thus, if there is no * associated data, the first element must point to the plaintext/ciphertext. * * To meet the needs of IPsec, a special quirk applies to rfc4106, rfc4309, * rfc4543, and rfc7539esp ciphers. For these ciphers, the final 'ivsize' bytes * of the associated data buffer must contain a second copy of the IV. This is * in addition to the copy passed to aead_request_set_crypt(). These two IV * copies must not differ; different implementations of the same algorithm may * behave differently in that case. Note that the algorithm might not actually * treat the IV as associated data; nevertheless the length passed to * aead_request_set_ad() must include it. */ struct crypto_aead; struct scatterlist; /** * struct aead_request - AEAD request * @base: Common attributes for async crypto requests * @assoclen: Length in bytes of associated data for authentication * @cryptlen: Length of data to be encrypted or decrypted * @iv: Initialisation vector * @src: Source data * @dst: Destination data * @__ctx: Start of private context data */ struct aead_request { struct crypto_async_request base; unsigned int assoclen; unsigned int cryptlen; u8 *iv; struct scatterlist *src; struct scatterlist *dst; void *__ctx[] CRYPTO_MINALIGN_ATTR; }; /** * struct aead_alg - AEAD cipher definition * @maxauthsize: Set the maximum authentication tag size supported by the * transformation. A transformation may support smaller tag sizes. * As the authentication tag is a message digest to ensure the * integrity of the encrypted data, a consumer typically wants the * largest authentication tag possible as defined by this * variable. * @setauthsize: Set authentication size for the AEAD transformation. This * function is used to specify the consumer requested size of the * authentication tag to be either generated by the transformation * during encryption or the size of the authentication tag to be * supplied during the decryption operation. This function is also * responsible for checking the authentication tag size for * validity. * @setkey: see struct skcipher_alg * @encrypt: see struct skcipher_alg * @decrypt: see struct skcipher_alg * @ivsize: see struct skcipher_alg * @chunksize: see struct skcipher_alg * @init: Initialize the cryptographic transformation object. This function * is used to initialize the cryptographic transformation object. * This function is called only once at the instantiation time, right * after the transformation context was allocated. In case the * cryptographic hardware has some special requirements which need to * be handled by software, this function shall check for the precise * requirement of the transformation and put any software fallbacks * in place. * @exit: Deinitialize the cryptographic transformation object. This is a * counterpart to @init, used to remove various changes set in * @init. * @base: Definition of a generic crypto cipher algorithm. * * All fields except @ivsize is mandatory and must be filled. */ struct aead_alg { int (*setkey)(struct crypto_aead *tfm, const u8 *key, unsigned int keylen); int (*setauthsize)(struct crypto_aead *tfm, unsigned int authsize); int (*encrypt)(struct aead_request *req); int (*decrypt)(struct aead_request *req); int (*init)(struct crypto_aead *tfm); void (*exit)(struct crypto_aead *tfm); unsigned int ivsize; unsigned int maxauthsize; unsigned int chunksize; struct crypto_alg base; }; struct crypto_aead { unsigned int authsize; unsigned int reqsize; struct crypto_tfm base; }; struct crypto_sync_aead { struct crypto_aead base; }; #define MAX_SYNC_AEAD_REQSIZE 384 #define SYNC_AEAD_REQUEST_ON_STACK(name, _tfm) \ char __##name##_desc[sizeof(struct aead_request) + \ MAX_SYNC_AEAD_REQSIZE \ ] CRYPTO_MINALIGN_ATTR; \ struct aead_request *name = \ (((struct aead_request *)__##name##_desc)->base.tfm = \ crypto_sync_aead_tfm((_tfm)), \ (void *)__##name##_desc) static inline struct crypto_aead *__crypto_aead_cast(struct crypto_tfm *tfm) { return container_of(tfm, struct crypto_aead, base); } /** * crypto_alloc_aead() - allocate AEAD cipher handle * @alg_name: is the cra_name / name or cra_driver_name / driver name of the * AEAD cipher * @type: specifies the type of the cipher * @mask: specifies the mask for the cipher * * Allocate a cipher handle for an AEAD. The returned struct * crypto_aead is the cipher handle that is required for any subsequent * API invocation for that AEAD. * * Return: allocated cipher handle in case of success; IS_ERR() is true in case * of an error, PTR_ERR() returns the error code. */ struct crypto_aead *crypto_alloc_aead(const char *alg_name, u32 type, u32 mask); struct crypto_sync_aead *crypto_alloc_sync_aead(const char *alg_name, u32 type, u32 mask); static inline struct crypto_tfm *crypto_aead_tfm(struct crypto_aead *tfm) { return &tfm->base; } static inline struct crypto_tfm *crypto_sync_aead_tfm(struct crypto_sync_aead *tfm) { return crypto_aead_tfm(&tfm->base); } /** * crypto_free_aead() - zeroize and free aead handle * @tfm: cipher handle to be freed * * If @tfm is a NULL or error pointer, this function does nothing. */ static inline void crypto_free_aead(struct crypto_aead *tfm) { crypto_destroy_tfm(tfm, crypto_aead_tfm(tfm)); } static inline void crypto_free_sync_aead(struct crypto_sync_aead *tfm) { crypto_free_aead(&tfm->base); } /** * crypto_has_aead() - Search for the availability of an aead. * @alg_name: is the cra_name / name or cra_driver_name / driver name of the * aead * @type: specifies the type of the aead * @mask: specifies the mask for the aead * * Return: true when the aead is known to the kernel crypto API; false * otherwise */ int crypto_has_aead(const char *alg_name, u32 type, u32 mask); static inline const char *crypto_aead_driver_name(struct crypto_aead *tfm) { return crypto_tfm_alg_driver_name(crypto_aead_tfm(tfm)); } static inline struct aead_alg *crypto_aead_alg(struct crypto_aead *tfm) { return container_of(crypto_aead_tfm(tfm)->__crt_alg, struct aead_alg, base); } static inline unsigned int crypto_aead_alg_ivsize(struct aead_alg *alg) { return alg->ivsize; } /** * crypto_aead_ivsize() - obtain IV size * @tfm: cipher handle * * The size of the IV for the aead referenced by the cipher handle is * returned. This IV size may be zero if the cipher does not need an IV. * * Return: IV size in bytes */ static inline unsigned int crypto_aead_ivsize(struct crypto_aead *tfm) { return crypto_aead_alg_ivsize(crypto_aead_alg(tfm)); } static inline unsigned int crypto_sync_aead_ivsize(struct crypto_sync_aead *tfm) { return crypto_aead_ivsize(&tfm->base); } /** * crypto_aead_authsize() - obtain maximum authentication data size * @tfm: cipher handle * * The maximum size of the authentication data for the AEAD cipher referenced * by the AEAD cipher handle is returned. The authentication data size may be * zero if the cipher implements a hard-coded maximum. * * The authentication data may also be known as "tag value". * * Return: authentication data size / tag size in bytes */ static inline unsigned int crypto_aead_authsize(struct crypto_aead *tfm) { return tfm->authsize; } static inline unsigned int crypto_sync_aead_authsize(struct crypto_sync_aead *tfm) { return crypto_aead_authsize(&tfm->base); } static inline unsigned int crypto_aead_alg_maxauthsize(struct aead_alg *alg) { return alg->maxauthsize; } static inline unsigned int crypto_aead_maxauthsize(struct crypto_aead *aead) { return crypto_aead_alg_maxauthsize(crypto_aead_alg(aead)); } static inline unsigned int crypto_sync_aead_maxauthsize(struct crypto_sync_aead *tfm) { return crypto_aead_maxauthsize(&tfm->base); } /** * crypto_aead_blocksize() - obtain block size of cipher * @tfm: cipher handle * * The block size for the AEAD referenced with the cipher handle is returned. * The caller may use that information to allocate appropriate memory for the * data returned by the encryption or decryption operation * * Return: block size of cipher */ static inline unsigned int crypto_aead_blocksize(struct crypto_aead *tfm) { return crypto_tfm_alg_blocksize(crypto_aead_tfm(tfm)); } static inline unsigned int crypto_sync_aead_blocksize(struct crypto_sync_aead *tfm) { return crypto_aead_blocksize(&tfm->base); } static inline unsigned int crypto_aead_alignmask(struct crypto_aead *tfm) { return crypto_tfm_alg_alignmask(crypto_aead_tfm(tfm)); } static inline u32 crypto_aead_get_flags(struct crypto_aead *tfm) { return crypto_tfm_get_flags(crypto_aead_tfm(tfm)); } static inline void crypto_aead_set_flags(struct crypto_aead *tfm, u32 flags) { crypto_tfm_set_flags(crypto_aead_tfm(tfm), flags); } static inline void crypto_aead_clear_flags(struct crypto_aead *tfm, u32 flags) { crypto_tfm_clear_flags(crypto_aead_tfm(tfm), flags); } static inline u32 crypto_sync_aead_get_flags(struct crypto_sync_aead *tfm) { return crypto_aead_get_flags(&tfm->base); } static inline void crypto_sync_aead_set_flags(struct crypto_sync_aead *tfm, u32 flags) { crypto_aead_set_flags(&tfm->base, flags); } static inline void crypto_sync_aead_clear_flags(struct crypto_sync_aead *tfm, u32 flags) { crypto_aead_clear_flags(&tfm->base, flags); } /** * crypto_aead_setkey() - set key for cipher * @tfm: cipher handle * @key: buffer holding the key * @keylen: length of the key in bytes * * The caller provided key is set for the AEAD referenced by the cipher * handle. * * Note, the key length determines the cipher type. Many block ciphers implement * different cipher modes depending on the key size, such as AES-128 vs AES-192 * vs. AES-256. When providing a 16 byte key for an AES cipher handle, AES-128 * is performed. * * Return: 0 if the setting of the key was successful; < 0 if an error occurred */ int crypto_aead_setkey(struct crypto_aead *tfm, const u8 *key, unsigned int keylen); static inline int crypto_sync_aead_setkey(struct crypto_sync_aead *tfm, const u8 *key, unsigned int keylen) { return crypto_aead_setkey(&tfm->base, key, keylen); } /** * crypto_aead_setauthsize() - set authentication data size * @tfm: cipher handle * @authsize: size of the authentication data / tag in bytes * * Set the authentication data size / tag size. AEAD requires an authentication * tag (or MAC) in addition to the associated data. * * Return: 0 if the setting of the key was successful; < 0 if an error occurred */ int crypto_aead_setauthsize(struct crypto_aead *tfm, unsigned int authsize); static inline int crypto_sync_aead_setauthsize(struct crypto_sync_aead *tfm, unsigned int authsize) { return crypto_aead_setauthsize(&tfm->base, authsize); } static inline struct crypto_aead *crypto_aead_reqtfm(struct aead_request *req) { return __crypto_aead_cast(req->base.tfm); } static inline struct crypto_sync_aead *crypto_sync_aead_reqtfm(struct aead_request *req) { struct crypto_aead *tfm = crypto_aead_reqtfm(req); return container_of(tfm, struct crypto_sync_aead, base); } /** * crypto_aead_encrypt() - encrypt plaintext * @req: reference to the aead_request handle that holds all information * needed to perform the cipher operation * * Encrypt plaintext data using the aead_request handle. That data structure * and how it is filled with data is discussed with the aead_request_* * functions. * * IMPORTANT NOTE The encryption operation creates the authentication data / * tag. That data is concatenated with the created ciphertext. * The ciphertext memory size is therefore the given number of * block cipher blocks + the size defined by the * crypto_aead_setauthsize invocation. The caller must ensure * that sufficient memory is available for the ciphertext and * the authentication tag. * * Return: 0 if the cipher operation was successful; < 0 if an error occurred */ int crypto_aead_encrypt(struct aead_request *req); /** * crypto_aead_decrypt() - decrypt ciphertext * @req: reference to the aead_request handle that holds all information * needed to perform the cipher operation * * Decrypt ciphertext data using the aead_request handle. That data structure * and how it is filled with data is discussed with the aead_request_* * functions. * * IMPORTANT NOTE The caller must concatenate the ciphertext followed by the * authentication data / tag. That authentication data / tag * must have the size defined by the crypto_aead_setauthsize * invocation. * * * Return: 0 if the cipher operation was successful; -EBADMSG: The AEAD * cipher operation performs the authentication of the data during the * decryption operation. Therefore, the function returns this error if * the authentication of the ciphertext was unsuccessful (i.e. the * integrity of the ciphertext or the associated data was violated); * < 0 if an error occurred. */ int crypto_aead_decrypt(struct aead_request *req); /** * DOC: Asynchronous AEAD Request Handle * * The aead_request data structure contains all pointers to data required for * the AEAD cipher operation. This includes the cipher handle (which can be * used by multiple aead_request instances), pointer to plaintext and * ciphertext, asynchronous callback function, etc. It acts as a handle to the * aead_request_* API calls in a similar way as AEAD handle to the * crypto_aead_* API calls. */ /** * crypto_aead_reqsize() - obtain size of the request data structure * @tfm: cipher handle * * Return: number of bytes */ static inline unsigned int crypto_aead_reqsize(struct crypto_aead *tfm) { return tfm->reqsize; } /** * aead_request_set_tfm() - update cipher handle reference in request * @req: request handle to be modified * @tfm: cipher handle that shall be added to the request handle * * Allow the caller to replace the existing aead handle in the request * data structure with a different one. */ static inline void aead_request_set_tfm(struct aead_request *req, struct crypto_aead *tfm) { req->base.tfm = crypto_aead_tfm(tfm); } static inline void aead_request_set_sync_tfm(struct aead_request *req, struct crypto_sync_aead *tfm) { aead_request_set_tfm(req, &tfm->base); } /** * aead_request_alloc() - allocate request data structure * @tfm: cipher handle to be registered with the request * @gfp: memory allocation flag that is handed to kmalloc by the API call. * * Allocate the request data structure that must be used with the AEAD * encrypt and decrypt API calls. During the allocation, the provided aead * handle is registered in the request data structure. * * Return: allocated request handle in case of success, or NULL if out of memory */ static inline struct aead_request *aead_request_alloc(struct crypto_aead *tfm, gfp_t gfp) { struct aead_request *req; req = kmalloc(sizeof(*req) + crypto_aead_reqsize(tfm), gfp); if (likely(req)) aead_request_set_tfm(req, tfm); return req; } /** * aead_request_free() - zeroize and free request data structure * @req: request data structure cipher handle to be freed */ static inline void aead_request_free(struct aead_request *req) { kfree_sensitive(req); } /** * aead_request_set_callback() - set asynchronous callback function * @req: request handle * @flags: specify zero or an ORing of the flags * CRYPTO_TFM_REQ_MAY_BACKLOG the request queue may back log and * increase the wait queue beyond the initial maximum size; * CRYPTO_TFM_REQ_MAY_SLEEP the request processing may sleep * @compl: callback function pointer to be registered with the request handle * @data: The data pointer refers to memory that is not used by the kernel * crypto API, but provided to the callback function for it to use. Here, * the caller can provide a reference to memory the callback function can * operate on. As the callback function is invoked asynchronously to the * related functionality, it may need to access data structures of the * related functionality which can be referenced using this pointer. The * callback function can access the memory via the "data" field in the * crypto_async_request data structure provided to the callback function. * * Setting the callback function that is triggered once the cipher operation * completes * * The callback function is registered with the aead_request handle and * must comply with the following template:: * * void callback_function(struct crypto_async_request *req, int error) */ static inline void aead_request_set_callback(struct aead_request *req, u32 flags, crypto_completion_t compl, void *data) { req->base.complete = compl; req->base.data = data; req->base.flags = flags; } /** * aead_request_set_crypt - set data buffers * @req: request handle * @src: source scatter / gather list * @dst: destination scatter / gather list * @cryptlen: number of bytes to process from @src * @iv: IV for the cipher operation which must comply with the IV size defined * by crypto_aead_ivsize() * * Setting the source data and destination data scatter / gather lists which * hold the associated data concatenated with the plaintext or ciphertext. See * below for the authentication tag. * * For encryption, the source is treated as the plaintext and the * destination is the ciphertext. For a decryption operation, the use is * reversed - the source is the ciphertext and the destination is the plaintext. * * The memory structure for cipher operation has the following structure: * * - AEAD encryption input: assoc data || plaintext * - AEAD encryption output: assoc data || ciphertext || auth tag * - AEAD decryption input: assoc data || ciphertext || auth tag * - AEAD decryption output: assoc data || plaintext * * Albeit the kernel requires the presence of the AAD buffer, however, * the kernel does not fill the AAD buffer in the output case. If the * caller wants to have that data buffer filled, the caller must either * use an in-place cipher operation (i.e. same memory location for * input/output memory location). */ static inline void aead_request_set_crypt(struct aead_request *req, struct scatterlist *src, struct scatterlist *dst, unsigned int cryptlen, u8 *iv) { req->src = src; req->dst = dst; req->cryptlen = cryptlen; req->iv = iv; } /** * aead_request_set_ad - set associated data information * @req: request handle * @assoclen: number of bytes in associated data * * Setting the AD information. This function sets the length of * the associated data. */ static inline void aead_request_set_ad(struct aead_request *req, unsigned int assoclen) { req->assoclen = assoclen; } #endif /* _CRYPTO_AEAD_H */
60 62 97 33 97 1 10 10 209 141 79 16 83 20 20 16 20 20 18 20 2 2 31 1 33 1 209 211 1 1 1 2 1 5 5 5 221 137 90 4 143 102 80 25 14 14 13 262 7 10 250 3 2 1 244 5 5 2 78 152 24 11 159 8 7 6 132 17 149 27 28 28 49 41 3 30 73 108 37 83 33 7 26 26 17 17 18 18 18 9 18 17 9 96 5 96 95 5 96 17 509 254 254 19 17 17 5 2 21 22 21 2 19 13 19 19 17 1 5 18 5 18 19 19 231 1 231 231 1 25 1 1 22 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 // SPDX-License-Identifier: GPL-2.0-or-later /* * Fast Userspace Mutexes (which I call "Futexes!"). * (C) Rusty Russell, IBM 2002 * * Generalized futexes, futex requeueing, misc fixes by Ingo Molnar * (C) Copyright 2003 Red Hat Inc, All Rights Reserved * * Removed page pinning, fix privately mapped COW pages and other cleanups * (C) Copyright 2003, 2004 Jamie Lokier * * Robust futex support started by Ingo Molnar * (C) Copyright 2006 Red Hat Inc, All Rights Reserved * Thanks to Thomas Gleixner for suggestions, analysis and fixes. * * PI-futex support started by Ingo Molnar and Thomas Gleixner * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> * Copyright (C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com> * * PRIVATE futexes by Eric Dumazet * Copyright (C) 2007 Eric Dumazet <dada1@cosmosbay.com> * * Requeue-PI support by Darren Hart <dvhltc@us.ibm.com> * Copyright (C) IBM Corporation, 2009 * Thanks to Thomas Gleixner for conceptual design and careful reviews. * * Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly * enough at me, Linus for the original (flawed) idea, Matthew * Kirkwood for proof-of-concept implementation. * * "The futexes are also cursed." * "But they come in a choice of three flavours!" */ #include <linux/compat.h> #include <linux/jhash.h> #include <linux/pagemap.h> #include <linux/debugfs.h> #include <linux/plist.h> #include <linux/gfp.h> #include <linux/vmalloc.h> #include <linux/memblock.h> #include <linux/fault-inject.h> #include <linux/slab.h> #include <linux/prctl.h> #include <linux/mempolicy.h> #include <linux/mmap_lock.h> #include "futex.h" #include "../locking/rtmutex_common.h" /* * The base of the bucket array and its size are always used together * (after initialization only in futex_hash()), so ensure that they * reside in the same cacheline. */ static struct { unsigned long hashmask; unsigned int hashshift; struct futex_hash_bucket *queues[MAX_NUMNODES]; } __futex_data __read_mostly __aligned(2*sizeof(long)); #define futex_hashmask (__futex_data.hashmask) #define futex_hashshift (__futex_data.hashshift) #define futex_queues (__futex_data.queues) struct futex_private_hash { int state; unsigned int hash_mask; struct rcu_head rcu; void *mm; bool custom; struct futex_hash_bucket queues[]; }; /* * Fault injections for futexes. */ #ifdef CONFIG_FAIL_FUTEX static struct { struct fault_attr attr; bool ignore_private; } fail_futex = { .attr = FAULT_ATTR_INITIALIZER, .ignore_private = false, }; static int __init setup_fail_futex(char *str) { return setup_fault_attr(&fail_futex.attr, str); } __setup("fail_futex=", setup_fail_futex); bool should_fail_futex(bool fshared) { if (fail_futex.ignore_private && !fshared) return false; return should_fail(&fail_futex.attr, 1); } #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS static int __init fail_futex_debugfs(void) { umode_t mode = S_IFREG | S_IRUSR | S_IWUSR; struct dentry *dir; dir = fault_create_debugfs_attr("fail_futex", NULL, &fail_futex.attr); if (IS_ERR(dir)) return PTR_ERR(dir); debugfs_create_bool("ignore-private", mode, dir, &fail_futex.ignore_private); return 0; } late_initcall(fail_futex_debugfs); #endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */ #endif /* CONFIG_FAIL_FUTEX */ static struct futex_hash_bucket * __futex_hash(union futex_key *key, struct futex_private_hash *fph); #ifdef CONFIG_FUTEX_PRIVATE_HASH static bool futex_ref_get(struct futex_private_hash *fph); static bool futex_ref_put(struct futex_private_hash *fph); static bool futex_ref_is_dead(struct futex_private_hash *fph); enum { FR_PERCPU = 0, FR_ATOMIC }; static inline bool futex_key_is_private(union futex_key *key) { /* * Relies on get_futex_key() to set either bit for shared * futexes -- see comment with union futex_key. */ return !(key->both.offset & (FUT_OFF_INODE | FUT_OFF_MMSHARED)); } static bool futex_private_hash_get(struct futex_private_hash *fph) { return futex_ref_get(fph); } void futex_private_hash_put(struct futex_private_hash *fph) { if (futex_ref_put(fph)) wake_up_var(fph->mm); } /** * futex_hash_get - Get an additional reference for the local hash. * @hb: ptr to the private local hash. * * Obtain an additional reference for the already obtained hash bucket. The * caller must already own an reference. */ void futex_hash_get(struct futex_hash_bucket *hb) { struct futex_private_hash *fph = hb->priv; if (!fph) return; WARN_ON_ONCE(!futex_private_hash_get(fph)); } void futex_hash_put(struct futex_hash_bucket *hb) { struct futex_private_hash *fph = hb->priv; if (!fph) return; futex_private_hash_put(fph); } static struct futex_hash_bucket * __futex_hash_private(union futex_key *key, struct futex_private_hash *fph) { u32 hash; if (!futex_key_is_private(key)) return NULL; if (!fph) fph = rcu_dereference(key->private.mm->futex_phash); if (!fph || !fph->hash_mask) return NULL; hash = jhash2((void *)&key->private.address, sizeof(key->private.address) / 4, key->both.offset); return &fph->queues[hash & fph->hash_mask]; } static void futex_rehash_private(struct futex_private_hash *old, struct futex_private_hash *new) { struct futex_hash_bucket *hb_old, *hb_new; unsigned int slots = old->hash_mask + 1; unsigned int i; for (i = 0; i < slots; i++) { struct futex_q *this, *tmp; hb_old = &old->queues[i]; spin_lock(&hb_old->lock); plist_for_each_entry_safe(this, tmp, &hb_old->chain, list) { plist_del(&this->list, &hb_old->chain); futex_hb_waiters_dec(hb_old); WARN_ON_ONCE(this->lock_ptr != &hb_old->lock); hb_new = __futex_hash(&this->key, new); futex_hb_waiters_inc(hb_new); /* * The new pointer isn't published yet but an already * moved user can be unqueued due to timeout or signal. */ spin_lock_nested(&hb_new->lock, SINGLE_DEPTH_NESTING); plist_add(&this->list, &hb_new->chain); this->lock_ptr = &hb_new->lock; spin_unlock(&hb_new->lock); } spin_unlock(&hb_old->lock); } } static bool __futex_pivot_hash(struct mm_struct *mm, struct futex_private_hash *new) { struct futex_private_hash *fph; WARN_ON_ONCE(mm->futex_phash_new); fph = rcu_dereference_protected(mm->futex_phash, lockdep_is_held(&mm->futex_hash_lock)); if (fph) { if (!futex_ref_is_dead(fph)) { mm->futex_phash_new = new; return false; } futex_rehash_private(fph, new); } new->state = FR_PERCPU; scoped_guard(rcu) { mm->futex_batches = get_state_synchronize_rcu(); rcu_assign_pointer(mm->futex_phash, new); } kvfree_rcu(fph, rcu); return true; } static void futex_pivot_hash(struct mm_struct *mm) { scoped_guard(mutex, &mm->futex_hash_lock) { struct futex_private_hash *fph; fph = mm->futex_phash_new; if (fph) { mm->futex_phash_new = NULL; __futex_pivot_hash(mm, fph); } } } struct futex_private_hash *futex_private_hash(void) { struct mm_struct *mm = current->mm; /* * Ideally we don't loop. If there is a replacement in progress * then a new private hash is already prepared and a reference can't be * obtained once the last user dropped it's. * In that case we block on mm_struct::futex_hash_lock and either have * to perform the replacement or wait while someone else is doing the * job. Eitherway, on the second iteration we acquire a reference on the * new private hash or loop again because a new replacement has been * requested. */ again: scoped_guard(rcu) { struct futex_private_hash *fph; fph = rcu_dereference(mm->futex_phash); if (!fph) return NULL; if (futex_private_hash_get(fph)) return fph; } futex_pivot_hash(mm); goto again; } struct futex_hash_bucket *futex_hash(union futex_key *key) { struct futex_private_hash *fph; struct futex_hash_bucket *hb; again: scoped_guard(rcu) { hb = __futex_hash(key, NULL); fph = hb->priv; if (!fph || futex_private_hash_get(fph)) return hb; } futex_pivot_hash(key->private.mm); goto again; } #else /* !CONFIG_FUTEX_PRIVATE_HASH */ static struct futex_hash_bucket * __futex_hash_private(union futex_key *key, struct futex_private_hash *fph) { return NULL; } struct futex_hash_bucket *futex_hash(union futex_key *key) { return __futex_hash(key, NULL); } #endif /* CONFIG_FUTEX_PRIVATE_HASH */ #ifdef CONFIG_FUTEX_MPOL static int __futex_key_to_node(struct mm_struct *mm, unsigned long addr) { struct vm_area_struct *vma = vma_lookup(mm, addr); struct mempolicy *mpol; int node = FUTEX_NO_NODE; if (!vma) return FUTEX_NO_NODE; mpol = vma_policy(vma); if (!mpol) return FUTEX_NO_NODE; switch (mpol->mode) { case MPOL_PREFERRED: node = first_node(mpol->nodes); break; case MPOL_PREFERRED_MANY: case MPOL_BIND: if (mpol->home_node != NUMA_NO_NODE) node = mpol->home_node; break; default: break; } return node; } static int futex_key_to_node_opt(struct mm_struct *mm, unsigned long addr) { int seq, node; guard(rcu)(); if (!mmap_lock_speculate_try_begin(mm, &seq)) return -EBUSY; node = __futex_key_to_node(mm, addr); if (mmap_lock_speculate_retry(mm, seq)) return -EAGAIN; return node; } static int futex_mpol(struct mm_struct *mm, unsigned long addr) { int node; node = futex_key_to_node_opt(mm, addr); if (node >= FUTEX_NO_NODE) return node; guard(mmap_read_lock)(mm); return __futex_key_to_node(mm, addr); } #else /* !CONFIG_FUTEX_MPOL */ static int futex_mpol(struct mm_struct *mm, unsigned long addr) { return FUTEX_NO_NODE; } #endif /* CONFIG_FUTEX_MPOL */ /** * __futex_hash - Return the hash bucket * @key: Pointer to the futex key for which the hash is calculated * @fph: Pointer to private hash if known * * We hash on the keys returned from get_futex_key (see below) and return the * corresponding hash bucket. * If the FUTEX is PROCESS_PRIVATE then a per-process hash bucket (from the * private hash) is returned if existing. Otherwise a hash bucket from the * global hash is returned. */ static struct futex_hash_bucket * __futex_hash(union futex_key *key, struct futex_private_hash *fph) { int node = key->both.node; u32 hash; if (node == FUTEX_NO_NODE) { struct futex_hash_bucket *hb; hb = __futex_hash_private(key, fph); if (hb) return hb; } hash = jhash2((u32 *)key, offsetof(typeof(*key), both.offset) / sizeof(u32), key->both.offset); if (node == FUTEX_NO_NODE) { /* * In case of !FLAGS_NUMA, use some unused hash bits to pick a * node -- this ensures regular futexes are interleaved across * the nodes and avoids having to allocate multiple * hash-tables. * * NOTE: this isn't perfectly uniform, but it is fast and * handles sparse node masks. */ node = (hash >> futex_hashshift) % nr_node_ids; if (!node_possible(node)) { node = find_next_bit_wrap(node_possible_map.bits, nr_node_ids, node); } } return &futex_queues[node][hash & futex_hashmask]; } /** * futex_setup_timer - set up the sleeping hrtimer. * @time: ptr to the given timeout value * @timeout: the hrtimer_sleeper structure to be set up * @flags: futex flags * @range_ns: optional range in ns * * Return: Initialized hrtimer_sleeper structure or NULL if no timeout * value given */ struct hrtimer_sleeper * futex_setup_timer(ktime_t *time, struct hrtimer_sleeper *timeout, int flags, u64 range_ns) { if (!time) return NULL; hrtimer_setup_sleeper_on_stack(timeout, (flags & FLAGS_CLOCKRT) ? CLOCK_REALTIME : CLOCK_MONOTONIC, HRTIMER_MODE_ABS); /* * If range_ns is 0, calling hrtimer_set_expires_range_ns() is * effectively the same as calling hrtimer_set_expires(). */ hrtimer_set_expires_range_ns(&timeout->timer, *time, range_ns); return timeout; } /* * Generate a machine wide unique identifier for this inode. * * This relies on u64 not wrapping in the life-time of the machine; which with * 1ns resolution means almost 585 years. * * This further relies on the fact that a well formed program will not unmap * the file while it has a (shared) futex waiting on it. This mapping will have * a file reference which pins the mount and inode. * * If for some reason an inode gets evicted and read back in again, it will get * a new sequence number and will _NOT_ match, even though it is the exact same * file. * * It is important that futex_match() will never have a false-positive, esp. * for PI futexes that can mess up the state. The above argues that false-negatives * are only possible for malformed programs. */ static u64 get_inode_sequence_number(struct inode *inode) { static atomic64_t i_seq; u64 old; /* Does the inode already have a sequence number? */ old = atomic64_read(&inode->i_sequence); if (likely(old)) return old; for (;;) { u64 new = atomic64_inc_return(&i_seq); if (WARN_ON_ONCE(!new)) continue; old = 0; if (!atomic64_try_cmpxchg_relaxed(&inode->i_sequence, &old, new)) return old; return new; } } /** * get_futex_key() - Get parameters which are the keys for a futex * @uaddr: virtual address of the futex * @flags: FLAGS_* * @key: address where result is stored. * @rw: mapping needs to be read/write (values: FUTEX_READ, * FUTEX_WRITE) * * Return: a negative error code or 0 * * The key words are stored in @key on success. * * For shared mappings (when @fshared), the key is: * * ( inode->i_sequence, page offset within mapping, offset_within_page ) * * [ also see get_inode_sequence_number() ] * * For private mappings (or when !@fshared), the key is: * * ( current->mm, address, 0 ) * * This allows (cross process, where applicable) identification of the futex * without keeping the page pinned for the duration of the FUTEX_WAIT. * * lock_page() might sleep, the caller should not hold a spinlock. */ int get_futex_key(u32 __user *uaddr, unsigned int flags, union futex_key *key, enum futex_access rw) { unsigned long address = (unsigned long)uaddr; struct mm_struct *mm = current->mm; struct page *page; struct folio *folio; struct address_space *mapping; int node, err, size, ro = 0; bool node_updated = false; bool fshared; fshared = flags & FLAGS_SHARED; size = futex_size(flags); if (flags & FLAGS_NUMA) size *= 2; /* * The futex address must be "naturally" aligned. */ key->both.offset = address % PAGE_SIZE; if (unlikely((address % size) != 0)) return -EINVAL; address -= key->both.offset; if (unlikely(!access_ok(uaddr, size))) return -EFAULT; if (unlikely(should_fail_futex(fshared))) return -EFAULT; node = FUTEX_NO_NODE; if (flags & FLAGS_NUMA) { u32 __user *naddr = (void *)uaddr + size / 2; if (get_user_inline(node, naddr)) return -EFAULT; if ((node != FUTEX_NO_NODE) && ((unsigned int)node >= MAX_NUMNODES || !node_possible(node))) return -EINVAL; } if (node == FUTEX_NO_NODE && (flags & FLAGS_MPOL)) { node = futex_mpol(mm, address); node_updated = true; } if (flags & FLAGS_NUMA) { u32 __user *naddr = (void *)uaddr + size / 2; if (node == FUTEX_NO_NODE) { node = numa_node_id(); node_updated = true; } if (node_updated && put_user_inline(node, naddr)) return -EFAULT; } key->both.node = node; /* * PROCESS_PRIVATE futexes are fast. * As the mm cannot disappear under us and the 'key' only needs * virtual address, we dont even have to find the underlying vma. * Note : We do have to check 'uaddr' is a valid user address, * but access_ok() should be faster than find_vma() */ if (!fshared) { /* * On no-MMU, shared futexes are treated as private, therefore * we must not include the current process in the key. Since * there is only one address space, the address is a unique key * on its own. */ if (IS_ENABLED(CONFIG_MMU)) key->private.mm = mm; else key->private.mm = NULL; key->private.address = address; return 0; } again: /* Ignore any VERIFY_READ mapping (futex common case) */ if (unlikely(should_fail_futex(true))) return -EFAULT; err = get_user_pages_fast(address, 1, FOLL_WRITE, &page); /* * If write access is not required (eg. FUTEX_WAIT), try * and get read-only access. */ if (err == -EFAULT && rw == FUTEX_READ) { err = get_user_pages_fast(address, 1, 0, &page); ro = 1; } if (err < 0) return err; else err = 0; /* * The treatment of mapping from this point on is critical. The folio * lock protects many things but in this context the folio lock * stabilizes mapping, prevents inode freeing in the shared * file-backed region case and guards against movement to swap cache. * * Strictly speaking the folio lock is not needed in all cases being * considered here and folio lock forces unnecessarily serialization. * From this point on, mapping will be re-verified if necessary and * folio lock will be acquired only if it is unavoidable * * Mapping checks require the folio so it is looked up now. For * anonymous pages, it does not matter if the folio is split * in the future as the key is based on the address. For * filesystem-backed pages, the precise page is required as the * index of the page determines the key. */ folio = page_folio(page); mapping = READ_ONCE(folio->mapping); /* * If folio->mapping is NULL, then it cannot be an anonymous * page; but it might be the ZERO_PAGE or in the gate area or * in a special mapping (all cases which we are happy to fail); * or it may have been a good file page when get_user_pages_fast * found it, but truncated or holepunched or subjected to * invalidate_complete_page2 before we got the folio lock (also * cases which we are happy to fail). And we hold a reference, * so refcount care in invalidate_inode_page's remove_mapping * prevents drop_caches from setting mapping to NULL beneath us. * * The case we do have to guard against is when memory pressure made * shmem_writepage move it from filecache to swapcache beneath us: * an unlikely race, but we do need to retry for folio->mapping. */ if (unlikely(!mapping)) { int shmem_swizzled; /* * Folio lock is required to identify which special case above * applies. If this is really a shmem page then the folio lock * will prevent unexpected transitions. */ folio_lock(folio); shmem_swizzled = folio_test_swapcache(folio) || folio->mapping; folio_unlock(folio); folio_put(folio); if (shmem_swizzled) goto again; return -EFAULT; } /* * Private mappings are handled in a simple way. * * If the futex key is stored in anonymous memory, then the associated * object is the mm which is implicitly pinned by the calling process. * * NOTE: When userspace waits on a MAP_SHARED mapping, even if * it's a read-only handle, it's expected that futexes attach to * the object not the particular process. */ if (folio_test_anon(folio)) { /* * A RO anonymous page will never change and thus doesn't make * sense for futex operations. */ if (unlikely(should_fail_futex(true)) || ro) { err = -EFAULT; goto out; } key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */ key->private.mm = mm; key->private.address = address; } else { struct inode *inode; /* * The associated futex object in this case is the inode and * the folio->mapping must be traversed. Ordinarily this should * be stabilised under folio lock but it's not strictly * necessary in this case as we just want to pin the inode, not * update i_pages or anything like that. * * The RCU read lock is taken as the inode is finally freed * under RCU. If the mapping still matches expectations then the * mapping->host can be safely accessed as being a valid inode. */ rcu_read_lock(); if (READ_ONCE(folio->mapping) != mapping) { rcu_read_unlock(); folio_put(folio); goto again; } inode = READ_ONCE(mapping->host); if (!inode) { rcu_read_unlock(); folio_put(folio); goto again; } key->both.offset |= FUT_OFF_INODE; /* inode-based key */ key->shared.i_seq = get_inode_sequence_number(inode); key->shared.pgoff = page_pgoff(folio, page); rcu_read_unlock(); } out: folio_put(folio); return err; } /** * fault_in_user_writeable() - Fault in user address and verify RW access * @uaddr: pointer to faulting user space address * * Slow path to fixup the fault we just took in the atomic write * access to @uaddr. * * We have no generic implementation of a non-destructive write to the * user address. We know that we faulted in the atomic pagefault * disabled section so we can as well avoid the #PF overhead by * calling get_user_pages() right away. */ int fault_in_user_writeable(u32 __user *uaddr) { struct mm_struct *mm = current->mm; int ret; mmap_read_lock(mm); ret = fixup_user_fault(mm, (unsigned long)uaddr, FAULT_FLAG_WRITE, NULL); mmap_read_unlock(mm); return ret < 0 ? ret : 0; } /** * futex_top_waiter() - Return the highest priority waiter on a futex * @hb: the hash bucket the futex_q's reside in * @key: the futex key (to distinguish it from other futex futex_q's) * * Must be called with the hb lock held. */ struct futex_q *futex_top_waiter(struct futex_hash_bucket *hb, union futex_key *key) { struct futex_q *this; plist_for_each_entry(this, &hb->chain, list) { if (futex_match(&this->key, key)) return this; } return NULL; } /** * wait_for_owner_exiting - Block until the owner has exited * @ret: owner's current futex lock status * @exiting: Pointer to the exiting task * * Caller must hold a refcount on @exiting. */ void wait_for_owner_exiting(int ret, struct task_struct *exiting) { if (ret != -EBUSY) { WARN_ON_ONCE(exiting); return; } if (WARN_ON_ONCE(ret == -EBUSY && !exiting)) return; mutex_lock(&exiting->futex_exit_mutex); /* * No point in doing state checking here. If the waiter got here * while the task was in exec()->exec_futex_release() then it can * have any FUTEX_STATE_* value when the waiter has acquired the * mutex. OK, if running, EXITING or DEAD if it reached exit() * already. Highly unlikely and not a problem. Just one more round * through the futex maze. */ mutex_unlock(&exiting->futex_exit_mutex); put_task_struct(exiting); } /** * __futex_unqueue() - Remove the futex_q from its futex_hash_bucket * @q: The futex_q to unqueue * * The q->lock_ptr must not be NULL and must be held by the caller. */ void __futex_unqueue(struct futex_q *q) { struct futex_hash_bucket *hb; if (WARN_ON_SMP(!q->lock_ptr) || WARN_ON(plist_node_empty(&q->list))) return; lockdep_assert_held(q->lock_ptr); hb = container_of(q->lock_ptr, struct futex_hash_bucket, lock); plist_del(&q->list, &hb->chain); futex_hb_waiters_dec(hb); } /* The key must be already stored in q->key. */ void futex_q_lock(struct futex_q *q, struct futex_hash_bucket *hb) __acquires(&hb->lock) { /* * Increment the counter before taking the lock so that * a potential waker won't miss a to-be-slept task that is * waiting for the spinlock. This is safe as all futex_q_lock() * users end up calling futex_queue(). Similarly, for housekeeping, * decrement the counter at futex_q_unlock() when some error has * occurred and we don't end up adding the task to the list. */ futex_hb_waiters_inc(hb); /* implies smp_mb(); (A) */ q->lock_ptr = &hb->lock; spin_lock(&hb->lock); } void futex_q_unlock(struct futex_hash_bucket *hb) __releases(&hb->lock) { futex_hb_waiters_dec(hb); spin_unlock(&hb->lock); } void __futex_queue(struct futex_q *q, struct futex_hash_bucket *hb, struct task_struct *task) { int prio; /* * The priority used to register this element is * - either the real thread-priority for the real-time threads * (i.e. threads with a priority lower than MAX_RT_PRIO) * - or MAX_RT_PRIO for non-RT threads. * Thus, all RT-threads are woken first in priority order, and * the others are woken last, in FIFO order. */ prio = min(current->normal_prio, MAX_RT_PRIO); plist_node_init(&q->list, prio); plist_add(&q->list, &hb->chain); q->task = task; } /** * futex_unqueue() - Remove the futex_q from its futex_hash_bucket * @q: The futex_q to unqueue * * The q->lock_ptr must not be held by the caller. A call to futex_unqueue() must * be paired with exactly one earlier call to futex_queue(). * * Return: * - 1 - if the futex_q was still queued (and we removed unqueued it); * - 0 - if the futex_q was already removed by the waking thread */ int futex_unqueue(struct futex_q *q) { spinlock_t *lock_ptr; int ret = 0; /* RCU so lock_ptr is not going away during locking. */ guard(rcu)(); /* In the common case we don't take the spinlock, which is nice. */ retry: /* * q->lock_ptr can change between this read and the following spin_lock. * Use READ_ONCE to forbid the compiler from reloading q->lock_ptr and * optimizing lock_ptr out of the logic below. */ lock_ptr = READ_ONCE(q->lock_ptr); if (lock_ptr != NULL) { spin_lock(lock_ptr); /* * q->lock_ptr can change between reading it and * spin_lock(), causing us to take the wrong lock. This * corrects the race condition. * * Reasoning goes like this: if we have the wrong lock, * q->lock_ptr must have changed (maybe several times) * between reading it and the spin_lock(). It can * change again after the spin_lock() but only if it was * already changed before the spin_lock(). It cannot, * however, change back to the original value. Therefore * we can detect whether we acquired the correct lock. */ if (unlikely(lock_ptr != q->lock_ptr)) { spin_unlock(lock_ptr); goto retry; } __futex_unqueue(q); BUG_ON(q->pi_state); spin_unlock(lock_ptr); ret = 1; } return ret; } void futex_q_lockptr_lock(struct futex_q *q) { spinlock_t *lock_ptr; /* * See futex_unqueue() why lock_ptr can change. */ guard(rcu)(); retry: lock_ptr = READ_ONCE(q->lock_ptr); spin_lock(lock_ptr); if (unlikely(lock_ptr != q->lock_ptr)) { spin_unlock(lock_ptr); goto retry; } } /* * PI futexes can not be requeued and must remove themselves from the hash * bucket. The hash bucket lock (i.e. lock_ptr) is held. */ void futex_unqueue_pi(struct futex_q *q) { /* * If the lock was not acquired (due to timeout or signal) then the * rt_waiter is removed before futex_q is. If this is observed by * an unlocker after dropping the rtmutex wait lock and before * acquiring the hash bucket lock, then the unlocker dequeues the * futex_q from the hash bucket list to guarantee consistent state * vs. userspace. Therefore the dequeue here must be conditional. */ if (!plist_node_empty(&q->list)) __futex_unqueue(q); BUG_ON(!q->pi_state); put_pi_state(q->pi_state); q->pi_state = NULL; } /* Constants for the pending_op argument of handle_futex_death */ #define HANDLE_DEATH_PENDING true #define HANDLE_DEATH_LIST false /* * Process a futex-list entry, check whether it's owned by the * dying task, and do notification if so: */ static int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, bool pi, bool pending_op) { u32 uval, nval, mval; pid_t owner; int err; /* Futex address must be 32bit aligned */ if ((((unsigned long)uaddr) % sizeof(*uaddr)) != 0) return -1; retry: if (get_user(uval, uaddr)) return -1; /* * Special case for regular (non PI) futexes. The unlock path in * user space has two race scenarios: * * 1. The unlock path releases the user space futex value and * before it can execute the futex() syscall to wake up * waiters it is killed. * * 2. A woken up waiter is killed before it can acquire the * futex in user space. * * In the second case, the wake up notification could be generated * by the unlock path in user space after setting the futex value * to zero or by the kernel after setting the OWNER_DIED bit below. * * In both cases the TID validation below prevents a wakeup of * potential waiters which can cause these waiters to block * forever. * * In both cases the following conditions are met: * * 1) task->robust_list->list_op_pending != NULL * @pending_op == true * 2) The owner part of user space futex value == 0 * 3) Regular futex: @pi == false * * If these conditions are met, it is safe to attempt waking up a * potential waiter without touching the user space futex value and * trying to set the OWNER_DIED bit. If the futex value is zero, * the rest of the user space mutex state is consistent, so a woken * waiter will just take over the uncontended futex. Setting the * OWNER_DIED bit would create inconsistent state and malfunction * of the user space owner died handling. Otherwise, the OWNER_DIED * bit is already set, and the woken waiter is expected to deal with * this. */ owner = uval & FUTEX_TID_MASK; if (pending_op && !pi && !owner) { futex_wake(uaddr, FLAGS_SIZE_32 | FLAGS_SHARED, 1, FUTEX_BITSET_MATCH_ANY); return 0; } if (owner != task_pid_vnr(curr)) return 0; /* * Ok, this dying thread is truly holding a futex * of interest. Set the OWNER_DIED bit atomically * via cmpxchg, and if the value had FUTEX_WAITERS * set, wake up a waiter (if any). (We have to do a * futex_wake() even if OWNER_DIED is already set - * to handle the rare but possible case of recursive * thread-death.) The rest of the cleanup is done in * userspace. */ mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED; /* * We are not holding a lock here, but we want to have * the pagefault_disable/enable() protection because * we want to handle the fault gracefully. If the * access fails we try to fault in the futex with R/W * verification via get_user_pages. get_user() above * does not guarantee R/W access. If that fails we * give up and leave the futex locked. */ if ((err = futex_cmpxchg_value_locked(&nval, uaddr, uval, mval))) { switch (err) { case -EFAULT: if (fault_in_user_writeable(uaddr)) return -1; goto retry; case -EAGAIN: cond_resched(); goto retry; default: WARN_ON_ONCE(1); return err; } } if (nval != uval) goto retry; /* * Wake robust non-PI futexes here. The wakeup of * PI futexes happens in exit_pi_state(): */ if (!pi && (uval & FUTEX_WAITERS)) { futex_wake(uaddr, FLAGS_SIZE_32 | FLAGS_SHARED, 1, FUTEX_BITSET_MATCH_ANY); } return 0; } /* * Fetch a robust-list pointer. Bit 0 signals PI futexes: */ static inline int fetch_robust_entry(struct robust_list __user **entry, struct robust_list __user * __user *head, unsigned int *pi) { unsigned long uentry; if (get_user(uentry, (unsigned long __user *)head)) return -EFAULT; *entry = (void __user *)(uentry & ~1UL); *pi = uentry & 1; return 0; } /* * Walk curr->robust_list (very carefully, it's a userspace list!) * and mark any locks found there dead, and notify any waiters. * * We silently return on any sign of list-walking problem. */ static void exit_robust_list(struct task_struct *curr) { struct robust_list_head __user *head = curr->robust_list; struct robust_list __user *entry, *next_entry, *pending; unsigned int limit = ROBUST_LIST_LIMIT, pi, pip; unsigned int next_pi; unsigned long futex_offset; int rc; /* * Fetch the list head (which was registered earlier, via * sys_set_robust_list()): */ if (fetch_robust_entry(&entry, &head->list.next, &pi)) return; /* * Fetch the relative futex offset: */ if (get_user(futex_offset, &head->futex_offset)) return; /* * Fetch any possibly pending lock-add first, and handle it * if it exists: */ if (fetch_robust_entry(&pending, &head->list_op_pending, &pip)) return; next_entry = NULL; /* avoid warning with gcc */ while (entry != &head->list) { /* * Fetch the next entry in the list before calling * handle_futex_death: */ rc = fetch_robust_entry(&next_entry, &entry->next, &next_pi); /* * A pending lock might already be on the list, so * don't process it twice: */ if (entry != pending) { if (handle_futex_death((void __user *)entry + futex_offset, curr, pi, HANDLE_DEATH_LIST)) return; } if (rc) return; entry = next_entry; pi = next_pi; /* * Avoid excessively long or circular lists: */ if (!--limit) break; cond_resched(); } if (pending) { handle_futex_death((void __user *)pending + futex_offset, curr, pip, HANDLE_DEATH_PENDING); } } #ifdef CONFIG_COMPAT static void __user *futex_uaddr(struct robust_list __user *entry, compat_long_t futex_offset) { compat_uptr_t base = ptr_to_compat(entry); void __user *uaddr = compat_ptr(base + futex_offset); return uaddr; } /* * Fetch a robust-list pointer. Bit 0 signals PI futexes: */ static inline int compat_fetch_robust_entry(compat_uptr_t *uentry, struct robust_list __user **entry, compat_uptr_t __user *head, unsigned int *pi) { if (get_user(*uentry, head)) return -EFAULT; *entry = compat_ptr((*uentry) & ~1); *pi = (unsigned int)(*uentry) & 1; return 0; } /* * Walk curr->robust_list (very carefully, it's a userspace list!) * and mark any locks found there dead, and notify any waiters. * * We silently return on any sign of list-walking problem. */ static void compat_exit_robust_list(struct task_struct *curr) { struct compat_robust_list_head __user *head = curr->compat_robust_list; struct robust_list __user *entry, *next_entry, *pending; unsigned int limit = ROBUST_LIST_LIMIT, pi, pip; unsigned int next_pi; compat_uptr_t uentry, next_uentry, upending; compat_long_t futex_offset; int rc; /* * Fetch the list head (which was registered earlier, via * sys_set_robust_list()): */ if (compat_fetch_robust_entry(&uentry, &entry, &head->list.next, &pi)) return; /* * Fetch the relative futex offset: */ if (get_user(futex_offset, &head->futex_offset)) return; /* * Fetch any possibly pending lock-add first, and handle it * if it exists: */ if (compat_fetch_robust_entry(&upending, &pending, &head->list_op_pending, &pip)) return; next_entry = NULL; /* avoid warning with gcc */ while (entry != (struct robust_list __user *) &head->list) { /* * Fetch the next entry in the list before calling * handle_futex_death: */ rc = compat_fetch_robust_entry(&next_uentry, &next_entry, (compat_uptr_t __user *)&entry->next, &next_pi); /* * A pending lock might already be on the list, so * dont process it twice: */ if (entry != pending) { void __user *uaddr = futex_uaddr(entry, futex_offset); if (handle_futex_death(uaddr, curr, pi, HANDLE_DEATH_LIST)) return; } if (rc) return; uentry = next_uentry; entry = next_entry; pi = next_pi; /* * Avoid excessively long or circular lists: */ if (!--limit) break; cond_resched(); } if (pending) { void __user *uaddr = futex_uaddr(pending, futex_offset); handle_futex_death(uaddr, curr, pip, HANDLE_DEATH_PENDING); } } #endif #ifdef CONFIG_FUTEX_PI /* * This task is holding PI mutexes at exit time => bad. * Kernel cleans up PI-state, but userspace is likely hosed. * (Robust-futex cleanup is separate and might save the day for userspace.) */ static void exit_pi_state_list(struct task_struct *curr) { struct list_head *next, *head = &curr->pi_state_list; struct futex_pi_state *pi_state; union futex_key key = FUTEX_KEY_INIT; /* * The mutex mm_struct::futex_hash_lock might be acquired. */ might_sleep(); /* * Ensure the hash remains stable (no resize) during the while loop * below. The hb pointer is acquired under the pi_lock so we can't block * on the mutex. */ WARN_ON(curr != current); guard(private_hash)(); /* * We are a ZOMBIE and nobody can enqueue itself on * pi_state_list anymore, but we have to be careful * versus waiters unqueueing themselves: */ raw_spin_lock_irq(&curr->pi_lock); while (!list_empty(head)) { next = head->next; pi_state = list_entry(next, struct futex_pi_state, list); key = pi_state->key; if (1) { CLASS(hb, hb)(&key); /* * We can race against put_pi_state() removing itself from the * list (a waiter going away). put_pi_state() will first * decrement the reference count and then modify the list, so * its possible to see the list entry but fail this reference * acquire. * * In that case; drop the locks to let put_pi_state() make * progress and retry the loop. */ if (!refcount_inc_not_zero(&pi_state->refcount)) { raw_spin_unlock_irq(&curr->pi_lock); cpu_relax(); raw_spin_lock_irq(&curr->pi_lock); continue; } raw_spin_unlock_irq(&curr->pi_lock); spin_lock(&hb->lock); raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); raw_spin_lock(&curr->pi_lock); /* * We dropped the pi-lock, so re-check whether this * task still owns the PI-state: */ if (head->next != next) { /* retain curr->pi_lock for the loop invariant */ raw_spin_unlock(&pi_state->pi_mutex.wait_lock); spin_unlock(&hb->lock); put_pi_state(pi_state); continue; } WARN_ON(pi_state->owner != curr); WARN_ON(list_empty(&pi_state->list)); list_del_init(&pi_state->list); pi_state->owner = NULL; raw_spin_unlock(&curr->pi_lock); raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); spin_unlock(&hb->lock); } rt_mutex_futex_unlock(&pi_state->pi_mutex); put_pi_state(pi_state); raw_spin_lock_irq(&curr->pi_lock); } raw_spin_unlock_irq(&curr->pi_lock); } #else static inline void exit_pi_state_list(struct task_struct *curr) { } #endif static void futex_cleanup(struct task_struct *tsk) { if (unlikely(tsk->robust_list)) { exit_robust_list(tsk); tsk->robust_list = NULL; } #ifdef CONFIG_COMPAT if (unlikely(tsk->compat_robust_list)) { compat_exit_robust_list(tsk); tsk->compat_robust_list = NULL; } #endif if (unlikely(!list_empty(&tsk->pi_state_list))) exit_pi_state_list(tsk); } /** * futex_exit_recursive - Set the tasks futex state to FUTEX_STATE_DEAD * @tsk: task to set the state on * * Set the futex exit state of the task lockless. The futex waiter code * observes that state when a task is exiting and loops until the task has * actually finished the futex cleanup. The worst case for this is that the * waiter runs through the wait loop until the state becomes visible. * * This is called from the recursive fault handling path in make_task_dead(). * * This is best effort. Either the futex exit code has run already or * not. If the OWNER_DIED bit has been set on the futex then the waiter can * take it over. If not, the problem is pushed back to user space. If the * futex exit code did not run yet, then an already queued waiter might * block forever, but there is nothing which can be done about that. */ void futex_exit_recursive(struct task_struct *tsk) { /* If the state is FUTEX_STATE_EXITING then futex_exit_mutex is held */ if (tsk->futex_state == FUTEX_STATE_EXITING) mutex_unlock(&tsk->futex_exit_mutex); tsk->futex_state = FUTEX_STATE_DEAD; } static void futex_cleanup_begin(struct task_struct *tsk) { /* * Prevent various race issues against a concurrent incoming waiter * including live locks by forcing the waiter to block on * tsk->futex_exit_mutex when it observes FUTEX_STATE_EXITING in * attach_to_pi_owner(). */ mutex_lock(&tsk->futex_exit_mutex); /* * Switch the state to FUTEX_STATE_EXITING under tsk->pi_lock. * * This ensures that all subsequent checks of tsk->futex_state in * attach_to_pi_owner() must observe FUTEX_STATE_EXITING with * tsk->pi_lock held. * * It guarantees also that a pi_state which was queued right before * the state change under tsk->pi_lock by a concurrent waiter must * be observed in exit_pi_state_list(). */ raw_spin_lock_irq(&tsk->pi_lock); tsk->futex_state = FUTEX_STATE_EXITING; raw_spin_unlock_irq(&tsk->pi_lock); } static void futex_cleanup_end(struct task_struct *tsk, int state) { /* * Lockless store. The only side effect is that an observer might * take another loop until it becomes visible. */ tsk->futex_state = state; /* * Drop the exit protection. This unblocks waiters which observed * FUTEX_STATE_EXITING to reevaluate the state. */ mutex_unlock(&tsk->futex_exit_mutex); } void futex_exec_release(struct task_struct *tsk) { /* * The state handling is done for consistency, but in the case of * exec() there is no way to prevent further damage as the PID stays * the same. But for the unlikely and arguably buggy case that a * futex is held on exec(), this provides at least as much state * consistency protection which is possible. */ futex_cleanup_begin(tsk); futex_cleanup(tsk); /* * Reset the state to FUTEX_STATE_OK. The task is alive and about * exec a new binary. */ futex_cleanup_end(tsk, FUTEX_STATE_OK); } void futex_exit_release(struct task_struct *tsk) { futex_cleanup_begin(tsk); futex_cleanup(tsk); futex_cleanup_end(tsk, FUTEX_STATE_DEAD); } static void futex_hash_bucket_init(struct futex_hash_bucket *fhb, struct futex_private_hash *fph) { #ifdef CONFIG_FUTEX_PRIVATE_HASH fhb->priv = fph; #endif atomic_set(&fhb->waiters, 0); plist_head_init(&fhb->chain); spin_lock_init(&fhb->lock); } #define FH_CUSTOM 0x01 #ifdef CONFIG_FUTEX_PRIVATE_HASH /* * futex-ref * * Heavily inspired by percpu-rwsem/percpu-refcount; not reusing any of that * code because it just doesn't fit right. * * Dual counter, per-cpu / atomic approach like percpu-refcount, except it * re-initializes the state automatically, such that the fph swizzle is also a * transition back to per-cpu. */ static void futex_ref_rcu(struct rcu_head *head); static void __futex_ref_atomic_begin(struct futex_private_hash *fph) { struct mm_struct *mm = fph->mm; /* * The counter we're about to switch to must have fully switched; * otherwise it would be impossible for it to have reported success * from futex_ref_is_dead(). */ WARN_ON_ONCE(atomic_long_read(&mm->futex_atomic) != 0); /* * Set the atomic to the bias value such that futex_ref_{get,put}() * will never observe 0. Will be fixed up in __futex_ref_atomic_end() * when folding in the percpu count. */ atomic_long_set(&mm->futex_atomic, LONG_MAX); smp_store_release(&fph->state, FR_ATOMIC); call_rcu_hurry(&mm->futex_rcu, futex_ref_rcu); } static void __futex_ref_atomic_end(struct futex_private_hash *fph) { struct mm_struct *mm = fph->mm; unsigned int count = 0; long ret; int cpu; /* * Per __futex_ref_atomic_begin() the state of the fph must be ATOMIC * and per this RCU callback, everybody must now observe this state and * use the atomic variable. */ WARN_ON_ONCE(fph->state != FR_ATOMIC); /* * Therefore the per-cpu counter is now stable, sum and reset. */ for_each_possible_cpu(cpu) { unsigned int *ptr = per_cpu_ptr(mm->futex_ref, cpu); count += *ptr; *ptr = 0; } /* * Re-init for the next cycle. */ this_cpu_inc(*mm->futex_ref); /* 0 -> 1 */ /* * Add actual count, subtract bias and initial refcount. * * The moment this atomic operation happens, futex_ref_is_dead() can * become true. */ ret = atomic_long_add_return(count - LONG_MAX - 1, &mm->futex_atomic); if (!ret) wake_up_var(mm); WARN_ON_ONCE(ret < 0); mmput_async(mm); } static void futex_ref_rcu(struct rcu_head *head) { struct mm_struct *mm = container_of(head, struct mm_struct, futex_rcu); struct futex_private_hash *fph = rcu_dereference_raw(mm->futex_phash); if (fph->state == FR_PERCPU) { /* * Per this extra grace-period, everybody must now observe * fph as the current fph and no previously observed fph's * are in-flight. * * Notably, nobody will now rely on the atomic * futex_ref_is_dead() state anymore so we can begin the * migration of the per-cpu counter into the atomic. */ __futex_ref_atomic_begin(fph); return; } __futex_ref_atomic_end(fph); } /* * Drop the initial refcount and transition to atomics. */ static void futex_ref_drop(struct futex_private_hash *fph) { struct mm_struct *mm = fph->mm; /* * Can only transition the current fph; */ WARN_ON_ONCE(rcu_dereference_raw(mm->futex_phash) != fph); /* * We enqueue at least one RCU callback. Ensure mm stays if the task * exits before the transition is completed. */ mmget(mm); /* * In order to avoid the following scenario: * * futex_hash() __futex_pivot_hash() * guard(rcu); guard(mm->futex_hash_lock); * fph = mm->futex_phash; * rcu_assign_pointer(&mm->futex_phash, new); * futex_hash_allocate() * futex_ref_drop() * fph->state = FR_ATOMIC; * atomic_set(, BIAS); * * futex_private_hash_get(fph); // OOPS * * Where an old fph (which is FR_ATOMIC) and should fail on * inc_not_zero, will succeed because a new transition is started and * the atomic is bias'ed away from 0. * * There must be at least one full grace-period between publishing a * new fph and trying to replace it. */ if (poll_state_synchronize_rcu(mm->futex_batches)) { /* * There was a grace-period, we can begin now. */ __futex_ref_atomic_begin(fph); return; } call_rcu_hurry(&mm->futex_rcu, futex_ref_rcu); } static bool futex_ref_get(struct futex_private_hash *fph) { struct mm_struct *mm = fph->mm; guard(preempt)(); if (READ_ONCE(fph->state) == FR_PERCPU) { __this_cpu_inc(*mm->futex_ref); return true; } return atomic_long_inc_not_zero(&mm->futex_atomic); } static bool futex_ref_put(struct futex_private_hash *fph) { struct mm_struct *mm = fph->mm; guard(preempt)(); if (READ_ONCE(fph->state) == FR_PERCPU) { __this_cpu_dec(*mm->futex_ref); return false; } return atomic_long_dec_and_test(&mm->futex_atomic); } static bool futex_ref_is_dead(struct futex_private_hash *fph) { struct mm_struct *mm = fph->mm; guard(rcu)(); if (smp_load_acquire(&fph->state) == FR_PERCPU) return false; return atomic_long_read(&mm->futex_atomic) == 0; } int futex_mm_init(struct mm_struct *mm) { mutex_init(&mm->futex_hash_lock); RCU_INIT_POINTER(mm->futex_phash, NULL); mm->futex_phash_new = NULL; /* futex-ref */ mm->futex_ref = NULL; atomic_long_set(&mm->futex_atomic, 0); mm->futex_batches = get_state_synchronize_rcu(); return 0; } void futex_hash_free(struct mm_struct *mm) { struct futex_private_hash *fph; free_percpu(mm->futex_ref); kvfree(mm->futex_phash_new); fph = rcu_dereference_raw(mm->futex_phash); if (fph) kvfree(fph); } static bool futex_pivot_pending(struct mm_struct *mm) { struct futex_private_hash *fph; guard(rcu)(); if (!mm->futex_phash_new) return true; fph = rcu_dereference(mm->futex_phash); return futex_ref_is_dead(fph); } static bool futex_hash_less(struct futex_private_hash *a, struct futex_private_hash *b) { /* user provided always wins */ if (!a->custom && b->custom) return true; if (a->custom && !b->custom) return false; /* zero-sized hash wins */ if (!b->hash_mask) return true; if (!a->hash_mask) return false; /* keep the biggest */ if (a->hash_mask < b->hash_mask) return true; if (a->hash_mask > b->hash_mask) return false; return false; /* equal */ } static int futex_hash_allocate(unsigned int hash_slots, unsigned int flags) { struct mm_struct *mm = current->mm; struct futex_private_hash *fph; bool custom = flags & FH_CUSTOM; int i; if (hash_slots && (hash_slots == 1 || !is_power_of_2(hash_slots))) return -EINVAL; /* * Once we've disabled the global hash there is no way back. */ scoped_guard(rcu) { fph = rcu_dereference(mm->futex_phash); if (fph && !fph->hash_mask) { if (custom) return -EBUSY; return 0; } } if (!mm->futex_ref) { /* * This will always be allocated by the first thread and * therefore requires no locking. */ mm->futex_ref = alloc_percpu(unsigned int); if (!mm->futex_ref) return -ENOMEM; this_cpu_inc(*mm->futex_ref); /* 0 -> 1 */ } fph = kvzalloc(struct_size(fph, queues, hash_slots), GFP_KERNEL_ACCOUNT | __GFP_NOWARN); if (!fph) return -ENOMEM; fph->hash_mask = hash_slots ? hash_slots - 1 : 0; fph->custom = custom; fph->mm = mm; for (i = 0; i < hash_slots; i++) futex_hash_bucket_init(&fph->queues[i], fph); if (custom) { /* * Only let prctl() wait / retry; don't unduly delay clone(). */ again: wait_var_event(mm, futex_pivot_pending(mm)); } scoped_guard(mutex, &mm->futex_hash_lock) { struct futex_private_hash *free __free(kvfree) = NULL; struct futex_private_hash *cur, *new; cur = rcu_dereference_protected(mm->futex_phash, lockdep_is_held(&mm->futex_hash_lock)); new = mm->futex_phash_new; mm->futex_phash_new = NULL; if (fph) { if (cur && !cur->hash_mask) { /* * If two threads simultaneously request the global * hash then the first one performs the switch, * the second one returns here. */ free = fph; mm->futex_phash_new = new; return -EBUSY; } if (cur && !new) { /* * If we have an existing hash, but do not yet have * allocated a replacement hash, drop the initial * reference on the existing hash. */ futex_ref_drop(cur); } if (new) { /* * Two updates raced; throw out the lesser one. */ if (futex_hash_less(new, fph)) { free = new; new = fph; } else { free = fph; } } else { new = fph; } fph = NULL; } if (new) { /* * Will set mm->futex_phash_new on failure; * futex_private_hash_get() will try again. */ if (!__futex_pivot_hash(mm, new) && custom) goto again; } } return 0; } int futex_hash_allocate_default(void) { unsigned int threads, buckets, current_buckets = 0; struct futex_private_hash *fph; if (!current->mm) return 0; scoped_guard(rcu) { threads = min_t(unsigned int, get_nr_threads(current), num_online_cpus()); fph = rcu_dereference(current->mm->futex_phash); if (fph) { if (fph->custom) return 0; current_buckets = fph->hash_mask + 1; } } /* * The default allocation will remain within * 16 <= threads * 4 <= global hash size */ buckets = roundup_pow_of_two(4 * threads); buckets = clamp(buckets, 16, futex_hashmask + 1); if (current_buckets >= buckets) return 0; return futex_hash_allocate(buckets, 0); } static int futex_hash_get_slots(void) { struct futex_private_hash *fph; guard(rcu)(); fph = rcu_dereference(current->mm->futex_phash); if (fph && fph->hash_mask) return fph->hash_mask + 1; return 0; } #else static int futex_hash_allocate(unsigned int hash_slots, unsigned int flags) { return -EINVAL; } static int futex_hash_get_slots(void) { return 0; } #endif int futex_hash_prctl(unsigned long arg2, unsigned long arg3, unsigned long arg4) { unsigned int flags = FH_CUSTOM; int ret; switch (arg2) { case PR_FUTEX_HASH_SET_SLOTS: if (arg4) return -EINVAL; ret = futex_hash_allocate(arg3, flags); break; case PR_FUTEX_HASH_GET_SLOTS: ret = futex_hash_get_slots(); break; default: ret = -EINVAL; break; } return ret; } static int __init futex_init(void) { unsigned long hashsize, i; unsigned int order, n; unsigned long size; #ifdef CONFIG_BASE_SMALL hashsize = 16; #else hashsize = 256 * num_possible_cpus(); hashsize /= num_possible_nodes(); hashsize = max(4, hashsize); hashsize = roundup_pow_of_two(hashsize); #endif futex_hashshift = ilog2(hashsize); size = sizeof(struct futex_hash_bucket) * hashsize; order = get_order(size); for_each_node(n) { struct futex_hash_bucket *table; if (order > MAX_PAGE_ORDER) table = vmalloc_huge_node(size, GFP_KERNEL, n); else table = alloc_pages_exact_nid(n, size, GFP_KERNEL); BUG_ON(!table); for (i = 0; i < hashsize; i++) futex_hash_bucket_init(&table[i], NULL); futex_queues[n] = table; } futex_hashmask = hashsize - 1; pr_info("futex hash table entries: %lu (%lu bytes on %d NUMA nodes, total %lu KiB, %s).\n", hashsize, size, num_possible_nodes(), size * num_possible_nodes() / 1024, order > MAX_PAGE_ORDER ? "vmalloc" : "linear"); return 0; } core_initcall(futex_init);
22 2269 1274 15 2298 2302 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 // SPDX-License-Identifier: GPL-2.0 #include <linux/kernel.h> #include <linux/export.h> #include <linux/uaccess.h> #include <linux/mm.h> #include <linux/bitops.h> #include <asm/word-at-a-time.h> /* * Do a strnlen, return length of string *with* final '\0'. * 'count' is the user-supplied count, while 'max' is the * address space maximum. * * Return 0 for exceptions (which includes hitting the address * space maximum), or 'count+1' if hitting the user-supplied * maximum count. * * NOTE! We can sometimes overshoot the user-supplied maximum * if it fits in a aligned 'long'. The caller needs to check * the return value against "> max". */ static __always_inline long do_strnlen_user(const char __user *src, unsigned long count, unsigned long max) { const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS; unsigned long align, res = 0; unsigned long c; /* * Do everything aligned. But that means that we * need to also expand the maximum.. */ align = (sizeof(unsigned long) - 1) & (unsigned long)src; src -= align; max += align; unsafe_get_user(c, (unsigned long __user *)src, efault); c |= aligned_byte_mask(align); for (;;) { unsigned long data; if (has_zero(c, &data, &constants)) { data = prep_zero_mask(c, data, &constants); data = create_zero_mask(data); return res + find_zero(data) + 1 - align; } res += sizeof(unsigned long); /* We already handled 'unsigned long' bytes. Did we do it all ? */ if (unlikely(max <= sizeof(unsigned long))) break; max -= sizeof(unsigned long); unsafe_get_user(c, (unsigned long __user *)(src+res), efault); } res -= align; /* * Uhhuh. We hit 'max'. But was that the user-specified maximum * too? If so, return the marker for "too long". */ if (res >= count) return count+1; /* * Nope: we hit the address space limit, and we still had more * characters the caller would have wanted. That's 0. */ efault: return 0; } /** * strnlen_user: - Get the size of a user string INCLUDING final NUL. * @str: The string to measure. * @count: Maximum count (including NUL character) * * Context: User context only. This function may sleep if pagefaults are * enabled. * * Get the size of a NUL-terminated string in user space. * * Returns the size of the string INCLUDING the terminating NUL. * If the string is too long, returns a number larger than @count. User * has to check the return value against "> count". * On exception (or invalid count), returns 0. * * NOTE! You should basically never use this function. There is * almost never any valid case for using the length of a user space * string, since the string can be changed at any time by other * threads. Use "strncpy_from_user()" instead to get a stable copy * of the string. */ long strnlen_user(const char __user *str, long count) { unsigned long max_addr, src_addr; if (unlikely(count <= 0)) return 0; if (can_do_masked_user_access()) { long retval; str = masked_user_read_access_begin(str); retval = do_strnlen_user(str, count, count); user_read_access_end(); return retval; } max_addr = TASK_SIZE_MAX; src_addr = (unsigned long)untagged_addr(str); if (likely(src_addr < max_addr)) { unsigned long max = max_addr - src_addr; long retval; /* * Truncate 'max' to the user-specified limit, so that * we only have one limit we need to check in the loop */ if (max > count) max = count; if (user_read_access_begin(str, max)) { retval = do_strnlen_user(str, count, max); user_read_access_end(); return retval; } } return 0; } EXPORT_SYMBOL(strnlen_user);
2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * Universal TUN/TAP device driver. * Copyright (C) 1999-2000 Maxim Krasnyansky <max_mk@yahoo.com> */ #ifndef __IF_TUN_H #define __IF_TUN_H #include <uapi/linux/if_tun.h> #include <uapi/linux/virtio_net.h> #define TUN_XDP_FLAG 0x1UL #define TUN_MSG_UBUF 1 #define TUN_MSG_PTR 2 struct tun_msg_ctl { unsigned short type; unsigned short num; void *ptr; }; #if defined(CONFIG_TUN) || defined(CONFIG_TUN_MODULE) struct socket *tun_get_socket(struct file *); struct ptr_ring *tun_get_tx_ring(struct file *file); static inline bool tun_is_xdp_frame(void *ptr) { return (unsigned long)ptr & TUN_XDP_FLAG; } static inline void *tun_xdp_to_ptr(struct xdp_frame *xdp) { return (void *)((unsigned long)xdp | TUN_XDP_FLAG); } static inline struct xdp_frame *tun_ptr_to_xdp(void *ptr) { return (void *)((unsigned long)ptr & ~TUN_XDP_FLAG); } void tun_ptr_free(void *ptr); #else #include <linux/err.h> #include <linux/errno.h> struct file; struct socket; static inline struct socket *tun_get_socket(struct file *f) { return ERR_PTR(-EINVAL); } static inline struct ptr_ring *tun_get_tx_ring(struct file *f) { return ERR_PTR(-EINVAL); } static inline bool tun_is_xdp_frame(void *ptr) { return false; } static inline void *tun_xdp_to_ptr(struct xdp_frame *xdp) { return NULL; } static inline struct xdp_frame *tun_ptr_to_xdp(void *ptr) { return NULL; } static inline void tun_ptr_free(void *ptr) { } #endif /* CONFIG_TUN */ #endif /* __IF_TUN_H */
37 12 27 27 12 15 24 5 19 1 18 17 16 3 2 2 2 2 3 3 2 2 4 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2021 Facebook */ #include <linux/bitmap.h> #include <linux/bpf.h> #include <linux/btf.h> #include <linux/err.h> #include <linux/jhash.h> #include <linux/random.h> #include <linux/btf_ids.h> #define BLOOM_CREATE_FLAG_MASK \ (BPF_F_NUMA_NODE | BPF_F_ZERO_SEED | BPF_F_ACCESS_MASK) struct bpf_bloom_filter { struct bpf_map map; u32 bitset_mask; u32 hash_seed; u32 nr_hash_funcs; unsigned long bitset[]; }; static u32 hash(struct bpf_bloom_filter *bloom, void *value, u32 value_size, u32 index) { u32 h; if (likely(value_size % 4 == 0)) h = jhash2(value, value_size / 4, bloom->hash_seed + index); else h = jhash(value, value_size, bloom->hash_seed + index); return h & bloom->bitset_mask; } static long bloom_map_peek_elem(struct bpf_map *map, void *value) { struct bpf_bloom_filter *bloom = container_of(map, struct bpf_bloom_filter, map); u32 i, h; for (i = 0; i < bloom->nr_hash_funcs; i++) { h = hash(bloom, value, map->value_size, i); if (!test_bit(h, bloom->bitset)) return -ENOENT; } return 0; } static long bloom_map_push_elem(struct bpf_map *map, void *value, u64 flags) { struct bpf_bloom_filter *bloom = container_of(map, struct bpf_bloom_filter, map); u32 i, h; if (flags != BPF_ANY) return -EINVAL; for (i = 0; i < bloom->nr_hash_funcs; i++) { h = hash(bloom, value, map->value_size, i); set_bit(h, bloom->bitset); } return 0; } static long bloom_map_pop_elem(struct bpf_map *map, void *value) { return -EOPNOTSUPP; } static long bloom_map_delete_elem(struct bpf_map *map, void *value) { return -EOPNOTSUPP; } static int bloom_map_get_next_key(struct bpf_map *map, void *key, void *next_key) { return -EOPNOTSUPP; } /* Called from syscall */ static int bloom_map_alloc_check(union bpf_attr *attr) { if (attr->value_size > KMALLOC_MAX_SIZE) /* if value_size is bigger, the user space won't be able to * access the elements. */ return -E2BIG; return 0; } static struct bpf_map *bloom_map_alloc(union bpf_attr *attr) { u32 bitset_bytes, bitset_mask, nr_hash_funcs, nr_bits; int numa_node = bpf_map_attr_numa_node(attr); struct bpf_bloom_filter *bloom; if (attr->key_size != 0 || attr->value_size == 0 || attr->max_entries == 0 || attr->map_flags & ~BLOOM_CREATE_FLAG_MASK || !bpf_map_flags_access_ok(attr->map_flags) || /* The lower 4 bits of map_extra (0xF) specify the number * of hash functions */ (attr->map_extra & ~0xF)) return ERR_PTR(-EINVAL); nr_hash_funcs = attr->map_extra; if (nr_hash_funcs == 0) /* Default to using 5 hash functions if unspecified */ nr_hash_funcs = 5; /* For the bloom filter, the optimal bit array size that minimizes the * false positive probability is n * k / ln(2) where n is the number of * expected entries in the bloom filter and k is the number of hash * functions. We use 7 / 5 to approximate 1 / ln(2). * * We round this up to the nearest power of two to enable more efficient * hashing using bitmasks. The bitmask will be the bit array size - 1. * * If this overflows a u32, the bit array size will have 2^32 (4 * GB) bits. */ if (check_mul_overflow(attr->max_entries, nr_hash_funcs, &nr_bits) || check_mul_overflow(nr_bits / 5, (u32)7, &nr_bits) || nr_bits > (1UL << 31)) { /* The bit array size is 2^32 bits but to avoid overflowing the * u32, we use U32_MAX, which will round up to the equivalent * number of bytes */ bitset_bytes = BITS_TO_BYTES(U32_MAX); bitset_mask = U32_MAX; } else { if (nr_bits <= BITS_PER_LONG) nr_bits = BITS_PER_LONG; else nr_bits = roundup_pow_of_two(nr_bits); bitset_bytes = BITS_TO_BYTES(nr_bits); bitset_mask = nr_bits - 1; } bitset_bytes = roundup(bitset_bytes, sizeof(unsigned long)); bloom = bpf_map_area_alloc(sizeof(*bloom) + bitset_bytes, numa_node); if (!bloom) return ERR_PTR(-ENOMEM); bpf_map_init_from_attr(&bloom->map, attr); bloom->nr_hash_funcs = nr_hash_funcs; bloom->bitset_mask = bitset_mask; if (!(attr->map_flags & BPF_F_ZERO_SEED)) bloom->hash_seed = get_random_u32(); return &bloom->map; } static void bloom_map_free(struct bpf_map *map) { struct bpf_bloom_filter *bloom = container_of(map, struct bpf_bloom_filter, map); bpf_map_area_free(bloom); } static void *bloom_map_lookup_elem(struct bpf_map *map, void *key) { /* The eBPF program should use map_peek_elem instead */ return ERR_PTR(-EINVAL); } static long bloom_map_update_elem(struct bpf_map *map, void *key, void *value, u64 flags) { /* The eBPF program should use map_push_elem instead */ return -EINVAL; } static int bloom_map_check_btf(const struct bpf_map *map, const struct btf *btf, const struct btf_type *key_type, const struct btf_type *value_type) { /* Bloom filter maps are keyless */ return btf_type_is_void(key_type) ? 0 : -EINVAL; } static u64 bloom_map_mem_usage(const struct bpf_map *map) { struct bpf_bloom_filter *bloom; u64 bitset_bytes; bloom = container_of(map, struct bpf_bloom_filter, map); bitset_bytes = BITS_TO_BYTES((u64)bloom->bitset_mask + 1); bitset_bytes = roundup(bitset_bytes, sizeof(unsigned long)); return sizeof(*bloom) + bitset_bytes; } BTF_ID_LIST_SINGLE(bpf_bloom_map_btf_ids, struct, bpf_bloom_filter) const struct bpf_map_ops bloom_filter_map_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc_check = bloom_map_alloc_check, .map_alloc = bloom_map_alloc, .map_free = bloom_map_free, .map_get_next_key = bloom_map_get_next_key, .map_push_elem = bloom_map_push_elem, .map_peek_elem = bloom_map_peek_elem, .map_pop_elem = bloom_map_pop_elem, .map_lookup_elem = bloom_map_lookup_elem, .map_update_elem = bloom_map_update_elem, .map_delete_elem = bloom_map_delete_elem, .map_check_btf = bloom_map_check_btf, .map_mem_usage = bloom_map_mem_usage, .map_btf_id = &bpf_bloom_map_btf_ids[0], };
1 17 6 22 2 417 325 24 1 23 24 24 24 24 376 366 34 440 441 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 // SPDX-License-Identifier: GPL-2.0-only /* * * Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com> * Copyright (C) 2002 by Ron Minnich <rminnich@lanl.gov> */ #include <linux/kernel.h> #include <linux/module.h> #include <linux/errno.h> #include <linux/fs.h> #include <linux/file.h> #include <linux/stat.h> #include <linux/string.h> #include <linux/pagemap.h> #include <linux/mount.h> #include <linux/sched.h> #include <linux/slab.h> #include <linux/statfs.h> #include <linux/magic.h> #include <linux/fscache.h> #include <linux/fs_context.h> #include <net/9p/9p.h> #include <net/9p/client.h> #include "v9fs.h" #include "v9fs_vfs.h" #include "fid.h" #include "xattr.h" #include "acl.h" static const struct super_operations v9fs_super_ops, v9fs_super_ops_dotl; static int v9fs_fill_super(struct super_block *sb) { int ret; struct v9fs_session_info *v9ses = v9ses = sb->s_fs_info; sb->s_maxbytes = MAX_LFS_FILESIZE; sb->s_blocksize_bits = fls(v9ses->maxdata - 1); sb->s_blocksize = 1 << sb->s_blocksize_bits; sb->s_magic = V9FS_MAGIC; if (v9fs_proto_dotl(v9ses)) { sb->s_op = &v9fs_super_ops_dotl; if (!(v9ses->flags & V9FS_NO_XATTR)) sb->s_xattr = v9fs_xattr_handlers; } else { sb->s_op = &v9fs_super_ops; sb->s_time_max = U32_MAX; } sb->s_time_min = 0; ret = super_setup_bdi(sb); if (ret) return ret; if (!v9ses->cache) { sb->s_bdi->ra_pages = 0; sb->s_bdi->io_pages = 0; } else { sb->s_bdi->ra_pages = v9ses->maxdata >> PAGE_SHIFT; sb->s_bdi->io_pages = v9ses->maxdata >> PAGE_SHIFT; } sb->s_flags |= SB_ACTIVE; #ifdef CONFIG_9P_FS_POSIX_ACL if ((v9ses->flags & V9FS_ACL_MASK) == V9FS_POSIX_ACL) sb->s_flags |= SB_POSIXACL; #endif return 0; } /** * v9fs_get_tree - create the mountable root and superblock * @fc: the filesystem context * */ static int v9fs_get_tree(struct fs_context *fc) { struct super_block *sb = NULL; struct inode *inode = NULL; struct dentry *root = NULL; struct v9fs_session_info *v9ses = NULL; struct p9_fid *fid; int retval = 0; p9_debug(P9_DEBUG_VFS, "\n"); v9ses = kzalloc(sizeof(struct v9fs_session_info), GFP_KERNEL); if (!v9ses) return -ENOMEM; fid = v9fs_session_init(v9ses, fc); if (IS_ERR(fid)) { retval = PTR_ERR(fid); goto free_session; } fc->s_fs_info = v9ses; sb = sget_fc(fc, NULL, set_anon_super_fc); if (IS_ERR(sb)) { retval = PTR_ERR(sb); goto clunk_fid; } retval = v9fs_fill_super(sb); if (retval) goto release_sb; if (v9ses->cache & (CACHE_META|CACHE_LOOSE)) { set_default_d_op(sb, &v9fs_cached_dentry_operations); } else { set_default_d_op(sb, &v9fs_dentry_operations); sb->s_d_flags |= DCACHE_DONTCACHE; } inode = v9fs_get_new_inode_from_fid(v9ses, fid, sb); if (IS_ERR(inode)) { retval = PTR_ERR(inode); goto release_sb; } root = d_make_root(inode); if (!root) { retval = -ENOMEM; goto release_sb; } sb->s_root = root; retval = v9fs_get_acl(inode, fid); if (retval) goto release_sb; v9fs_fid_add(root, &fid); p9_debug(P9_DEBUG_VFS, " simple set mount, return 0\n"); fc->root = dget(sb->s_root); return 0; clunk_fid: p9_fid_put(fid); v9fs_session_close(v9ses); free_session: kfree(v9ses); return retval; release_sb: /* * we will do the session_close and root dentry release * in the below call. But we need to clunk fid, because we haven't * attached the fid to dentry so it won't get clunked * automatically. */ p9_fid_put(fid); deactivate_locked_super(sb); return retval; } /** * v9fs_kill_super - Kill Superblock * @s: superblock * */ static void v9fs_kill_super(struct super_block *s) { struct v9fs_session_info *v9ses = s->s_fs_info; p9_debug(P9_DEBUG_VFS, " %p\n", s); kill_anon_super(s); v9fs_session_cancel(v9ses); v9fs_session_close(v9ses); kfree(v9ses); s->s_fs_info = NULL; p9_debug(P9_DEBUG_VFS, "exiting kill_super\n"); } static void v9fs_umount_begin(struct super_block *sb) { struct v9fs_session_info *v9ses; v9ses = sb->s_fs_info; v9fs_session_begin_cancel(v9ses); } static int v9fs_statfs(struct dentry *dentry, struct kstatfs *buf) { struct v9fs_session_info *v9ses; struct p9_fid *fid; struct p9_rstatfs rs; int res; fid = v9fs_fid_lookup(dentry); if (IS_ERR(fid)) { res = PTR_ERR(fid); goto done; } v9ses = v9fs_dentry2v9ses(dentry); if (v9fs_proto_dotl(v9ses)) { res = p9_client_statfs(fid, &rs); if (res == 0) { buf->f_type = rs.type; buf->f_bsize = rs.bsize; buf->f_blocks = rs.blocks; buf->f_bfree = rs.bfree; buf->f_bavail = rs.bavail; buf->f_files = rs.files; buf->f_ffree = rs.ffree; buf->f_fsid = u64_to_fsid(rs.fsid); buf->f_namelen = rs.namelen; } if (res != -ENOSYS) goto done; } res = simple_statfs(dentry, buf); done: p9_fid_put(fid); return res; } static int v9fs_drop_inode(struct inode *inode) { struct v9fs_session_info *v9ses; v9ses = v9fs_inode2v9ses(inode); if (v9ses->cache & (CACHE_META|CACHE_LOOSE)) return inode_generic_drop(inode); /* * in case of non cached mode always drop the * inode because we want the inode attribute * to always match that on the server. */ return 1; } static int v9fs_write_inode(struct inode *inode, struct writeback_control *wbc) { /* * send an fsync request to server irrespective of * wbc->sync_mode. */ p9_debug(P9_DEBUG_VFS, "%s: inode %p\n", __func__, inode); return netfs_unpin_writeback(inode, wbc); } static int v9fs_write_inode_dotl(struct inode *inode, struct writeback_control *wbc) { p9_debug(P9_DEBUG_VFS, "%s: inode %p\n", __func__, inode); return netfs_unpin_writeback(inode, wbc); } static const struct super_operations v9fs_super_ops = { .alloc_inode = v9fs_alloc_inode, .free_inode = v9fs_free_inode, .statfs = simple_statfs, .drop_inode = v9fs_drop_inode, .evict_inode = v9fs_evict_inode, .show_options = v9fs_show_options, .umount_begin = v9fs_umount_begin, .write_inode = v9fs_write_inode, }; static const struct super_operations v9fs_super_ops_dotl = { .alloc_inode = v9fs_alloc_inode, .free_inode = v9fs_free_inode, .statfs = v9fs_statfs, .drop_inode = v9fs_drop_inode, .evict_inode = v9fs_evict_inode, .show_options = v9fs_show_options, .umount_begin = v9fs_umount_begin, .write_inode = v9fs_write_inode_dotl, }; static void v9fs_free_fc(struct fs_context *fc) { struct v9fs_context *ctx = fc->fs_private; if (!ctx) return; /* These should be NULL by now but guard against leaks */ kfree(ctx->session_opts.uname); kfree(ctx->session_opts.aname); #ifdef CONFIG_9P_FSCACHE kfree(ctx->session_opts.cachetag); #endif if (ctx->client_opts.trans_mod) v9fs_put_trans(ctx->client_opts.trans_mod); kfree(ctx); } static const struct fs_context_operations v9fs_context_ops = { .parse_param = v9fs_parse_param, .get_tree = v9fs_get_tree, .free = v9fs_free_fc, }; static int v9fs_init_fs_context(struct fs_context *fc) { struct v9fs_context *ctx; ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); if (!ctx) return -ENOMEM; /* initialize core options */ ctx->session_opts.afid = ~0; ctx->session_opts.cache = CACHE_NONE; ctx->session_opts.session_lock_timeout = P9_LOCK_TIMEOUT; ctx->session_opts.uname = kstrdup(V9FS_DEFUSER, GFP_KERNEL); if (!ctx->session_opts.uname) goto error; ctx->session_opts.aname = kstrdup(V9FS_DEFANAME, GFP_KERNEL); if (!ctx->session_opts.aname) goto error; ctx->session_opts.uid = INVALID_UID; ctx->session_opts.dfltuid = V9FS_DEFUID; ctx->session_opts.dfltgid = V9FS_DEFGID; /* initialize client options */ ctx->client_opts.proto_version = p9_proto_2000L; ctx->client_opts.msize = DEFAULT_MSIZE; /* initialize fd transport options */ ctx->fd_opts.port = P9_FD_PORT; ctx->fd_opts.rfd = ~0; ctx->fd_opts.wfd = ~0; ctx->fd_opts.privport = false; /* initialize rdma transport options */ ctx->rdma_opts.port = P9_RDMA_PORT; ctx->rdma_opts.sq_depth = P9_RDMA_SQ_DEPTH; ctx->rdma_opts.rq_depth = P9_RDMA_RQ_DEPTH; ctx->rdma_opts.timeout = P9_RDMA_TIMEOUT; ctx->rdma_opts.privport = false; fc->ops = &v9fs_context_ops; fc->fs_private = ctx; return 0; error: fc->need_free = 1; return -ENOMEM; } struct file_system_type v9fs_fs_type = { .name = "9p", .kill_sb = v9fs_kill_super, .owner = THIS_MODULE, .fs_flags = FS_RENAME_DOES_D_MOVE, .init_fs_context = v9fs_init_fs_context, .parameters = v9fs_param_spec, }; MODULE_ALIAS_FS("9p");
1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 // SPDX-License-Identifier: GPL-2.0 #include <linux/build_bug.h> #include <linux/errno.h> #include <linux/errname.h> #include <linux/kernel.h> #include <linux/math.h> /* * Ensure these tables do not accidentally become gigantic if some * huge errno makes it in. On most architectures, the first table will * only have about 140 entries, but mips and parisc have more sparsely * allocated errnos (with EHWPOISON = 257 on parisc, and EDQUOT = 1133 * on mips), so this wastes a bit of space on those - though we * special case the EDQUOT case. */ #define E(err) [err + BUILD_BUG_ON_ZERO(err <= 0 || err > 300)] = "-" #err static const char *names_0[] = { E(E2BIG), E(EACCES), E(EADDRINUSE), E(EADDRNOTAVAIL), E(EADV), E(EAFNOSUPPORT), E(EAGAIN), /* EWOULDBLOCK */ E(EALREADY), E(EBADE), E(EBADF), E(EBADFD), E(EBADMSG), E(EBADR), E(EBADRQC), E(EBADSLT), E(EBFONT), E(EBUSY), E(ECANCELED), /* ECANCELLED */ E(ECHILD), E(ECHRNG), E(ECOMM), E(ECONNABORTED), E(ECONNREFUSED), /* EREFUSED */ E(ECONNRESET), E(EDEADLK), /* EDEADLOCK */ #if EDEADLK != EDEADLOCK /* mips, sparc, powerpc */ E(EDEADLOCK), #endif E(EDESTADDRREQ), E(EDOM), E(EDOTDOT), #ifndef CONFIG_MIPS E(EDQUOT), #endif E(EEXIST), E(EFAULT), E(EFBIG), E(EHOSTDOWN), E(EHOSTUNREACH), E(EHWPOISON), E(EIDRM), E(EILSEQ), #ifdef EINIT E(EINIT), #endif E(EINPROGRESS), E(EINTR), E(EINVAL), E(EIO), E(EISCONN), E(EISDIR), E(EISNAM), E(EKEYEXPIRED), E(EKEYREJECTED), E(EKEYREVOKED), E(EL2HLT), E(EL2NSYNC), E(EL3HLT), E(EL3RST), E(ELIBACC), E(ELIBBAD), E(ELIBEXEC), E(ELIBMAX), E(ELIBSCN), E(ELNRNG), E(ELOOP), E(EMEDIUMTYPE), E(EMFILE), E(EMLINK), E(EMSGSIZE), E(EMULTIHOP), E(ENAMETOOLONG), E(ENAVAIL), E(ENETDOWN), E(ENETRESET), E(ENETUNREACH), E(ENFILE), E(ENOANO), E(ENOBUFS), E(ENOCSI), E(ENODATA), E(ENODEV), E(ENOENT), E(ENOEXEC), E(ENOKEY), E(ENOLCK), E(ENOLINK), E(ENOMEDIUM), E(ENOMEM), E(ENOMSG), E(ENONET), E(ENOPKG), E(ENOPROTOOPT), E(ENOSPC), E(ENOSR), E(ENOSTR), E(ENOSYS), E(ENOTBLK), E(ENOTCONN), E(ENOTDIR), E(ENOTEMPTY), E(ENOTNAM), E(ENOTRECOVERABLE), E(ENOTSOCK), E(ENOTTY), E(ENOTUNIQ), E(ENXIO), E(EOPNOTSUPP), E(EOVERFLOW), E(EOWNERDEAD), E(EPERM), E(EPFNOSUPPORT), E(EPIPE), #ifdef EPROCLIM E(EPROCLIM), #endif E(EPROTO), E(EPROTONOSUPPORT), E(EPROTOTYPE), E(ERANGE), E(EREMCHG), #ifdef EREMDEV E(EREMDEV), #endif E(EREMOTE), E(EREMOTEIO), E(ERESTART), E(ERFKILL), E(EROFS), #ifdef ERREMOTE E(ERREMOTE), #endif E(ESHUTDOWN), E(ESOCKTNOSUPPORT), E(ESPIPE), E(ESRCH), E(ESRMNT), E(ESTALE), E(ESTRPIPE), E(ETIME), E(ETIMEDOUT), E(ETOOMANYREFS), E(ETXTBSY), E(EUCLEAN), E(EUNATCH), E(EUSERS), E(EXDEV), E(EXFULL), }; #undef E #ifdef EREFUSED /* parisc */ static_assert(EREFUSED == ECONNREFUSED); #endif #ifdef ECANCELLED /* parisc */ static_assert(ECANCELLED == ECANCELED); #endif static_assert(EAGAIN == EWOULDBLOCK); /* everywhere */ #define E(err) [err - 512 + BUILD_BUG_ON_ZERO(err < 512 || err > 550)] = "-" #err static const char *names_512[] = { E(ERESTARTSYS), E(ERESTARTNOINTR), E(ERESTARTNOHAND), E(ENOIOCTLCMD), E(ERESTART_RESTARTBLOCK), E(EPROBE_DEFER), E(EOPENSTALE), E(ENOPARAM), E(EBADHANDLE), E(ENOTSYNC), E(EBADCOOKIE), E(ENOTSUPP), E(ETOOSMALL), E(ESERVERFAULT), E(EBADTYPE), E(EJUKEBOX), E(EIOCBQUEUED), E(ERECALLCONFLICT), }; #undef E static const char *__errname(unsigned err) { if (err < ARRAY_SIZE(names_0)) return names_0[err]; if (err >= 512 && err - 512 < ARRAY_SIZE(names_512)) return names_512[err - 512]; /* But why? */ if (IS_ENABLED(CONFIG_MIPS) && err == EDQUOT) /* 1133 */ return "-EDQUOT"; return NULL; } /* * errname(EIO) -> "EIO" * errname(-EIO) -> "-EIO" */ const char *errname(int err) { const char *name = __errname(abs(err)); if (!name) return NULL; return err > 0 ? name + 1 : name; } EXPORT_SYMBOL(errname);
64 64 3 1 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 // SPDX-License-Identifier: GPL-2.0-or-later // Copyright (c) 2020, Nikolay Aleksandrov <nikolay@nvidia.com> #include <linux/err.h> #include <linux/export.h> #include <linux/if_ether.h> #include <linux/igmp.h> #include <linux/in.h> #include <linux/jhash.h> #include <linux/kernel.h> #include <linux/log2.h> #include <linux/netdevice.h> #include <linux/netfilter_bridge.h> #include <linux/random.h> #include <linux/rculist.h> #include <linux/skbuff.h> #include <linux/slab.h> #include <linux/timer.h> #include <linux/inetdevice.h> #include <linux/mroute.h> #include <net/ip.h> #include <net/switchdev.h> #if IS_ENABLED(CONFIG_IPV6) #include <linux/icmpv6.h> #include <net/ipv6.h> #include <net/mld.h> #include <net/ip6_checksum.h> #include <net/addrconf.h> #endif #include "br_private.h" #include "br_private_mcast_eht.h" static bool br_multicast_del_eht_set_entry(struct net_bridge_port_group *pg, union net_bridge_eht_addr *src_addr, union net_bridge_eht_addr *h_addr); static void br_multicast_create_eht_set_entry(const struct net_bridge_mcast *brmctx, struct net_bridge_port_group *pg, union net_bridge_eht_addr *src_addr, union net_bridge_eht_addr *h_addr, int filter_mode, bool allow_zero_src); static struct net_bridge_group_eht_host * br_multicast_eht_host_lookup(struct net_bridge_port_group *pg, union net_bridge_eht_addr *h_addr) { struct rb_node *node = pg->eht_host_tree.rb_node; while (node) { struct net_bridge_group_eht_host *this; int result; this = rb_entry(node, struct net_bridge_group_eht_host, rb_node); result = memcmp(h_addr, &this->h_addr, sizeof(*h_addr)); if (result < 0) node = node->rb_left; else if (result > 0) node = node->rb_right; else return this; } return NULL; } static int br_multicast_eht_host_filter_mode(struct net_bridge_port_group *pg, union net_bridge_eht_addr *h_addr) { struct net_bridge_group_eht_host *eht_host; eht_host = br_multicast_eht_host_lookup(pg, h_addr); if (!eht_host) return MCAST_INCLUDE; return eht_host->filter_mode; } static struct net_bridge_group_eht_set_entry * br_multicast_eht_set_entry_lookup(struct net_bridge_group_eht_set *eht_set, union net_bridge_eht_addr *h_addr) { struct rb_node *node = eht_set->entry_tree.rb_node; while (node) { struct net_bridge_group_eht_set_entry *this; int result; this = rb_entry(node, struct net_bridge_group_eht_set_entry, rb_node); result = memcmp(h_addr, &this->h_addr, sizeof(*h_addr)); if (result < 0) node = node->rb_left; else if (result > 0) node = node->rb_right; else return this; } return NULL; } static struct net_bridge_group_eht_set * br_multicast_eht_set_lookup(struct net_bridge_port_group *pg, union net_bridge_eht_addr *src_addr) { struct rb_node *node = pg->eht_set_tree.rb_node; while (node) { struct net_bridge_group_eht_set *this; int result; this = rb_entry(node, struct net_bridge_group_eht_set, rb_node); result = memcmp(src_addr, &this->src_addr, sizeof(*src_addr)); if (result < 0) node = node->rb_left; else if (result > 0) node = node->rb_right; else return this; } return NULL; } static void __eht_destroy_host(struct net_bridge_group_eht_host *eht_host) { WARN_ON(!hlist_empty(&eht_host->set_entries)); br_multicast_eht_hosts_dec(eht_host->pg); rb_erase(&eht_host->rb_node, &eht_host->pg->eht_host_tree); RB_CLEAR_NODE(&eht_host->rb_node); kfree(eht_host); } static void br_multicast_destroy_eht_set_entry(struct net_bridge_mcast_gc *gc) { struct net_bridge_group_eht_set_entry *set_h; set_h = container_of(gc, struct net_bridge_group_eht_set_entry, mcast_gc); WARN_ON(!RB_EMPTY_NODE(&set_h->rb_node)); timer_shutdown_sync(&set_h->timer); kfree(set_h); } static void br_multicast_destroy_eht_set(struct net_bridge_mcast_gc *gc) { struct net_bridge_group_eht_set *eht_set; eht_set = container_of(gc, struct net_bridge_group_eht_set, mcast_gc); WARN_ON(!RB_EMPTY_NODE(&eht_set->rb_node)); WARN_ON(!RB_EMPTY_ROOT(&eht_set->entry_tree)); timer_shutdown_sync(&eht_set->timer); kfree(eht_set); } static void __eht_del_set_entry(struct net_bridge_group_eht_set_entry *set_h) { struct net_bridge_group_eht_host *eht_host = set_h->h_parent; union net_bridge_eht_addr zero_addr; rb_erase(&set_h->rb_node, &set_h->eht_set->entry_tree); RB_CLEAR_NODE(&set_h->rb_node); hlist_del_init(&set_h->host_list); memset(&zero_addr, 0, sizeof(zero_addr)); if (memcmp(&set_h->h_addr, &zero_addr, sizeof(zero_addr))) eht_host->num_entries--; hlist_add_head(&set_h->mcast_gc.gc_node, &set_h->br->mcast_gc_list); queue_work(system_long_wq, &set_h->br->mcast_gc_work); if (hlist_empty(&eht_host->set_entries)) __eht_destroy_host(eht_host); } static void br_multicast_del_eht_set(struct net_bridge_group_eht_set *eht_set) { struct net_bridge_group_eht_set_entry *set_h; struct rb_node *node; while ((node = rb_first(&eht_set->entry_tree))) { set_h = rb_entry(node, struct net_bridge_group_eht_set_entry, rb_node); __eht_del_set_entry(set_h); } rb_erase(&eht_set->rb_node, &eht_set->pg->eht_set_tree); RB_CLEAR_NODE(&eht_set->rb_node); hlist_add_head(&eht_set->mcast_gc.gc_node, &eht_set->br->mcast_gc_list); queue_work(system_long_wq, &eht_set->br->mcast_gc_work); } void br_multicast_eht_clean_sets(struct net_bridge_port_group *pg) { struct net_bridge_group_eht_set *eht_set; struct rb_node *node; while ((node = rb_first(&pg->eht_set_tree))) { eht_set = rb_entry(node, struct net_bridge_group_eht_set, rb_node); br_multicast_del_eht_set(eht_set); } } static void br_multicast_eht_set_entry_expired(struct timer_list *t) { struct net_bridge_group_eht_set_entry *set_h = timer_container_of(set_h, t, timer); struct net_bridge *br = set_h->br; spin_lock(&br->multicast_lock); if (RB_EMPTY_NODE(&set_h->rb_node) || timer_pending(&set_h->timer)) goto out; br_multicast_del_eht_set_entry(set_h->eht_set->pg, &set_h->eht_set->src_addr, &set_h->h_addr); out: spin_unlock(&br->multicast_lock); } static void br_multicast_eht_set_expired(struct timer_list *t) { struct net_bridge_group_eht_set *eht_set = timer_container_of(eht_set, t, timer); struct net_bridge *br = eht_set->br; spin_lock(&br->multicast_lock); if (RB_EMPTY_NODE(&eht_set->rb_node) || timer_pending(&eht_set->timer)) goto out; br_multicast_del_eht_set(eht_set); out: spin_unlock(&br->multicast_lock); } static struct net_bridge_group_eht_host * __eht_lookup_create_host(struct net_bridge_port_group *pg, union net_bridge_eht_addr *h_addr, unsigned char filter_mode) { struct rb_node **link = &pg->eht_host_tree.rb_node, *parent = NULL; struct net_bridge_group_eht_host *eht_host; while (*link) { struct net_bridge_group_eht_host *this; int result; this = rb_entry(*link, struct net_bridge_group_eht_host, rb_node); result = memcmp(h_addr, &this->h_addr, sizeof(*h_addr)); parent = *link; if (result < 0) link = &((*link)->rb_left); else if (result > 0) link = &((*link)->rb_right); else return this; } if (br_multicast_eht_hosts_over_limit(pg)) return NULL; eht_host = kzalloc(sizeof(*eht_host), GFP_ATOMIC); if (!eht_host) return NULL; memcpy(&eht_host->h_addr, h_addr, sizeof(*h_addr)); INIT_HLIST_HEAD(&eht_host->set_entries); eht_host->pg = pg; eht_host->filter_mode = filter_mode; rb_link_node(&eht_host->rb_node, parent, link); rb_insert_color(&eht_host->rb_node, &pg->eht_host_tree); br_multicast_eht_hosts_inc(pg); return eht_host; } static struct net_bridge_group_eht_set_entry * __eht_lookup_create_set_entry(struct net_bridge *br, struct net_bridge_group_eht_set *eht_set, struct net_bridge_group_eht_host *eht_host, bool allow_zero_src) { struct rb_node **link = &eht_set->entry_tree.rb_node, *parent = NULL; struct net_bridge_group_eht_set_entry *set_h; while (*link) { struct net_bridge_group_eht_set_entry *this; int result; this = rb_entry(*link, struct net_bridge_group_eht_set_entry, rb_node); result = memcmp(&eht_host->h_addr, &this->h_addr, sizeof(union net_bridge_eht_addr)); parent = *link; if (result < 0) link = &((*link)->rb_left); else if (result > 0) link = &((*link)->rb_right); else return this; } /* always allow auto-created zero entry */ if (!allow_zero_src && eht_host->num_entries >= PG_SRC_ENT_LIMIT) return NULL; set_h = kzalloc(sizeof(*set_h), GFP_ATOMIC); if (!set_h) return NULL; memcpy(&set_h->h_addr, &eht_host->h_addr, sizeof(union net_bridge_eht_addr)); set_h->mcast_gc.destroy = br_multicast_destroy_eht_set_entry; set_h->eht_set = eht_set; set_h->h_parent = eht_host; set_h->br = br; timer_setup(&set_h->timer, br_multicast_eht_set_entry_expired, 0); hlist_add_head(&set_h->host_list, &eht_host->set_entries); rb_link_node(&set_h->rb_node, parent, link); rb_insert_color(&set_h->rb_node, &eht_set->entry_tree); /* we must not count the auto-created zero entry otherwise we won't be * able to track the full list of PG_SRC_ENT_LIMIT entries */ if (!allow_zero_src) eht_host->num_entries++; return set_h; } static struct net_bridge_group_eht_set * __eht_lookup_create_set(struct net_bridge_port_group *pg, union net_bridge_eht_addr *src_addr) { struct rb_node **link = &pg->eht_set_tree.rb_node, *parent = NULL; struct net_bridge_group_eht_set *eht_set; while (*link) { struct net_bridge_group_eht_set *this; int result; this = rb_entry(*link, struct net_bridge_group_eht_set, rb_node); result = memcmp(src_addr, &this->src_addr, sizeof(*src_addr)); parent = *link; if (result < 0) link = &((*link)->rb_left); else if (result > 0) link = &((*link)->rb_right); else return this; } eht_set = kzalloc(sizeof(*eht_set), GFP_ATOMIC); if (!eht_set) return NULL; memcpy(&eht_set->src_addr, src_addr, sizeof(*src_addr)); eht_set->mcast_gc.destroy = br_multicast_destroy_eht_set; eht_set->pg = pg; eht_set->br = pg->key.port->br; eht_set->entry_tree = RB_ROOT; timer_setup(&eht_set->timer, br_multicast_eht_set_expired, 0); rb_link_node(&eht_set->rb_node, parent, link); rb_insert_color(&eht_set->rb_node, &pg->eht_set_tree); return eht_set; } static void br_multicast_ip_src_to_eht_addr(const struct br_ip *src, union net_bridge_eht_addr *dest) { switch (src->proto) { case htons(ETH_P_IP): dest->ip4 = src->src.ip4; break; #if IS_ENABLED(CONFIG_IPV6) case htons(ETH_P_IPV6): memcpy(&dest->ip6, &src->src.ip6, sizeof(struct in6_addr)); break; #endif } } static void br_eht_convert_host_filter_mode(const struct net_bridge_mcast *brmctx, struct net_bridge_port_group *pg, union net_bridge_eht_addr *h_addr, int filter_mode) { struct net_bridge_group_eht_host *eht_host; union net_bridge_eht_addr zero_addr; eht_host = br_multicast_eht_host_lookup(pg, h_addr); if (eht_host) eht_host->filter_mode = filter_mode; memset(&zero_addr, 0, sizeof(zero_addr)); switch (filter_mode) { case MCAST_INCLUDE: br_multicast_del_eht_set_entry(pg, &zero_addr, h_addr); break; case MCAST_EXCLUDE: br_multicast_create_eht_set_entry(brmctx, pg, &zero_addr, h_addr, MCAST_EXCLUDE, true); break; } } static void br_multicast_create_eht_set_entry(const struct net_bridge_mcast *brmctx, struct net_bridge_port_group *pg, union net_bridge_eht_addr *src_addr, union net_bridge_eht_addr *h_addr, int filter_mode, bool allow_zero_src) { struct net_bridge_group_eht_set_entry *set_h; struct net_bridge_group_eht_host *eht_host; struct net_bridge *br = pg->key.port->br; struct net_bridge_group_eht_set *eht_set; union net_bridge_eht_addr zero_addr; memset(&zero_addr, 0, sizeof(zero_addr)); if (!allow_zero_src && !memcmp(src_addr, &zero_addr, sizeof(zero_addr))) return; eht_set = __eht_lookup_create_set(pg, src_addr); if (!eht_set) return; eht_host = __eht_lookup_create_host(pg, h_addr, filter_mode); if (!eht_host) goto fail_host; set_h = __eht_lookup_create_set_entry(br, eht_set, eht_host, allow_zero_src); if (!set_h) goto fail_set_entry; mod_timer(&set_h->timer, jiffies + br_multicast_gmi(brmctx)); mod_timer(&eht_set->timer, jiffies + br_multicast_gmi(brmctx)); return; fail_set_entry: if (hlist_empty(&eht_host->set_entries)) __eht_destroy_host(eht_host); fail_host: if (RB_EMPTY_ROOT(&eht_set->entry_tree)) br_multicast_del_eht_set(eht_set); } static bool br_multicast_del_eht_set_entry(struct net_bridge_port_group *pg, union net_bridge_eht_addr *src_addr, union net_bridge_eht_addr *h_addr) { struct net_bridge_group_eht_set_entry *set_h; struct net_bridge_group_eht_set *eht_set; bool set_deleted = false; eht_set = br_multicast_eht_set_lookup(pg, src_addr); if (!eht_set) goto out; set_h = br_multicast_eht_set_entry_lookup(eht_set, h_addr); if (!set_h) goto out; __eht_del_set_entry(set_h); if (RB_EMPTY_ROOT(&eht_set->entry_tree)) { br_multicast_del_eht_set(eht_set); set_deleted = true; } out: return set_deleted; } static void br_multicast_del_eht_host(struct net_bridge_port_group *pg, union net_bridge_eht_addr *h_addr) { struct net_bridge_group_eht_set_entry *set_h; struct net_bridge_group_eht_host *eht_host; struct hlist_node *tmp; eht_host = br_multicast_eht_host_lookup(pg, h_addr); if (!eht_host) return; hlist_for_each_entry_safe(set_h, tmp, &eht_host->set_entries, host_list) br_multicast_del_eht_set_entry(set_h->eht_set->pg, &set_h->eht_set->src_addr, &set_h->h_addr); } /* create new set entries from reports */ static void __eht_create_set_entries(const struct net_bridge_mcast *brmctx, struct net_bridge_port_group *pg, union net_bridge_eht_addr *h_addr, void *srcs, u32 nsrcs, size_t addr_size, int filter_mode) { union net_bridge_eht_addr eht_src_addr; u32 src_idx; memset(&eht_src_addr, 0, sizeof(eht_src_addr)); for (src_idx = 0; src_idx < nsrcs; src_idx++) { memcpy(&eht_src_addr, srcs + (src_idx * addr_size), addr_size); br_multicast_create_eht_set_entry(brmctx, pg, &eht_src_addr, h_addr, filter_mode, false); } } /* delete existing set entries and their (S,G) entries if they were the last */ static bool __eht_del_set_entries(struct net_bridge_port_group *pg, union net_bridge_eht_addr *h_addr, void *srcs, u32 nsrcs, size_t addr_size) { union net_bridge_eht_addr eht_src_addr; struct net_bridge_group_src *src_ent; bool changed = false; struct br_ip src_ip; u32 src_idx; memset(&eht_src_addr, 0, sizeof(eht_src_addr)); memset(&src_ip, 0, sizeof(src_ip)); src_ip.proto = pg->key.addr.proto; for (src_idx = 0; src_idx < nsrcs; src_idx++) { memcpy(&eht_src_addr, srcs + (src_idx * addr_size), addr_size); if (!br_multicast_del_eht_set_entry(pg, &eht_src_addr, h_addr)) continue; memcpy(&src_ip, srcs + (src_idx * addr_size), addr_size); src_ent = br_multicast_find_group_src(pg, &src_ip); if (!src_ent) continue; br_multicast_del_group_src(src_ent, true); changed = true; } return changed; } static bool br_multicast_eht_allow(const struct net_bridge_mcast *brmctx, struct net_bridge_port_group *pg, union net_bridge_eht_addr *h_addr, void *srcs, u32 nsrcs, size_t addr_size) { bool changed = false; switch (br_multicast_eht_host_filter_mode(pg, h_addr)) { case MCAST_INCLUDE: __eht_create_set_entries(brmctx, pg, h_addr, srcs, nsrcs, addr_size, MCAST_INCLUDE); break; case MCAST_EXCLUDE: changed = __eht_del_set_entries(pg, h_addr, srcs, nsrcs, addr_size); break; } return changed; } static bool br_multicast_eht_block(const struct net_bridge_mcast *brmctx, struct net_bridge_port_group *pg, union net_bridge_eht_addr *h_addr, void *srcs, u32 nsrcs, size_t addr_size) { bool changed = false; switch (br_multicast_eht_host_filter_mode(pg, h_addr)) { case MCAST_INCLUDE: changed = __eht_del_set_entries(pg, h_addr, srcs, nsrcs, addr_size); break; case MCAST_EXCLUDE: __eht_create_set_entries(brmctx, pg, h_addr, srcs, nsrcs, addr_size, MCAST_EXCLUDE); break; } return changed; } /* flush_entries is true when changing mode */ static bool __eht_inc_exc(const struct net_bridge_mcast *brmctx, struct net_bridge_port_group *pg, union net_bridge_eht_addr *h_addr, void *srcs, u32 nsrcs, size_t addr_size, unsigned char filter_mode, bool to_report) { bool changed = false, flush_entries = to_report; union net_bridge_eht_addr eht_src_addr; if (br_multicast_eht_host_filter_mode(pg, h_addr) != filter_mode) flush_entries = true; memset(&eht_src_addr, 0, sizeof(eht_src_addr)); /* if we're changing mode del host and its entries */ if (flush_entries) br_multicast_del_eht_host(pg, h_addr); __eht_create_set_entries(brmctx, pg, h_addr, srcs, nsrcs, addr_size, filter_mode); /* we can be missing sets only if we've deleted some entries */ if (flush_entries) { struct net_bridge_group_eht_set *eht_set; struct net_bridge_group_src *src_ent; struct hlist_node *tmp; hlist_for_each_entry_safe(src_ent, tmp, &pg->src_list, node) { br_multicast_ip_src_to_eht_addr(&src_ent->addr, &eht_src_addr); if (!br_multicast_eht_set_lookup(pg, &eht_src_addr)) { br_multicast_del_group_src(src_ent, true); changed = true; continue; } /* this is an optimization for TO_INCLUDE where we lower * the set's timeout to LMQT to catch timeout hosts: * - host A (timing out): set entries X, Y * - host B: set entry Z (new from current TO_INCLUDE) * sends BLOCK Z after LMQT but host A's EHT * entries still exist (unless lowered to LMQT * so they can timeout with the S,Gs) * => we wait another LMQT, when we can just delete the * group immediately */ if (!(src_ent->flags & BR_SGRP_F_SEND) || filter_mode != MCAST_INCLUDE || !to_report) continue; eht_set = br_multicast_eht_set_lookup(pg, &eht_src_addr); if (!eht_set) continue; mod_timer(&eht_set->timer, jiffies + br_multicast_lmqt(brmctx)); } } return changed; } static bool br_multicast_eht_inc(const struct net_bridge_mcast *brmctx, struct net_bridge_port_group *pg, union net_bridge_eht_addr *h_addr, void *srcs, u32 nsrcs, size_t addr_size, bool to_report) { bool changed; changed = __eht_inc_exc(brmctx, pg, h_addr, srcs, nsrcs, addr_size, MCAST_INCLUDE, to_report); br_eht_convert_host_filter_mode(brmctx, pg, h_addr, MCAST_INCLUDE); return changed; } static bool br_multicast_eht_exc(const struct net_bridge_mcast *brmctx, struct net_bridge_port_group *pg, union net_bridge_eht_addr *h_addr, void *srcs, u32 nsrcs, size_t addr_size, bool to_report) { bool changed; changed = __eht_inc_exc(brmctx, pg, h_addr, srcs, nsrcs, addr_size, MCAST_EXCLUDE, to_report); br_eht_convert_host_filter_mode(brmctx, pg, h_addr, MCAST_EXCLUDE); return changed; } static bool __eht_ip4_handle(const struct net_bridge_mcast *brmctx, struct net_bridge_port_group *pg, union net_bridge_eht_addr *h_addr, void *srcs, u32 nsrcs, int grec_type) { bool changed = false, to_report = false; switch (grec_type) { case IGMPV3_ALLOW_NEW_SOURCES: br_multicast_eht_allow(brmctx, pg, h_addr, srcs, nsrcs, sizeof(__be32)); break; case IGMPV3_BLOCK_OLD_SOURCES: changed = br_multicast_eht_block(brmctx, pg, h_addr, srcs, nsrcs, sizeof(__be32)); break; case IGMPV3_CHANGE_TO_INCLUDE: to_report = true; fallthrough; case IGMPV3_MODE_IS_INCLUDE: changed = br_multicast_eht_inc(brmctx, pg, h_addr, srcs, nsrcs, sizeof(__be32), to_report); break; case IGMPV3_CHANGE_TO_EXCLUDE: to_report = true; fallthrough; case IGMPV3_MODE_IS_EXCLUDE: changed = br_multicast_eht_exc(brmctx, pg, h_addr, srcs, nsrcs, sizeof(__be32), to_report); break; } return changed; } #if IS_ENABLED(CONFIG_IPV6) static bool __eht_ip6_handle(const struct net_bridge_mcast *brmctx, struct net_bridge_port_group *pg, union net_bridge_eht_addr *h_addr, void *srcs, u32 nsrcs, int grec_type) { bool changed = false, to_report = false; switch (grec_type) { case MLD2_ALLOW_NEW_SOURCES: br_multicast_eht_allow(brmctx, pg, h_addr, srcs, nsrcs, sizeof(struct in6_addr)); break; case MLD2_BLOCK_OLD_SOURCES: changed = br_multicast_eht_block(brmctx, pg, h_addr, srcs, nsrcs, sizeof(struct in6_addr)); break; case MLD2_CHANGE_TO_INCLUDE: to_report = true; fallthrough; case MLD2_MODE_IS_INCLUDE: changed = br_multicast_eht_inc(brmctx, pg, h_addr, srcs, nsrcs, sizeof(struct in6_addr), to_report); break; case MLD2_CHANGE_TO_EXCLUDE: to_report = true; fallthrough; case MLD2_MODE_IS_EXCLUDE: changed = br_multicast_eht_exc(brmctx, pg, h_addr, srcs, nsrcs, sizeof(struct in6_addr), to_report); break; } return changed; } #endif /* true means an entry was deleted */ bool br_multicast_eht_handle(const struct net_bridge_mcast *brmctx, struct net_bridge_port_group *pg, void *h_addr, void *srcs, u32 nsrcs, size_t addr_size, int grec_type) { bool eht_enabled = !!(pg->key.port->flags & BR_MULTICAST_FAST_LEAVE); union net_bridge_eht_addr eht_host_addr; bool changed = false; if (!eht_enabled) goto out; memset(&eht_host_addr, 0, sizeof(eht_host_addr)); memcpy(&eht_host_addr, h_addr, addr_size); if (addr_size == sizeof(__be32)) changed = __eht_ip4_handle(brmctx, pg, &eht_host_addr, srcs, nsrcs, grec_type); #if IS_ENABLED(CONFIG_IPV6) else changed = __eht_ip6_handle(brmctx, pg, &eht_host_addr, srcs, nsrcs, grec_type); #endif out: return changed; } int br_multicast_eht_set_hosts_limit(struct net_bridge_port *p, u32 eht_hosts_limit) { struct net_bridge *br = p->br; if (!eht_hosts_limit) return -EINVAL; spin_lock_bh(&br->multicast_lock); p->multicast_eht_hosts_limit = eht_hosts_limit; spin_unlock_bh(&br->multicast_lock); return 0; }
4 3 3 1 1 3 3 3 3 3 1 1 3 1 1 4 4 4 1 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 // SPDX-License-Identifier: GPL-2.0-or-later /* * IPv6 Syncookies implementation for the Linux kernel * * Authors: * Glenn Griffin <ggriffin.kernel@gmail.com> * * Based on IPv4 implementation by Andi Kleen * linux/net/ipv4/syncookies.c */ #include <linux/tcp.h> #include <linux/random.h> #include <linux/siphash.h> #include <linux/kernel.h> #include <net/secure_seq.h> #include <net/ipv6.h> #include <net/tcp.h> #include <net/tcp_ecn.h> #define COOKIEBITS 24 /* Upper bits store count */ #define COOKIEMASK (((__u32)1 << COOKIEBITS) - 1) static siphash_aligned_key_t syncookie6_secret[2]; /* RFC 2460, Section 8.3: * [ipv6 tcp] MSS must be computed as the maximum packet size minus 60 [..] * * Due to IPV6_MIN_MTU=1280 the lowest possible MSS is 1220, which allows * using higher values than ipv4 tcp syncookies. * The other values are chosen based on ethernet (1500 and 9k MTU), plus * one that accounts for common encap (PPPoe) overhead. Table must be sorted. */ static __u16 const msstab[] = { 1280 - 60, /* IPV6_MIN_MTU - 60 */ 1480 - 60, 1500 - 60, 9000 - 60, }; static u32 cookie_hash(const struct in6_addr *saddr, const struct in6_addr *daddr, __be16 sport, __be16 dport, u32 count, int c) { const struct { struct in6_addr saddr; struct in6_addr daddr; u32 count; __be16 sport; __be16 dport; } __aligned(SIPHASH_ALIGNMENT) combined = { .saddr = *saddr, .daddr = *daddr, .count = count, .sport = sport, .dport = dport }; net_get_random_once(syncookie6_secret, sizeof(syncookie6_secret)); return siphash(&combined, offsetofend(typeof(combined), dport), &syncookie6_secret[c]); } static __u32 secure_tcp_syn_cookie(const struct in6_addr *saddr, const struct in6_addr *daddr, __be16 sport, __be16 dport, __u32 sseq, __u32 data) { u32 count = tcp_cookie_time(); return (cookie_hash(saddr, daddr, sport, dport, 0, 0) + sseq + (count << COOKIEBITS) + ((cookie_hash(saddr, daddr, sport, dport, count, 1) + data) & COOKIEMASK)); } static __u32 check_tcp_syn_cookie(__u32 cookie, const struct in6_addr *saddr, const struct in6_addr *daddr, __be16 sport, __be16 dport, __u32 sseq) { __u32 diff, count = tcp_cookie_time(); cookie -= cookie_hash(saddr, daddr, sport, dport, 0, 0) + sseq; diff = (count - (cookie >> COOKIEBITS)) & ((__u32) -1 >> COOKIEBITS); if (diff >= MAX_SYNCOOKIE_AGE) return (__u32)-1; return (cookie - cookie_hash(saddr, daddr, sport, dport, count - diff, 1)) & COOKIEMASK; } u32 __cookie_v6_init_sequence(const struct ipv6hdr *iph, const struct tcphdr *th, __u16 *mssp) { int mssind; const __u16 mss = *mssp; for (mssind = ARRAY_SIZE(msstab) - 1; mssind ; mssind--) if (mss >= msstab[mssind]) break; *mssp = msstab[mssind]; return secure_tcp_syn_cookie(&iph->saddr, &iph->daddr, th->source, th->dest, ntohl(th->seq), mssind); } EXPORT_SYMBOL_GPL(__cookie_v6_init_sequence); __u32 cookie_v6_init_sequence(const struct sk_buff *skb, __u16 *mssp) { const struct ipv6hdr *iph = ipv6_hdr(skb); const struct tcphdr *th = tcp_hdr(skb); return __cookie_v6_init_sequence(iph, th, mssp); } int __cookie_v6_check(const struct ipv6hdr *iph, const struct tcphdr *th) { __u32 cookie = ntohl(th->ack_seq) - 1; __u32 seq = ntohl(th->seq) - 1; __u32 mssind; mssind = check_tcp_syn_cookie(cookie, &iph->saddr, &iph->daddr, th->source, th->dest, seq); return mssind < ARRAY_SIZE(msstab) ? msstab[mssind] : 0; } EXPORT_SYMBOL_GPL(__cookie_v6_check); static struct request_sock *cookie_tcp_check(struct net *net, struct sock *sk, struct sk_buff *skb) { struct tcp_options_received tcp_opt; u32 tsoff = 0; int mss; if (tcp_synq_no_recent_overflow(sk)) goto out; mss = __cookie_v6_check(ipv6_hdr(skb), tcp_hdr(skb)); if (!mss) { __NET_INC_STATS(net, LINUX_MIB_SYNCOOKIESFAILED); goto out; } __NET_INC_STATS(net, LINUX_MIB_SYNCOOKIESRECV); /* check for timestamp cookie support */ memset(&tcp_opt, 0, sizeof(tcp_opt)); tcp_parse_options(net, skb, &tcp_opt, 0, NULL); if (tcp_opt.saw_tstamp && tcp_opt.rcv_tsecr) { tsoff = secure_tcpv6_ts_off(net, ipv6_hdr(skb)->daddr.s6_addr32, ipv6_hdr(skb)->saddr.s6_addr32); tcp_opt.rcv_tsecr -= tsoff; } if (!cookie_timestamp_decode(net, &tcp_opt)) goto out; return cookie_tcp_reqsk_alloc(&tcp6_request_sock_ops, sk, skb, &tcp_opt, mss, tsoff); out: return ERR_PTR(-EINVAL); } struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) { const struct tcphdr *th = tcp_hdr(skb); struct ipv6_pinfo *np = inet6_sk(sk); struct tcp_sock *tp = tcp_sk(sk); struct inet_request_sock *ireq; struct net *net = sock_net(sk); struct request_sock *req; struct dst_entry *dst; struct sock *ret = sk; __u8 rcv_wscale; int full_space; SKB_DR(reason); if (!READ_ONCE(net->ipv4.sysctl_tcp_syncookies) || !th->ack || th->rst) goto out; if (cookie_bpf_ok(skb)) { req = cookie_bpf_check(sk, skb); } else { req = cookie_tcp_check(net, sk, skb); if (IS_ERR(req)) goto out; } if (!req) { SKB_DR_SET(reason, NO_SOCKET); goto out_drop; } ireq = inet_rsk(req); ireq->ir_v6_rmt_addr = ipv6_hdr(skb)->saddr; ireq->ir_v6_loc_addr = ipv6_hdr(skb)->daddr; if (security_inet_conn_request(sk, skb, req)) { SKB_DR_SET(reason, SECURITY_HOOK); goto out_free; } if (ipv6_opt_accepted(sk, skb, &TCP_SKB_CB(skb)->header.h6) || np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo || np->rxopt.bits.rxhlim || np->rxopt.bits.rxohlim) { refcount_inc(&skb->users); ireq->pktopts = skb; } /* So that link locals have meaning */ if (!sk->sk_bound_dev_if && ipv6_addr_type(&ireq->ir_v6_rmt_addr) & IPV6_ADDR_LINKLOCAL) ireq->ir_iif = tcp_v6_iif(skb); tcp_ao_syncookie(sk, skb, req, AF_INET6); /* * We need to lookup the dst_entry to get the correct window size. * This is taken from tcp_v6_syn_recv_sock. Somebody please enlighten * me if there is a preferred way. */ { struct in6_addr *final_p, final; struct flowi6 fl6; memset(&fl6, 0, sizeof(fl6)); fl6.flowi6_proto = IPPROTO_TCP; fl6.daddr = ireq->ir_v6_rmt_addr; final_p = fl6_update_dst(&fl6, rcu_dereference(np->opt), &final); fl6.saddr = ireq->ir_v6_loc_addr; fl6.flowi6_oif = ireq->ir_iif; fl6.flowi6_mark = ireq->ir_mark; fl6.fl6_dport = ireq->ir_rmt_port; fl6.fl6_sport = inet_sk(sk)->inet_sport; fl6.flowi6_uid = sk_uid(sk); security_req_classify_flow(req, flowi6_to_flowi_common(&fl6)); dst = ip6_dst_lookup_flow(net, sk, &fl6, final_p); if (IS_ERR(dst)) { SKB_DR_SET(reason, IP_OUTNOROUTES); goto out_free; } } req->rsk_window_clamp = READ_ONCE(tp->window_clamp) ? :dst_metric(dst, RTAX_WINDOW); /* limit the window selection if the user enforce a smaller rx buffer */ full_space = tcp_full_space(sk); if (sk->sk_userlocks & SOCK_RCVBUF_LOCK && (req->rsk_window_clamp > full_space || req->rsk_window_clamp == 0)) req->rsk_window_clamp = full_space; tcp_select_initial_window(sk, full_space, req->mss, &req->rsk_rcv_wnd, &req->rsk_window_clamp, ireq->wscale_ok, &rcv_wscale, dst_metric(dst, RTAX_INITRWND)); /* req->syncookie is set true only if ACK is validated * by BPF kfunc, then, rcv_wscale is already configured. */ if (!req->syncookie) ireq->rcv_wscale = rcv_wscale; ireq->ecn_ok &= cookie_ecn_ok(net, dst); tcp_rsk(req)->accecn_ok = ireq->ecn_ok && cookie_accecn_ok(th); ret = tcp_get_cookie_sock(sk, skb, req, dst); if (!ret) { SKB_DR_SET(reason, NO_SOCKET); goto out_drop; } out: return ret; out_free: reqsk_free(req); out_drop: sk_skb_reason_drop(sk, skb, reason); return NULL; }
3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 // SPDX-License-Identifier: GPL-2.0-or-later /* * X.25 Packet Layer release 002 * * This is ALPHA test software. This code may break your machine, randomly fail to work with new * releases, misbehave and/or generally screw up. It might even work. * * This code REQUIRES 2.1.15 or higher * * History * X.25 001 Jonathan Naylor Started coding. * 2000-09-04 Henner Eisen Prevent freeing a dangling skb. */ #define pr_fmt(fmt) "X25: " fmt #include <linux/kernel.h> #include <linux/netdevice.h> #include <linux/skbuff.h> #include <linux/slab.h> #include <net/sock.h> #include <linux/if_arp.h> #include <net/x25.h> #include <net/x25device.h> static int x25_receive_data(struct sk_buff *skb, struct x25_neigh *nb) { struct sock *sk; unsigned short frametype; unsigned int lci; if (!pskb_may_pull(skb, X25_STD_MIN_LEN)) return 0; frametype = skb->data[2]; lci = ((skb->data[0] << 8) & 0xF00) + ((skb->data[1] << 0) & 0x0FF); /* * LCI of zero is always for us, and its always a link control * frame. */ if (lci == 0) { x25_link_control(skb, nb, frametype); return 0; } /* * Find an existing socket. */ if ((sk = x25_find_socket(lci, nb)) != NULL) { int queued = 1; skb_reset_transport_header(skb); bh_lock_sock(sk); if (!sock_owned_by_user(sk)) { queued = x25_process_rx_frame(sk, skb); } else { queued = !sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf)); } bh_unlock_sock(sk); sock_put(sk); return queued; } /* * Is is a Call Request ? if so process it. */ if (frametype == X25_CALL_REQUEST) return x25_rx_call_request(skb, nb, lci); /* * Its not a Call Request, nor is it a control frame. * Can we forward it? */ if (x25_forward_data(lci, nb, skb)) { if (frametype == X25_CLEAR_CONFIRMATION) { x25_clear_forward_by_lci(lci); } kfree_skb(skb); return 1; } /* x25_transmit_clear_request(nb, lci, 0x0D); */ if (frametype != X25_CLEAR_CONFIRMATION) pr_debug("x25_receive_data(): unknown frame type %2x\n",frametype); return 0; } int x25_lapb_receive_frame(struct sk_buff *skb, struct net_device *dev, struct packet_type *ptype, struct net_device *orig_dev) { struct sk_buff *nskb; struct x25_neigh *nb; if (!net_eq(dev_net(dev), &init_net)) goto drop; nskb = skb_copy(skb, GFP_ATOMIC); if (!nskb) goto drop; kfree_skb(skb); skb = nskb; /* * Packet received from unrecognised device, throw it away. */ nb = x25_get_neigh(dev); if (!nb) { pr_debug("unknown neighbour - %s\n", dev->name); goto drop; } if (!pskb_may_pull(skb, 1)) { x25_neigh_put(nb); goto drop; } switch (skb->data[0]) { case X25_IFACE_DATA: skb_pull(skb, 1); if (x25_receive_data(skb, nb)) { x25_neigh_put(nb); goto out; } break; case X25_IFACE_CONNECT: x25_link_established(nb); break; case X25_IFACE_DISCONNECT: x25_link_terminated(nb); break; } x25_neigh_put(nb); drop: kfree_skb(skb); out: return 0; } void x25_establish_link(struct x25_neigh *nb) { struct sk_buff *skb; unsigned char *ptr; switch (nb->dev->type) { case ARPHRD_X25: if ((skb = alloc_skb(1, GFP_ATOMIC)) == NULL) { pr_err("x25_dev: out of memory\n"); return; } ptr = skb_put(skb, 1); *ptr = X25_IFACE_CONNECT; break; default: return; } skb->protocol = htons(ETH_P_X25); skb->dev = nb->dev; dev_queue_xmit(skb); } void x25_send_frame(struct sk_buff *skb, struct x25_neigh *nb) { unsigned char *dptr; skb_reset_network_header(skb); switch (nb->dev->type) { case ARPHRD_X25: dptr = skb_push(skb, 1); *dptr = X25_IFACE_DATA; break; default: kfree_skb(skb); return; } skb->protocol = htons(ETH_P_X25); skb->dev = nb->dev; dev_queue_xmit(skb); }
390 61 329 115 320 408 354 349 127 7 911 847 843 305 22 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 // SPDX-License-Identifier: GPL-2.0-only /* Copyright (c) 2025 Christian Brauner <brauner@kernel.org> */ #include <linux/ns_common.h> #include <linux/nstree.h> #include <linux/proc_ns.h> #include <linux/user_namespace.h> #include <linux/vfsdebug.h> #ifdef CONFIG_DEBUG_VFS static void ns_debug(struct ns_common *ns, const struct proc_ns_operations *ops) { switch (ns->ns_type) { #ifdef CONFIG_CGROUPS case CLONE_NEWCGROUP: VFS_WARN_ON_ONCE(ops != &cgroupns_operations); break; #endif #ifdef CONFIG_IPC_NS case CLONE_NEWIPC: VFS_WARN_ON_ONCE(ops != &ipcns_operations); break; #endif case CLONE_NEWNS: VFS_WARN_ON_ONCE(ops != &mntns_operations); break; #ifdef CONFIG_NET_NS case CLONE_NEWNET: VFS_WARN_ON_ONCE(ops != &netns_operations); break; #endif #ifdef CONFIG_PID_NS case CLONE_NEWPID: VFS_WARN_ON_ONCE(ops != &pidns_operations); break; #endif #ifdef CONFIG_TIME_NS case CLONE_NEWTIME: VFS_WARN_ON_ONCE(ops != &timens_operations); break; #endif #ifdef CONFIG_USER_NS case CLONE_NEWUSER: VFS_WARN_ON_ONCE(ops != &userns_operations); break; #endif #ifdef CONFIG_UTS_NS case CLONE_NEWUTS: VFS_WARN_ON_ONCE(ops != &utsns_operations); break; #endif } } #endif int __ns_common_init(struct ns_common *ns, u32 ns_type, const struct proc_ns_operations *ops, int inum) { int ret = 0; refcount_set(&ns->__ns_ref, 1); ns->stashed = NULL; ns->ops = ops; ns->ns_id = 0; ns->ns_type = ns_type; ns_tree_node_init(&ns->ns_tree_node); ns_tree_node_init(&ns->ns_unified_node); ns_tree_node_init(&ns->ns_owner_node); ns_tree_root_init(&ns->ns_owner_root); #ifdef CONFIG_DEBUG_VFS ns_debug(ns, ops); #endif if (inum) ns->inum = inum; else ret = proc_alloc_inum(&ns->inum); if (ret) return ret; /* * Tree ref starts at 0. It's incremented when namespace enters * active use (installed in nsproxy) and decremented when all * active uses are gone. Initial namespaces are always active. */ if (is_ns_init_inum(ns)) atomic_set(&ns->__ns_ref_active, 1); else atomic_set(&ns->__ns_ref_active, 0); return 0; } void __ns_common_free(struct ns_common *ns) { proc_free_inum(ns->inum); } struct ns_common *__must_check ns_owner(struct ns_common *ns) { struct user_namespace *owner; if (unlikely(!ns->ops)) return NULL; VFS_WARN_ON_ONCE(!ns->ops->owner); owner = ns->ops->owner(ns); VFS_WARN_ON_ONCE(!owner && ns != to_ns_common(&init_user_ns)); if (!owner) return NULL; /* Skip init_user_ns as it's always active */ if (owner == &init_user_ns) return NULL; return to_ns_common(owner); } /* * The active reference count works by having each namespace that gets * created take a single active reference on its owning user namespace. * That single reference is only released once the child namespace's * active count itself goes down. * * A regular namespace tree might look as follow: * Legend: * + : adding active reference * - : dropping active reference * x : always active (initial namespace) * * * net_ns pid_ns * \ / * + + * user_ns1 (2) * | * ipc_ns | uts_ns * \ | / * + + + * user_ns2 (3) * | * cgroup_ns | mnt_ns * \ | / * x x x * init_user_ns (1) * * If both net_ns and pid_ns put their last active reference on * themselves it will cascade to user_ns1 dropping its own active * reference and dropping one active reference on user_ns2: * * net_ns pid_ns * \ / * - - * user_ns1 (0) * | * ipc_ns | uts_ns * \ | / * + - + * user_ns2 (2) * | * cgroup_ns | mnt_ns * \ | / * x x x * init_user_ns (1) * * The iteration stops once we reach a namespace that still has active * references. */ void __ns_ref_active_put(struct ns_common *ns) { /* Initial namespaces are always active. */ if (is_ns_init_id(ns)) return; if (!atomic_dec_and_test(&ns->__ns_ref_active)) { VFS_WARN_ON_ONCE(__ns_ref_active_read(ns) < 0); return; } VFS_WARN_ON_ONCE(is_ns_init_id(ns)); VFS_WARN_ON_ONCE(!__ns_ref_read(ns)); for (;;) { ns = ns_owner(ns); if (!ns) return; VFS_WARN_ON_ONCE(is_ns_init_id(ns)); if (!atomic_dec_and_test(&ns->__ns_ref_active)) { VFS_WARN_ON_ONCE(__ns_ref_active_read(ns) < 0); return; } } } /* * The active reference count works by having each namespace that gets * created take a single active reference on its owning user namespace. * That single reference is only released once the child namespace's * active count itself goes down. This makes it possible to efficiently * resurrect a namespace tree: * * A regular namespace tree might look as follow: * Legend: * + : adding active reference * - : dropping active reference * x : always active (initial namespace) * * * net_ns pid_ns * \ / * + + * user_ns1 (2) * | * ipc_ns | uts_ns * \ | / * + + + * user_ns2 (3) * | * cgroup_ns | mnt_ns * \ | / * x x x * init_user_ns (1) * * If both net_ns and pid_ns put their last active reference on * themselves it will cascade to user_ns1 dropping its own active * reference and dropping one active reference on user_ns2: * * net_ns pid_ns * \ / * - - * user_ns1 (0) * | * ipc_ns | uts_ns * \ | / * + - + * user_ns2 (2) * | * cgroup_ns | mnt_ns * \ | / * x x x * init_user_ns (1) * * Assume the whole tree is dead but all namespaces are still active: * * net_ns pid_ns * \ / * - - * user_ns1 (0) * | * ipc_ns | uts_ns * \ | / * - - - * user_ns2 (0) * | * cgroup_ns | mnt_ns * \ | / * x x x * init_user_ns (1) * * Now assume the net_ns gets resurrected (.e.g., via the SIOCGSKNS ioctl()): * * net_ns pid_ns * \ / * + - * user_ns1 (0) * | * ipc_ns | uts_ns * \ | / * - + - * user_ns2 (0) * | * cgroup_ns | mnt_ns * \ | / * x x x * init_user_ns (1) * * If net_ns had a zero reference count and we bumped it we also need to * take another reference on its owning user namespace. Similarly, if * pid_ns had a zero reference count it also needs to take another * reference on its owning user namespace. So both net_ns and pid_ns * will each have their own reference on the owning user namespace. * * If the owning user namespace user_ns1 had a zero reference count then * it also needs to take another reference on its owning user namespace * and so on. */ void __ns_ref_active_get(struct ns_common *ns) { int prev; /* Initial namespaces are always active. */ if (is_ns_init_id(ns)) return; /* If we didn't resurrect the namespace we're done. */ prev = atomic_fetch_add(1, &ns->__ns_ref_active); VFS_WARN_ON_ONCE(prev < 0); if (likely(prev)) return; /* * We did resurrect it. Walk the ownership hierarchy upwards * until we found an owning user namespace that is active. */ for (;;) { ns = ns_owner(ns); if (!ns) return; VFS_WARN_ON_ONCE(is_ns_init_id(ns)); prev = atomic_fetch_add(1, &ns->__ns_ref_active); VFS_WARN_ON_ONCE(prev < 0); if (likely(prev)) return; } }
1 1 1 1 1 1 1 1 1 15 15 15 15 21 16 16 16 15 15 15 2 1 1 1 1 2 2 1 1 1 1 1 12 12 12 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 // SPDX-License-Identifier: GPL-2.0-only #include <linux/etherdevice.h> #include <linux/if_tap.h> #include <linux/if_vlan.h> #include <linux/interrupt.h> #include <linux/nsproxy.h> #include <linux/compat.h> #include <linux/if_tun.h> #include <linux/module.h> #include <linux/skbuff.h> #include <linux/cache.h> #include <linux/sched/signal.h> #include <linux/types.h> #include <linux/slab.h> #include <linux/wait.h> #include <linux/cdev.h> #include <linux/idr.h> #include <linux/fs.h> #include <linux/uio.h> #include <net/gso.h> #include <net/net_namespace.h> #include <net/rtnetlink.h> #include <net/sock.h> #include <net/xdp.h> #include <linux/virtio_net.h> #include <linux/skb_array.h> #include "tun_vnet.h" #define TAP_IFFEATURES (IFF_VNET_HDR | IFF_MULTI_QUEUE) static struct proto tap_proto = { .name = "tap", .owner = THIS_MODULE, .obj_size = sizeof(struct tap_queue), }; #define TAP_NUM_DEVS (1U << MINORBITS) static LIST_HEAD(major_list); struct major_info { struct rcu_head rcu; dev_t major; struct idr minor_idr; spinlock_t minor_lock; const char *device_name; struct list_head next; }; #define GOODCOPY_LEN 128 static const struct proto_ops tap_socket_ops; #define RX_OFFLOADS (NETIF_F_GRO | NETIF_F_LRO) #define TAP_FEATURES (NETIF_F_GSO | NETIF_F_SG | NETIF_F_FRAGLIST) static struct tap_dev *tap_dev_get_rcu(const struct net_device *dev) { return rcu_dereference(dev->rx_handler_data); } /* * RCU usage: * The tap_queue and the macvlan_dev are loosely coupled, the * pointers from one to the other can only be read while rcu_read_lock * or rtnl is held. * * Both the file and the macvlan_dev hold a reference on the tap_queue * through sock_hold(&q->sk). When the macvlan_dev goes away first, * q->vlan becomes inaccessible. When the files gets closed, * tap_get_queue() fails. * * There may still be references to the struct sock inside of the * queue from outbound SKBs, but these never reference back to the * file or the dev. The data structure is freed through __sk_free * when both our references and any pending SKBs are gone. */ static int tap_enable_queue(struct tap_dev *tap, struct file *file, struct tap_queue *q) { int err = -EINVAL; ASSERT_RTNL(); if (q->enabled) goto out; err = 0; rcu_assign_pointer(tap->taps[tap->numvtaps], q); q->queue_index = tap->numvtaps; q->enabled = true; tap->numvtaps++; out: return err; } /* Requires RTNL */ static int tap_set_queue(struct tap_dev *tap, struct file *file, struct tap_queue *q) { if (tap->numqueues == MAX_TAP_QUEUES) return -EBUSY; rcu_assign_pointer(q->tap, tap); rcu_assign_pointer(tap->taps[tap->numvtaps], q); sock_hold(&q->sk); q->file = file; q->queue_index = tap->numvtaps; q->enabled = true; file->private_data = q; list_add_tail(&q->next, &tap->queue_list); tap->numvtaps++; tap->numqueues++; return 0; } static int tap_disable_queue(struct tap_queue *q) { struct tap_dev *tap; struct tap_queue *nq; ASSERT_RTNL(); if (!q->enabled) return -EINVAL; tap = rtnl_dereference(q->tap); if (tap) { int index = q->queue_index; BUG_ON(index >= tap->numvtaps); nq = rtnl_dereference(tap->taps[tap->numvtaps - 1]); nq->queue_index = index; rcu_assign_pointer(tap->taps[index], nq); RCU_INIT_POINTER(tap->taps[tap->numvtaps - 1], NULL); q->enabled = false; tap->numvtaps--; } return 0; } /* * The file owning the queue got closed, give up both * the reference that the files holds as well as the * one from the macvlan_dev if that still exists. * * Using the spinlock makes sure that we don't get * to the queue again after destroying it. */ static void tap_put_queue(struct tap_queue *q) { struct tap_dev *tap; rtnl_lock(); tap = rtnl_dereference(q->tap); if (tap) { if (q->enabled) BUG_ON(tap_disable_queue(q)); tap->numqueues--; RCU_INIT_POINTER(q->tap, NULL); sock_put(&q->sk); list_del_init(&q->next); } rtnl_unlock(); synchronize_rcu(); sock_put(&q->sk); } /* * Select a queue based on the rxq of the device on which this packet * arrived. If the incoming device is not mq, calculate a flow hash * to select a queue. If all fails, find the first available queue. * Cache vlan->numvtaps since it can become zero during the execution * of this function. */ static struct tap_queue *tap_get_queue(struct tap_dev *tap, struct sk_buff *skb) { struct tap_queue *queue = NULL; /* Access to taps array is protected by rcu, but access to numvtaps * isn't. Below we use it to lookup a queue, but treat it as a hint * and validate that the result isn't NULL - in case we are * racing against queue removal. */ int numvtaps = READ_ONCE(tap->numvtaps); __u32 rxq; if (!numvtaps) goto out; if (numvtaps == 1) goto single; /* Check if we can use flow to select a queue */ rxq = skb_get_hash(skb); if (rxq) { queue = rcu_dereference(tap->taps[rxq % numvtaps]); goto out; } if (likely(skb_rx_queue_recorded(skb))) { rxq = skb_get_rx_queue(skb); while (unlikely(rxq >= numvtaps)) rxq -= numvtaps; queue = rcu_dereference(tap->taps[rxq]); goto out; } single: queue = rcu_dereference(tap->taps[0]); out: return queue; } /* * The net_device is going away, give up the reference * that it holds on all queues and safely set the pointer * from the queues to NULL. */ void tap_del_queues(struct tap_dev *tap) { struct tap_queue *q, *tmp; ASSERT_RTNL(); list_for_each_entry_safe(q, tmp, &tap->queue_list, next) { list_del_init(&q->next); RCU_INIT_POINTER(q->tap, NULL); if (q->enabled) tap->numvtaps--; tap->numqueues--; sock_put(&q->sk); } BUG_ON(tap->numvtaps); BUG_ON(tap->numqueues); /* guarantee that any future tap_set_queue will fail */ tap->numvtaps = MAX_TAP_QUEUES; } EXPORT_SYMBOL_GPL(tap_del_queues); rx_handler_result_t tap_handle_frame(struct sk_buff **pskb) { struct sk_buff *skb = *pskb; struct net_device *dev = skb->dev; struct tap_dev *tap; struct tap_queue *q; netdev_features_t features = TAP_FEATURES; enum skb_drop_reason drop_reason; tap = tap_dev_get_rcu(dev); if (!tap) return RX_HANDLER_PASS; q = tap_get_queue(tap, skb); if (!q) return RX_HANDLER_PASS; skb_push(skb, ETH_HLEN); /* Apply the forward feature mask so that we perform segmentation * according to users wishes. This only works if VNET_HDR is * enabled. */ if (q->flags & IFF_VNET_HDR) features |= tap->tap_features; if (netif_needs_gso(skb, features)) { struct sk_buff *segs = __skb_gso_segment(skb, features, false); struct sk_buff *next; if (IS_ERR(segs)) { drop_reason = SKB_DROP_REASON_SKB_GSO_SEG; goto drop; } if (!segs) { if (ptr_ring_produce(&q->ring, skb)) { drop_reason = SKB_DROP_REASON_FULL_RING; goto drop; } goto wake_up; } consume_skb(skb); skb_list_walk_safe(segs, skb, next) { skb_mark_not_on_list(skb); if (ptr_ring_produce(&q->ring, skb)) { drop_reason = SKB_DROP_REASON_FULL_RING; kfree_skb_reason(skb, drop_reason); kfree_skb_list_reason(next, drop_reason); break; } } } else { /* If we receive a partial checksum and the tap side * doesn't support checksum offload, compute the checksum. * Note: it doesn't matter which checksum feature to * check, we either support them all or none. */ if (skb->ip_summed == CHECKSUM_PARTIAL && !(features & NETIF_F_CSUM_MASK) && skb_checksum_help(skb)) { drop_reason = SKB_DROP_REASON_SKB_CSUM; goto drop; } if (ptr_ring_produce(&q->ring, skb)) { drop_reason = SKB_DROP_REASON_FULL_RING; goto drop; } } wake_up: wake_up_interruptible_poll(sk_sleep(&q->sk), EPOLLIN | EPOLLRDNORM | EPOLLRDBAND); return RX_HANDLER_CONSUMED; drop: /* Count errors/drops only here, thus don't care about args. */ if (tap->count_rx_dropped) tap->count_rx_dropped(tap); kfree_skb_reason(skb, drop_reason); return RX_HANDLER_CONSUMED; } EXPORT_SYMBOL_GPL(tap_handle_frame); static struct major_info *tap_get_major(int major) { struct major_info *tap_major; list_for_each_entry_rcu(tap_major, &major_list, next) { if (tap_major->major == major) return tap_major; } return NULL; } int tap_get_minor(dev_t major, struct tap_dev *tap) { int retval = -ENOMEM; struct major_info *tap_major; rcu_read_lock(); tap_major = tap_get_major(MAJOR(major)); if (!tap_major) { retval = -EINVAL; goto unlock; } spin_lock(&tap_major->minor_lock); retval = idr_alloc(&tap_major->minor_idr, tap, 1, TAP_NUM_DEVS, GFP_ATOMIC); if (retval >= 0) { tap->minor = retval; } else if (retval == -ENOSPC) { netdev_err(tap->dev, "Too many tap devices\n"); retval = -EINVAL; } spin_unlock(&tap_major->minor_lock); unlock: rcu_read_unlock(); return retval < 0 ? retval : 0; } EXPORT_SYMBOL_GPL(tap_get_minor); void tap_free_minor(dev_t major, struct tap_dev *tap) { struct major_info *tap_major; rcu_read_lock(); tap_major = tap_get_major(MAJOR(major)); if (!tap_major) { goto unlock; } spin_lock(&tap_major->minor_lock); if (tap->minor) { idr_remove(&tap_major->minor_idr, tap->minor); tap->minor = 0; } spin_unlock(&tap_major->minor_lock); unlock: rcu_read_unlock(); } EXPORT_SYMBOL_GPL(tap_free_minor); static struct tap_dev *dev_get_by_tap_file(int major, int minor) { struct net_device *dev = NULL; struct tap_dev *tap; struct major_info *tap_major; rcu_read_lock(); tap_major = tap_get_major(major); if (!tap_major) { tap = NULL; goto unlock; } spin_lock(&tap_major->minor_lock); tap = idr_find(&tap_major->minor_idr, minor); if (tap) { dev = tap->dev; dev_hold(dev); } spin_unlock(&tap_major->minor_lock); unlock: rcu_read_unlock(); return tap; } static void tap_sock_write_space(struct sock *sk) { wait_queue_head_t *wqueue; if (!sock_writeable(sk) || !test_and_clear_bit(SOCKWQ_ASYNC_NOSPACE, &sk->sk_socket->flags)) return; wqueue = sk_sleep(sk); if (wqueue && waitqueue_active(wqueue)) wake_up_interruptible_poll(wqueue, EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND); } static void tap_sock_destruct(struct sock *sk) { struct tap_queue *q = container_of(sk, struct tap_queue, sk); ptr_ring_cleanup(&q->ring, __skb_array_destroy_skb); } static int tap_open(struct inode *inode, struct file *file) { struct net *net = current->nsproxy->net_ns; struct tap_dev *tap; struct tap_queue *q; int err = -ENODEV; rtnl_lock(); tap = dev_get_by_tap_file(imajor(inode), iminor(inode)); if (!tap) goto err; err = -ENOMEM; q = (struct tap_queue *)sk_alloc(net, AF_UNSPEC, GFP_KERNEL, &tap_proto, 0); if (!q) goto err; if (ptr_ring_init(&q->ring, tap->dev->tx_queue_len, GFP_KERNEL)) { sk_free(&q->sk); goto err; } init_waitqueue_head(&q->sock.wq.wait); q->sock.type = SOCK_RAW; q->sock.state = SS_CONNECTED; q->sock.file = file; q->sock.ops = &tap_socket_ops; sock_init_data_uid(&q->sock, &q->sk, current_fsuid()); q->sk.sk_write_space = tap_sock_write_space; q->sk.sk_destruct = tap_sock_destruct; q->flags = IFF_VNET_HDR | IFF_NO_PI | IFF_TAP; q->vnet_hdr_sz = sizeof(struct virtio_net_hdr); /* * so far only KVM virtio_net uses tap, enable zero copy between * guest kernel and host kernel when lower device supports zerocopy * * The macvlan supports zerocopy iff the lower device supports zero * copy so we don't have to look at the lower device directly. */ if ((tap->dev->features & NETIF_F_HIGHDMA) && (tap->dev->features & NETIF_F_SG)) sock_set_flag(&q->sk, SOCK_ZEROCOPY); err = tap_set_queue(tap, file, q); if (err) { /* tap_sock_destruct() will take care of freeing ptr_ring */ goto err_put; } /* tap groks IOCB_NOWAIT just fine, mark it as such */ file->f_mode |= FMODE_NOWAIT; dev_put(tap->dev); rtnl_unlock(); return err; err_put: sock_put(&q->sk); err: if (tap) dev_put(tap->dev); rtnl_unlock(); return err; } static int tap_release(struct inode *inode, struct file *file) { struct tap_queue *q = file->private_data; tap_put_queue(q); return 0; } static __poll_t tap_poll(struct file *file, poll_table *wait) { struct tap_queue *q = file->private_data; __poll_t mask = EPOLLERR; if (!q) goto out; mask = 0; poll_wait(file, &q->sock.wq.wait, wait); if (!ptr_ring_empty(&q->ring)) mask |= EPOLLIN | EPOLLRDNORM; if (sock_writeable(&q->sk) || (!test_and_set_bit(SOCKWQ_ASYNC_NOSPACE, &q->sock.flags) && sock_writeable(&q->sk))) mask |= EPOLLOUT | EPOLLWRNORM; out: return mask; } static inline struct sk_buff *tap_alloc_skb(struct sock *sk, size_t prepad, size_t len, size_t linear, int noblock, int *err) { struct sk_buff *skb; /* Under a page? Don't bother with paged skb. */ if (prepad + len < PAGE_SIZE || !linear) linear = len; if (len - linear > MAX_SKB_FRAGS * (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) linear = len - MAX_SKB_FRAGS * (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER); skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock, err, PAGE_ALLOC_COSTLY_ORDER); if (!skb) return NULL; skb_reserve(skb, prepad); skb_put(skb, linear); skb->data_len = len - linear; skb->len += len - linear; return skb; } /* Neighbour code has some assumptions on HH_DATA_MOD alignment */ #define TAP_RESERVE HH_DATA_OFF(ETH_HLEN) /* Get packet from user space buffer */ static ssize_t tap_get_user(struct tap_queue *q, void *msg_control, struct iov_iter *from, int noblock) { int good_linear = SKB_MAX_HEAD(TAP_RESERVE); struct sk_buff *skb; struct tap_dev *tap; unsigned long total_len = iov_iter_count(from); unsigned long len = total_len; int err; struct virtio_net_hdr vnet_hdr = { 0 }; int vnet_hdr_len = 0; int hdr_len = 0; int copylen = 0; int depth; bool zerocopy = false; size_t linear; enum skb_drop_reason drop_reason; if (q->flags & IFF_VNET_HDR) { vnet_hdr_len = READ_ONCE(q->vnet_hdr_sz); hdr_len = tun_vnet_hdr_get(vnet_hdr_len, q->flags, from, &vnet_hdr); if (hdr_len < 0) { err = hdr_len; goto err; } len -= vnet_hdr_len; } err = -EINVAL; if (unlikely(len < ETH_HLEN)) goto err; if (msg_control && sock_flag(&q->sk, SOCK_ZEROCOPY)) { struct iov_iter i; copylen = clamp(hdr_len ?: GOODCOPY_LEN, ETH_HLEN, good_linear); linear = copylen; i = *from; iov_iter_advance(&i, copylen); if (iov_iter_npages(&i, INT_MAX) <= MAX_SKB_FRAGS) zerocopy = true; } if (!zerocopy) { copylen = len; linear = clamp(hdr_len, ETH_HLEN, good_linear); } skb = tap_alloc_skb(&q->sk, TAP_RESERVE, copylen, linear, noblock, &err); if (!skb) goto err; if (zerocopy) err = zerocopy_sg_from_iter(skb, from); else err = skb_copy_datagram_from_iter(skb, 0, from, len); if (err) { drop_reason = SKB_DROP_REASON_SKB_UCOPY_FAULT; goto err_kfree; } skb_set_network_header(skb, ETH_HLEN); skb_reset_mac_header(skb); skb->protocol = eth_hdr(skb)->h_proto; rcu_read_lock(); tap = rcu_dereference(q->tap); if (!tap) { kfree_skb(skb); rcu_read_unlock(); return total_len; } skb->dev = tap->dev; if (vnet_hdr_len) { err = tun_vnet_hdr_to_skb(q->flags, skb, &vnet_hdr); if (err) { rcu_read_unlock(); drop_reason = SKB_DROP_REASON_DEV_HDR; goto err_kfree; } } skb_probe_transport_header(skb); /* Move network header to the right position for VLAN tagged packets */ if (eth_type_vlan(skb->protocol) && vlan_get_protocol_and_depth(skb, skb->protocol, &depth) != 0) skb_set_network_header(skb, depth); /* copy skb_ubuf_info for callback when skb has no error */ if (zerocopy) { skb_zcopy_init(skb, msg_control); } else if (msg_control) { struct ubuf_info *uarg = msg_control; uarg->ops->complete(NULL, uarg, false); } dev_queue_xmit(skb); rcu_read_unlock(); return total_len; err_kfree: kfree_skb_reason(skb, drop_reason); err: rcu_read_lock(); tap = rcu_dereference(q->tap); if (tap && tap->count_tx_dropped) tap->count_tx_dropped(tap); rcu_read_unlock(); return err; } static ssize_t tap_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; struct tap_queue *q = file->private_data; int noblock = 0; if ((file->f_flags & O_NONBLOCK) || (iocb->ki_flags & IOCB_NOWAIT)) noblock = 1; return tap_get_user(q, NULL, from, noblock); } /* Put packet to the user space buffer */ static ssize_t tap_put_user(struct tap_queue *q, const struct sk_buff *skb, struct iov_iter *iter) { int ret; int vnet_hdr_len = 0; int vlan_offset = 0; int total; if (q->flags & IFF_VNET_HDR) { struct virtio_net_hdr vnet_hdr; vnet_hdr_len = READ_ONCE(q->vnet_hdr_sz); ret = tun_vnet_hdr_from_skb(q->flags, NULL, skb, &vnet_hdr); if (ret) return ret; ret = tun_vnet_hdr_put(vnet_hdr_len, iter, &vnet_hdr); if (ret) return ret; } total = vnet_hdr_len; total += skb->len; if (skb_vlan_tag_present(skb)) { struct { __be16 h_vlan_proto; __be16 h_vlan_TCI; } veth; veth.h_vlan_proto = skb->vlan_proto; veth.h_vlan_TCI = htons(skb_vlan_tag_get(skb)); vlan_offset = offsetof(struct vlan_ethhdr, h_vlan_proto); total += VLAN_HLEN; ret = skb_copy_datagram_iter(skb, 0, iter, vlan_offset); if (ret || !iov_iter_count(iter)) goto done; ret = copy_to_iter(&veth, sizeof(veth), iter); if (ret != sizeof(veth) || !iov_iter_count(iter)) goto done; } ret = skb_copy_datagram_iter(skb, vlan_offset, iter, skb->len - vlan_offset); done: return ret ? ret : total; } static ssize_t tap_do_read(struct tap_queue *q, struct iov_iter *to, int noblock, struct sk_buff *skb) { DEFINE_WAIT(wait); ssize_t ret = 0; if (!iov_iter_count(to)) { kfree_skb(skb); return 0; } if (skb) goto put; while (1) { if (!noblock) prepare_to_wait(sk_sleep(&q->sk), &wait, TASK_INTERRUPTIBLE); /* Read frames from the queue */ skb = ptr_ring_consume(&q->ring); if (skb) break; if (noblock) { ret = -EAGAIN; break; } if (signal_pending(current)) { ret = -ERESTARTSYS; break; } /* Nothing to read, let's sleep */ schedule(); } if (!noblock) finish_wait(sk_sleep(&q->sk), &wait); put: if (skb) { ret = tap_put_user(q, skb, to); if (unlikely(ret < 0)) kfree_skb(skb); else consume_skb(skb); } return ret; } static ssize_t tap_read_iter(struct kiocb *iocb, struct iov_iter *to) { struct file *file = iocb->ki_filp; struct tap_queue *q = file->private_data; ssize_t len = iov_iter_count(to), ret; int noblock = 0; if ((file->f_flags & O_NONBLOCK) || (iocb->ki_flags & IOCB_NOWAIT)) noblock = 1; ret = tap_do_read(q, to, noblock, NULL); ret = min_t(ssize_t, ret, len); if (ret > 0) iocb->ki_pos = ret; return ret; } static struct tap_dev *tap_get_tap_dev(struct tap_queue *q) { struct tap_dev *tap; ASSERT_RTNL(); tap = rtnl_dereference(q->tap); if (tap) dev_hold(tap->dev); return tap; } static void tap_put_tap_dev(struct tap_dev *tap) { dev_put(tap->dev); } static int tap_ioctl_set_queue(struct file *file, unsigned int flags) { struct tap_queue *q = file->private_data; struct tap_dev *tap; int ret; tap = tap_get_tap_dev(q); if (!tap) return -EINVAL; if (flags & IFF_ATTACH_QUEUE) ret = tap_enable_queue(tap, file, q); else if (flags & IFF_DETACH_QUEUE) ret = tap_disable_queue(q); else ret = -EINVAL; tap_put_tap_dev(tap); return ret; } static int set_offload(struct tap_queue *q, unsigned long arg) { struct tap_dev *tap; netdev_features_t features; netdev_features_t feature_mask = 0; tap = rtnl_dereference(q->tap); if (!tap) return -ENOLINK; features = tap->dev->features; if (arg & TUN_F_CSUM) { feature_mask = NETIF_F_HW_CSUM; if (arg & (TUN_F_TSO4 | TUN_F_TSO6)) { if (arg & TUN_F_TSO_ECN) feature_mask |= NETIF_F_TSO_ECN; if (arg & TUN_F_TSO4) feature_mask |= NETIF_F_TSO; if (arg & TUN_F_TSO6) feature_mask |= NETIF_F_TSO6; } /* TODO: for now USO4 and USO6 should work simultaneously */ if ((arg & (TUN_F_USO4 | TUN_F_USO6)) == (TUN_F_USO4 | TUN_F_USO6)) features |= NETIF_F_GSO_UDP_L4; } /* tun/tap driver inverts the usage for TSO offloads, where * setting the TSO bit means that the userspace wants to * accept TSO frames and turning it off means that user space * does not support TSO. * For tap, we have to invert it to mean the same thing. * When user space turns off TSO, we turn off GSO/LRO so that * user-space will not receive TSO frames. */ if (feature_mask & (NETIF_F_TSO | NETIF_F_TSO6) || (feature_mask & (TUN_F_USO4 | TUN_F_USO6)) == (TUN_F_USO4 | TUN_F_USO6)) features |= RX_OFFLOADS; else features &= ~RX_OFFLOADS; /* tap_features are the same as features on tun/tap and * reflect user expectations. */ tap->tap_features = feature_mask; if (tap->update_features) tap->update_features(tap, features); return 0; } /* * provide compatibility with generic tun/tap interface */ static long tap_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct tap_queue *q = file->private_data; struct tap_dev *tap; void __user *argp = (void __user *)arg; struct ifreq __user *ifr = argp; unsigned int __user *up = argp; unsigned short u; int __user *sp = argp; struct sockaddr_storage ss; int s; int ret; switch (cmd) { case TUNSETIFF: /* ignore the name, just look at flags */ if (get_user(u, &ifr->ifr_flags)) return -EFAULT; ret = 0; if ((u & ~TAP_IFFEATURES) != (IFF_NO_PI | IFF_TAP)) ret = -EINVAL; else q->flags = (q->flags & ~TAP_IFFEATURES) | u; return ret; case TUNGETIFF: rtnl_lock(); tap = tap_get_tap_dev(q); if (!tap) { rtnl_unlock(); return -ENOLINK; } ret = 0; u = q->flags; if (copy_to_user(&ifr->ifr_name, tap->dev->name, IFNAMSIZ) || put_user(u, &ifr->ifr_flags)) ret = -EFAULT; tap_put_tap_dev(tap); rtnl_unlock(); return ret; case TUNSETQUEUE: if (get_user(u, &ifr->ifr_flags)) return -EFAULT; rtnl_lock(); ret = tap_ioctl_set_queue(file, u); rtnl_unlock(); return ret; case TUNGETFEATURES: if (put_user(IFF_TAP | IFF_NO_PI | TAP_IFFEATURES, up)) return -EFAULT; return 0; case TUNSETSNDBUF: if (get_user(s, sp)) return -EFAULT; if (s <= 0) return -EINVAL; q->sk.sk_sndbuf = s; return 0; case TUNSETOFFLOAD: /* let the user check for future flags */ if (arg & ~(TUN_F_CSUM | TUN_F_TSO4 | TUN_F_TSO6 | TUN_F_TSO_ECN | TUN_F_UFO | TUN_F_USO4 | TUN_F_USO6)) return -EINVAL; rtnl_lock(); ret = set_offload(q, arg); rtnl_unlock(); return ret; case SIOCGIFHWADDR: rtnl_lock(); tap = tap_get_tap_dev(q); if (!tap) { rtnl_unlock(); return -ENOLINK; } ret = 0; netif_get_mac_address((struct sockaddr *)&ss, dev_net(tap->dev), tap->dev->name); if (copy_to_user(&ifr->ifr_name, tap->dev->name, IFNAMSIZ) || copy_to_user(&ifr->ifr_hwaddr, &ss, sizeof(ifr->ifr_hwaddr))) ret = -EFAULT; tap_put_tap_dev(tap); rtnl_unlock(); return ret; case SIOCSIFHWADDR: if (copy_from_user(&ss, &ifr->ifr_hwaddr, sizeof(ifr->ifr_hwaddr))) return -EFAULT; rtnl_lock(); tap = tap_get_tap_dev(q); if (!tap) { rtnl_unlock(); return -ENOLINK; } if (tap->dev->addr_len > sizeof(ifr->ifr_hwaddr)) ret = -EINVAL; else ret = dev_set_mac_address_user(tap->dev, &ss, NULL); tap_put_tap_dev(tap); rtnl_unlock(); return ret; default: return tun_vnet_ioctl(&q->vnet_hdr_sz, &q->flags, cmd, sp); } } static const struct file_operations tap_fops = { .owner = THIS_MODULE, .open = tap_open, .release = tap_release, .read_iter = tap_read_iter, .write_iter = tap_write_iter, .poll = tap_poll, .unlocked_ioctl = tap_ioctl, .compat_ioctl = compat_ptr_ioctl, }; static int tap_get_user_xdp(struct tap_queue *q, struct xdp_buff *xdp) { struct virtio_net_hdr *gso = xdp->data_hard_start; int buflen = xdp->frame_sz; int vnet_hdr_len = 0; struct tap_dev *tap; struct sk_buff *skb; int err, depth; if (unlikely(xdp->data_end - xdp->data < ETH_HLEN)) { err = -EINVAL; goto err; } if (q->flags & IFF_VNET_HDR) vnet_hdr_len = READ_ONCE(q->vnet_hdr_sz); skb = build_skb(xdp->data_hard_start, buflen); if (!skb) { err = -ENOMEM; goto err; } skb_reserve(skb, xdp->data - xdp->data_hard_start); skb_put(skb, xdp->data_end - xdp->data); skb_set_network_header(skb, ETH_HLEN); skb_reset_mac_header(skb); skb->protocol = eth_hdr(skb)->h_proto; if (vnet_hdr_len) { err = tun_vnet_hdr_to_skb(q->flags, skb, gso); if (err) goto err_kfree; } /* Move network header to the right position for VLAN tagged packets */ if (eth_type_vlan(skb->protocol) && vlan_get_protocol_and_depth(skb, skb->protocol, &depth) != 0) skb_set_network_header(skb, depth); rcu_read_lock(); tap = rcu_dereference(q->tap); if (tap) { skb->dev = tap->dev; skb_probe_transport_header(skb); dev_queue_xmit(skb); } else { kfree_skb(skb); } rcu_read_unlock(); return 0; err_kfree: kfree_skb(skb); err: rcu_read_lock(); tap = rcu_dereference(q->tap); if (tap && tap->count_tx_dropped) tap->count_tx_dropped(tap); rcu_read_unlock(); return err; } static int tap_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len) { struct tap_queue *q = container_of(sock, struct tap_queue, sock); struct tun_msg_ctl *ctl = m->msg_control; struct xdp_buff *xdp; int i; if (m->msg_controllen == sizeof(struct tun_msg_ctl) && ctl && ctl->type == TUN_MSG_PTR) { for (i = 0; i < ctl->num; i++) { xdp = &((struct xdp_buff *)ctl->ptr)[i]; tap_get_user_xdp(q, xdp); } return 0; } return tap_get_user(q, ctl ? ctl->ptr : NULL, &m->msg_iter, m->msg_flags & MSG_DONTWAIT); } static int tap_recvmsg(struct socket *sock, struct msghdr *m, size_t total_len, int flags) { struct tap_queue *q = container_of(sock, struct tap_queue, sock); struct sk_buff *skb = m->msg_control; int ret; if (flags & ~(MSG_DONTWAIT|MSG_TRUNC)) { kfree_skb(skb); return -EINVAL; } ret = tap_do_read(q, &m->msg_iter, flags & MSG_DONTWAIT, skb); if (ret > total_len) { m->msg_flags |= MSG_TRUNC; ret = flags & MSG_TRUNC ? ret : total_len; } return ret; } static int tap_peek_len(struct socket *sock) { struct tap_queue *q = container_of(sock, struct tap_queue, sock); return PTR_RING_PEEK_CALL(&q->ring, __skb_array_len_with_tag); } /* Ops structure to mimic raw sockets with tun */ static const struct proto_ops tap_socket_ops = { .sendmsg = tap_sendmsg, .recvmsg = tap_recvmsg, .peek_len = tap_peek_len, }; /* Get an underlying socket object from tun file. Returns error unless file is * attached to a device. The returned object works like a packet socket, it * can be used for sock_sendmsg/sock_recvmsg. The caller is responsible for * holding a reference to the file for as long as the socket is in use. */ struct socket *tap_get_socket(struct file *file) { struct tap_queue *q; if (file->f_op != &tap_fops) return ERR_PTR(-EINVAL); q = file->private_data; if (!q) return ERR_PTR(-EBADFD); return &q->sock; } EXPORT_SYMBOL_GPL(tap_get_socket); struct ptr_ring *tap_get_ptr_ring(struct file *file) { struct tap_queue *q; if (file->f_op != &tap_fops) return ERR_PTR(-EINVAL); q = file->private_data; if (!q) return ERR_PTR(-EBADFD); return &q->ring; } EXPORT_SYMBOL_GPL(tap_get_ptr_ring); int tap_queue_resize(struct tap_dev *tap) { struct net_device *dev = tap->dev; struct tap_queue *q; struct ptr_ring **rings; int n = tap->numqueues; int ret, i = 0; rings = kmalloc_array(n, sizeof(*rings), GFP_KERNEL); if (!rings) return -ENOMEM; list_for_each_entry(q, &tap->queue_list, next) rings[i++] = &q->ring; ret = ptr_ring_resize_multiple_bh(rings, n, dev->tx_queue_len, GFP_KERNEL, __skb_array_destroy_skb); kfree(rings); return ret; } EXPORT_SYMBOL_GPL(tap_queue_resize); static int tap_list_add(dev_t major, const char *device_name) { struct major_info *tap_major; tap_major = kzalloc(sizeof(*tap_major), GFP_ATOMIC); if (!tap_major) return -ENOMEM; tap_major->major = MAJOR(major); idr_init(&tap_major->minor_idr); spin_lock_init(&tap_major->minor_lock); tap_major->device_name = device_name; list_add_tail_rcu(&tap_major->next, &major_list); return 0; } int tap_create_cdev(struct cdev *tap_cdev, dev_t *tap_major, const char *device_name, struct module *module) { int err; err = alloc_chrdev_region(tap_major, 0, TAP_NUM_DEVS, device_name); if (err) goto out1; cdev_init(tap_cdev, &tap_fops); tap_cdev->owner = module; err = cdev_add(tap_cdev, *tap_major, TAP_NUM_DEVS); if (err) goto out2; err = tap_list_add(*tap_major, device_name); if (err) goto out3; return 0; out3: cdev_del(tap_cdev); out2: unregister_chrdev_region(*tap_major, TAP_NUM_DEVS); out1: return err; } EXPORT_SYMBOL_GPL(tap_create_cdev); void tap_destroy_cdev(dev_t major, struct cdev *tap_cdev) { struct major_info *tap_major, *tmp; cdev_del(tap_cdev); unregister_chrdev_region(major, TAP_NUM_DEVS); list_for_each_entry_safe(tap_major, tmp, &major_list, next) { if (tap_major->major == MAJOR(major)) { idr_destroy(&tap_major->minor_idr); list_del_rcu(&tap_major->next); kfree_rcu(tap_major, rcu); } } } EXPORT_SYMBOL_GPL(tap_destroy_cdev); MODULE_DESCRIPTION("Common library for drivers implementing the TAP interface"); MODULE_AUTHOR("Arnd Bergmann <arnd@arndb.de>"); MODULE_AUTHOR("Sainath Grandhi <sainath.grandhi@intel.com>"); MODULE_LICENSE("GPL"); MODULE_IMPORT_NS("NETDEV_INTERNAL");
10 1 42 119 119 1 1 1 40 1 40 1 40 32 8 1 1 1 40 39 40 40 68 68 1 79 79 33 68 49 42 2 10 1 4 7 24 30 16 15 14 1 8 9 9 8 1 15 13 2 2 16 15 1 15 16 16 10 10 6 5 10 2 14 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 // SPDX-License-Identifier: GPL-2.0-or-later /* * inet fragments management * * Authors: Pavel Emelyanov <xemul@openvz.org> * Started as consolidation of ipv4/ip_fragment.c, * ipv6/reassembly. and ipv6 nf conntrack reassembly */ #include <linux/list.h> #include <linux/spinlock.h> #include <linux/module.h> #include <linux/timer.h> #include <linux/mm.h> #include <linux/random.h> #include <linux/skbuff.h> #include <linux/rtnetlink.h> #include <linux/slab.h> #include <linux/rhashtable.h> #include <net/sock.h> #include <net/inet_frag.h> #include <net/inet_ecn.h> #include <net/ip.h> #include <net/ipv6.h> #include "../core/sock_destructor.h" /* Use skb->cb to track consecutive/adjacent fragments coming at * the end of the queue. Nodes in the rb-tree queue will * contain "runs" of one or more adjacent fragments. * * Invariants: * - next_frag is NULL at the tail of a "run"; * - the head of a "run" has the sum of all fragment lengths in frag_run_len. */ struct ipfrag_skb_cb { union { struct inet_skb_parm h4; struct inet6_skb_parm h6; }; struct sk_buff *next_frag; int frag_run_len; int ip_defrag_offset; }; #define FRAG_CB(skb) ((struct ipfrag_skb_cb *)((skb)->cb)) static void fragcb_clear(struct sk_buff *skb) { RB_CLEAR_NODE(&skb->rbnode); FRAG_CB(skb)->next_frag = NULL; FRAG_CB(skb)->frag_run_len = skb->len; } /* Append skb to the last "run". */ static void fragrun_append_to_last(struct inet_frag_queue *q, struct sk_buff *skb) { fragcb_clear(skb); FRAG_CB(q->last_run_head)->frag_run_len += skb->len; FRAG_CB(q->fragments_tail)->next_frag = skb; q->fragments_tail = skb; } /* Create a new "run" with the skb. */ static void fragrun_create(struct inet_frag_queue *q, struct sk_buff *skb) { BUILD_BUG_ON(sizeof(struct ipfrag_skb_cb) > sizeof(skb->cb)); fragcb_clear(skb); if (q->last_run_head) rb_link_node(&skb->rbnode, &q->last_run_head->rbnode, &q->last_run_head->rbnode.rb_right); else rb_link_node(&skb->rbnode, NULL, &q->rb_fragments.rb_node); rb_insert_color(&skb->rbnode, &q->rb_fragments); q->fragments_tail = skb; q->last_run_head = skb; } /* Given the OR values of all fragments, apply RFC 3168 5.3 requirements * Value : 0xff if frame should be dropped. * 0 or INET_ECN_CE value, to be ORed in to final iph->tos field */ const u8 ip_frag_ecn_table[16] = { /* at least one fragment had CE, and others ECT_0 or ECT_1 */ [IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0] = INET_ECN_CE, [IPFRAG_ECN_CE | IPFRAG_ECN_ECT_1] = INET_ECN_CE, [IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1] = INET_ECN_CE, /* invalid combinations : drop frame */ [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE] = 0xff, [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_0] = 0xff, [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_1] = 0xff, [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1] = 0xff, [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0] = 0xff, [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_1] = 0xff, [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1] = 0xff, }; EXPORT_SYMBOL(ip_frag_ecn_table); int inet_frags_init(struct inet_frags *f) { f->frags_cachep = kmem_cache_create(f->frags_cache_name, f->qsize, 0, 0, NULL); if (!f->frags_cachep) return -ENOMEM; refcount_set(&f->refcnt, 1); init_completion(&f->completion); return 0; } EXPORT_SYMBOL(inet_frags_init); void inet_frags_fini(struct inet_frags *f) { if (refcount_dec_and_test(&f->refcnt)) complete(&f->completion); wait_for_completion(&f->completion); kmem_cache_destroy(f->frags_cachep); f->frags_cachep = NULL; } EXPORT_SYMBOL(inet_frags_fini); /* called from rhashtable_free_and_destroy() at netns_frags dismantle */ static void inet_frags_free_cb(void *ptr, void *arg) { struct inet_frag_queue *fq = ptr; int count; count = timer_delete_sync(&fq->timer) ? 1 : 0; spin_lock_bh(&fq->lock); fq->flags |= INET_FRAG_DROP; if (!(fq->flags & INET_FRAG_COMPLETE)) { fq->flags |= INET_FRAG_COMPLETE; count++; } else if (fq->flags & INET_FRAG_HASH_DEAD) { count++; } spin_unlock_bh(&fq->lock); inet_frag_putn(fq, count); } static LLIST_HEAD(fqdir_free_list); static void fqdir_free_fn(struct work_struct *work) { struct llist_node *kill_list; struct fqdir *fqdir, *tmp; struct inet_frags *f; /* Atomically snapshot the list of fqdirs to free */ kill_list = llist_del_all(&fqdir_free_list); /* We need to make sure all ongoing call_rcu(..., inet_frag_destroy_rcu) * have completed, since they need to dereference fqdir. * Would it not be nice to have kfree_rcu_barrier() ? :) */ rcu_barrier(); llist_for_each_entry_safe(fqdir, tmp, kill_list, free_list) { f = fqdir->f; if (refcount_dec_and_test(&f->refcnt)) complete(&f->completion); kfree(fqdir); } } static DECLARE_DELAYED_WORK(fqdir_free_work, fqdir_free_fn); static void fqdir_work_fn(struct work_struct *work) { struct fqdir *fqdir = container_of(work, struct fqdir, destroy_work); rhashtable_free_and_destroy(&fqdir->rhashtable, inet_frags_free_cb, NULL); if (llist_add(&fqdir->free_list, &fqdir_free_list)) queue_delayed_work(system_percpu_wq, &fqdir_free_work, HZ); } int fqdir_init(struct fqdir **fqdirp, struct inet_frags *f, struct net *net) { struct fqdir *fqdir = kzalloc(sizeof(*fqdir), GFP_KERNEL); int res; if (!fqdir) return -ENOMEM; fqdir->f = f; fqdir->net = net; res = rhashtable_init(&fqdir->rhashtable, &fqdir->f->rhash_params); if (res < 0) { kfree(fqdir); return res; } refcount_inc(&f->refcnt); *fqdirp = fqdir; return 0; } EXPORT_SYMBOL(fqdir_init); static struct workqueue_struct *inet_frag_wq; static int __init inet_frag_wq_init(void) { inet_frag_wq = create_workqueue("inet_frag_wq"); if (!inet_frag_wq) panic("Could not create inet frag workq"); return 0; } pure_initcall(inet_frag_wq_init); void fqdir_pre_exit(struct fqdir *fqdir) { struct inet_frag_queue *fq; struct rhashtable_iter hti; /* Prevent creation of new frags. * Pairs with READ_ONCE() in inet_frag_find(). */ WRITE_ONCE(fqdir->high_thresh, 0); /* Pairs with READ_ONCE() in inet_frag_kill(), ip_expire() * and ip6frag_expire_frag_queue(). */ WRITE_ONCE(fqdir->dead, true); rhashtable_walk_enter(&fqdir->rhashtable, &hti); rhashtable_walk_start(&hti); while ((fq = rhashtable_walk_next(&hti))) { if (IS_ERR(fq)) { if (PTR_ERR(fq) != -EAGAIN) break; continue; } spin_lock_bh(&fq->lock); if (!(fq->flags & INET_FRAG_COMPLETE)) inet_frag_queue_flush(fq, 0); spin_unlock_bh(&fq->lock); } rhashtable_walk_stop(&hti); rhashtable_walk_exit(&hti); } EXPORT_SYMBOL(fqdir_pre_exit); void fqdir_exit(struct fqdir *fqdir) { INIT_WORK(&fqdir->destroy_work, fqdir_work_fn); queue_work(inet_frag_wq, &fqdir->destroy_work); } EXPORT_SYMBOL(fqdir_exit); void inet_frag_kill(struct inet_frag_queue *fq, int *refs) { if (timer_delete(&fq->timer)) (*refs)++; if (!(fq->flags & INET_FRAG_COMPLETE)) { struct fqdir *fqdir = fq->fqdir; fq->flags |= INET_FRAG_COMPLETE; rcu_read_lock(); /* The RCU read lock provides a memory barrier * guaranteeing that if fqdir->dead is false then * the hash table destruction will not start until * after we unlock. Paired with fqdir_pre_exit(). */ if (!READ_ONCE(fqdir->dead)) { rhashtable_remove_fast(&fqdir->rhashtable, &fq->node, fqdir->f->rhash_params); (*refs)++; } else { fq->flags |= INET_FRAG_HASH_DEAD; } rcu_read_unlock(); } } EXPORT_SYMBOL(inet_frag_kill); static void inet_frag_destroy_rcu(struct rcu_head *head) { struct inet_frag_queue *q = container_of(head, struct inet_frag_queue, rcu); struct inet_frags *f = q->fqdir->f; if (f->destructor) f->destructor(q); kmem_cache_free(f->frags_cachep, q); } static unsigned int inet_frag_rbtree_purge(struct rb_root *root, enum skb_drop_reason reason) { struct rb_node *p = rb_first(root); unsigned int sum = 0; while (p) { struct sk_buff *skb = rb_entry(p, struct sk_buff, rbnode); p = rb_next(p); rb_erase(&skb->rbnode, root); while (skb) { struct sk_buff *next = FRAG_CB(skb)->next_frag; sum += skb->truesize; kfree_skb_reason(skb, reason); skb = next; } } return sum; } void inet_frag_queue_flush(struct inet_frag_queue *q, enum skb_drop_reason reason) { unsigned int sum; reason = reason ?: SKB_DROP_REASON_FRAG_REASM_TIMEOUT; sum = inet_frag_rbtree_purge(&q->rb_fragments, reason); sub_frag_mem_limit(q->fqdir, sum); } EXPORT_SYMBOL(inet_frag_queue_flush); void inet_frag_destroy(struct inet_frag_queue *q) { unsigned int sum, sum_truesize = 0; enum skb_drop_reason reason; struct inet_frags *f; struct fqdir *fqdir; WARN_ON(!(q->flags & INET_FRAG_COMPLETE)); reason = (q->flags & INET_FRAG_DROP) ? SKB_DROP_REASON_FRAG_REASM_TIMEOUT : SKB_CONSUMED; WARN_ON(timer_delete(&q->timer) != 0); /* Release all fragment data. */ fqdir = q->fqdir; f = fqdir->f; sum_truesize = inet_frag_rbtree_purge(&q->rb_fragments, reason); sum = sum_truesize + f->qsize; call_rcu(&q->rcu, inet_frag_destroy_rcu); sub_frag_mem_limit(fqdir, sum); } EXPORT_SYMBOL(inet_frag_destroy); static struct inet_frag_queue *inet_frag_alloc(struct fqdir *fqdir, struct inet_frags *f, void *arg) { struct inet_frag_queue *q; q = kmem_cache_zalloc(f->frags_cachep, GFP_ATOMIC); if (!q) return NULL; q->fqdir = fqdir; f->constructor(q, arg); add_frag_mem_limit(fqdir, f->qsize); timer_setup(&q->timer, f->frag_expire, 0); spin_lock_init(&q->lock); /* One reference for the timer, one for the hash table. * We never take any extra references, only decrement this field. */ refcount_set(&q->refcnt, 2); return q; } static struct inet_frag_queue *inet_frag_create(struct fqdir *fqdir, void *arg, struct inet_frag_queue **prev) { struct inet_frags *f = fqdir->f; struct inet_frag_queue *q; q = inet_frag_alloc(fqdir, f, arg); if (!q) { *prev = ERR_PTR(-ENOMEM); return NULL; } mod_timer(&q->timer, jiffies + fqdir->timeout); *prev = rhashtable_lookup_get_insert_key(&fqdir->rhashtable, &q->key, &q->node, f->rhash_params); if (*prev) { /* We could not insert in the hash table, * we need to cancel what inet_frag_alloc() * anticipated. */ int refs = 1; q->flags |= INET_FRAG_COMPLETE; inet_frag_kill(q, &refs); inet_frag_putn(q, refs); return NULL; } return q; } struct inet_frag_queue *inet_frag_find(struct fqdir *fqdir, void *key) { /* This pairs with WRITE_ONCE() in fqdir_pre_exit(). */ long high_thresh = READ_ONCE(fqdir->high_thresh); struct inet_frag_queue *fq = NULL, *prev; if (!high_thresh || frag_mem_limit(fqdir) > high_thresh) return NULL; prev = rhashtable_lookup(&fqdir->rhashtable, key, fqdir->f->rhash_params); if (!prev) fq = inet_frag_create(fqdir, key, &prev); if (!IS_ERR_OR_NULL(prev)) fq = prev; return fq; } EXPORT_SYMBOL(inet_frag_find); int inet_frag_queue_insert(struct inet_frag_queue *q, struct sk_buff *skb, int offset, int end) { struct sk_buff *last = q->fragments_tail; /* RFC5722, Section 4, amended by Errata ID : 3089 * When reassembling an IPv6 datagram, if * one or more its constituent fragments is determined to be an * overlapping fragment, the entire datagram (and any constituent * fragments) MUST be silently discarded. * * Duplicates, however, should be ignored (i.e. skb dropped, but the * queue/fragments kept for later reassembly). */ if (!last) fragrun_create(q, skb); /* First fragment. */ else if (FRAG_CB(last)->ip_defrag_offset + last->len < end) { /* This is the common case: skb goes to the end. */ /* Detect and discard overlaps. */ if (offset < FRAG_CB(last)->ip_defrag_offset + last->len) return IPFRAG_OVERLAP; if (offset == FRAG_CB(last)->ip_defrag_offset + last->len) fragrun_append_to_last(q, skb); else fragrun_create(q, skb); } else { /* Binary search. Note that skb can become the first fragment, * but not the last (covered above). */ struct rb_node **rbn, *parent; rbn = &q->rb_fragments.rb_node; do { struct sk_buff *curr; int curr_run_end; parent = *rbn; curr = rb_to_skb(parent); curr_run_end = FRAG_CB(curr)->ip_defrag_offset + FRAG_CB(curr)->frag_run_len; if (end <= FRAG_CB(curr)->ip_defrag_offset) rbn = &parent->rb_left; else if (offset >= curr_run_end) rbn = &parent->rb_right; else if (offset >= FRAG_CB(curr)->ip_defrag_offset && end <= curr_run_end) return IPFRAG_DUP; else return IPFRAG_OVERLAP; } while (*rbn); /* Here we have parent properly set, and rbn pointing to * one of its NULL left/right children. Insert skb. */ fragcb_clear(skb); rb_link_node(&skb->rbnode, parent, rbn); rb_insert_color(&skb->rbnode, &q->rb_fragments); } FRAG_CB(skb)->ip_defrag_offset = offset; if (offset) nf_reset_ct(skb); return IPFRAG_OK; } EXPORT_SYMBOL(inet_frag_queue_insert); void *inet_frag_reasm_prepare(struct inet_frag_queue *q, struct sk_buff *skb, struct sk_buff *parent) { struct sk_buff *fp, *head = skb_rb_first(&q->rb_fragments); void (*destructor)(struct sk_buff *); unsigned int orig_truesize = 0; struct sk_buff **nextp = NULL; struct sock *sk = skb->sk; int delta; if (sk && is_skb_wmem(skb)) { /* TX: skb->sk might have been passed as argument to * dst->output and must remain valid until tx completes. * * Move sk to reassembled skb and fix up wmem accounting. */ orig_truesize = skb->truesize; destructor = skb->destructor; } if (head != skb) { fp = skb_clone(skb, GFP_ATOMIC); if (!fp) { head = skb; goto out_restore_sk; } FRAG_CB(fp)->next_frag = FRAG_CB(skb)->next_frag; if (RB_EMPTY_NODE(&skb->rbnode)) FRAG_CB(parent)->next_frag = fp; else rb_replace_node(&skb->rbnode, &fp->rbnode, &q->rb_fragments); if (q->fragments_tail == skb) q->fragments_tail = fp; if (orig_truesize) { /* prevent skb_morph from releasing sk */ skb->sk = NULL; skb->destructor = NULL; } skb_morph(skb, head); FRAG_CB(skb)->next_frag = FRAG_CB(head)->next_frag; rb_replace_node(&head->rbnode, &skb->rbnode, &q->rb_fragments); consume_skb(head); head = skb; } WARN_ON(FRAG_CB(head)->ip_defrag_offset != 0); delta = -head->truesize; /* Head of list must not be cloned. */ if (skb_unclone(head, GFP_ATOMIC)) goto out_restore_sk; delta += head->truesize; if (delta) add_frag_mem_limit(q->fqdir, delta); /* If the first fragment is fragmented itself, we split * it to two chunks: the first with data and paged part * and the second, holding only fragments. */ if (skb_has_frag_list(head)) { struct sk_buff *clone; int i, plen = 0; clone = alloc_skb(0, GFP_ATOMIC); if (!clone) goto out_restore_sk; skb_shinfo(clone)->frag_list = skb_shinfo(head)->frag_list; skb_frag_list_init(head); for (i = 0; i < skb_shinfo(head)->nr_frags; i++) plen += skb_frag_size(&skb_shinfo(head)->frags[i]); clone->data_len = head->data_len - plen; clone->len = clone->data_len; head->truesize += clone->truesize; clone->csum = 0; clone->ip_summed = head->ip_summed; add_frag_mem_limit(q->fqdir, clone->truesize); skb_shinfo(head)->frag_list = clone; nextp = &clone->next; } else { nextp = &skb_shinfo(head)->frag_list; } out_restore_sk: if (orig_truesize) { int ts_delta = head->truesize - orig_truesize; /* if this reassembled skb is fragmented later, * fraglist skbs will get skb->sk assigned from head->sk, * and each frag skb will be released via sock_wfree. * * Update sk_wmem_alloc. */ head->sk = sk; head->destructor = destructor; refcount_add(ts_delta, &sk->sk_wmem_alloc); } return nextp; } EXPORT_SYMBOL(inet_frag_reasm_prepare); void inet_frag_reasm_finish(struct inet_frag_queue *q, struct sk_buff *head, void *reasm_data, bool try_coalesce) { struct sock *sk = is_skb_wmem(head) ? head->sk : NULL; const unsigned int head_truesize = head->truesize; struct sk_buff **nextp = reasm_data; struct rb_node *rbn; struct sk_buff *fp; int sum_truesize; skb_push(head, head->data - skb_network_header(head)); /* Traverse the tree in order, to build frag_list. */ fp = FRAG_CB(head)->next_frag; rbn = rb_next(&head->rbnode); rb_erase(&head->rbnode, &q->rb_fragments); sum_truesize = head->truesize; while (rbn || fp) { /* fp points to the next sk_buff in the current run; * rbn points to the next run. */ /* Go through the current run. */ while (fp) { struct sk_buff *next_frag = FRAG_CB(fp)->next_frag; bool stolen; int delta; sum_truesize += fp->truesize; if (head->ip_summed != fp->ip_summed) head->ip_summed = CHECKSUM_NONE; else if (head->ip_summed == CHECKSUM_COMPLETE) head->csum = csum_add(head->csum, fp->csum); if (try_coalesce && skb_try_coalesce(head, fp, &stolen, &delta)) { kfree_skb_partial(fp, stolen); } else { fp->prev = NULL; memset(&fp->rbnode, 0, sizeof(fp->rbnode)); fp->sk = NULL; head->data_len += fp->len; head->len += fp->len; head->truesize += fp->truesize; *nextp = fp; nextp = &fp->next; } fp = next_frag; } /* Move to the next run. */ if (rbn) { struct rb_node *rbnext = rb_next(rbn); fp = rb_to_skb(rbn); rb_erase(rbn, &q->rb_fragments); rbn = rbnext; } } sub_frag_mem_limit(q->fqdir, sum_truesize); *nextp = NULL; skb_mark_not_on_list(head); head->prev = NULL; head->tstamp = q->stamp; head->tstamp_type = q->tstamp_type; if (sk) refcount_add(sum_truesize - head_truesize, &sk->sk_wmem_alloc); } EXPORT_SYMBOL(inet_frag_reasm_finish); struct sk_buff *inet_frag_pull_head(struct inet_frag_queue *q) { struct sk_buff *head, *skb; head = skb_rb_first(&q->rb_fragments); if (!head) return NULL; skb = FRAG_CB(head)->next_frag; if (skb) rb_replace_node(&head->rbnode, &skb->rbnode, &q->rb_fragments); else rb_erase(&head->rbnode, &q->rb_fragments); memset(&head->rbnode, 0, sizeof(head->rbnode)); barrier(); if (head == q->fragments_tail) q->fragments_tail = NULL; sub_frag_mem_limit(q->fqdir, head->truesize); return head; } EXPORT_SYMBOL(inet_frag_pull_head);
986 58036 4371 4380 3683 3683 3684 3698 3683 26959 26959 26932 26986 4370 1162 3325 4362 4366 2897 3406 4367 4368 4371 1162 3330 231 599 599 368 598 599 232 213 338 368 368 368 116 118 936 939 939 940 935 939 941 29709 57733 57840 29727 46336 940 938 937 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 1994 Linus Torvalds * * Pentium III FXSR, SSE support * General FPU state handling cleanups * Gareth Hughes <gareth@valinux.com>, May 2000 */ #include <asm/fpu/api.h> #include <asm/fpu/regset.h> #include <asm/fpu/sched.h> #include <asm/fpu/signal.h> #include <asm/fpu/types.h> #include <asm/msr.h> #include <asm/traps.h> #include <asm/irq_regs.h> #include <uapi/asm/kvm.h> #include <linux/hardirq.h> #include <linux/kvm_types.h> #include <linux/pkeys.h> #include <linux/vmalloc.h> #include "context.h" #include "internal.h" #include "legacy.h" #include "xstate.h" #define CREATE_TRACE_POINTS #include <asm/trace/fpu.h> #ifdef CONFIG_X86_64 DEFINE_STATIC_KEY_FALSE(__fpu_state_size_dynamic); DEFINE_PER_CPU(u64, xfd_state); #endif /* The FPU state configuration data for kernel and user space */ struct fpu_state_config fpu_kernel_cfg __ro_after_init; struct fpu_state_config fpu_user_cfg __ro_after_init; struct vcpu_fpu_config guest_default_cfg __ro_after_init; /* * Represents the initial FPU state. It's mostly (but not completely) zeroes, * depending on the FPU hardware format: */ struct fpstate init_fpstate __ro_after_init; /* * Track FPU initialization and kernel-mode usage. 'true' means the FPU is * initialized and is not currently being used by the kernel: */ DEFINE_PER_CPU(bool, kernel_fpu_allowed); /* * Track which context is using the FPU on the CPU: */ DEFINE_PER_CPU(struct fpu *, fpu_fpregs_owner_ctx); #ifdef CONFIG_X86_DEBUG_FPU struct fpu *x86_task_fpu(struct task_struct *task) { if (WARN_ON_ONCE(task->flags & PF_KTHREAD)) return NULL; return (void *)task + sizeof(*task); } #endif /* * Can we use the FPU in kernel mode with the * whole "kernel_fpu_begin/end()" sequence? */ bool irq_fpu_usable(void) { if (WARN_ON_ONCE(in_nmi())) return false; /* * Return false in the following cases: * * - FPU is not yet initialized. This can happen only when the call is * coming from CPU onlining, for example for microcode checksumming. * - The kernel is already using the FPU, either because of explicit * nesting (which should never be done), or because of implicit * nesting when a hardirq interrupted a kernel-mode FPU section. * * The single boolean check below handles both cases: */ if (!this_cpu_read(kernel_fpu_allowed)) return false; /* * When not in NMI or hard interrupt context, FPU can be used in: * * - Task context except from within fpregs_lock()'ed critical * regions. * * - Soft interrupt processing context which cannot happen * while in a fpregs_lock()'ed critical region. */ if (!in_hardirq()) return true; /* * In hard interrupt context it's safe when soft interrupts * are enabled, which means the interrupt did not hit in * a fpregs_lock()'ed critical region. */ return !softirq_count(); } EXPORT_SYMBOL(irq_fpu_usable); /* * Track AVX512 state use because it is known to slow the max clock * speed of the core. */ static void update_avx_timestamp(struct fpu *fpu) { #define AVX512_TRACKING_MASK (XFEATURE_MASK_ZMM_Hi256 | XFEATURE_MASK_Hi16_ZMM) if (fpu->fpstate->regs.xsave.header.xfeatures & AVX512_TRACKING_MASK) fpu->avx512_timestamp = jiffies; } /* * Save the FPU register state in fpu->fpstate->regs. The register state is * preserved. * * Must be called with fpregs_lock() held. * * The legacy FNSAVE instruction clears all FPU state unconditionally, so * register state has to be reloaded. That might be a pointless exercise * when the FPU is going to be used by another task right after that. But * this only affects 20+ years old 32bit systems and avoids conditionals all * over the place. * * FXSAVE and all XSAVE variants preserve the FPU register state. */ void save_fpregs_to_fpstate(struct fpu *fpu) { if (likely(use_xsave())) { os_xsave(fpu->fpstate); update_avx_timestamp(fpu); return; } if (likely(use_fxsr())) { fxsave(&fpu->fpstate->regs.fxsave); return; } /* * Legacy FPU register saving, FNSAVE always clears FPU registers, * so we have to reload them from the memory state. */ asm volatile("fnsave %[fp]; fwait" : [fp] "=m" (fpu->fpstate->regs.fsave)); frstor(&fpu->fpstate->regs.fsave); } void restore_fpregs_from_fpstate(struct fpstate *fpstate, u64 mask) { /* * AMD K7/K8 and later CPUs up to Zen don't save/restore * FDP/FIP/FOP unless an exception is pending. Clear the x87 state * here by setting it to fixed values. "m" is a random variable * that should be in L1. */ if (unlikely(static_cpu_has_bug(X86_BUG_FXSAVE_LEAK))) { asm volatile( "fnclex\n\t" "emms\n\t" "fildl %[addr]" /* set F?P to defined value */ : : [addr] "m" (*fpstate)); } if (use_xsave()) { /* * Dynamically enabled features are enabled in XCR0, but * usage requires also that the corresponding bits in XFD * are cleared. If the bits are set then using a related * instruction will raise #NM. This allows to do the * allocation of the larger FPU buffer lazy from #NM or if * the task has no permission to kill it which would happen * via #UD if the feature is disabled in XCR0. * * XFD state is following the same life time rules as * XSTATE and to restore state correctly XFD has to be * updated before XRSTORS otherwise the component would * stay in or go into init state even if the bits are set * in fpstate::regs::xsave::xfeatures. */ xfd_update_state(fpstate); /* * Restoring state always needs to modify all features * which are in @mask even if the current task cannot use * extended features. * * So fpstate->xfeatures cannot be used here, because then * a feature for which the task has no permission but was * used by the previous task would not go into init state. */ mask = fpu_kernel_cfg.max_features & mask; os_xrstor(fpstate, mask); } else { if (use_fxsr()) fxrstor(&fpstate->regs.fxsave); else frstor(&fpstate->regs.fsave); } } void fpu_reset_from_exception_fixup(void) { restore_fpregs_from_fpstate(&init_fpstate, XFEATURE_MASK_FPSTATE); } #if IS_ENABLED(CONFIG_KVM) static void __fpstate_reset(struct fpstate *fpstate); static void fpu_lock_guest_permissions(void) { struct fpu_state_perm *fpuperm; u64 perm; if (!IS_ENABLED(CONFIG_X86_64)) return; spin_lock_irq(&current->sighand->siglock); fpuperm = &x86_task_fpu(current->group_leader)->guest_perm; perm = fpuperm->__state_perm; /* First fpstate allocation locks down permissions. */ WRITE_ONCE(fpuperm->__state_perm, perm | FPU_GUEST_PERM_LOCKED); spin_unlock_irq(&current->sighand->siglock); } bool fpu_alloc_guest_fpstate(struct fpu_guest *gfpu) { struct fpstate *fpstate; unsigned int size; size = guest_default_cfg.size + ALIGN(offsetof(struct fpstate, regs), 64); fpstate = vzalloc(size); if (!fpstate) return false; /* Initialize indicators to reflect properties of the fpstate */ fpstate->is_valloc = true; fpstate->is_guest = true; __fpstate_reset(fpstate); fpstate_init_user(fpstate); gfpu->fpstate = fpstate; gfpu->xfeatures = guest_default_cfg.features; /* * KVM sets the FP+SSE bits in the XSAVE header when copying FPU state * to userspace, even when XSAVE is unsupported, so that restoring FPU * state on a different CPU that does support XSAVE can cleanly load * the incoming state using its natural XSAVE. In other words, KVM's * uABI size may be larger than this host's default size. Conversely, * the default size should never be larger than KVM's base uABI size; * all features that can expand the uABI size must be opt-in. */ gfpu->uabi_size = sizeof(struct kvm_xsave); if (WARN_ON_ONCE(fpu_user_cfg.default_size > gfpu->uabi_size)) gfpu->uabi_size = fpu_user_cfg.default_size; fpu_lock_guest_permissions(); return true; } EXPORT_SYMBOL_FOR_KVM(fpu_alloc_guest_fpstate); void fpu_free_guest_fpstate(struct fpu_guest *gfpu) { struct fpstate *fpstate = gfpu->fpstate; if (!fpstate) return; if (WARN_ON_ONCE(!fpstate->is_valloc || !fpstate->is_guest || fpstate->in_use)) return; gfpu->fpstate = NULL; vfree(fpstate); } EXPORT_SYMBOL_FOR_KVM(fpu_free_guest_fpstate); /* * fpu_enable_guest_xfd_features - Check xfeatures against guest perm and enable * @guest_fpu: Pointer to the guest FPU container * @xfeatures: Features requested by guest CPUID * * Enable all dynamic xfeatures according to guest perm and requested CPUID. * * Return: 0 on success, error code otherwise */ int fpu_enable_guest_xfd_features(struct fpu_guest *guest_fpu, u64 xfeatures) { lockdep_assert_preemption_enabled(); /* Nothing to do if all requested features are already enabled. */ xfeatures &= ~guest_fpu->xfeatures; if (!xfeatures) return 0; return __xfd_enable_feature(xfeatures, guest_fpu); } EXPORT_SYMBOL_FOR_KVM(fpu_enable_guest_xfd_features); #ifdef CONFIG_X86_64 void fpu_update_guest_xfd(struct fpu_guest *guest_fpu, u64 xfd) { struct fpstate *fpstate = guest_fpu->fpstate; fpregs_lock(); /* * KVM's guest ABI is that setting XFD[i]=1 *can* immediately revert the * save state to its initial configuration. Likewise, KVM_GET_XSAVE does * the same as XSAVE and returns XSTATE_BV[i]=0 whenever XFD[i]=1. * * If the guest's FPU state is in hardware, just update XFD: the XSAVE * in fpu_swap_kvm_fpstate will clear XSTATE_BV[i] whenever XFD[i]=1. * * If however the guest's FPU state is NOT resident in hardware, clear * disabled components in XSTATE_BV now, or a subsequent XRSTOR will * attempt to load disabled components and generate #NM _in the host_. */ if (xfd && test_thread_flag(TIF_NEED_FPU_LOAD)) fpstate->regs.xsave.header.xfeatures &= ~xfd; fpstate->xfd = xfd; if (fpstate->in_use) xfd_update_state(fpstate); fpregs_unlock(); } EXPORT_SYMBOL_FOR_KVM(fpu_update_guest_xfd); /** * fpu_sync_guest_vmexit_xfd_state - Synchronize XFD MSR and software state * * Must be invoked from KVM after a VMEXIT before enabling interrupts when * XFD write emulation is disabled. This is required because the guest can * freely modify XFD and the state at VMEXIT is not guaranteed to be the * same as the state on VMENTER. So software state has to be updated before * any operation which depends on it can take place. * * Note: It can be invoked unconditionally even when write emulation is * enabled for the price of a then pointless MSR read. */ void fpu_sync_guest_vmexit_xfd_state(void) { struct fpstate *fpstate = x86_task_fpu(current)->fpstate; lockdep_assert_irqs_disabled(); if (fpu_state_size_dynamic()) { rdmsrq(MSR_IA32_XFD, fpstate->xfd); __this_cpu_write(xfd_state, fpstate->xfd); } } EXPORT_SYMBOL_FOR_KVM(fpu_sync_guest_vmexit_xfd_state); #endif /* CONFIG_X86_64 */ int fpu_swap_kvm_fpstate(struct fpu_guest *guest_fpu, bool enter_guest) { struct fpstate *guest_fps = guest_fpu->fpstate; struct fpu *fpu = x86_task_fpu(current); struct fpstate *cur_fps = fpu->fpstate; fpregs_lock(); if (!cur_fps->is_confidential && !test_thread_flag(TIF_NEED_FPU_LOAD)) save_fpregs_to_fpstate(fpu); /* Swap fpstate */ if (enter_guest) { fpu->__task_fpstate = cur_fps; fpu->fpstate = guest_fps; guest_fps->in_use = true; } else { guest_fps->in_use = false; fpu->fpstate = fpu->__task_fpstate; fpu->__task_fpstate = NULL; } cur_fps = fpu->fpstate; if (!cur_fps->is_confidential) { /* Includes XFD update */ restore_fpregs_from_fpstate(cur_fps, XFEATURE_MASK_FPSTATE); } else { /* * XSTATE is restored by firmware from encrypted * memory. Make sure XFD state is correct while * running with guest fpstate */ xfd_update_state(cur_fps); } fpregs_mark_activate(); fpregs_unlock(); return 0; } EXPORT_SYMBOL_FOR_KVM(fpu_swap_kvm_fpstate); void fpu_copy_guest_fpstate_to_uabi(struct fpu_guest *gfpu, void *buf, unsigned int size, u64 xfeatures, u32 pkru) { struct fpstate *kstate = gfpu->fpstate; union fpregs_state *ustate = buf; struct membuf mb = { .p = buf, .left = size }; if (cpu_feature_enabled(X86_FEATURE_XSAVE)) { __copy_xstate_to_uabi_buf(mb, kstate, xfeatures, pkru, XSTATE_COPY_XSAVE); } else { memcpy(&ustate->fxsave, &kstate->regs.fxsave, sizeof(ustate->fxsave)); /* Make it restorable on a XSAVE enabled host */ ustate->xsave.header.xfeatures = XFEATURE_MASK_FPSSE; } } EXPORT_SYMBOL_FOR_KVM(fpu_copy_guest_fpstate_to_uabi); int fpu_copy_uabi_to_guest_fpstate(struct fpu_guest *gfpu, const void *buf, u64 xcr0, u32 *vpkru) { struct fpstate *kstate = gfpu->fpstate; const union fpregs_state *ustate = buf; if (!cpu_feature_enabled(X86_FEATURE_XSAVE)) { if (ustate->xsave.header.xfeatures & ~XFEATURE_MASK_FPSSE) return -EINVAL; if (ustate->fxsave.mxcsr & ~mxcsr_feature_mask) return -EINVAL; memcpy(&kstate->regs.fxsave, &ustate->fxsave, sizeof(ustate->fxsave)); return 0; } if (ustate->xsave.header.xfeatures & ~xcr0) return -EINVAL; /* * Disabled features must be in their initial state, otherwise XRSTOR * causes an exception. */ if (WARN_ON_ONCE(ustate->xsave.header.xfeatures & kstate->xfd)) return -EINVAL; /* * Nullify @vpkru to preserve its current value if PKRU's bit isn't set * in the header. KVM's odd ABI is to leave PKRU untouched in this * case (all other components are eventually re-initialized). */ if (!(ustate->xsave.header.xfeatures & XFEATURE_MASK_PKRU)) vpkru = NULL; return copy_uabi_from_kernel_to_xstate(kstate, ustate, vpkru); } EXPORT_SYMBOL_FOR_KVM(fpu_copy_uabi_to_guest_fpstate); #endif /* CONFIG_KVM */ void kernel_fpu_begin_mask(unsigned int kfpu_mask) { if (!irqs_disabled()) fpregs_lock(); WARN_ON_FPU(!irq_fpu_usable()); /* Toggle kernel_fpu_allowed to false: */ WARN_ON_FPU(!this_cpu_read(kernel_fpu_allowed)); this_cpu_write(kernel_fpu_allowed, false); if (!(current->flags & (PF_KTHREAD | PF_USER_WORKER)) && !test_thread_flag(TIF_NEED_FPU_LOAD)) { set_thread_flag(TIF_NEED_FPU_LOAD); save_fpregs_to_fpstate(x86_task_fpu(current)); } __cpu_invalidate_fpregs_state(); /* Put sane initial values into the control registers. */ if (likely(kfpu_mask & KFPU_MXCSR) && boot_cpu_has(X86_FEATURE_XMM)) ldmxcsr(MXCSR_DEFAULT); if (unlikely(kfpu_mask & KFPU_387) && boot_cpu_has(X86_FEATURE_FPU)) asm volatile ("fninit"); } EXPORT_SYMBOL_GPL(kernel_fpu_begin_mask); void kernel_fpu_end(void) { /* Toggle kernel_fpu_allowed back to true: */ WARN_ON_FPU(this_cpu_read(kernel_fpu_allowed)); this_cpu_write(kernel_fpu_allowed, true); if (!irqs_disabled()) fpregs_unlock(); } EXPORT_SYMBOL_GPL(kernel_fpu_end); /* * Sync the FPU register state to current's memory register state when the * current task owns the FPU. The hardware register state is preserved. */ void fpu_sync_fpstate(struct fpu *fpu) { WARN_ON_FPU(fpu != x86_task_fpu(current)); fpregs_lock(); trace_x86_fpu_before_save(fpu); if (!test_thread_flag(TIF_NEED_FPU_LOAD)) save_fpregs_to_fpstate(fpu); trace_x86_fpu_after_save(fpu); fpregs_unlock(); } static inline unsigned int init_fpstate_copy_size(void) { if (!use_xsave()) return fpu_kernel_cfg.default_size; /* XSAVE(S) just needs the legacy and the xstate header part */ return sizeof(init_fpstate.regs.xsave); } static inline void fpstate_init_fxstate(struct fpstate *fpstate) { fpstate->regs.fxsave.cwd = 0x37f; fpstate->regs.fxsave.mxcsr = MXCSR_DEFAULT; } /* * Legacy x87 fpstate state init: */ static inline void fpstate_init_fstate(struct fpstate *fpstate) { fpstate->regs.fsave.cwd = 0xffff037fu; fpstate->regs.fsave.swd = 0xffff0000u; fpstate->regs.fsave.twd = 0xffffffffu; fpstate->regs.fsave.fos = 0xffff0000u; } /* * Used in two places: * 1) Early boot to setup init_fpstate for non XSAVE systems * 2) fpu_alloc_guest_fpstate() which is invoked from KVM */ void fpstate_init_user(struct fpstate *fpstate) { if (!cpu_feature_enabled(X86_FEATURE_FPU)) { fpstate_init_soft(&fpstate->regs.soft); return; } xstate_init_xcomp_bv(&fpstate->regs.xsave, fpstate->xfeatures); if (cpu_feature_enabled(X86_FEATURE_FXSR)) fpstate_init_fxstate(fpstate); else fpstate_init_fstate(fpstate); } static void __fpstate_reset(struct fpstate *fpstate) { /* * Supervisor features (and thus sizes) may diverge between guest * FPUs and host FPUs, as some supervisor features are supported * for guests despite not being utilized by the host. User * features and sizes are always identical, which allows for * common guest and userspace ABI. * * For the host, set XFD to the kernel's desired initialization * value. For guests, set XFD to its architectural RESET value. */ if (fpstate->is_guest) { fpstate->size = guest_default_cfg.size; fpstate->xfeatures = guest_default_cfg.features; fpstate->xfd = 0; } else { fpstate->size = fpu_kernel_cfg.default_size; fpstate->xfeatures = fpu_kernel_cfg.default_features; fpstate->xfd = init_fpstate.xfd; } fpstate->user_size = fpu_user_cfg.default_size; fpstate->user_xfeatures = fpu_user_cfg.default_features; } void fpstate_reset(struct fpu *fpu) { /* Set the fpstate pointer to the default fpstate */ fpu->fpstate = &fpu->__fpstate; __fpstate_reset(fpu->fpstate); /* Initialize the permission related info in fpu */ fpu->perm.__state_perm = fpu_kernel_cfg.default_features; fpu->perm.__state_size = fpu_kernel_cfg.default_size; fpu->perm.__user_state_size = fpu_user_cfg.default_size; fpu->guest_perm.__state_perm = guest_default_cfg.features; fpu->guest_perm.__state_size = guest_default_cfg.size; /* * User features and sizes are always identical between host and * guest FPUs, which allows for common guest and userspace ABI. */ fpu->guest_perm.__user_state_size = fpu_user_cfg.default_size; } static inline void fpu_inherit_perms(struct fpu *dst_fpu) { if (fpu_state_size_dynamic()) { struct fpu *src_fpu = x86_task_fpu(current->group_leader); spin_lock_irq(&current->sighand->siglock); /* Fork also inherits the permissions of the parent */ dst_fpu->perm = src_fpu->perm; dst_fpu->guest_perm = src_fpu->guest_perm; spin_unlock_irq(&current->sighand->siglock); } } /* A passed ssp of zero will not cause any update */ static int update_fpu_shstk(struct task_struct *dst, unsigned long ssp) { #ifdef CONFIG_X86_USER_SHADOW_STACK struct cet_user_state *xstate; /* If ssp update is not needed. */ if (!ssp) return 0; xstate = get_xsave_addr(&x86_task_fpu(dst)->fpstate->regs.xsave, XFEATURE_CET_USER); /* * If there is a non-zero ssp, then 'dst' must be configured with a shadow * stack and the fpu state should be up to date since it was just copied * from the parent in fpu_clone(). So there must be a valid non-init CET * state location in the buffer. */ if (WARN_ON_ONCE(!xstate)) return 1; xstate->user_ssp = (u64)ssp; #endif return 0; } /* Clone current's FPU state on fork */ int fpu_clone(struct task_struct *dst, u64 clone_flags, bool minimal, unsigned long ssp) { /* * We allocate the new FPU structure right after the end of the task struct. * task allocation size already took this into account. * * This is safe because task_struct size is a multiple of cacheline size, * thus x86_task_fpu() will always be cacheline aligned as well. */ struct fpu *dst_fpu = (void *)dst + sizeof(*dst); BUILD_BUG_ON(sizeof(*dst) % SMP_CACHE_BYTES != 0); /* The new task's FPU state cannot be valid in the hardware. */ dst_fpu->last_cpu = -1; fpstate_reset(dst_fpu); if (!cpu_feature_enabled(X86_FEATURE_FPU)) return 0; /* * Enforce reload for user space tasks and prevent kernel threads * from trying to save the FPU registers on context switch. */ set_tsk_thread_flag(dst, TIF_NEED_FPU_LOAD); /* * No FPU state inheritance for kernel threads and IO * worker threads. */ if (minimal) { /* Clear out the minimal state */ memcpy(&dst_fpu->fpstate->regs, &init_fpstate.regs, init_fpstate_copy_size()); return 0; } /* * If a new feature is added, ensure all dynamic features are * caller-saved from here! */ BUILD_BUG_ON(XFEATURE_MASK_USER_DYNAMIC != XFEATURE_MASK_XTILE_DATA); /* * Save the default portion of the current FPU state into the * clone. Assume all dynamic features to be defined as caller- * saved, which enables skipping both the expansion of fpstate * and the copying of any dynamic state. * * Do not use memcpy() when TIF_NEED_FPU_LOAD is set because * copying is not valid when current uses non-default states. */ fpregs_lock(); if (test_thread_flag(TIF_NEED_FPU_LOAD)) fpregs_restore_userregs(); save_fpregs_to_fpstate(dst_fpu); fpregs_unlock(); if (!(clone_flags & CLONE_THREAD)) fpu_inherit_perms(dst_fpu); /* * Children never inherit PASID state. * Force it to have its init value: */ if (use_xsave()) dst_fpu->fpstate->regs.xsave.header.xfeatures &= ~XFEATURE_MASK_PASID; /* * Update shadow stack pointer, in case it changed during clone. */ if (update_fpu_shstk(dst, ssp)) return 1; trace_x86_fpu_copy_dst(dst_fpu); return 0; } /* * While struct fpu is no longer part of struct thread_struct, it is still * allocated after struct task_struct in the "task_struct" kmem cache. But * since FPU is expected to be part of struct thread_struct, we have to * adjust for it here. */ void fpu_thread_struct_whitelist(unsigned long *offset, unsigned long *size) { /* The allocation follows struct task_struct. */ *offset = sizeof(struct task_struct) - offsetof(struct task_struct, thread); *offset += offsetof(struct fpu, __fpstate.regs); *size = fpu_kernel_cfg.default_size; } /* * Drops current FPU state: deactivates the fpregs and * the fpstate. NOTE: it still leaves previous contents * in the fpregs in the eager-FPU case. * * This function can be used in cases where we know that * a state-restore is coming: either an explicit one, * or a reschedule. */ void fpu__drop(struct task_struct *tsk) { struct fpu *fpu; if (test_tsk_thread_flag(tsk, TIF_NEED_FPU_LOAD)) return; fpu = x86_task_fpu(tsk); preempt_disable(); if (fpu == x86_task_fpu(current)) { /* Ignore delayed exceptions from user space */ asm volatile("1: fwait\n" "2:\n" _ASM_EXTABLE(1b, 2b)); fpregs_deactivate(fpu); } trace_x86_fpu_dropped(fpu); preempt_enable(); } /* * Clear FPU registers by setting them up from the init fpstate. * Caller must do fpregs_[un]lock() around it. */ static inline void restore_fpregs_from_init_fpstate(u64 features_mask) { if (use_xsave()) os_xrstor(&init_fpstate, features_mask); else if (use_fxsr()) fxrstor(&init_fpstate.regs.fxsave); else frstor(&init_fpstate.regs.fsave); pkru_write_default(); } /* * Reset current->fpu memory state to the init values. */ static void fpu_reset_fpstate_regs(void) { struct fpu *fpu = x86_task_fpu(current); fpregs_lock(); __fpu_invalidate_fpregs_state(fpu); /* * This does not change the actual hardware registers. It just * resets the memory image and sets TIF_NEED_FPU_LOAD so a * subsequent return to usermode will reload the registers from the * task's memory image. * * Do not use fpstate_init() here. Just copy init_fpstate which has * the correct content already except for PKRU. * * PKRU handling does not rely on the xstate when restoring for * user space as PKRU is eagerly written in switch_to() and * flush_thread(). */ memcpy(&fpu->fpstate->regs, &init_fpstate.regs, init_fpstate_copy_size()); set_thread_flag(TIF_NEED_FPU_LOAD); fpregs_unlock(); } /* * Reset current's user FPU states to the init states. current's * supervisor states, if any, are not modified by this function. The * caller guarantees that the XSTATE header in memory is intact. */ void fpu__clear_user_states(struct fpu *fpu) { WARN_ON_FPU(fpu != x86_task_fpu(current)); fpregs_lock(); if (!cpu_feature_enabled(X86_FEATURE_FPU)) { fpu_reset_fpstate_regs(); fpregs_unlock(); return; } /* * Ensure that current's supervisor states are loaded into their * corresponding registers. */ if (xfeatures_mask_supervisor() && !fpregs_state_valid(fpu, smp_processor_id())) os_xrstor_supervisor(fpu->fpstate); /* Ensure XFD state is in sync before reloading XSTATE */ xfd_update_state(fpu->fpstate); /* Reset user states in registers. */ restore_fpregs_from_init_fpstate(XFEATURE_MASK_USER_RESTORE); /* * Now all FPU registers have their desired values. Inform the FPU * state machine that current's FPU registers are in the hardware * registers. The memory image does not need to be updated because * any operation relying on it has to save the registers first when * current's FPU is marked active. */ fpregs_mark_activate(); fpregs_unlock(); } void fpu_flush_thread(void) { fpstate_reset(x86_task_fpu(current)); fpu_reset_fpstate_regs(); } /* * Load FPU context before returning to userspace. */ void switch_fpu_return(void) { if (!static_cpu_has(X86_FEATURE_FPU)) return; fpregs_restore_userregs(); } EXPORT_SYMBOL_FOR_KVM(switch_fpu_return); void fpregs_lock_and_load(void) { /* * fpregs_lock() only disables preemption (mostly). So modifying state * in an interrupt could screw up some in progress fpregs operation. * Warn about it. */ WARN_ON_ONCE(!irq_fpu_usable()); WARN_ON_ONCE(current->flags & PF_KTHREAD); fpregs_lock(); fpregs_assert_state_consistent(); if (test_thread_flag(TIF_NEED_FPU_LOAD)) fpregs_restore_userregs(); } #ifdef CONFIG_X86_DEBUG_FPU /* * If current FPU state according to its tracking (loaded FPU context on this * CPU) is not valid then we must have TIF_NEED_FPU_LOAD set so the context is * loaded on return to userland. */ void fpregs_assert_state_consistent(void) { struct fpu *fpu = x86_task_fpu(current); if (test_thread_flag(TIF_NEED_FPU_LOAD)) return; WARN_ON_FPU(!fpregs_state_valid(fpu, smp_processor_id())); } EXPORT_SYMBOL_FOR_KVM(fpregs_assert_state_consistent); #endif void fpregs_mark_activate(void) { struct fpu *fpu = x86_task_fpu(current); fpregs_activate(fpu); fpu->last_cpu = smp_processor_id(); clear_thread_flag(TIF_NEED_FPU_LOAD); } /* * x87 math exception handling: */ int fpu__exception_code(struct fpu *fpu, int trap_nr) { int err; if (trap_nr == X86_TRAP_MF) { unsigned short cwd, swd; /* * (~cwd & swd) will mask out exceptions that are not set to unmasked * status. 0x3f is the exception bits in these regs, 0x200 is the * C1 reg you need in case of a stack fault, 0x040 is the stack * fault bit. We should only be taking one exception at a time, * so if this combination doesn't produce any single exception, * then we have a bad program that isn't synchronizing its FPU usage * and it will suffer the consequences since we won't be able to * fully reproduce the context of the exception. */ if (boot_cpu_has(X86_FEATURE_FXSR)) { cwd = fpu->fpstate->regs.fxsave.cwd; swd = fpu->fpstate->regs.fxsave.swd; } else { cwd = (unsigned short)fpu->fpstate->regs.fsave.cwd; swd = (unsigned short)fpu->fpstate->regs.fsave.swd; } err = swd & ~cwd; } else { /* * The SIMD FPU exceptions are handled a little differently, as there * is only a single status/control register. Thus, to determine which * unmasked exception was caught we must mask the exception mask bits * at 0x1f80, and then use these to mask the exception bits at 0x3f. */ unsigned short mxcsr = MXCSR_DEFAULT; if (boot_cpu_has(X86_FEATURE_XMM)) mxcsr = fpu->fpstate->regs.fxsave.mxcsr; err = ~(mxcsr >> 7) & mxcsr; } if (err & 0x001) { /* Invalid op */ /* * swd & 0x240 == 0x040: Stack Underflow * swd & 0x240 == 0x240: Stack Overflow * User must clear the SF bit (0x40) if set */ return FPE_FLTINV; } else if (err & 0x004) { /* Divide by Zero */ return FPE_FLTDIV; } else if (err & 0x008) { /* Overflow */ return FPE_FLTOVF; } else if (err & 0x012) { /* Denormal, Underflow */ return FPE_FLTUND; } else if (err & 0x020) { /* Precision */ return FPE_FLTRES; } /* * If we're using IRQ 13, or supposedly even some trap * X86_TRAP_MF implementations, it's possible * we get a spurious trap, which is not an error. */ return 0; } /* * Initialize register state that may prevent from entering low-power idle. * This function will be invoked from the cpuidle driver only when needed. */ noinstr void fpu_idle_fpregs(void) { /* Note: AMX_TILE being enabled implies XGETBV1 support */ if (cpu_feature_enabled(X86_FEATURE_AMX_TILE) && (xfeatures_in_use() & XFEATURE_MASK_XTILE)) { tile_release(); __this_cpu_write(fpu_fpregs_owner_ctx, NULL); } }
2 3 3 3 3 7 2 2 4 2 3 3 2 2 2 22 23 3 10 7 2 11 8 5 8 4 4 2 1 1 3 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 // SPDX-License-Identifier: GPL-2.0-or-later /* * dir.c - Operations for configfs directories. * * Based on sysfs: * sysfs is Copyright (C) 2001, 2002, 2003 Patrick Mochel * * configfs Copyright (C) 2005 Oracle. All rights reserved. */ #undef DEBUG #include <linux/fs.h> #include <linux/fsnotify.h> #include <linux/mount.h> #include <linux/module.h> #include <linux/slab.h> #include <linux/err.h> #include <linux/configfs.h> #include "configfs_internal.h" /* * Protects mutations of configfs_dirent linkage together with proper i_mutex * Also protects mutations of symlinks linkage to target configfs_dirent * Mutators of configfs_dirent linkage must *both* have the proper inode locked * and configfs_dirent_lock locked, in that order. * This allows one to safely traverse configfs_dirent trees and symlinks without * having to lock inodes. * * Protects setting of CONFIGFS_USET_DROPPING: checking the flag * unlocked is not reliable unless in detach_groups() called from * rmdir()/unregister() and from configfs_attach_group() */ DEFINE_SPINLOCK(configfs_dirent_lock); /* * All of link_obj/unlink_obj/link_group/unlink_group require that * subsys->su_mutex is held. * But parent configfs_subsystem is NULL when config_item is root. * Use this mutex when config_item is root. */ static DEFINE_MUTEX(configfs_subsystem_mutex); static void configfs_d_iput(struct dentry * dentry, struct inode * inode) { struct configfs_dirent *sd = dentry->d_fsdata; if (sd) { /* Coordinate with configfs_readdir */ spin_lock(&configfs_dirent_lock); /* * Set sd->s_dentry to null only when this dentry is the one * that is going to be killed. Otherwise configfs_d_iput may * run just after configfs_lookup and set sd->s_dentry to * NULL even it's still in use. */ if (sd->s_dentry == dentry) sd->s_dentry = NULL; spin_unlock(&configfs_dirent_lock); configfs_put(sd); } iput(inode); } const struct dentry_operations configfs_dentry_ops = { .d_iput = configfs_d_iput, }; #ifdef CONFIG_LOCKDEP /* * Helpers to make lockdep happy with our recursive locking of default groups' * inodes (see configfs_attach_group() and configfs_detach_group()). * We put default groups i_mutexes in separate classes according to their depth * from the youngest non-default group ancestor. * * For a non-default group A having default groups A/B, A/C, and A/C/D, default * groups A/B and A/C will have their inode's mutex in class * default_group_class[0], and default group A/C/D will be in * default_group_class[1]. * * The lock classes are declared and assigned in inode.c, according to the * s_depth value. * The s_depth value is initialized to -1, adjusted to >= 0 when attaching * default groups, and reset to -1 when all default groups are attached. During * attachment, if configfs_create() sees s_depth > 0, the lock class of the new * inode's mutex is set to default_group_class[s_depth - 1]. */ static void configfs_init_dirent_depth(struct configfs_dirent *sd) { sd->s_depth = -1; } static void configfs_set_dir_dirent_depth(struct configfs_dirent *parent_sd, struct configfs_dirent *sd) { int parent_depth = parent_sd->s_depth; if (parent_depth >= 0) sd->s_depth = parent_depth + 1; } static void configfs_adjust_dir_dirent_depth_before_populate(struct configfs_dirent *sd) { /* * item's i_mutex class is already setup, so s_depth is now only * used to set new sub-directories s_depth, which is always done * with item's i_mutex locked. */ /* * sd->s_depth == -1 iff we are a non default group. * else (we are a default group) sd->s_depth > 0 (see * create_dir()). */ if (sd->s_depth == -1) /* * We are a non default group and we are going to create * default groups. */ sd->s_depth = 0; } static void configfs_adjust_dir_dirent_depth_after_populate(struct configfs_dirent *sd) { /* We will not create default groups anymore. */ sd->s_depth = -1; } #else /* CONFIG_LOCKDEP */ static void configfs_init_dirent_depth(struct configfs_dirent *sd) { } static void configfs_set_dir_dirent_depth(struct configfs_dirent *parent_sd, struct configfs_dirent *sd) { } static void configfs_adjust_dir_dirent_depth_before_populate(struct configfs_dirent *sd) { } static void configfs_adjust_dir_dirent_depth_after_populate(struct configfs_dirent *sd) { } #endif /* CONFIG_LOCKDEP */ static struct configfs_fragment *new_fragment(void) { struct configfs_fragment *p; p = kmalloc(sizeof(struct configfs_fragment), GFP_KERNEL); if (p) { atomic_set(&p->frag_count, 1); init_rwsem(&p->frag_sem); p->frag_dead = false; } return p; } void put_fragment(struct configfs_fragment *frag) { if (frag && atomic_dec_and_test(&frag->frag_count)) kfree(frag); } struct configfs_fragment *get_fragment(struct configfs_fragment *frag) { if (likely(frag)) atomic_inc(&frag->frag_count); return frag; } /* * Allocates a new configfs_dirent and links it to the parent configfs_dirent */ static struct configfs_dirent *configfs_new_dirent(struct configfs_dirent *parent_sd, void *element, int type, struct configfs_fragment *frag) { struct configfs_dirent * sd; sd = kmem_cache_zalloc(configfs_dir_cachep, GFP_KERNEL); if (!sd) return ERR_PTR(-ENOMEM); atomic_set(&sd->s_count, 1); INIT_LIST_HEAD(&sd->s_children); sd->s_element = element; sd->s_type = type; configfs_init_dirent_depth(sd); spin_lock(&configfs_dirent_lock); if (parent_sd->s_type & CONFIGFS_USET_DROPPING) { spin_unlock(&configfs_dirent_lock); kmem_cache_free(configfs_dir_cachep, sd); return ERR_PTR(-ENOENT); } sd->s_frag = get_fragment(frag); /* * configfs_lookup scans only for unpinned items. s_children is * partitioned so that configfs_lookup can bail out early. * CONFIGFS_PINNED and CONFIGFS_NOT_PINNED are not symmetrical. readdir * cursors still need to be inserted at the front of the list. */ if (sd->s_type & CONFIGFS_PINNED) list_add_tail(&sd->s_sibling, &parent_sd->s_children); else list_add(&sd->s_sibling, &parent_sd->s_children); spin_unlock(&configfs_dirent_lock); return sd; } /* * * Return -EEXIST if there is already a configfs element with the same * name for the same parent. * * called with parent inode's i_mutex held */ static int configfs_dirent_exists(struct dentry *dentry) { struct configfs_dirent *parent_sd = dentry->d_parent->d_fsdata; const unsigned char *new = dentry->d_name.name; struct configfs_dirent *sd; list_for_each_entry(sd, &parent_sd->s_children, s_sibling) { if (sd->s_element) { const unsigned char *existing = configfs_get_name(sd); if (strcmp(existing, new)) continue; else return -EEXIST; } } return 0; } int configfs_make_dirent(struct configfs_dirent * parent_sd, struct dentry * dentry, void * element, umode_t mode, int type, struct configfs_fragment *frag) { struct configfs_dirent * sd; sd = configfs_new_dirent(parent_sd, element, type, frag); if (IS_ERR(sd)) return PTR_ERR(sd); sd->s_mode = mode; sd->s_dentry = dentry; if (dentry) dentry->d_fsdata = configfs_get(sd); return 0; } static void configfs_remove_dirent(struct dentry *dentry) { struct configfs_dirent *sd = dentry->d_fsdata; if (!sd) return; spin_lock(&configfs_dirent_lock); list_del_init(&sd->s_sibling); spin_unlock(&configfs_dirent_lock); configfs_put(sd); } /** * configfs_create_dir - create a directory for an config_item. * @item: config_itemwe're creating directory for. * @dentry: config_item's dentry. * @frag: config_item's fragment. * * Note: user-created entries won't be allowed under this new directory * until it is validated by configfs_dir_set_ready() */ static int configfs_create_dir(struct config_item *item, struct dentry *dentry, struct configfs_fragment *frag) { int error; umode_t mode = S_IFDIR| S_IRWXU | S_IRUGO | S_IXUGO; struct dentry *p = dentry->d_parent; struct inode *inode; BUG_ON(!item); error = configfs_make_dirent(p->d_fsdata, dentry, item, mode, CONFIGFS_DIR | CONFIGFS_USET_CREATING, frag); if (unlikely(error)) return error; configfs_set_dir_dirent_depth(p->d_fsdata, dentry->d_fsdata); inode = configfs_create(dentry, mode); if (IS_ERR(inode)) goto out_remove; inode->i_op = &configfs_dir_inode_operations; inode->i_fop = &configfs_dir_operations; /* directory inodes start off with i_nlink == 2 (for "." entry) */ inc_nlink(inode); d_instantiate(dentry, inode); /* already hashed */ dget(dentry); /* pin directory dentries in core */ inc_nlink(d_inode(p)); item->ci_dentry = dentry; return 0; out_remove: configfs_put(dentry->d_fsdata); configfs_remove_dirent(dentry); return PTR_ERR(inode); } /* * Allow userspace to create new entries under a new directory created with * configfs_create_dir(), and under all of its chidlren directories recursively. * @sd configfs_dirent of the new directory to validate * * Caller must hold configfs_dirent_lock. */ static void configfs_dir_set_ready(struct configfs_dirent *sd) { struct configfs_dirent *child_sd; sd->s_type &= ~CONFIGFS_USET_CREATING; list_for_each_entry(child_sd, &sd->s_children, s_sibling) if (child_sd->s_type & CONFIGFS_USET_CREATING) configfs_dir_set_ready(child_sd); } /* * Check that a directory does not belong to a directory hierarchy being * attached and not validated yet. * @sd configfs_dirent of the directory to check * * @return non-zero iff the directory was validated * * Note: takes configfs_dirent_lock, so the result may change from false to true * in two consecutive calls, but never from true to false. */ int configfs_dirent_is_ready(struct configfs_dirent *sd) { int ret; spin_lock(&configfs_dirent_lock); ret = !(sd->s_type & CONFIGFS_USET_CREATING); spin_unlock(&configfs_dirent_lock); return ret; } int configfs_create_link(struct configfs_dirent *target, struct dentry *parent, struct dentry *dentry, char *body) { int err = 0; umode_t mode = S_IFLNK | S_IRWXUGO; struct configfs_dirent *p = parent->d_fsdata; struct inode *inode; err = configfs_make_dirent(p, dentry, target, mode, CONFIGFS_ITEM_LINK, p->s_frag); if (err) return err; inode = configfs_create(dentry, mode); if (IS_ERR(inode)) goto out_remove; inode->i_link = body; inode->i_op = &configfs_symlink_inode_operations; d_instantiate(dentry, inode); dget(dentry); /* pin link dentries in core */ return 0; out_remove: configfs_put(dentry->d_fsdata); configfs_remove_dirent(dentry); return PTR_ERR(inode); } static void remove_dir(struct dentry * d) { struct dentry * parent = dget(d->d_parent); configfs_remove_dirent(d); if (d_really_is_positive(d)) { if (likely(simple_empty(d))) { __simple_rmdir(d_inode(parent),d); dput(d); } else { pr_warn("remove_dir (%pd): attributes remain", d); } } pr_debug(" o %pd removing done (%d)\n", d, d_count(d)); dput(parent); } /** * configfs_remove_dir - remove an config_item's directory. * @item: config_item we're removing. * * The only thing special about this is that we remove any files in * the directory before we remove the directory, and we've inlined * what used to be configfs_rmdir() below, instead of calling separately. * * Caller holds the mutex of the item's inode */ static void configfs_remove_dir(struct config_item * item) { struct dentry * dentry = dget(item->ci_dentry); if (!dentry) return; remove_dir(dentry); /** * Drop reference from dget() on entrance. */ dput(dentry); } static struct dentry * configfs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { struct configfs_dirent * parent_sd = dentry->d_parent->d_fsdata; struct configfs_dirent * sd; struct inode *inode = NULL; if (dentry->d_name.len > NAME_MAX) return ERR_PTR(-ENAMETOOLONG); /* * Fake invisibility if dir belongs to a group/default groups hierarchy * being attached * * This forbids userspace to read/write attributes of items which may * not complete their initialization, since the dentries of the * attributes won't be instantiated. */ if (!configfs_dirent_is_ready(parent_sd)) return ERR_PTR(-ENOENT); spin_lock(&configfs_dirent_lock); list_for_each_entry(sd, &parent_sd->s_children, s_sibling) { /* * s_children is partitioned, see configfs_new_dirent. The first * pinned item indicates we can stop scanning. */ if (sd->s_type & CONFIGFS_PINNED) break; /* * Note: CONFIGFS_PINNED and CONFIGFS_NOT_PINNED are asymmetric. * there may be a readdir cursor in this list */ if ((sd->s_type & CONFIGFS_NOT_PINNED) && !strcmp(configfs_get_name(sd), dentry->d_name.name)) { struct configfs_attribute *attr = sd->s_element; umode_t mode = (attr->ca_mode & S_IALLUGO) | S_IFREG; dentry->d_fsdata = configfs_get(sd); sd->s_dentry = dentry; spin_unlock(&configfs_dirent_lock); inode = configfs_create(dentry, mode); if (IS_ERR(inode)) { configfs_put(sd); return ERR_CAST(inode); } if (sd->s_type & CONFIGFS_ITEM_BIN_ATTR) { inode->i_size = 0; inode->i_fop = &configfs_bin_file_operations; } else { inode->i_size = PAGE_SIZE; inode->i_fop = &configfs_file_operations; } goto done; } } spin_unlock(&configfs_dirent_lock); done: d_add(dentry, inode); return NULL; } /* * Only subdirectories count here. Files (CONFIGFS_NOT_PINNED) are * attributes and are removed by rmdir(). We recurse, setting * CONFIGFS_USET_DROPPING on all children that are candidates for * default detach. * If there is an error, the caller will reset the flags via * configfs_detach_rollback(). */ static int configfs_detach_prep(struct dentry *dentry, struct dentry **wait) { struct configfs_dirent *parent_sd = dentry->d_fsdata; struct configfs_dirent *sd; int ret; /* Mark that we're trying to drop the group */ parent_sd->s_type |= CONFIGFS_USET_DROPPING; ret = -EBUSY; if (parent_sd->s_links) goto out; ret = 0; list_for_each_entry(sd, &parent_sd->s_children, s_sibling) { if (!sd->s_element || (sd->s_type & CONFIGFS_NOT_PINNED)) continue; if (sd->s_type & CONFIGFS_USET_DEFAULT) { /* Abort if racing with mkdir() */ if (sd->s_type & CONFIGFS_USET_IN_MKDIR) { if (wait) *wait= dget(sd->s_dentry); return -EAGAIN; } /* * Yup, recursive. If there's a problem, blame * deep nesting of default_groups */ ret = configfs_detach_prep(sd->s_dentry, wait); if (!ret) continue; } else ret = -ENOTEMPTY; break; } out: return ret; } /* * Walk the tree, resetting CONFIGFS_USET_DROPPING wherever it was * set. */ static void configfs_detach_rollback(struct dentry *dentry) { struct configfs_dirent *parent_sd = dentry->d_fsdata; struct configfs_dirent *sd; parent_sd->s_type &= ~CONFIGFS_USET_DROPPING; list_for_each_entry(sd, &parent_sd->s_children, s_sibling) if (sd->s_type & CONFIGFS_USET_DEFAULT) configfs_detach_rollback(sd->s_dentry); } static void detach_attrs(struct config_item * item) { struct dentry * dentry = dget(item->ci_dentry); struct configfs_dirent * parent_sd; struct configfs_dirent * sd, * tmp; if (!dentry) return; pr_debug("configfs %s: dropping attrs for dir\n", dentry->d_name.name); parent_sd = dentry->d_fsdata; list_for_each_entry_safe(sd, tmp, &parent_sd->s_children, s_sibling) { if (!sd->s_element || !(sd->s_type & CONFIGFS_NOT_PINNED)) continue; spin_lock(&configfs_dirent_lock); list_del_init(&sd->s_sibling); spin_unlock(&configfs_dirent_lock); configfs_drop_dentry(sd, dentry); configfs_put(sd); } /** * Drop reference from dget() on entrance. */ dput(dentry); } static int populate_attrs(struct config_item *item) { const struct config_item_type *t = item->ci_type; const struct configfs_group_operations *ops; struct configfs_attribute *attr; struct configfs_bin_attribute *bin_attr; int error = 0; int i; if (!t) return -EINVAL; ops = t->ct_group_ops; if (t->ct_attrs) { for (i = 0; (attr = t->ct_attrs[i]) != NULL; i++) { if (ops && ops->is_visible && !ops->is_visible(item, attr, i)) continue; if ((error = configfs_create_file(item, attr))) break; } } if (!error && t->ct_bin_attrs) { for (i = 0; (bin_attr = t->ct_bin_attrs[i]) != NULL; i++) { if (ops && ops->is_bin_visible && !ops->is_bin_visible(item, bin_attr, i)) continue; error = configfs_create_bin_file(item, bin_attr); if (error) break; } } if (error) detach_attrs(item); return error; } static int configfs_attach_group(struct config_item *parent_item, struct config_item *item, struct dentry *dentry, struct configfs_fragment *frag); static void configfs_detach_group(struct config_item *item); static void detach_groups(struct config_group *group) { struct dentry * dentry = dget(group->cg_item.ci_dentry); struct dentry *child; struct configfs_dirent *parent_sd; struct configfs_dirent *sd, *tmp; if (!dentry) return; parent_sd = dentry->d_fsdata; list_for_each_entry_safe(sd, tmp, &parent_sd->s_children, s_sibling) { if (!sd->s_element || !(sd->s_type & CONFIGFS_USET_DEFAULT)) continue; child = sd->s_dentry; inode_lock(d_inode(child)); configfs_detach_group(sd->s_element); d_inode(child)->i_flags |= S_DEAD; dont_mount(child); inode_unlock(d_inode(child)); d_delete(child); dput(child); } /** * Drop reference from dget() on entrance. */ dput(dentry); } /* * This fakes mkdir(2) on a default_groups[] entry. It * creates a dentry, attachs it, and then does fixup * on the sd->s_type. * * We could, perhaps, tweak our parent's ->mkdir for a minute and * try using vfs_mkdir. Just a thought. */ static int create_default_group(struct config_group *parent_group, struct config_group *group, struct configfs_fragment *frag) { int ret; struct configfs_dirent *sd; /* We trust the caller holds a reference to parent */ struct dentry *child, *parent = parent_group->cg_item.ci_dentry; if (!group->cg_item.ci_name) group->cg_item.ci_name = group->cg_item.ci_namebuf; ret = -ENOMEM; child = d_alloc_name(parent, group->cg_item.ci_name); if (child) { d_add(child, NULL); ret = configfs_attach_group(&parent_group->cg_item, &group->cg_item, child, frag); if (!ret) { sd = child->d_fsdata; sd->s_type |= CONFIGFS_USET_DEFAULT; } else { BUG_ON(d_inode(child)); d_drop(child); dput(child); } } return ret; } static int populate_groups(struct config_group *group, struct configfs_fragment *frag) { struct config_group *new_group; int ret = 0; list_for_each_entry(new_group, &group->default_groups, group_entry) { ret = create_default_group(group, new_group, frag); if (ret) { detach_groups(group); break; } } return ret; } void configfs_remove_default_groups(struct config_group *group) { struct config_group *g, *n; list_for_each_entry_safe(g, n, &group->default_groups, group_entry) { list_del(&g->group_entry); config_item_put(&g->cg_item); } } EXPORT_SYMBOL(configfs_remove_default_groups); /* * All of link_obj/unlink_obj/link_group/unlink_group require that * subsys->su_mutex is held. */ static void unlink_obj(struct config_item *item) { struct config_group *group; group = item->ci_group; if (group) { list_del_init(&item->ci_entry); item->ci_group = NULL; item->ci_parent = NULL; /* Drop the reference for ci_entry */ config_item_put(item); /* Drop the reference for ci_parent */ config_group_put(group); } } static void link_obj(struct config_item *parent_item, struct config_item *item) { /* * Parent seems redundant with group, but it makes certain * traversals much nicer. */ item->ci_parent = parent_item; /* * We hold a reference on the parent for the child's ci_parent * link. */ item->ci_group = config_group_get(to_config_group(parent_item)); list_add_tail(&item->ci_entry, &item->ci_group->cg_children); /* * We hold a reference on the child for ci_entry on the parent's * cg_children */ config_item_get(item); } static void unlink_group(struct config_group *group) { struct config_group *new_group; list_for_each_entry(new_group, &group->default_groups, group_entry) unlink_group(new_group); group->cg_subsys = NULL; unlink_obj(&group->cg_item); } static void link_group(struct config_group *parent_group, struct config_group *group) { struct config_group *new_group; struct configfs_subsystem *subsys = NULL; /* gcc is a turd */ link_obj(&parent_group->cg_item, &group->cg_item); if (parent_group->cg_subsys) subsys = parent_group->cg_subsys; else if (configfs_is_root(&parent_group->cg_item)) subsys = to_configfs_subsystem(group); else BUG(); group->cg_subsys = subsys; list_for_each_entry(new_group, &group->default_groups, group_entry) link_group(group, new_group); } /* * The goal is that configfs_attach_item() (and * configfs_attach_group()) can be called from either the VFS or this * module. That is, they assume that the items have been created, * the dentry allocated, and the dcache is all ready to go. * * If they fail, they must clean up after themselves as if they * had never been called. The caller (VFS or local function) will * handle cleaning up the dcache bits. * * configfs_detach_group() and configfs_detach_item() behave similarly on * the way out. They assume that the proper semaphores are held, they * clean up the configfs items, and they expect their callers will * handle the dcache bits. */ static int configfs_attach_item(struct config_item *parent_item, struct config_item *item, struct dentry *dentry, struct configfs_fragment *frag) { int ret; ret = configfs_create_dir(item, dentry, frag); if (!ret) { ret = populate_attrs(item); if (ret) { /* * We are going to remove an inode and its dentry but * the VFS may already have hit and used them. Thus, * we must lock them as rmdir() would. */ inode_lock(d_inode(dentry)); configfs_remove_dir(item); d_inode(dentry)->i_flags |= S_DEAD; dont_mount(dentry); inode_unlock(d_inode(dentry)); d_delete(dentry); } } return ret; } /* Caller holds the mutex of the item's inode */ static void configfs_detach_item(struct config_item *item) { detach_attrs(item); configfs_remove_dir(item); } static int configfs_attach_group(struct config_item *parent_item, struct config_item *item, struct dentry *dentry, struct configfs_fragment *frag) { int ret; struct configfs_dirent *sd; ret = configfs_attach_item(parent_item, item, dentry, frag); if (!ret) { sd = dentry->d_fsdata; sd->s_type |= CONFIGFS_USET_DIR; /* * FYI, we're faking mkdir in populate_groups() * We must lock the group's inode to avoid races with the VFS * which can already hit the inode and try to add/remove entries * under it. * * We must also lock the inode to remove it safely in case of * error, as rmdir() would. */ inode_lock_nested(d_inode(dentry), I_MUTEX_CHILD); configfs_adjust_dir_dirent_depth_before_populate(sd); ret = populate_groups(to_config_group(item), frag); if (ret) { configfs_detach_item(item); d_inode(dentry)->i_flags |= S_DEAD; dont_mount(dentry); } configfs_adjust_dir_dirent_depth_after_populate(sd); inode_unlock(d_inode(dentry)); if (ret) d_delete(dentry); } return ret; } /* Caller holds the mutex of the group's inode */ static void configfs_detach_group(struct config_item *item) { detach_groups(to_config_group(item)); configfs_detach_item(item); } /* * After the item has been detached from the filesystem view, we are * ready to tear it out of the hierarchy. Notify the client before * we do that so they can perform any cleanup that requires * navigating the hierarchy. A client does not need to provide this * callback. The subsystem semaphore MUST be held by the caller, and * references must be valid for both items. It also assumes the * caller has validated ci_type. */ static void client_disconnect_notify(struct config_item *parent_item, struct config_item *item) { const struct config_item_type *type; type = parent_item->ci_type; BUG_ON(!type); if (type->ct_group_ops && type->ct_group_ops->disconnect_notify) type->ct_group_ops->disconnect_notify(to_config_group(parent_item), item); } /* * Drop the initial reference from make_item()/make_group() * This function assumes that reference is held on item * and that item holds a valid reference to the parent. Also, it * assumes the caller has validated ci_type. */ static void client_drop_item(struct config_item *parent_item, struct config_item *item) { const struct config_item_type *type; type = parent_item->ci_type; BUG_ON(!type); /* * If ->drop_item() exists, it is responsible for the * config_item_put(). */ if (type->ct_group_ops && type->ct_group_ops->drop_item) type->ct_group_ops->drop_item(to_config_group(parent_item), item); else config_item_put(item); } #ifdef DEBUG static void configfs_dump_one(struct configfs_dirent *sd, int level) { pr_info("%*s\"%s\":\n", level, " ", configfs_get_name(sd)); #define type_print(_type) if (sd->s_type & _type) pr_info("%*s %s\n", level, " ", #_type) type_print(CONFIGFS_ROOT); type_print(CONFIGFS_DIR); type_print(CONFIGFS_ITEM_ATTR); type_print(CONFIGFS_ITEM_LINK); type_print(CONFIGFS_USET_DIR); type_print(CONFIGFS_USET_DEFAULT); type_print(CONFIGFS_USET_DROPPING); #undef type_print } static int configfs_dump(struct configfs_dirent *sd, int level) { struct configfs_dirent *child_sd; int ret = 0; configfs_dump_one(sd, level); if (!(sd->s_type & (CONFIGFS_DIR|CONFIGFS_ROOT))) return 0; list_for_each_entry(child_sd, &sd->s_children, s_sibling) { ret = configfs_dump(child_sd, level + 2); if (ret) break; } return ret; } #endif /* * configfs_depend_item() and configfs_undepend_item() * * WARNING: Do not call these from a configfs callback! * * This describes these functions and their helpers. * * Allow another kernel system to depend on a config_item. If this * happens, the item cannot go away until the dependent can live without * it. The idea is to give client modules as simple an interface as * possible. When a system asks them to depend on an item, they just * call configfs_depend_item(). If the item is live and the client * driver is in good shape, we'll happily do the work for them. * * Why is the locking complex? Because configfs uses the VFS to handle * all locking, but this function is called outside the normal * VFS->configfs path. So it must take VFS locks to prevent the * VFS->configfs stuff (configfs_mkdir(), configfs_rmdir(), etc). This is * why you can't call these functions underneath configfs callbacks. * * Note, btw, that this can be called at *any* time, even when a configfs * subsystem isn't registered, or when configfs is loading or unloading. * Just like configfs_register_subsystem(). So we take the same * precautions. We pin the filesystem. We lock configfs_dirent_lock. * If we can find the target item in the * configfs tree, it must be part of the subsystem tree as well, so we * do not need the subsystem semaphore. Holding configfs_dirent_lock helps * locking out mkdir() and rmdir(), who might be racing us. */ /* * configfs_depend_prep() * * Only subdirectories count here. Files (CONFIGFS_NOT_PINNED) are * attributes. This is similar but not the same to configfs_detach_prep(). * Note that configfs_detach_prep() expects the parent to be locked when it * is called, but we lock the parent *inside* configfs_depend_prep(). We * do that so we can unlock it if we find nothing. * * Here we do a depth-first search of the dentry hierarchy looking for * our object. * We deliberately ignore items tagged as dropping since they are virtually * dead, as well as items in the middle of attachment since they virtually * do not exist yet. This completes the locking out of racing mkdir() and * rmdir(). * Note: subdirectories in the middle of attachment start with s_type = * CONFIGFS_DIR|CONFIGFS_USET_CREATING set by create_dir(). When * CONFIGFS_USET_CREATING is set, we ignore the item. The actual set of * s_type is in configfs_new_dirent(), which has configfs_dirent_lock. * * If the target is not found, -ENOENT is bubbled up. * * This adds a requirement that all config_items be unique! * * This is recursive. There isn't * much on the stack, though, so folks that need this function - be careful * about your stack! Patches will be accepted to make it iterative. */ static int configfs_depend_prep(struct dentry *origin, struct config_item *target) { struct configfs_dirent *child_sd, *sd; int ret = 0; BUG_ON(!origin || !origin->d_fsdata); sd = origin->d_fsdata; if (sd->s_element == target) /* Boo-yah */ goto out; list_for_each_entry(child_sd, &sd->s_children, s_sibling) { if ((child_sd->s_type & CONFIGFS_DIR) && !(child_sd->s_type & CONFIGFS_USET_DROPPING) && !(child_sd->s_type & CONFIGFS_USET_CREATING)) { ret = configfs_depend_prep(child_sd->s_dentry, target); if (!ret) goto out; /* Child path boo-yah */ } } /* We looped all our children and didn't find target */ ret = -ENOENT; out: return ret; } static int configfs_do_depend_item(struct dentry *subsys_dentry, struct config_item *target) { struct configfs_dirent *p; int ret; spin_lock(&configfs_dirent_lock); /* Scan the tree, return 0 if found */ ret = configfs_depend_prep(subsys_dentry, target); if (ret) goto out_unlock_dirent_lock; /* * We are sure that the item is not about to be removed by rmdir(), and * not in the middle of attachment by mkdir(). */ p = target->ci_dentry->d_fsdata; p->s_dependent_count += 1; out_unlock_dirent_lock: spin_unlock(&configfs_dirent_lock); return ret; } static inline struct configfs_dirent * configfs_find_subsys_dentry(struct configfs_dirent *root_sd, struct config_item *subsys_item) { struct configfs_dirent *p; struct configfs_dirent *ret = NULL; list_for_each_entry(p, &root_sd->s_children, s_sibling) { if (p->s_type & CONFIGFS_DIR && p->s_element == subsys_item) { ret = p; break; } } return ret; } int configfs_depend_item(struct configfs_subsystem *subsys, struct config_item *target) { int ret; struct configfs_dirent *subsys_sd; struct config_item *s_item = &subsys->su_group.cg_item; struct dentry *root; /* * Pin the configfs filesystem. This means we can safely access * the root of the configfs filesystem. */ root = configfs_pin_fs(); if (IS_ERR(root)) return PTR_ERR(root); /* * Next, lock the root directory. We're going to check that the * subsystem is really registered, and so we need to lock out * configfs_[un]register_subsystem(). */ inode_lock(d_inode(root)); subsys_sd = configfs_find_subsys_dentry(root->d_fsdata, s_item); if (!subsys_sd) { ret = -ENOENT; goto out_unlock_fs; } /* Ok, now we can trust subsys/s_item */ ret = configfs_do_depend_item(subsys_sd->s_dentry, target); out_unlock_fs: inode_unlock(d_inode(root)); /* * If we succeeded, the fs is pinned via other methods. If not, * we're done with it anyway. So release_fs() is always right. */ configfs_release_fs(); return ret; } EXPORT_SYMBOL(configfs_depend_item); /* * Release the dependent linkage. This is much simpler than * configfs_depend_item() because we know that the client driver is * pinned, thus the subsystem is pinned, and therefore configfs is pinned. */ void configfs_undepend_item(struct config_item *target) { struct configfs_dirent *sd; /* * Since we can trust everything is pinned, we just need * configfs_dirent_lock. */ spin_lock(&configfs_dirent_lock); sd = target->ci_dentry->d_fsdata; BUG_ON(sd->s_dependent_count < 1); sd->s_dependent_count -= 1; /* * After this unlock, we cannot trust the item to stay alive! * DO NOT REFERENCE item after this unlock. */ spin_unlock(&configfs_dirent_lock); } EXPORT_SYMBOL(configfs_undepend_item); /* * caller_subsys is a caller's subsystem not target's. This is used to * determine if we should lock root and check subsys or not. When we are * in the same subsystem as our target there is no need to do locking as * we know that subsys is valid and is not unregistered during this function * as we are called from callback of one of his children and VFS holds a lock * on some inode. Otherwise we have to lock our root to ensure that target's * subsystem it is not unregistered during this function. */ int configfs_depend_item_unlocked(struct configfs_subsystem *caller_subsys, struct config_item *target) { struct configfs_subsystem *target_subsys; struct config_group *root, *parent; struct configfs_dirent *subsys_sd; int ret = -ENOENT; /* Disallow this function for configfs root */ if (configfs_is_root(target)) return -EINVAL; parent = target->ci_group; /* * This may happen when someone is trying to depend root * directory of some subsystem */ if (configfs_is_root(&parent->cg_item)) { target_subsys = to_configfs_subsystem(to_config_group(target)); root = parent; } else { target_subsys = parent->cg_subsys; /* Find a cofnigfs root as we may need it for locking */ for (root = parent; !configfs_is_root(&root->cg_item); root = root->cg_item.ci_group) ; } if (target_subsys != caller_subsys) { /* * We are in other configfs subsystem, so we have to do * additional locking to prevent other subsystem from being * unregistered */ inode_lock(d_inode(root->cg_item.ci_dentry)); /* * As we are trying to depend item from other subsystem * we have to check if this subsystem is still registered */ subsys_sd = configfs_find_subsys_dentry( root->cg_item.ci_dentry->d_fsdata, &target_subsys->su_group.cg_item); if (!subsys_sd) goto out_root_unlock; } else { subsys_sd = target_subsys->su_group.cg_item.ci_dentry->d_fsdata; } /* Now we can execute core of depend item */ ret = configfs_do_depend_item(subsys_sd->s_dentry, target); if (target_subsys != caller_subsys) out_root_unlock: /* * We were called from subsystem other than our target so we * took some locks so now it's time to release them */ inode_unlock(d_inode(root->cg_item.ci_dentry)); return ret; } EXPORT_SYMBOL(configfs_depend_item_unlocked); static struct dentry *configfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { int ret = 0; int module_got = 0; struct config_group *group = NULL; struct config_item *item = NULL; struct config_item *parent_item; struct configfs_subsystem *subsys; struct configfs_dirent *sd; const struct config_item_type *type; struct module *subsys_owner = NULL, *new_item_owner = NULL; struct configfs_fragment *frag; char *name; sd = dentry->d_parent->d_fsdata; /* * Fake invisibility if dir belongs to a group/default groups hierarchy * being attached */ if (!configfs_dirent_is_ready(sd)) { ret = -ENOENT; goto out; } if (!(sd->s_type & CONFIGFS_USET_DIR)) { ret = -EPERM; goto out; } frag = new_fragment(); if (!frag) { ret = -ENOMEM; goto out; } /* Get a working ref for the duration of this function */ parent_item = configfs_get_config_item(dentry->d_parent); type = parent_item->ci_type; subsys = to_config_group(parent_item)->cg_subsys; BUG_ON(!subsys); if (!type || !type->ct_group_ops || (!type->ct_group_ops->make_group && !type->ct_group_ops->make_item)) { ret = -EPERM; /* Lack-of-mkdir returns -EPERM */ goto out_put; } /* * The subsystem may belong to a different module than the item * being created. We don't want to safely pin the new item but * fail to pin the subsystem it sits under. */ if (!subsys->su_group.cg_item.ci_type) { ret = -EINVAL; goto out_put; } subsys_owner = subsys->su_group.cg_item.ci_type->ct_owner; if (!try_module_get(subsys_owner)) { ret = -EINVAL; goto out_put; } name = kmalloc(dentry->d_name.len + 1, GFP_KERNEL); if (!name) { ret = -ENOMEM; goto out_subsys_put; } snprintf(name, dentry->d_name.len + 1, "%s", dentry->d_name.name); mutex_lock(&subsys->su_mutex); if (type->ct_group_ops->make_group) { group = type->ct_group_ops->make_group(to_config_group(parent_item), name); if (!group) group = ERR_PTR(-ENOMEM); if (!IS_ERR(group)) { link_group(to_config_group(parent_item), group); item = &group->cg_item; } else ret = PTR_ERR(group); } else { item = type->ct_group_ops->make_item(to_config_group(parent_item), name); if (!item) item = ERR_PTR(-ENOMEM); if (!IS_ERR(item)) link_obj(parent_item, item); else ret = PTR_ERR(item); } mutex_unlock(&subsys->su_mutex); kfree(name); if (ret) { /* * If ret != 0, then link_obj() was never called. * There are no extra references to clean up. */ goto out_subsys_put; } /* * link_obj() has been called (via link_group() for groups). * From here on out, errors must clean that up. */ type = item->ci_type; if (!type) { ret = -EINVAL; goto out_unlink; } new_item_owner = type->ct_owner; if (!try_module_get(new_item_owner)) { ret = -EINVAL; goto out_unlink; } /* * I hate doing it this way, but if there is * an error, module_put() probably should * happen after any cleanup. */ module_got = 1; /* * Make racing rmdir() fail if it did not tag parent with * CONFIGFS_USET_DROPPING * Note: if CONFIGFS_USET_DROPPING is already set, attach_group() will * fail and let rmdir() terminate correctly */ spin_lock(&configfs_dirent_lock); /* This will make configfs_detach_prep() fail */ sd->s_type |= CONFIGFS_USET_IN_MKDIR; spin_unlock(&configfs_dirent_lock); if (group) ret = configfs_attach_group(parent_item, item, dentry, frag); else ret = configfs_attach_item(parent_item, item, dentry, frag); spin_lock(&configfs_dirent_lock); sd->s_type &= ~CONFIGFS_USET_IN_MKDIR; if (!ret) configfs_dir_set_ready(dentry->d_fsdata); spin_unlock(&configfs_dirent_lock); out_unlink: if (ret) { /* Tear down everything we built up */ mutex_lock(&subsys->su_mutex); client_disconnect_notify(parent_item, item); if (group) unlink_group(group); else unlink_obj(item); client_drop_item(parent_item, item); mutex_unlock(&subsys->su_mutex); if (module_got) module_put(new_item_owner); } out_subsys_put: if (ret) module_put(subsys_owner); out_put: /* * link_obj()/link_group() took a reference from child->parent, * so the parent is safely pinned. We can drop our working * reference. */ config_item_put(parent_item); put_fragment(frag); out: return ERR_PTR(ret); } static int configfs_rmdir(struct inode *dir, struct dentry *dentry) { struct config_item *parent_item; struct config_item *item; struct configfs_subsystem *subsys; struct configfs_dirent *sd; struct configfs_fragment *frag; struct module *subsys_owner = NULL, *dead_item_owner = NULL; int ret; sd = dentry->d_fsdata; if (sd->s_type & CONFIGFS_USET_DEFAULT) return -EPERM; /* Get a working ref until we have the child */ parent_item = configfs_get_config_item(dentry->d_parent); subsys = to_config_group(parent_item)->cg_subsys; BUG_ON(!subsys); if (!parent_item->ci_type) { config_item_put(parent_item); return -EINVAL; } /* configfs_mkdir() shouldn't have allowed this */ BUG_ON(!subsys->su_group.cg_item.ci_type); subsys_owner = subsys->su_group.cg_item.ci_type->ct_owner; /* * Ensure that no racing symlink() will make detach_prep() fail while * the new link is temporarily attached */ do { struct dentry *wait; mutex_lock(&configfs_symlink_mutex); spin_lock(&configfs_dirent_lock); /* * Here's where we check for dependents. We're protected by * configfs_dirent_lock. * If no dependent, atomically tag the item as dropping. */ ret = sd->s_dependent_count ? -EBUSY : 0; if (!ret) { ret = configfs_detach_prep(dentry, &wait); if (ret) configfs_detach_rollback(dentry); } spin_unlock(&configfs_dirent_lock); mutex_unlock(&configfs_symlink_mutex); if (ret) { if (ret != -EAGAIN) { config_item_put(parent_item); return ret; } /* Wait until the racing operation terminates */ inode_lock(d_inode(wait)); inode_unlock(d_inode(wait)); dput(wait); } } while (ret == -EAGAIN); frag = sd->s_frag; if (down_write_killable(&frag->frag_sem)) { spin_lock(&configfs_dirent_lock); configfs_detach_rollback(dentry); spin_unlock(&configfs_dirent_lock); config_item_put(parent_item); return -EINTR; } frag->frag_dead = true; up_write(&frag->frag_sem); /* Get a working ref for the duration of this function */ item = configfs_get_config_item(dentry); /* Drop reference from above, item already holds one. */ config_item_put(parent_item); if (item->ci_type) dead_item_owner = item->ci_type->ct_owner; if (sd->s_type & CONFIGFS_USET_DIR) { configfs_detach_group(item); mutex_lock(&subsys->su_mutex); client_disconnect_notify(parent_item, item); unlink_group(to_config_group(item)); } else { configfs_detach_item(item); mutex_lock(&subsys->su_mutex); client_disconnect_notify(parent_item, item); unlink_obj(item); } client_drop_item(parent_item, item); mutex_unlock(&subsys->su_mutex); /* Drop our reference from above */ config_item_put(item); module_put(dead_item_owner); module_put(subsys_owner); return 0; } const struct inode_operations configfs_dir_inode_operations = { .mkdir = configfs_mkdir, .rmdir = configfs_rmdir, .symlink = configfs_symlink, .unlink = configfs_unlink, .lookup = configfs_lookup, .setattr = configfs_setattr, }; const struct inode_operations configfs_root_inode_operations = { .lookup = configfs_lookup, .setattr = configfs_setattr, }; static int configfs_dir_open(struct inode *inode, struct file *file) { struct dentry * dentry = file->f_path.dentry; struct configfs_dirent * parent_sd = dentry->d_fsdata; int err; inode_lock(d_inode(dentry)); /* * Fake invisibility if dir belongs to a group/default groups hierarchy * being attached */ err = -ENOENT; if (configfs_dirent_is_ready(parent_sd)) { file->private_data = configfs_new_dirent(parent_sd, NULL, 0, NULL); err = PTR_ERR_OR_ZERO(file->private_data); } inode_unlock(d_inode(dentry)); return err; } static int configfs_dir_close(struct inode *inode, struct file *file) { struct dentry * dentry = file->f_path.dentry; struct configfs_dirent * cursor = file->private_data; inode_lock(d_inode(dentry)); spin_lock(&configfs_dirent_lock); list_del_init(&cursor->s_sibling); spin_unlock(&configfs_dirent_lock); inode_unlock(d_inode(dentry)); release_configfs_dirent(cursor); return 0; } static int configfs_readdir(struct file *file, struct dir_context *ctx) { struct dentry *dentry = file->f_path.dentry; struct super_block *sb = dentry->d_sb; struct configfs_dirent * parent_sd = dentry->d_fsdata; struct configfs_dirent *cursor = file->private_data; struct list_head *p, *q = &cursor->s_sibling; ino_t ino = 0; if (!dir_emit_dots(file, ctx)) return 0; spin_lock(&configfs_dirent_lock); if (ctx->pos == 2) list_move(q, &parent_sd->s_children); for (p = q->next; p != &parent_sd->s_children; p = p->next) { struct configfs_dirent *next; const char *name; int len; struct inode *inode = NULL; next = list_entry(p, struct configfs_dirent, s_sibling); if (!next->s_element) continue; /* * We'll have a dentry and an inode for * PINNED items and for open attribute * files. We lock here to prevent a race * with configfs_d_iput() clearing * s_dentry before calling iput(). * * Why do we go to the trouble? If * someone has an attribute file open, * the inode number should match until * they close it. Beyond that, we don't * care. */ dentry = next->s_dentry; if (dentry) inode = d_inode(dentry); if (inode) ino = inode->i_ino; spin_unlock(&configfs_dirent_lock); if (!inode) ino = iunique(sb, 2); name = configfs_get_name(next); len = strlen(name); if (!dir_emit(ctx, name, len, ino, fs_umode_to_dtype(next->s_mode))) return 0; spin_lock(&configfs_dirent_lock); list_move(q, p); p = q; ctx->pos++; } spin_unlock(&configfs_dirent_lock); return 0; } static loff_t configfs_dir_lseek(struct file *file, loff_t offset, int whence) { struct dentry * dentry = file->f_path.dentry; switch (whence) { case 1: offset += file->f_pos; fallthrough; case 0: if (offset >= 0) break; fallthrough; default: return -EINVAL; } if (offset != file->f_pos) { file->f_pos = offset; if (file->f_pos >= 2) { struct configfs_dirent *sd = dentry->d_fsdata; struct configfs_dirent *cursor = file->private_data; struct list_head *p; loff_t n = file->f_pos - 2; spin_lock(&configfs_dirent_lock); list_del(&cursor->s_sibling); p = sd->s_children.next; while (n && p != &sd->s_children) { struct configfs_dirent *next; next = list_entry(p, struct configfs_dirent, s_sibling); if (next->s_element) n--; p = p->next; } list_add_tail(&cursor->s_sibling, p); spin_unlock(&configfs_dirent_lock); } } return offset; } const struct file_operations configfs_dir_operations = { .open = configfs_dir_open, .release = configfs_dir_close, .llseek = configfs_dir_lseek, .read = generic_read_dir, .iterate_shared = configfs_readdir, }; /** * configfs_register_group - creates a parent-child relation between two groups * @parent_group: parent group * @group: child group * * link groups, creates dentry for the child and attaches it to the * parent dentry. * * Return: 0 on success, negative errno code on error */ int configfs_register_group(struct config_group *parent_group, struct config_group *group) { struct configfs_subsystem *subsys = parent_group->cg_subsys; struct dentry *parent; struct configfs_fragment *frag; int ret; frag = new_fragment(); if (!frag) return -ENOMEM; mutex_lock(&subsys->su_mutex); link_group(parent_group, group); mutex_unlock(&subsys->su_mutex); parent = parent_group->cg_item.ci_dentry; inode_lock_nested(d_inode(parent), I_MUTEX_PARENT); ret = create_default_group(parent_group, group, frag); if (ret) goto err_out; spin_lock(&configfs_dirent_lock); configfs_dir_set_ready(group->cg_item.ci_dentry->d_fsdata); spin_unlock(&configfs_dirent_lock); inode_unlock(d_inode(parent)); put_fragment(frag); return 0; err_out: inode_unlock(d_inode(parent)); mutex_lock(&subsys->su_mutex); unlink_group(group); mutex_unlock(&subsys->su_mutex); put_fragment(frag); return ret; } EXPORT_SYMBOL(configfs_register_group); /** * configfs_unregister_group() - unregisters a child group from its parent * @group: parent group to be unregistered * * Undoes configfs_register_group() */ void configfs_unregister_group(struct config_group *group) { struct configfs_subsystem *subsys = group->cg_subsys; struct dentry *dentry = group->cg_item.ci_dentry; struct dentry *parent = group->cg_item.ci_parent->ci_dentry; struct configfs_dirent *sd = dentry->d_fsdata; struct configfs_fragment *frag = sd->s_frag; down_write(&frag->frag_sem); frag->frag_dead = true; up_write(&frag->frag_sem); inode_lock_nested(d_inode(parent), I_MUTEX_PARENT); spin_lock(&configfs_dirent_lock); configfs_detach_prep(dentry, NULL); spin_unlock(&configfs_dirent_lock); configfs_detach_group(&group->cg_item); d_inode(dentry)->i_flags |= S_DEAD; dont_mount(dentry); d_drop(dentry); fsnotify_rmdir(d_inode(parent), dentry); inode_unlock(d_inode(parent)); dput(dentry); mutex_lock(&subsys->su_mutex); unlink_group(group); mutex_unlock(&subsys->su_mutex); } EXPORT_SYMBOL(configfs_unregister_group); /** * configfs_register_default_group() - allocates and registers a child group * @parent_group: parent group * @name: child group name * @item_type: child item type description * * boilerplate to allocate and register a child group with its parent. We need * kzalloc'ed memory because child's default_group is initially empty. * * Return: allocated config group or ERR_PTR() on error */ struct config_group * configfs_register_default_group(struct config_group *parent_group, const char *name, const struct config_item_type *item_type) { int ret; struct config_group *group; group = kzalloc(sizeof(*group), GFP_KERNEL); if (!group) return ERR_PTR(-ENOMEM); config_group_init_type_name(group, name, item_type); ret = configfs_register_group(parent_group, group); if (ret) { kfree(group); return ERR_PTR(ret); } return group; } EXPORT_SYMBOL(configfs_register_default_group); /** * configfs_unregister_default_group() - unregisters and frees a child group * @group: the group to act on */ void configfs_unregister_default_group(struct config_group *group) { configfs_unregister_group(group); kfree(group); } EXPORT_SYMBOL(configfs_unregister_default_group); int configfs_register_subsystem(struct configfs_subsystem *subsys) { int err; struct config_group *group = &subsys->su_group; struct dentry *dentry; struct dentry *root; struct configfs_dirent *sd; struct configfs_fragment *frag; frag = new_fragment(); if (!frag) return -ENOMEM; root = configfs_pin_fs(); if (IS_ERR(root)) { put_fragment(frag); return PTR_ERR(root); } if (!group->cg_item.ci_name) group->cg_item.ci_name = group->cg_item.ci_namebuf; sd = root->d_fsdata; mutex_lock(&configfs_subsystem_mutex); link_group(to_config_group(sd->s_element), group); mutex_unlock(&configfs_subsystem_mutex); inode_lock_nested(d_inode(root), I_MUTEX_PARENT); err = -ENOMEM; dentry = d_alloc_name(root, group->cg_item.ci_name); if (dentry) { d_add(dentry, NULL); err = configfs_dirent_exists(dentry); if (!err) err = configfs_attach_group(sd->s_element, &group->cg_item, dentry, frag); if (err) { BUG_ON(d_inode(dentry)); d_drop(dentry); dput(dentry); } else { spin_lock(&configfs_dirent_lock); configfs_dir_set_ready(dentry->d_fsdata); spin_unlock(&configfs_dirent_lock); } } inode_unlock(d_inode(root)); if (err) { mutex_lock(&configfs_subsystem_mutex); unlink_group(group); mutex_unlock(&configfs_subsystem_mutex); configfs_release_fs(); } put_fragment(frag); return err; } void configfs_unregister_subsystem(struct configfs_subsystem *subsys) { struct config_group *group = &subsys->su_group; struct dentry *dentry = group->cg_item.ci_dentry; struct dentry *root = dentry->d_sb->s_root; struct configfs_dirent *sd = dentry->d_fsdata; struct configfs_fragment *frag = sd->s_frag; if (dentry->d_parent != root) { pr_err("Tried to unregister non-subsystem!\n"); return; } down_write(&frag->frag_sem); frag->frag_dead = true; up_write(&frag->frag_sem); inode_lock_nested(d_inode(root), I_MUTEX_PARENT); inode_lock_nested(d_inode(dentry), I_MUTEX_CHILD); mutex_lock(&configfs_symlink_mutex); spin_lock(&configfs_dirent_lock); if (configfs_detach_prep(dentry, NULL)) { pr_err("Tried to unregister non-empty subsystem!\n"); } spin_unlock(&configfs_dirent_lock); mutex_unlock(&configfs_symlink_mutex); configfs_detach_group(&group->cg_item); d_inode(dentry)->i_flags |= S_DEAD; dont_mount(dentry); inode_unlock(d_inode(dentry)); d_drop(dentry); fsnotify_rmdir(d_inode(root), dentry); inode_unlock(d_inode(root)); dput(dentry); mutex_lock(&configfs_subsystem_mutex); unlink_group(group); mutex_unlock(&configfs_subsystem_mutex); configfs_release_fs(); } EXPORT_SYMBOL(configfs_register_subsystem); EXPORT_SYMBOL(configfs_unregister_subsystem);
216 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _SCSI_SCSI_CMND_H #define _SCSI_SCSI_CMND_H #include <linux/dma-mapping.h> #include <linux/blkdev.h> #include <linux/t10-pi.h> #include <linux/list.h> #include <linux/types.h> #include <linux/timer.h> #include <linux/scatterlist.h> #include <scsi/scsi_device.h> struct Scsi_Host; /* * MAX_COMMAND_SIZE is: * The longest fixed-length SCSI CDB as per the SCSI standard. * fixed-length means: commands that their size can be determined * by their opcode and the CDB does not carry a length specifier, (unlike * the VARIABLE_LENGTH_CMD(0x7f) command). This is actually not exactly * true and the SCSI standard also defines extended commands and * vendor specific commands that can be bigger than 16 bytes. The kernel * will support these using the same infrastructure used for VARLEN CDB's. * So in effect MAX_COMMAND_SIZE means the maximum size command scsi-ml * supports without specifying a cmd_len by ULD's */ #define MAX_COMMAND_SIZE 16 struct scsi_data_buffer { struct sg_table table; unsigned length; }; /* embedded in scsi_cmnd */ struct scsi_pointer { char *ptr; /* data pointer */ int this_residual; /* left in this buffer */ struct scatterlist *buffer; /* which buffer */ int buffers_residual; /* how many buffers left */ dma_addr_t dma_handle; volatile int Status; volatile int Message; volatile int have_data_in; volatile int sent_command; volatile int phase; }; /* for scmd->flags */ #define SCMD_TAGGED (1 << 0) #define SCMD_INITIALIZED (1 << 1) #define SCMD_LAST (1 << 2) /* * libata uses SCSI EH to fetch sense data for successful commands. * SCSI EH should not overwrite scmd->result when SCMD_FORCE_EH_SUCCESS is set. */ #define SCMD_FORCE_EH_SUCCESS (1 << 3) #define SCMD_FAIL_IF_RECOVERING (1 << 4) /* flags preserved across unprep / reprep */ #define SCMD_PRESERVED_FLAGS (SCMD_INITIALIZED | SCMD_FAIL_IF_RECOVERING) /* for scmd->state */ #define SCMD_STATE_COMPLETE 0 #define SCMD_STATE_INFLIGHT 1 enum scsi_cmnd_submitter { SUBMITTED_BY_BLOCK_LAYER = 0, SUBMITTED_BY_SCSI_ERROR_HANDLER = 1, SUBMITTED_BY_SCSI_RESET_IOCTL = 2, } __packed; struct scsi_cmnd { struct scsi_device *device; struct list_head eh_entry; /* entry for the host eh_abort_list/eh_cmd_q */ struct delayed_work abort_work; struct rcu_head rcu; int eh_eflags; /* Used by error handlr */ int budget_token; /* * This is set to jiffies as it was when the command was first * allocated. It is used to time how long the command has * been outstanding */ unsigned long jiffies_at_alloc; int retries; int allowed; unsigned char prot_op; unsigned char prot_type; unsigned char prot_flags; enum scsi_cmnd_submitter submitter; unsigned short cmd_len; enum dma_data_direction sc_data_direction; unsigned char cmnd[32]; /* SCSI CDB */ /* These elements define the operation we ultimately want to perform */ struct scsi_data_buffer sdb; struct scsi_data_buffer *prot_sdb; unsigned underflow; /* Return error if less than this amount is transferred */ unsigned transfersize; /* How much we are guaranteed to transfer with each SCSI transfer (ie, between disconnect / reconnects. Probably == sector size */ unsigned resid_len; /* residual count */ unsigned sense_len; unsigned char *sense_buffer; /* obtained by REQUEST SENSE when * CHECK CONDITION is received on original * command (auto-sense). Length must be * SCSI_SENSE_BUFFERSIZE bytes. */ int flags; /* Command flags */ unsigned long state; /* Command completion state */ unsigned int extra_len; /* length of alignment and padding */ /* * The fields below can be modified by the LLD but the fields above * must not be modified. */ unsigned char *host_scribble; /* The host adapter is allowed to * call scsi_malloc and get some memory * and hang it here. The host adapter * is also expected to call scsi_free * to release this memory. (The memory * obtained by scsi_malloc is guaranteed * to be at an address < 16Mb). */ int result; /* Status code from lower level driver */ }; /* Variant of blk_mq_rq_from_pdu() that verifies the type of its argument. */ static inline struct request *scsi_cmd_to_rq(struct scsi_cmnd *scmd) { return blk_mq_rq_from_pdu(scmd); } /* * Return the driver private allocation behind the command. * Only works if cmd_size is set in the host template. */ static inline void *scsi_cmd_priv(struct scsi_cmnd *cmd) { return cmd + 1; } void scsi_done(struct scsi_cmnd *cmd); void scsi_done_direct(struct scsi_cmnd *cmd); extern void scsi_finish_command(struct scsi_cmnd *cmd); extern void *scsi_kmap_atomic_sg(struct scatterlist *sg, int sg_count, size_t *offset, size_t *len); extern void scsi_kunmap_atomic_sg(void *virt); blk_status_t scsi_alloc_sgtables(struct scsi_cmnd *cmd); void scsi_free_sgtables(struct scsi_cmnd *cmd); #ifdef CONFIG_SCSI_DMA extern int scsi_dma_map(struct scsi_cmnd *cmd); extern void scsi_dma_unmap(struct scsi_cmnd *cmd); #else /* !CONFIG_SCSI_DMA */ static inline int scsi_dma_map(struct scsi_cmnd *cmd) { return -ENOSYS; } static inline void scsi_dma_unmap(struct scsi_cmnd *cmd) { } #endif /* !CONFIG_SCSI_DMA */ static inline unsigned scsi_sg_count(struct scsi_cmnd *cmd) { return cmd->sdb.table.nents; } static inline struct scatterlist *scsi_sglist(struct scsi_cmnd *cmd) { return cmd->sdb.table.sgl; } static inline unsigned scsi_bufflen(struct scsi_cmnd *cmd) { return cmd->sdb.length; } static inline void scsi_set_resid(struct scsi_cmnd *cmd, unsigned int resid) { cmd->resid_len = resid; } static inline unsigned int scsi_get_resid(struct scsi_cmnd *cmd) { return cmd->resid_len; } #define scsi_for_each_sg(cmd, sg, nseg, __i) \ for_each_sg(scsi_sglist(cmd), sg, nseg, __i) static inline int scsi_sg_copy_from_buffer(struct scsi_cmnd *cmd, const void *buf, int buflen) { return sg_copy_from_buffer(scsi_sglist(cmd), scsi_sg_count(cmd), buf, buflen); } static inline int scsi_sg_copy_to_buffer(struct scsi_cmnd *cmd, void *buf, int buflen) { return sg_copy_to_buffer(scsi_sglist(cmd), scsi_sg_count(cmd), buf, buflen); } static inline sector_t scsi_get_sector(struct scsi_cmnd *scmd) { return blk_rq_pos(scsi_cmd_to_rq(scmd)); } static inline sector_t scsi_get_lba(struct scsi_cmnd *scmd) { unsigned int shift = ilog2(scmd->device->sector_size) - SECTOR_SHIFT; return blk_rq_pos(scsi_cmd_to_rq(scmd)) >> shift; } static inline unsigned int scsi_logical_block_count(struct scsi_cmnd *scmd) { unsigned int shift = ilog2(scmd->device->sector_size); return blk_rq_bytes(scsi_cmd_to_rq(scmd)) >> shift; } /* * The operations below are hints that tell the controller driver how * to handle I/Os with DIF or similar types of protection information. */ enum scsi_prot_operations { /* Normal I/O */ SCSI_PROT_NORMAL = 0, /* OS-HBA: Protected, HBA-Target: Unprotected */ SCSI_PROT_READ_INSERT, SCSI_PROT_WRITE_STRIP, /* OS-HBA: Unprotected, HBA-Target: Protected */ SCSI_PROT_READ_STRIP, SCSI_PROT_WRITE_INSERT, /* OS-HBA: Protected, HBA-Target: Protected */ SCSI_PROT_READ_PASS, SCSI_PROT_WRITE_PASS, }; static inline void scsi_set_prot_op(struct scsi_cmnd *scmd, unsigned char op) { scmd->prot_op = op; } static inline unsigned char scsi_get_prot_op(struct scsi_cmnd *scmd) { return scmd->prot_op; } enum scsi_prot_flags { SCSI_PROT_TRANSFER_PI = 1 << 0, SCSI_PROT_GUARD_CHECK = 1 << 1, SCSI_PROT_REF_CHECK = 1 << 2, SCSI_PROT_REF_INCREMENT = 1 << 3, SCSI_PROT_IP_CHECKSUM = 1 << 4, }; /* * The controller usually does not know anything about the target it * is communicating with. However, when DIX is enabled the controller * must be know target type so it can verify the protection * information passed along with the I/O. */ enum scsi_prot_target_type { SCSI_PROT_DIF_TYPE0 = 0, SCSI_PROT_DIF_TYPE1, SCSI_PROT_DIF_TYPE2, SCSI_PROT_DIF_TYPE3, }; static inline void scsi_set_prot_type(struct scsi_cmnd *scmd, unsigned char type) { scmd->prot_type = type; } static inline unsigned char scsi_get_prot_type(struct scsi_cmnd *scmd) { return scmd->prot_type; } static inline u32 scsi_prot_ref_tag(struct scsi_cmnd *scmd) { struct request *rq = blk_mq_rq_from_pdu(scmd); return t10_pi_ref_tag(rq); } static inline unsigned int scsi_prot_interval(struct scsi_cmnd *scmd) { return scmd->device->sector_size; } static inline unsigned scsi_prot_sg_count(struct scsi_cmnd *cmd) { return cmd->prot_sdb ? cmd->prot_sdb->table.nents : 0; } static inline struct scatterlist *scsi_prot_sglist(struct scsi_cmnd *cmd) { return cmd->prot_sdb ? cmd->prot_sdb->table.sgl : NULL; } static inline struct scsi_data_buffer *scsi_prot(struct scsi_cmnd *cmd) { return cmd->prot_sdb; } #define scsi_for_each_prot_sg(cmd, sg, nseg, __i) \ for_each_sg(scsi_prot_sglist(cmd), sg, nseg, __i) static inline void set_status_byte(struct scsi_cmnd *cmd, char status) { cmd->result = (cmd->result & 0xffffff00) | status; } static inline u8 get_status_byte(struct scsi_cmnd *cmd) { return cmd->result & 0xff; } static inline void set_host_byte(struct scsi_cmnd *cmd, char status) { cmd->result = (cmd->result & 0xff00ffff) | (status << 16); } static inline u8 get_host_byte(struct scsi_cmnd *cmd) { return (cmd->result >> 16) & 0xff; } /** * scsi_msg_to_host_byte() - translate message byte * @cmd: the SCSI command * @msg: the SCSI parallel message byte to translate * * Translate the SCSI parallel message byte to a matching * host byte setting. A message of COMMAND_COMPLETE indicates * a successful command execution, any other message indicate * an error. As the messages themselves only have a meaning * for the SCSI parallel protocol this function translates * them into a matching host byte value for SCSI EH. */ static inline void scsi_msg_to_host_byte(struct scsi_cmnd *cmd, u8 msg) { switch (msg) { case COMMAND_COMPLETE: break; case ABORT_TASK_SET: set_host_byte(cmd, DID_ABORT); break; case TARGET_RESET: set_host_byte(cmd, DID_RESET); break; default: set_host_byte(cmd, DID_ERROR); break; } } static inline unsigned scsi_transfer_length(struct scsi_cmnd *scmd) { unsigned int xfer_len = scmd->sdb.length; unsigned int prot_interval = scsi_prot_interval(scmd); if (scmd->prot_flags & SCSI_PROT_TRANSFER_PI) xfer_len += (xfer_len >> ilog2(prot_interval)) * 8; return xfer_len; } extern void scsi_build_sense(struct scsi_cmnd *scmd, int desc, u8 key, u8 asc, u8 ascq); struct request *scsi_alloc_request(struct request_queue *q, blk_opf_t opf, blk_mq_req_flags_t flags); #endif /* _SCSI_SCSI_CMND_H */
23 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_DEBUGREG_H #define _ASM_X86_DEBUGREG_H #include <linux/bug.h> #include <linux/percpu.h> #include <uapi/asm/debugreg.h> #include <asm/cpufeature.h> #include <asm/msr.h> /* * Define bits that are always set to 1 in DR7, only bit 10 is * architecturally reserved to '1'. * * This is also the init/reset value for DR7. */ #define DR7_FIXED_1 0x00000400 DECLARE_PER_CPU(unsigned long, cpu_dr7); #ifndef CONFIG_PARAVIRT_XXL /* * These special macros can be used to get or set a debugging register */ #define get_debugreg(var, register) \ (var) = native_get_debugreg(register) #define set_debugreg(value, register) \ native_set_debugreg(register, value) #endif static __always_inline unsigned long native_get_debugreg(int regno) { unsigned long val; switch (regno) { case 0: asm("mov %%db0, %0" :"=r" (val)); break; case 1: asm("mov %%db1, %0" :"=r" (val)); break; case 2: asm("mov %%db2, %0" :"=r" (val)); break; case 3: asm("mov %%db3, %0" :"=r" (val)); break; case 6: asm("mov %%db6, %0" :"=r" (val)); break; case 7: /* * Use "asm volatile" for DR7 reads to forbid re-ordering them * with other code. * * This is needed because a DR7 access can cause a #VC exception * when running under SEV-ES. Taking a #VC exception is not a * safe thing to do just anywhere in the entry code and * re-ordering might place the access into an unsafe location. * * This happened in the NMI handler, where the DR7 read was * re-ordered to happen before the call to sev_es_ist_enter(), * causing stack recursion. */ asm volatile("mov %%db7, %0" : "=r" (val)); break; default: BUG(); } return val; } static __always_inline void native_set_debugreg(int regno, unsigned long value) { switch (regno) { case 0: asm("mov %0, %%db0" ::"r" (value)); break; case 1: asm("mov %0, %%db1" ::"r" (value)); break; case 2: asm("mov %0, %%db2" ::"r" (value)); break; case 3: asm("mov %0, %%db3" ::"r" (value)); break; case 6: asm("mov %0, %%db6" ::"r" (value)); break; case 7: /* * Use "asm volatile" for DR7 writes to forbid re-ordering them * with other code. * * While is didn't happen with a DR7 write (see the DR7 read * comment above which explains where it happened), add the * "asm volatile" here too to avoid similar problems in the * future. */ asm volatile("mov %0, %%db7" ::"r" (value)); break; default: BUG(); } } static inline void hw_breakpoint_disable(void) { /* Reset the control register for HW Breakpoint */ set_debugreg(DR7_FIXED_1, 7); /* Zero-out the individual HW breakpoint address registers */ set_debugreg(0UL, 0); set_debugreg(0UL, 1); set_debugreg(0UL, 2); set_debugreg(0UL, 3); } static __always_inline bool hw_breakpoint_active(void) { return __this_cpu_read(cpu_dr7) & DR_GLOBAL_ENABLE_MASK; } extern void hw_breakpoint_restore(void); static __always_inline unsigned long local_db_save(void) { unsigned long dr7; if (static_cpu_has(X86_FEATURE_HYPERVISOR) && !hw_breakpoint_active()) return 0; get_debugreg(dr7, 7); /* Architecturally set bit */ dr7 &= ~DR7_FIXED_1; if (dr7) set_debugreg(DR7_FIXED_1, 7); /* * Ensure the compiler doesn't lower the above statements into * the critical section; disabling breakpoints late would not * be good. */ barrier(); return dr7; } static __always_inline void local_db_restore(unsigned long dr7) { /* * Ensure the compiler doesn't raise this statement into * the critical section; enabling breakpoints early would * not be good. */ barrier(); if (dr7) set_debugreg(dr7, 7); } #ifdef CONFIG_CPU_SUP_AMD extern void amd_set_dr_addr_mask(unsigned long mask, unsigned int dr); extern unsigned long amd_get_dr_addr_mask(unsigned int dr); #else static inline void amd_set_dr_addr_mask(unsigned long mask, unsigned int dr) { } static inline unsigned long amd_get_dr_addr_mask(unsigned int dr) { return 0; } #endif static inline unsigned long get_debugctlmsr(void) { unsigned long debugctlmsr = 0; #ifndef CONFIG_X86_DEBUGCTLMSR if (boot_cpu_data.x86 < 6) return 0; #endif rdmsrq(MSR_IA32_DEBUGCTLMSR, debugctlmsr); return debugctlmsr; } static inline void update_debugctlmsr(unsigned long debugctlmsr) { #ifndef CONFIG_X86_DEBUGCTLMSR if (boot_cpu_data.x86 < 6) return; #endif wrmsrq(MSR_IA32_DEBUGCTLMSR, debugctlmsr); } #endif /* _ASM_X86_DEBUGREG_H */
11 11 11 11 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 /* * Copyright (c) 2004, 2005 Mellanox Technologies Ltd. All rights reserved. * Copyright (c) 2004, 2005 Infinicon Corporation. All rights reserved. * Copyright (c) 2004, 2005 Intel Corporation. All rights reserved. * Copyright (c) 2004, 2005 Topspin Corporation. All rights reserved. * Copyright (c) 2004-2007 Voltaire Corporation. All rights reserved. * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * */ #include <linux/slab.h> #include <linux/string.h> #include "agent.h" #include "smi.h" #include "mad_priv.h" #define SPFX "ib_agent: " struct ib_agent_port_private { struct list_head port_list; struct ib_mad_agent *agent[2]; }; static DEFINE_SPINLOCK(ib_agent_port_list_lock); static LIST_HEAD(ib_agent_port_list); static struct ib_agent_port_private * __ib_get_agent_port(const struct ib_device *device, int port_num) { struct ib_agent_port_private *entry; list_for_each_entry(entry, &ib_agent_port_list, port_list) { /* Need to check both agent[0] and agent[1], as an agent port * may only have one of them */ if (entry->agent[0] && entry->agent[0]->device == device && entry->agent[0]->port_num == port_num) return entry; if (entry->agent[1] && entry->agent[1]->device == device && entry->agent[1]->port_num == port_num) return entry; } return NULL; } static struct ib_agent_port_private * ib_get_agent_port(const struct ib_device *device, int port_num) { struct ib_agent_port_private *entry; unsigned long flags; spin_lock_irqsave(&ib_agent_port_list_lock, flags); entry = __ib_get_agent_port(device, port_num); spin_unlock_irqrestore(&ib_agent_port_list_lock, flags); return entry; } void agent_send_response(const struct ib_mad_hdr *mad_hdr, const struct ib_grh *grh, const struct ib_wc *wc, const struct ib_device *device, int port_num, int qpn, size_t resp_mad_len, bool opa) { struct ib_agent_port_private *port_priv; struct ib_mad_agent *agent; struct ib_mad_send_buf *send_buf; struct ib_ah *ah; struct ib_mad_send_wr_private *mad_send_wr; if (rdma_cap_ib_switch(device)) port_priv = ib_get_agent_port(device, 0); else port_priv = ib_get_agent_port(device, port_num); if (!port_priv) { dev_err(&device->dev, "Unable to find port agent\n"); return; } agent = port_priv->agent[qpn]; ah = ib_create_ah_from_wc(agent->qp->pd, wc, grh, port_num); if (IS_ERR(ah)) { dev_err(&device->dev, "ib_create_ah_from_wc error %pe\n", ah); return; } if (opa && mad_hdr->base_version != OPA_MGMT_BASE_VERSION) resp_mad_len = IB_MGMT_MAD_SIZE; send_buf = ib_create_send_mad(agent, wc->src_qp, wc->pkey_index, 0, IB_MGMT_MAD_HDR, resp_mad_len - IB_MGMT_MAD_HDR, GFP_KERNEL, mad_hdr->base_version); if (IS_ERR(send_buf)) { dev_err(&device->dev, "ib_create_send_mad error\n"); goto err1; } memcpy(send_buf->mad, mad_hdr, resp_mad_len); send_buf->ah = ah; if (rdma_cap_ib_switch(device)) { mad_send_wr = container_of(send_buf, struct ib_mad_send_wr_private, send_buf); mad_send_wr->send_wr.port_num = port_num; } if (ib_post_send_mad(send_buf, NULL)) { dev_err(&device->dev, "ib_post_send_mad error\n"); goto err2; } return; err2: ib_free_send_mad(send_buf); err1: rdma_destroy_ah(ah, RDMA_DESTROY_AH_SLEEPABLE); } static void agent_send_handler(struct ib_mad_agent *mad_agent, struct ib_mad_send_wc *mad_send_wc) { rdma_destroy_ah(mad_send_wc->send_buf->ah, RDMA_DESTROY_AH_SLEEPABLE); ib_free_send_mad(mad_send_wc->send_buf); } int ib_agent_port_open(struct ib_device *device, int port_num) { struct ib_agent_port_private *port_priv; unsigned long flags; int ret; /* Create new device info */ port_priv = kzalloc(sizeof *port_priv, GFP_KERNEL); if (!port_priv) { ret = -ENOMEM; goto error1; } if (rdma_cap_ib_smi(device, port_num)) { /* Obtain send only MAD agent for SMI QP */ port_priv->agent[0] = ib_register_mad_agent(device, port_num, IB_QPT_SMI, NULL, 0, &agent_send_handler, NULL, NULL, 0); if (IS_ERR(port_priv->agent[0])) { ret = PTR_ERR(port_priv->agent[0]); goto error2; } } if (rdma_cap_ib_cm(device, port_num)) { /* Obtain send only MAD agent for GSI QP */ port_priv->agent[1] = ib_register_mad_agent(device, port_num, IB_QPT_GSI, NULL, 0, &agent_send_handler, NULL, NULL, 0); if (IS_ERR(port_priv->agent[1])) { ret = PTR_ERR(port_priv->agent[1]); goto error3; } } spin_lock_irqsave(&ib_agent_port_list_lock, flags); list_add_tail(&port_priv->port_list, &ib_agent_port_list); spin_unlock_irqrestore(&ib_agent_port_list_lock, flags); return 0; error3: if (port_priv->agent[0]) ib_unregister_mad_agent(port_priv->agent[0]); error2: kfree(port_priv); error1: return ret; } int ib_agent_port_close(struct ib_device *device, int port_num) { struct ib_agent_port_private *port_priv; unsigned long flags; spin_lock_irqsave(&ib_agent_port_list_lock, flags); port_priv = __ib_get_agent_port(device, port_num); if (port_priv == NULL) { spin_unlock_irqrestore(&ib_agent_port_list_lock, flags); dev_err(&device->dev, "Port %d not found\n", port_num); return -ENODEV; } list_del(&port_priv->port_list); spin_unlock_irqrestore(&ib_agent_port_list_lock, flags); if (port_priv->agent[1]) ib_unregister_mad_agent(port_priv->agent[1]); if (port_priv->agent[0]) ib_unregister_mad_agent(port_priv->agent[0]); kfree(port_priv); return 0; }
143 1 40 102 9 1 439 439 43 89 29 89 86 71 6 4 5 76 1 88 31 31 20 2 2 2 81 42 14 73 7 3 65 6 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __NET_IP_TUNNELS_H #define __NET_IP_TUNNELS_H 1 #include <linux/if_tunnel.h> #include <linux/netdevice.h> #include <linux/skbuff.h> #include <linux/socket.h> #include <linux/types.h> #include <linux/u64_stats_sync.h> #include <linux/bitops.h> #include <net/dsfield.h> #include <net/flow.h> #include <net/gro_cells.h> #include <net/inet_dscp.h> #include <net/inet_ecn.h> #include <net/netns/generic.h> #include <net/rtnetlink.h> #include <net/lwtunnel.h> #include <net/dst_cache.h> #include <net/netdev_lock.h> #if IS_ENABLED(CONFIG_IPV6) #include <net/ipv6.h> #include <net/ip6_fib.h> #include <net/ip6_route.h> #endif /* Keep error state on tunnel for 30 sec */ #define IPTUNNEL_ERR_TIMEO (30*HZ) /* Used to memset ip_tunnel padding. */ #define IP_TUNNEL_KEY_SIZE offsetofend(struct ip_tunnel_key, tp_dst) /* Used to memset ipv4 address padding. */ #define IP_TUNNEL_KEY_IPV4_PAD offsetofend(struct ip_tunnel_key, u.ipv4.dst) #define IP_TUNNEL_KEY_IPV4_PAD_LEN \ (sizeof_field(struct ip_tunnel_key, u) - \ sizeof_field(struct ip_tunnel_key, u.ipv4)) #define __ipt_flag_op(op, ...) \ op(__VA_ARGS__, __IP_TUNNEL_FLAG_NUM) #define IP_TUNNEL_DECLARE_FLAGS(...) \ __ipt_flag_op(DECLARE_BITMAP, __VA_ARGS__) #define ip_tunnel_flags_zero(...) __ipt_flag_op(bitmap_zero, __VA_ARGS__) #define ip_tunnel_flags_copy(...) __ipt_flag_op(bitmap_copy, __VA_ARGS__) #define ip_tunnel_flags_and(...) __ipt_flag_op(bitmap_and, __VA_ARGS__) #define ip_tunnel_flags_or(...) __ipt_flag_op(bitmap_or, __VA_ARGS__) #define ip_tunnel_flags_empty(...) \ __ipt_flag_op(bitmap_empty, __VA_ARGS__) #define ip_tunnel_flags_intersect(...) \ __ipt_flag_op(bitmap_intersects, __VA_ARGS__) #define ip_tunnel_flags_subset(...) \ __ipt_flag_op(bitmap_subset, __VA_ARGS__) struct ip_tunnel_key { __be64 tun_id; union { struct { __be32 src; __be32 dst; } ipv4; struct { struct in6_addr src; struct in6_addr dst; } ipv6; } u; IP_TUNNEL_DECLARE_FLAGS(tun_flags); __be32 label; /* Flow Label for IPv6 */ u32 nhid; u8 tos; /* TOS for IPv4, TC for IPv6 */ u8 ttl; /* TTL for IPv4, HL for IPv6 */ __be16 tp_src; __be16 tp_dst; __u8 flow_flags; }; struct ip_tunnel_encap { u16 type; u16 flags; __be16 sport; __be16 dport; }; /* Flags for ip_tunnel_info mode. */ #define IP_TUNNEL_INFO_TX 0x01 /* represents tx tunnel parameters */ #define IP_TUNNEL_INFO_IPV6 0x02 /* key contains IPv6 addresses */ #define IP_TUNNEL_INFO_BRIDGE 0x04 /* represents a bridged tunnel id */ /* Maximum tunnel options length. */ #define IP_TUNNEL_OPTS_MAX \ GENMASK((sizeof_field(struct ip_tunnel_info, \ options_len) * BITS_PER_BYTE) - 1, 0) #define ip_tunnel_info_opts(info) \ _Generic(info, \ const struct ip_tunnel_info * : ((const void *)(info)->options),\ struct ip_tunnel_info * : ((void *)(info)->options)\ ) struct ip_tunnel_info { struct ip_tunnel_key key; struct ip_tunnel_encap encap; #ifdef CONFIG_DST_CACHE struct dst_cache dst_cache; #endif u8 options_len; u8 mode; u8 options[] __aligned_largest __counted_by(options_len); }; /* 6rd prefix/relay information */ #ifdef CONFIG_IPV6_SIT_6RD struct ip_tunnel_6rd_parm { struct in6_addr prefix; __be32 relay_prefix; u16 prefixlen; u16 relay_prefixlen; }; #endif struct ip_tunnel_prl_entry { struct ip_tunnel_prl_entry __rcu *next; __be32 addr; u16 flags; struct rcu_head rcu_head; }; struct metadata_dst; /* Kernel-side variant of ip_tunnel_parm */ struct ip_tunnel_parm_kern { char name[IFNAMSIZ]; IP_TUNNEL_DECLARE_FLAGS(i_flags); IP_TUNNEL_DECLARE_FLAGS(o_flags); __be32 i_key; __be32 o_key; int link; struct iphdr iph; }; struct ip_tunnel { struct ip_tunnel __rcu *next; struct hlist_node hash_node; struct net_device *dev; netdevice_tracker dev_tracker; struct net *net; /* netns for packet i/o */ unsigned long err_time; /* Time when the last ICMP error * arrived */ int err_count; /* Number of arrived ICMP errors */ /* These four fields used only by GRE */ u32 i_seqno; /* The last seen seqno */ atomic_t o_seqno; /* The last output seqno */ int tun_hlen; /* Precalculated header length */ /* These four fields used only by ERSPAN */ u32 index; /* ERSPAN type II index */ u8 erspan_ver; /* ERSPAN version */ u8 dir; /* ERSPAN direction */ u16 hwid; /* ERSPAN hardware ID */ struct dst_cache dst_cache; struct ip_tunnel_parm_kern parms; int mlink; int encap_hlen; /* Encap header length (FOU,GUE) */ int hlen; /* tun_hlen + encap_hlen */ struct ip_tunnel_encap encap; /* for SIT */ #ifdef CONFIG_IPV6_SIT_6RD struct ip_tunnel_6rd_parm ip6rd; #endif struct ip_tunnel_prl_entry __rcu *prl; /* potential router list */ unsigned int prl_count; /* # of entries in PRL */ unsigned int ip_tnl_net_id; struct gro_cells gro_cells; __u32 fwmark; bool collect_md; bool ignore_df; }; struct tnl_ptk_info { IP_TUNNEL_DECLARE_FLAGS(flags); __be16 proto; __be32 key; __be32 seq; int hdr_len; }; #define PACKET_RCVD 0 #define PACKET_REJECT 1 #define PACKET_NEXT 2 #define IP_TNL_HASH_BITS 7 #define IP_TNL_HASH_SIZE (1 << IP_TNL_HASH_BITS) struct ip_tunnel_net { struct net_device *fb_tunnel_dev; struct rtnl_link_ops *rtnl_link_ops; struct hlist_head tunnels[IP_TNL_HASH_SIZE]; struct ip_tunnel __rcu *collect_md_tun; int type; }; static inline void ip_tunnel_set_options_present(unsigned long *flags) { IP_TUNNEL_DECLARE_FLAGS(present) = { }; __set_bit(IP_TUNNEL_GENEVE_OPT_BIT, present); __set_bit(IP_TUNNEL_VXLAN_OPT_BIT, present); __set_bit(IP_TUNNEL_ERSPAN_OPT_BIT, present); __set_bit(IP_TUNNEL_GTP_OPT_BIT, present); __set_bit(IP_TUNNEL_PFCP_OPT_BIT, present); ip_tunnel_flags_or(flags, flags, present); } static inline void ip_tunnel_clear_options_present(unsigned long *flags) { IP_TUNNEL_DECLARE_FLAGS(present) = { }; __set_bit(IP_TUNNEL_GENEVE_OPT_BIT, present); __set_bit(IP_TUNNEL_VXLAN_OPT_BIT, present); __set_bit(IP_TUNNEL_ERSPAN_OPT_BIT, present); __set_bit(IP_TUNNEL_GTP_OPT_BIT, present); __set_bit(IP_TUNNEL_PFCP_OPT_BIT, present); __ipt_flag_op(bitmap_andnot, flags, flags, present); } static inline bool ip_tunnel_is_options_present(const unsigned long *flags) { IP_TUNNEL_DECLARE_FLAGS(present) = { }; __set_bit(IP_TUNNEL_GENEVE_OPT_BIT, present); __set_bit(IP_TUNNEL_VXLAN_OPT_BIT, present); __set_bit(IP_TUNNEL_ERSPAN_OPT_BIT, present); __set_bit(IP_TUNNEL_GTP_OPT_BIT, present); __set_bit(IP_TUNNEL_PFCP_OPT_BIT, present); return ip_tunnel_flags_intersect(flags, present); } static inline bool ip_tunnel_flags_is_be16_compat(const unsigned long *flags) { IP_TUNNEL_DECLARE_FLAGS(supp) = { }; bitmap_set(supp, 0, BITS_PER_TYPE(__be16)); __set_bit(IP_TUNNEL_VTI_BIT, supp); return ip_tunnel_flags_subset(flags, supp); } static inline void ip_tunnel_flags_from_be16(unsigned long *dst, __be16 flags) { ip_tunnel_flags_zero(dst); bitmap_write(dst, be16_to_cpu(flags), 0, BITS_PER_TYPE(__be16)); __assign_bit(IP_TUNNEL_VTI_BIT, dst, flags & VTI_ISVTI); } static inline __be16 ip_tunnel_flags_to_be16(const unsigned long *flags) { __be16 ret; ret = cpu_to_be16(bitmap_read(flags, 0, BITS_PER_TYPE(__be16))); if (test_bit(IP_TUNNEL_VTI_BIT, flags)) ret |= VTI_ISVTI; return ret; } static inline void ip_tunnel_key_init(struct ip_tunnel_key *key, __be32 saddr, __be32 daddr, u8 tos, u8 ttl, __be32 label, __be16 tp_src, __be16 tp_dst, __be64 tun_id, const unsigned long *tun_flags) { key->tun_id = tun_id; key->u.ipv4.src = saddr; key->u.ipv4.dst = daddr; memset((unsigned char *)key + IP_TUNNEL_KEY_IPV4_PAD, 0, IP_TUNNEL_KEY_IPV4_PAD_LEN); key->tos = tos; key->ttl = ttl; key->label = label; ip_tunnel_flags_copy(key->tun_flags, tun_flags); /* For the tunnel types on the top of IPsec, the tp_src and tp_dst of * the upper tunnel are used. * E.g: GRE over IPSEC, the tp_src and tp_port are zero. */ key->tp_src = tp_src; key->tp_dst = tp_dst; /* Clear struct padding. */ if (sizeof(*key) != IP_TUNNEL_KEY_SIZE) memset((unsigned char *)key + IP_TUNNEL_KEY_SIZE, 0, sizeof(*key) - IP_TUNNEL_KEY_SIZE); } static inline bool ip_tunnel_dst_cache_usable(const struct sk_buff *skb, const struct ip_tunnel_info *info) { if (skb->mark) return false; return !info || !test_bit(IP_TUNNEL_NOCACHE_BIT, info->key.tun_flags); } static inline unsigned short ip_tunnel_info_af(const struct ip_tunnel_info *tun_info) { return tun_info->mode & IP_TUNNEL_INFO_IPV6 ? AF_INET6 : AF_INET; } static inline __be64 key32_to_tunnel_id(__be32 key) { #ifdef __BIG_ENDIAN return (__force __be64)key; #else return (__force __be64)((__force u64)key << 32); #endif } /* Returns the least-significant 32 bits of a __be64. */ static inline __be32 tunnel_id_to_key32(__be64 tun_id) { #ifdef __BIG_ENDIAN return (__force __be32)tun_id; #else return (__force __be32)((__force u64)tun_id >> 32); #endif } #ifdef CONFIG_INET static inline void ip_tunnel_init_flow(struct flowi4 *fl4, int proto, __be32 daddr, __be32 saddr, __be32 key, __u8 tos, struct net *net, int oif, __u32 mark, __u32 tun_inner_hash, __u8 flow_flags) { memset(fl4, 0, sizeof(*fl4)); if (oif) { fl4->flowi4_l3mdev = l3mdev_master_upper_ifindex_by_index(net, oif); /* Legacy VRF/l3mdev use case */ fl4->flowi4_oif = fl4->flowi4_l3mdev ? 0 : oif; } fl4->daddr = daddr; fl4->saddr = saddr; fl4->flowi4_dscp = inet_dsfield_to_dscp(tos); fl4->flowi4_proto = proto; fl4->fl4_gre_key = key; fl4->flowi4_mark = mark; fl4->flowi4_multipath_hash = tun_inner_hash; fl4->flowi4_flags = flow_flags; } int __ip_tunnel_init(struct net_device *dev); #define ip_tunnel_init(DEV) \ ({ \ struct net_device *__dev = (DEV); \ int __res = __ip_tunnel_init(__dev); \ \ if (!__res) \ netdev_lockdep_set_classes(__dev);\ __res; \ }) void ip_tunnel_uninit(struct net_device *dev); void ip_tunnel_dellink(struct net_device *dev, struct list_head *head); struct net *ip_tunnel_get_link_net(const struct net_device *dev); int ip_tunnel_get_iflink(const struct net_device *dev); int ip_tunnel_init_net(struct net *net, unsigned int ip_tnl_net_id, struct rtnl_link_ops *ops, char *devname); void ip_tunnel_delete_net(struct net *net, unsigned int id, struct rtnl_link_ops *ops, struct list_head *dev_to_kill); void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, const struct iphdr *tnl_params, const u8 protocol); void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, const u8 proto, int tunnel_hlen); int ip_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm_kern *p, int cmd); bool ip_tunnel_parm_from_user(struct ip_tunnel_parm_kern *kp, const void __user *data); bool ip_tunnel_parm_to_user(void __user *data, struct ip_tunnel_parm_kern *kp); int ip_tunnel_siocdevprivate(struct net_device *dev, struct ifreq *ifr, void __user *data, int cmd); int __ip_tunnel_change_mtu(struct net_device *dev, int new_mtu, bool strict); int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu); struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn, int link, const unsigned long *flags, __be32 remote, __be32 local, __be32 key); void ip_tunnel_md_udp_encap(struct sk_buff *skb, struct ip_tunnel_info *info); int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb, const struct tnl_ptk_info *tpi, struct metadata_dst *tun_dst, bool log_ecn_error); int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[], struct ip_tunnel_parm_kern *p, __u32 fwmark); int ip_tunnel_newlink(struct net *net, struct net_device *dev, struct nlattr *tb[], struct ip_tunnel_parm_kern *p, __u32 fwmark); void ip_tunnel_setup(struct net_device *dev, unsigned int net_id); bool ip_tunnel_netlink_encap_parms(struct nlattr *data[], struct ip_tunnel_encap *encap); void ip_tunnel_netlink_parms(struct nlattr *data[], struct ip_tunnel_parm_kern *parms); extern const struct header_ops ip_tunnel_header_ops; __be16 ip_tunnel_parse_protocol(const struct sk_buff *skb); struct ip_tunnel_encap_ops { size_t (*encap_hlen)(struct ip_tunnel_encap *e); int (*build_header)(struct sk_buff *skb, struct ip_tunnel_encap *e, u8 *protocol, struct flowi4 *fl4); int (*err_handler)(struct sk_buff *skb, u32 info); }; #define MAX_IPTUN_ENCAP_OPS 8 extern const struct ip_tunnel_encap_ops __rcu * iptun_encaps[MAX_IPTUN_ENCAP_OPS]; int ip_tunnel_encap_add_ops(const struct ip_tunnel_encap_ops *op, unsigned int num); int ip_tunnel_encap_del_ops(const struct ip_tunnel_encap_ops *op, unsigned int num); int ip_tunnel_encap_setup(struct ip_tunnel *t, struct ip_tunnel_encap *ipencap); static inline enum skb_drop_reason pskb_inet_may_pull_reason(struct sk_buff *skb) { int nhlen; switch (skb->protocol) { #if IS_ENABLED(CONFIG_IPV6) case htons(ETH_P_IPV6): nhlen = sizeof(struct ipv6hdr); break; #endif case htons(ETH_P_IP): nhlen = sizeof(struct iphdr); break; default: nhlen = 0; } return pskb_network_may_pull_reason(skb, nhlen); } static inline bool pskb_inet_may_pull(struct sk_buff *skb) { return pskb_inet_may_pull_reason(skb) == SKB_NOT_DROPPED_YET; } /* Variant of pskb_inet_may_pull(). */ static inline enum skb_drop_reason skb_vlan_inet_prepare(struct sk_buff *skb, bool inner_proto_inherit) { int nhlen = 0, maclen = inner_proto_inherit ? 0 : ETH_HLEN; __be16 type = skb->protocol; enum skb_drop_reason reason; /* Essentially this is skb_protocol(skb, true) * And we get MAC len. */ if (eth_type_vlan(type)) type = __vlan_get_protocol(skb, type, &maclen); switch (type) { #if IS_ENABLED(CONFIG_IPV6) case htons(ETH_P_IPV6): nhlen = sizeof(struct ipv6hdr); break; #endif case htons(ETH_P_IP): nhlen = sizeof(struct iphdr); break; } /* For ETH_P_IPV6/ETH_P_IP we make sure to pull * a base network header in skb->head. */ reason = pskb_may_pull_reason(skb, maclen + nhlen); if (reason) return reason; skb_set_network_header(skb, maclen); return SKB_NOT_DROPPED_YET; } static inline int ip_encap_hlen(struct ip_tunnel_encap *e) { const struct ip_tunnel_encap_ops *ops; int hlen = -EINVAL; if (e->type == TUNNEL_ENCAP_NONE) return 0; if (e->type >= MAX_IPTUN_ENCAP_OPS) return -EINVAL; rcu_read_lock(); ops = rcu_dereference(iptun_encaps[e->type]); if (likely(ops && ops->encap_hlen)) hlen = ops->encap_hlen(e); rcu_read_unlock(); return hlen; } static inline int ip_tunnel_encap(struct sk_buff *skb, struct ip_tunnel_encap *e, u8 *protocol, struct flowi4 *fl4) { const struct ip_tunnel_encap_ops *ops; int ret = -EINVAL; if (e->type == TUNNEL_ENCAP_NONE) return 0; if (e->type >= MAX_IPTUN_ENCAP_OPS) return -EINVAL; rcu_read_lock(); ops = rcu_dereference(iptun_encaps[e->type]); if (likely(ops && ops->build_header)) ret = ops->build_header(skb, e, protocol, fl4); rcu_read_unlock(); return ret; } /* Extract dsfield from inner protocol */ static inline u8 ip_tunnel_get_dsfield(const struct iphdr *iph, const struct sk_buff *skb) { __be16 payload_protocol = skb_protocol(skb, true); if (payload_protocol == htons(ETH_P_IP)) return iph->tos; else if (payload_protocol == htons(ETH_P_IPV6)) return ipv6_get_dsfield((const struct ipv6hdr *)iph); else return 0; } static inline __be32 ip_tunnel_get_flowlabel(const struct iphdr *iph, const struct sk_buff *skb) { __be16 payload_protocol = skb_protocol(skb, true); if (payload_protocol == htons(ETH_P_IPV6)) return ip6_flowlabel((const struct ipv6hdr *)iph); else return 0; } static inline u8 ip_tunnel_get_ttl(const struct iphdr *iph, const struct sk_buff *skb) { __be16 payload_protocol = skb_protocol(skb, true); if (payload_protocol == htons(ETH_P_IP)) return iph->ttl; else if (payload_protocol == htons(ETH_P_IPV6)) return ((const struct ipv6hdr *)iph)->hop_limit; else return 0; } /* Propagate ECN bits out */ static inline u8 ip_tunnel_ecn_encap(u8 tos, const struct iphdr *iph, const struct sk_buff *skb) { u8 inner = ip_tunnel_get_dsfield(iph, skb); return INET_ECN_encapsulate(tos, inner); } int __iptunnel_pull_header(struct sk_buff *skb, int hdr_len, __be16 inner_proto, bool raw_proto, bool xnet); static inline int iptunnel_pull_header(struct sk_buff *skb, int hdr_len, __be16 inner_proto, bool xnet) { return __iptunnel_pull_header(skb, hdr_len, inner_proto, false, xnet); } void iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb, __be32 src, __be32 dst, u8 proto, u8 tos, u8 ttl, __be16 df, bool xnet, u16 ipcb_flags); struct metadata_dst *iptunnel_metadata_reply(struct metadata_dst *md, gfp_t flags); int skb_tunnel_check_pmtu(struct sk_buff *skb, struct dst_entry *encap_dst, int headroom, bool reply); static inline void ip_tunnel_adj_headroom(struct net_device *dev, unsigned int headroom) { /* we must cap headroom to some upperlimit, else pskb_expand_head * will overflow header offsets in skb_headers_offset_update(). */ const unsigned int max_allowed = 512; if (headroom > max_allowed) headroom = max_allowed; if (headroom > READ_ONCE(dev->needed_headroom)) WRITE_ONCE(dev->needed_headroom, headroom); } int iptunnel_handle_offloads(struct sk_buff *skb, int gso_type_mask); static inline int iptunnel_pull_offloads(struct sk_buff *skb) { if (skb_is_gso(skb)) { int err; err = skb_unclone(skb, GFP_ATOMIC); if (unlikely(err)) return err; skb_shinfo(skb)->gso_type &= ~(NETIF_F_GSO_ENCAP_ALL >> NETIF_F_GSO_SHIFT); } skb->encapsulation = 0; return 0; } static inline void iptunnel_xmit_stats(struct net_device *dev, int pkt_len) { if (pkt_len > 0) { struct pcpu_sw_netstats *tstats = get_cpu_ptr(dev->tstats); u64_stats_update_begin(&tstats->syncp); u64_stats_add(&tstats->tx_bytes, pkt_len); u64_stats_inc(&tstats->tx_packets); u64_stats_update_end(&tstats->syncp); put_cpu_ptr(tstats); return; } if (pkt_len < 0) { DEV_STATS_INC(dev, tx_errors); DEV_STATS_INC(dev, tx_aborted_errors); } else { DEV_STATS_INC(dev, tx_dropped); } } static inline void ip_tunnel_info_opts_get(void *to, const struct ip_tunnel_info *info) { memcpy(to, ip_tunnel_info_opts(info), info->options_len); } static inline void ip_tunnel_info_opts_set(struct ip_tunnel_info *info, const void *from, int len, const unsigned long *flags) { info->options_len = len; if (len > 0) { memcpy(ip_tunnel_info_opts(info), from, len); ip_tunnel_flags_or(info->key.tun_flags, info->key.tun_flags, flags); } } static inline struct ip_tunnel_info *lwt_tun_info(struct lwtunnel_state *lwtstate) { return (struct ip_tunnel_info *)lwtstate->data; } DECLARE_STATIC_KEY_FALSE(ip_tunnel_metadata_cnt); /* Returns > 0 if metadata should be collected */ static inline int ip_tunnel_collect_metadata(void) { return static_branch_unlikely(&ip_tunnel_metadata_cnt); } void __init ip_tunnel_core_init(void); void ip_tunnel_need_metadata(void); void ip_tunnel_unneed_metadata(void); #else /* CONFIG_INET */ static inline struct ip_tunnel_info *lwt_tun_info(struct lwtunnel_state *lwtstate) { return NULL; } static inline void ip_tunnel_need_metadata(void) { } static inline void ip_tunnel_unneed_metadata(void) { } static inline void ip_tunnel_info_opts_get(void *to, const struct ip_tunnel_info *info) { } static inline void ip_tunnel_info_opts_set(struct ip_tunnel_info *info, const void *from, int len, const unsigned long *flags) { info->options_len = 0; } #endif /* CONFIG_INET */ #endif /* __NET_IP_TUNNELS_H */
2 55 15 59 43 626 94 190 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_WAIT_H #define _LINUX_WAIT_H /* * Linux wait queue related types and methods */ #include <linux/list.h> #include <linux/stddef.h> #include <linux/spinlock.h> #include <asm/current.h> typedef struct wait_queue_entry wait_queue_entry_t; typedef int (*wait_queue_func_t)(struct wait_queue_entry *wq_entry, unsigned mode, int flags, void *key); int default_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, int flags, void *key); /* wait_queue_entry::flags */ #define WQ_FLAG_EXCLUSIVE 0x01 #define WQ_FLAG_WOKEN 0x02 #define WQ_FLAG_CUSTOM 0x04 #define WQ_FLAG_DONE 0x08 #define WQ_FLAG_PRIORITY 0x10 /* * A single wait-queue entry structure: */ struct wait_queue_entry { unsigned int flags; void *private; wait_queue_func_t func; struct list_head entry; }; struct wait_queue_head { spinlock_t lock; struct list_head head; }; typedef struct wait_queue_head wait_queue_head_t; struct task_struct; /* * Macros for declaration and initialisaton of the datatypes */ #define __WAITQUEUE_INITIALIZER(name, tsk) { \ .private = tsk, \ .func = default_wake_function, \ .entry = { NULL, NULL } } #define DECLARE_WAITQUEUE(name, tsk) \ struct wait_queue_entry name = __WAITQUEUE_INITIALIZER(name, tsk) #define __WAIT_QUEUE_HEAD_INITIALIZER(name) { \ .lock = __SPIN_LOCK_UNLOCKED(name.lock), \ .head = LIST_HEAD_INIT(name.head) } #define DECLARE_WAIT_QUEUE_HEAD(name) \ struct wait_queue_head name = __WAIT_QUEUE_HEAD_INITIALIZER(name) extern void __init_waitqueue_head(struct wait_queue_head *wq_head, const char *name, struct lock_class_key *); #define init_waitqueue_head(wq_head) \ do { \ static struct lock_class_key __key; \ \ __init_waitqueue_head((wq_head), #wq_head, &__key); \ } while (0) #ifdef CONFIG_LOCKDEP # define __WAIT_QUEUE_HEAD_INIT_ONSTACK(name) \ ({ init_waitqueue_head(&name); name; }) # define DECLARE_WAIT_QUEUE_HEAD_ONSTACK(name) \ struct wait_queue_head name = __WAIT_QUEUE_HEAD_INIT_ONSTACK(name) #else # define DECLARE_WAIT_QUEUE_HEAD_ONSTACK(name) DECLARE_WAIT_QUEUE_HEAD(name) #endif static inline void init_waitqueue_entry(struct wait_queue_entry *wq_entry, struct task_struct *p) { wq_entry->flags = 0; wq_entry->private = p; wq_entry->func = default_wake_function; } static inline void init_waitqueue_func_entry(struct wait_queue_entry *wq_entry, wait_queue_func_t func) { wq_entry->flags = 0; wq_entry->private = NULL; wq_entry->func = func; } /** * waitqueue_active -- locklessly test for waiters on the queue * @wq_head: the waitqueue to test for waiters * * returns true if the wait list is not empty * * NOTE: this function is lockless and requires care, incorrect usage _will_ * lead to sporadic and non-obvious failure. * * Use either while holding wait_queue_head::lock or when used for wakeups * with an extra smp_mb() like:: * * CPU0 - waker CPU1 - waiter * * for (;;) { * @cond = true; prepare_to_wait(&wq_head, &wait, state); * smp_mb(); // smp_mb() from set_current_state() * if (waitqueue_active(wq_head)) if (@cond) * wake_up(wq_head); break; * schedule(); * } * finish_wait(&wq_head, &wait); * * Because without the explicit smp_mb() it's possible for the * waitqueue_active() load to get hoisted over the @cond store such that we'll * observe an empty wait list while the waiter might not observe @cond. * * Also note that this 'optimization' trades a spin_lock() for an smp_mb(), * which (when the lock is uncontended) are of roughly equal cost. */ static inline int waitqueue_active(struct wait_queue_head *wq_head) { return !list_empty(&wq_head->head); } /** * wq_has_single_sleeper - check if there is only one sleeper * @wq_head: wait queue head * * Returns true of wq_head has only one sleeper on the list. * * Please refer to the comment for waitqueue_active. */ static inline bool wq_has_single_sleeper(struct wait_queue_head *wq_head) { return list_is_singular(&wq_head->head); } /** * wq_has_sleeper - check if there are any waiting processes * @wq_head: wait queue head * * Returns true if wq_head has waiting processes * * Please refer to the comment for waitqueue_active. */ static inline bool wq_has_sleeper(struct wait_queue_head *wq_head) { /* * We need to be sure we are in sync with the * add_wait_queue modifications to the wait queue. * * This memory barrier should be paired with one on the * waiting side. */ smp_mb(); return waitqueue_active(wq_head); } extern void add_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry); extern void add_wait_queue_exclusive(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry); extern void add_wait_queue_priority(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry); extern int add_wait_queue_priority_exclusive(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry); extern void remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry); static inline void __add_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry) { struct list_head *head = &wq_head->head; struct wait_queue_entry *wq; list_for_each_entry(wq, &wq_head->head, entry) { if (!(wq->flags & WQ_FLAG_PRIORITY)) break; head = &wq->entry; } list_add(&wq_entry->entry, head); } /* * Used for wake-one threads: */ static inline void __add_wait_queue_exclusive(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry) { wq_entry->flags |= WQ_FLAG_EXCLUSIVE; __add_wait_queue(wq_head, wq_entry); } static inline void __add_wait_queue_entry_tail(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry) { list_add_tail(&wq_entry->entry, &wq_head->head); } static inline void __add_wait_queue_entry_tail_exclusive(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry) { wq_entry->flags |= WQ_FLAG_EXCLUSIVE; __add_wait_queue_entry_tail(wq_head, wq_entry); } static inline void __remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry) { list_del(&wq_entry->entry); } int __wake_up(struct wait_queue_head *wq_head, unsigned int mode, int nr, void *key); void __wake_up_on_current_cpu(struct wait_queue_head *wq_head, unsigned int mode, void *key); void __wake_up_locked_key(struct wait_queue_head *wq_head, unsigned int mode, void *key); void __wake_up_sync_key(struct wait_queue_head *wq_head, unsigned int mode, void *key); void __wake_up_locked_sync_key(struct wait_queue_head *wq_head, unsigned int mode, void *key); void __wake_up_locked(struct wait_queue_head *wq_head, unsigned int mode, int nr); void __wake_up_sync(struct wait_queue_head *wq_head, unsigned int mode); void __wake_up_pollfree(struct wait_queue_head *wq_head); #define wake_up(x) __wake_up(x, TASK_NORMAL, 1, NULL) #define wake_up_nr(x, nr) __wake_up(x, TASK_NORMAL, nr, NULL) #define wake_up_all(x) __wake_up(x, TASK_NORMAL, 0, NULL) #define wake_up_locked(x) __wake_up_locked((x), TASK_NORMAL, 1) #define wake_up_all_locked(x) __wake_up_locked((x), TASK_NORMAL, 0) #define wake_up_sync(x) __wake_up_sync(x, TASK_NORMAL) #define wake_up_interruptible(x) __wake_up(x, TASK_INTERRUPTIBLE, 1, NULL) #define wake_up_interruptible_nr(x, nr) __wake_up(x, TASK_INTERRUPTIBLE, nr, NULL) #define wake_up_interruptible_all(x) __wake_up(x, TASK_INTERRUPTIBLE, 0, NULL) #define wake_up_interruptible_sync(x) __wake_up_sync((x), TASK_INTERRUPTIBLE) /* * Wakeup macros to be used to report events to the targets. */ #define poll_to_key(m) ((void *)(__force uintptr_t)(__poll_t)(m)) #define key_to_poll(m) ((__force __poll_t)(uintptr_t)(void *)(m)) #define wake_up_poll(x, m) \ __wake_up(x, TASK_NORMAL, 1, poll_to_key(m)) #define wake_up_poll_on_current_cpu(x, m) \ __wake_up_on_current_cpu(x, TASK_NORMAL, poll_to_key(m)) #define wake_up_locked_poll(x, m) \ __wake_up_locked_key((x), TASK_NORMAL, poll_to_key(m)) #define wake_up_interruptible_poll(x, m) \ __wake_up(x, TASK_INTERRUPTIBLE, 1, poll_to_key(m)) #define wake_up_interruptible_sync_poll(x, m) \ __wake_up_sync_key((x), TASK_INTERRUPTIBLE, poll_to_key(m)) #define wake_up_interruptible_sync_poll_locked(x, m) \ __wake_up_locked_sync_key((x), TASK_INTERRUPTIBLE, poll_to_key(m)) /** * wake_up_pollfree - signal that a polled waitqueue is going away * @wq_head: the wait queue head * * In the very rare cases where a ->poll() implementation uses a waitqueue whose * lifetime is tied to a task rather than to the 'struct file' being polled, * this function must be called before the waitqueue is freed so that * non-blocking polls (e.g. epoll) are notified that the queue is going away. * * The caller must also RCU-delay the freeing of the wait_queue_head, e.g. via * an explicit synchronize_rcu() or call_rcu(), or via SLAB_TYPESAFE_BY_RCU. */ static inline void wake_up_pollfree(struct wait_queue_head *wq_head) { /* * For performance reasons, we don't always take the queue lock here. * Therefore, we might race with someone removing the last entry from * the queue, and proceed while they still hold the queue lock. * However, rcu_read_lock() is required to be held in such cases, so we * can safely proceed with an RCU-delayed free. */ if (waitqueue_active(wq_head)) __wake_up_pollfree(wq_head); } #define ___wait_cond_timeout(condition) \ ({ \ bool __cond = (condition); \ if (__cond && !__ret) \ __ret = 1; \ __cond || !__ret; \ }) #define ___wait_is_interruptible(state) \ (!__builtin_constant_p(state) || \ (state & (TASK_INTERRUPTIBLE | TASK_WAKEKILL))) extern void init_wait_entry(struct wait_queue_entry *wq_entry, int flags); /* * The below macro ___wait_event() has an explicit shadow of the __ret * variable when used from the wait_event_*() macros. * * This is so that both can use the ___wait_cond_timeout() construct * to wrap the condition. * * The type inconsistency of the wait_event_*() __ret variable is also * on purpose; we use long where we can return timeout values and int * otherwise. */ #define ___wait_event(wq_head, condition, state, exclusive, ret, cmd) \ ({ \ __label__ __out; \ struct wait_queue_entry __wq_entry; \ long __ret = ret; /* explicit shadow */ \ \ init_wait_entry(&__wq_entry, exclusive ? WQ_FLAG_EXCLUSIVE : 0); \ for (;;) { \ long __int = prepare_to_wait_event(&wq_head, &__wq_entry, state);\ \ if (condition) \ break; \ \ if (___wait_is_interruptible(state) && __int) { \ __ret = __int; \ goto __out; \ } \ \ cmd; \ \ if (condition) \ break; \ } \ finish_wait(&wq_head, &__wq_entry); \ __out: __ret; \ }) #define __wait_event(wq_head, condition) \ (void)___wait_event(wq_head, condition, TASK_UNINTERRUPTIBLE, 0, 0, \ schedule()) /** * wait_event - sleep until a condition gets true * @wq_head: the waitqueue to wait on * @condition: a C expression for the event to wait for * * The process is put to sleep (TASK_UNINTERRUPTIBLE) until the * @condition evaluates to true. The @condition is checked each time * the waitqueue @wq_head is woken up. * * wake_up() has to be called after changing any variable that could * change the result of the wait condition. */ #define wait_event(wq_head, condition) \ do { \ might_sleep(); \ if (condition) \ break; \ __wait_event(wq_head, condition); \ } while (0) #define __io_wait_event(wq_head, condition) \ (void)___wait_event(wq_head, condition, TASK_UNINTERRUPTIBLE, 0, 0, \ io_schedule()) /* * io_wait_event() -- like wait_event() but with io_schedule() */ #define io_wait_event(wq_head, condition) \ do { \ might_sleep(); \ if (condition) \ break; \ __io_wait_event(wq_head, condition); \ } while (0) #define __wait_event_freezable(wq_head, condition) \ ___wait_event(wq_head, condition, (TASK_INTERRUPTIBLE|TASK_FREEZABLE), \ 0, 0, schedule()) /** * wait_event_freezable - sleep (or freeze) until a condition gets true * @wq_head: the waitqueue to wait on * @condition: a C expression for the event to wait for * * The process is put to sleep (TASK_INTERRUPTIBLE -- so as not to contribute * to system load) until the @condition evaluates to true. The * @condition is checked each time the waitqueue @wq_head is woken up. * * wake_up() has to be called after changing any variable that could * change the result of the wait condition. */ #define wait_event_freezable(wq_head, condition) \ ({ \ int __ret = 0; \ might_sleep(); \ if (!(condition)) \ __ret = __wait_event_freezable(wq_head, condition); \ __ret; \ }) #define __wait_event_timeout(wq_head, condition, timeout) \ ___wait_event(wq_head, ___wait_cond_timeout(condition), \ TASK_UNINTERRUPTIBLE, 0, timeout, \ __ret = schedule_timeout(__ret)) /** * wait_event_timeout - sleep until a condition gets true or a timeout elapses * @wq_head: the waitqueue to wait on * @condition: a C expression for the event to wait for * @timeout: timeout, in jiffies * * The process is put to sleep (TASK_UNINTERRUPTIBLE) until the * @condition evaluates to true. The @condition is checked each time * the waitqueue @wq_head is woken up. * * wake_up() has to be called after changing any variable that could * change the result of the wait condition. * * Returns: * 0 if the @condition evaluated to %false after the @timeout elapsed, * 1 if the @condition evaluated to %true after the @timeout elapsed, * or the remaining jiffies (at least 1) if the @condition evaluated * to %true before the @timeout elapsed. */ #define wait_event_timeout(wq_head, condition, timeout) \ ({ \ long __ret = timeout; \ might_sleep(); \ if (!___wait_cond_timeout(condition)) \ __ret = __wait_event_timeout(wq_head, condition, timeout); \ __ret; \ }) #define __wait_event_freezable_timeout(wq_head, condition, timeout) \ ___wait_event(wq_head, ___wait_cond_timeout(condition), \ (TASK_INTERRUPTIBLE|TASK_FREEZABLE), 0, timeout, \ __ret = schedule_timeout(__ret)) /* * like wait_event_timeout() -- except it uses TASK_INTERRUPTIBLE to avoid * increasing load and is freezable. */ #define wait_event_freezable_timeout(wq_head, condition, timeout) \ ({ \ long __ret = timeout; \ might_sleep(); \ if (!___wait_cond_timeout(condition)) \ __ret = __wait_event_freezable_timeout(wq_head, condition, timeout); \ __ret; \ }) #define __wait_event_exclusive_cmd(wq_head, condition, cmd1, cmd2) \ (void)___wait_event(wq_head, condition, TASK_UNINTERRUPTIBLE, 1, 0, \ cmd1; schedule(); cmd2) /* * Just like wait_event_cmd(), except it sets exclusive flag */ #define wait_event_exclusive_cmd(wq_head, condition, cmd1, cmd2) \ do { \ if (condition) \ break; \ __wait_event_exclusive_cmd(wq_head, condition, cmd1, cmd2); \ } while (0) #define __wait_event_cmd(wq_head, condition, cmd1, cmd2) \ (void)___wait_event(wq_head, condition, TASK_UNINTERRUPTIBLE, 0, 0, \ cmd1; schedule(); cmd2) /** * wait_event_cmd - sleep until a condition gets true * @wq_head: the waitqueue to wait on * @condition: a C expression for the event to wait for * @cmd1: the command will be executed before sleep * @cmd2: the command will be executed after sleep * * The process is put to sleep (TASK_UNINTERRUPTIBLE) until the * @condition evaluates to true. The @condition is checked each time * the waitqueue @wq_head is woken up. * * wake_up() has to be called after changing any variable that could * change the result of the wait condition. */ #define wait_event_cmd(wq_head, condition, cmd1, cmd2) \ do { \ if (condition) \ break; \ __wait_event_cmd(wq_head, condition, cmd1, cmd2); \ } while (0) #define __wait_event_interruptible(wq_head, condition) \ ___wait_event(wq_head, condition, TASK_INTERRUPTIBLE, 0, 0, \ schedule()) /** * wait_event_interruptible - sleep until a condition gets true * @wq_head: the waitqueue to wait on * @condition: a C expression for the event to wait for * * The process is put to sleep (TASK_INTERRUPTIBLE) until the * @condition evaluates to true or a signal is received. * The @condition is checked each time the waitqueue @wq_head is woken up. * * wake_up() has to be called after changing any variable that could * change the result of the wait condition. * * The function will return -ERESTARTSYS if it was interrupted by a * signal and 0 if @condition evaluated to true. */ #define wait_event_interruptible(wq_head, condition) \ ({ \ int __ret = 0; \ might_sleep(); \ if (!(condition)) \ __ret = __wait_event_interruptible(wq_head, condition); \ __ret; \ }) #define __wait_event_interruptible_timeout(wq_head, condition, timeout) \ ___wait_event(wq_head, ___wait_cond_timeout(condition), \ TASK_INTERRUPTIBLE, 0, timeout, \ __ret = schedule_timeout(__ret)) /** * wait_event_interruptible_timeout - sleep until a condition gets true or a timeout elapses * @wq_head: the waitqueue to wait on * @condition: a C expression for the event to wait for * @timeout: timeout, in jiffies * * The process is put to sleep (TASK_INTERRUPTIBLE) until the * @condition evaluates to true or a signal is received. * The @condition is checked each time the waitqueue @wq_head is woken up. * * wake_up() has to be called after changing any variable that could * change the result of the wait condition. * * Returns: * 0 if the @condition evaluated to %false after the @timeout elapsed, * 1 if the @condition evaluated to %true after the @timeout elapsed, * the remaining jiffies (at least 1) if the @condition evaluated * to %true before the @timeout elapsed, or -%ERESTARTSYS if it was * interrupted by a signal. */ #define wait_event_interruptible_timeout(wq_head, condition, timeout) \ ({ \ long __ret = timeout; \ might_sleep(); \ if (!___wait_cond_timeout(condition)) \ __ret = __wait_event_interruptible_timeout(wq_head, \ condition, timeout); \ __ret; \ }) #define __wait_event_hrtimeout(wq_head, condition, timeout, state) \ ({ \ int __ret = 0; \ struct hrtimer_sleeper __t; \ \ hrtimer_setup_sleeper_on_stack(&__t, CLOCK_MONOTONIC, \ HRTIMER_MODE_REL); \ if ((timeout) != KTIME_MAX) { \ hrtimer_set_expires_range_ns(&__t.timer, timeout, \ current->timer_slack_ns); \ hrtimer_sleeper_start_expires(&__t, HRTIMER_MODE_REL); \ } \ \ __ret = ___wait_event(wq_head, condition, state, 0, 0, \ if (!__t.task) { \ __ret = -ETIME; \ break; \ } \ schedule()); \ \ hrtimer_cancel(&__t.timer); \ destroy_hrtimer_on_stack(&__t.timer); \ __ret; \ }) /** * wait_event_hrtimeout - sleep until a condition gets true or a timeout elapses * @wq_head: the waitqueue to wait on * @condition: a C expression for the event to wait for * @timeout: timeout, as a ktime_t * * The process is put to sleep (TASK_UNINTERRUPTIBLE) until the * @condition evaluates to true or a signal is received. * The @condition is checked each time the waitqueue @wq_head is woken up. * * wake_up() has to be called after changing any variable that could * change the result of the wait condition. * * The function returns 0 if @condition became true, or -ETIME if the timeout * elapsed. */ #define wait_event_hrtimeout(wq_head, condition, timeout) \ ({ \ int __ret = 0; \ might_sleep(); \ if (!(condition)) \ __ret = __wait_event_hrtimeout(wq_head, condition, timeout, \ TASK_UNINTERRUPTIBLE); \ __ret; \ }) /** * wait_event_interruptible_hrtimeout - sleep until a condition gets true or a timeout elapses * @wq: the waitqueue to wait on * @condition: a C expression for the event to wait for * @timeout: timeout, as a ktime_t * * The process is put to sleep (TASK_INTERRUPTIBLE) until the * @condition evaluates to true or a signal is received. * The @condition is checked each time the waitqueue @wq is woken up. * * wake_up() has to be called after changing any variable that could * change the result of the wait condition. * * The function returns 0 if @condition became true, -ERESTARTSYS if it was * interrupted by a signal, or -ETIME if the timeout elapsed. */ #define wait_event_interruptible_hrtimeout(wq, condition, timeout) \ ({ \ long __ret = 0; \ might_sleep(); \ if (!(condition)) \ __ret = __wait_event_hrtimeout(wq, condition, timeout, \ TASK_INTERRUPTIBLE); \ __ret; \ }) #define __wait_event_interruptible_exclusive(wq, condition) \ ___wait_event(wq, condition, TASK_INTERRUPTIBLE, 1, 0, \ schedule()) #define wait_event_interruptible_exclusive(wq, condition) \ ({ \ int __ret = 0; \ might_sleep(); \ if (!(condition)) \ __ret = __wait_event_interruptible_exclusive(wq, condition); \ __ret; \ }) #define __wait_event_killable_exclusive(wq, condition) \ ___wait_event(wq, condition, TASK_KILLABLE, 1, 0, \ schedule()) #define wait_event_killable_exclusive(wq, condition) \ ({ \ int __ret = 0; \ might_sleep(); \ if (!(condition)) \ __ret = __wait_event_killable_exclusive(wq, condition); \ __ret; \ }) #define __wait_event_freezable_exclusive(wq, condition) \ ___wait_event(wq, condition, (TASK_INTERRUPTIBLE|TASK_FREEZABLE), 1, 0,\ schedule()) #define wait_event_freezable_exclusive(wq, condition) \ ({ \ int __ret = 0; \ might_sleep(); \ if (!(condition)) \ __ret = __wait_event_freezable_exclusive(wq, condition); \ __ret; \ }) /** * wait_event_idle - wait for a condition without contributing to system load * @wq_head: the waitqueue to wait on * @condition: a C expression for the event to wait for * * The process is put to sleep (TASK_IDLE) until the * @condition evaluates to true. * The @condition is checked each time the waitqueue @wq_head is woken up. * * wake_up() has to be called after changing any variable that could * change the result of the wait condition. * */ #define wait_event_idle(wq_head, condition) \ do { \ might_sleep(); \ if (!(condition)) \ ___wait_event(wq_head, condition, TASK_IDLE, 0, 0, schedule()); \ } while (0) /** * wait_event_idle_exclusive - wait for a condition with contributing to system load * @wq_head: the waitqueue to wait on * @condition: a C expression for the event to wait for * * The process is put to sleep (TASK_IDLE) until the * @condition evaluates to true. * The @condition is checked each time the waitqueue @wq_head is woken up. * * The process is put on the wait queue with an WQ_FLAG_EXCLUSIVE flag * set thus if other processes wait on the same list, when this * process is woken further processes are not considered. * * wake_up() has to be called after changing any variable that could * change the result of the wait condition. * */ #define wait_event_idle_exclusive(wq_head, condition) \ do { \ might_sleep(); \ if (!(condition)) \ ___wait_event(wq_head, condition, TASK_IDLE, 1, 0, schedule()); \ } while (0) #define __wait_event_idle_timeout(wq_head, condition, timeout) \ ___wait_event(wq_head, ___wait_cond_timeout(condition), \ TASK_IDLE, 0, timeout, \ __ret = schedule_timeout(__ret)) /** * wait_event_idle_timeout - sleep without load until a condition becomes true or a timeout elapses * @wq_head: the waitqueue to wait on * @condition: a C expression for the event to wait for * @timeout: timeout, in jiffies * * The process is put to sleep (TASK_IDLE) until the * @condition evaluates to true. The @condition is checked each time * the waitqueue @wq_head is woken up. * * wake_up() has to be called after changing any variable that could * change the result of the wait condition. * * Returns: * 0 if the @condition evaluated to %false after the @timeout elapsed, * 1 if the @condition evaluated to %true after the @timeout elapsed, * or the remaining jiffies (at least 1) if the @condition evaluated * to %true before the @timeout elapsed. */ #define wait_event_idle_timeout(wq_head, condition, timeout) \ ({ \ long __ret = timeout; \ might_sleep(); \ if (!___wait_cond_timeout(condition)) \ __ret = __wait_event_idle_timeout(wq_head, condition, timeout); \ __ret; \ }) #define __wait_event_idle_exclusive_timeout(wq_head, condition, timeout) \ ___wait_event(wq_head, ___wait_cond_timeout(condition), \ TASK_IDLE, 1, timeout, \ __ret = schedule_timeout(__ret)) /** * wait_event_idle_exclusive_timeout - sleep without load until a condition becomes true or a timeout elapses * @wq_head: the waitqueue to wait on * @condition: a C expression for the event to wait for * @timeout: timeout, in jiffies * * The process is put to sleep (TASK_IDLE) until the * @condition evaluates to true. The @condition is checked each time * the waitqueue @wq_head is woken up. * * The process is put on the wait queue with an WQ_FLAG_EXCLUSIVE flag * set thus if other processes wait on the same list, when this * process is woken further processes are not considered. * * wake_up() has to be called after changing any variable that could * change the result of the wait condition. * * Returns: * 0 if the @condition evaluated to %false after the @timeout elapsed, * 1 if the @condition evaluated to %true after the @timeout elapsed, * or the remaining jiffies (at least 1) if the @condition evaluated * to %true before the @timeout elapsed. */ #define wait_event_idle_exclusive_timeout(wq_head, condition, timeout) \ ({ \ long __ret = timeout; \ might_sleep(); \ if (!___wait_cond_timeout(condition)) \ __ret = __wait_event_idle_exclusive_timeout(wq_head, condition, timeout);\ __ret; \ }) extern int do_wait_intr(wait_queue_head_t *, wait_queue_entry_t *); extern int do_wait_intr_irq(wait_queue_head_t *, wait_queue_entry_t *); #define __wait_event_interruptible_locked(wq, condition, exclusive, fn) \ ({ \ int __ret; \ DEFINE_WAIT(__wait); \ if (exclusive) \ __wait.flags |= WQ_FLAG_EXCLUSIVE; \ do { \ __ret = fn(&(wq), &__wait); \ if (__ret) \ break; \ } while (!(condition)); \ __remove_wait_queue(&(wq), &__wait); \ __set_current_state(TASK_RUNNING); \ __ret; \ }) /** * wait_event_interruptible_locked - sleep until a condition gets true * @wq: the waitqueue to wait on * @condition: a C expression for the event to wait for * * The process is put to sleep (TASK_INTERRUPTIBLE) until the * @condition evaluates to true or a signal is received. * The @condition is checked each time the waitqueue @wq is woken up. * * It must be called with wq.lock being held. This spinlock is * unlocked while sleeping but @condition testing is done while lock * is held and when this macro exits the lock is held. * * The lock is locked/unlocked using spin_lock()/spin_unlock() * functions which must match the way they are locked/unlocked outside * of this macro. * * wake_up_locked() has to be called after changing any variable that could * change the result of the wait condition. * * The function will return -ERESTARTSYS if it was interrupted by a * signal and 0 if @condition evaluated to true. */ #define wait_event_interruptible_locked(wq, condition) \ ((condition) \ ? 0 : __wait_event_interruptible_locked(wq, condition, 0, do_wait_intr)) /** * wait_event_interruptible_locked_irq - sleep until a condition gets true * @wq: the waitqueue to wait on * @condition: a C expression for the event to wait for * * The process is put to sleep (TASK_INTERRUPTIBLE) until the * @condition evaluates to true or a signal is received. * The @condition is checked each time the waitqueue @wq is woken up. * * It must be called with wq.lock being held. This spinlock is * unlocked while sleeping but @condition testing is done while lock * is held and when this macro exits the lock is held. * * The lock is locked/unlocked using spin_lock_irq()/spin_unlock_irq() * functions which must match the way they are locked/unlocked outside * of this macro. * * wake_up_locked() has to be called after changing any variable that could * change the result of the wait condition. * * The function will return -ERESTARTSYS if it was interrupted by a * signal and 0 if @condition evaluated to true. */ #define wait_event_interruptible_locked_irq(wq, condition) \ ((condition) \ ? 0 : __wait_event_interruptible_locked(wq, condition, 0, do_wait_intr_irq)) /** * wait_event_interruptible_exclusive_locked - sleep exclusively until a condition gets true * @wq: the waitqueue to wait on * @condition: a C expression for the event to wait for * * The process is put to sleep (TASK_INTERRUPTIBLE) until the * @condition evaluates to true or a signal is received. * The @condition is checked each time the waitqueue @wq is woken up. * * It must be called with wq.lock being held. This spinlock is * unlocked while sleeping but @condition testing is done while lock * is held and when this macro exits the lock is held. * * The lock is locked/unlocked using spin_lock()/spin_unlock() * functions which must match the way they are locked/unlocked outside * of this macro. * * The process is put on the wait queue with an WQ_FLAG_EXCLUSIVE flag * set thus when other process waits process on the list if this * process is awaken further processes are not considered. * * wake_up_locked() has to be called after changing any variable that could * change the result of the wait condition. * * The function will return -ERESTARTSYS if it was interrupted by a * signal and 0 if @condition evaluated to true. */ #define wait_event_interruptible_exclusive_locked(wq, condition) \ ((condition) \ ? 0 : __wait_event_interruptible_locked(wq, condition, 1, do_wait_intr)) /** * wait_event_interruptible_exclusive_locked_irq - sleep until a condition gets true * @wq: the waitqueue to wait on * @condition: a C expression for the event to wait for * * The process is put to sleep (TASK_INTERRUPTIBLE) until the * @condition evaluates to true or a signal is received. * The @condition is checked each time the waitqueue @wq is woken up. * * It must be called with wq.lock being held. This spinlock is * unlocked while sleeping but @condition testing is done while lock * is held and when this macro exits the lock is held. * * The lock is locked/unlocked using spin_lock_irq()/spin_unlock_irq() * functions which must match the way they are locked/unlocked outside * of this macro. * * The process is put on the wait queue with an WQ_FLAG_EXCLUSIVE flag * set thus when other process waits process on the list if this * process is awaken further processes are not considered. * * wake_up_locked() has to be called after changing any variable that could * change the result of the wait condition. * * The function will return -ERESTARTSYS if it was interrupted by a * signal and 0 if @condition evaluated to true. */ #define wait_event_interruptible_exclusive_locked_irq(wq, condition) \ ((condition) \ ? 0 : __wait_event_interruptible_locked(wq, condition, 1, do_wait_intr_irq)) #define __wait_event_killable(wq, condition) \ ___wait_event(wq, condition, TASK_KILLABLE, 0, 0, schedule()) /** * wait_event_killable - sleep until a condition gets true * @wq_head: the waitqueue to wait on * @condition: a C expression for the event to wait for * * The process is put to sleep (TASK_KILLABLE) until the * @condition evaluates to true or a signal is received. * The @condition is checked each time the waitqueue @wq_head is woken up. * * wake_up() has to be called after changing any variable that could * change the result of the wait condition. * * The function will return -ERESTARTSYS if it was interrupted by a * signal and 0 if @condition evaluated to true. */ #define wait_event_killable(wq_head, condition) \ ({ \ int __ret = 0; \ might_sleep(); \ if (!(condition)) \ __ret = __wait_event_killable(wq_head, condition); \ __ret; \ }) #define __wait_event_state(wq, condition, state) \ ___wait_event(wq, condition, state, 0, 0, schedule()) /** * wait_event_state - sleep until a condition gets true * @wq_head: the waitqueue to wait on * @condition: a C expression for the event to wait for * @state: state to sleep in * * The process is put to sleep (@state) until the @condition evaluates to true * or a signal is received (when allowed by @state). The @condition is checked * each time the waitqueue @wq_head is woken up. * * wake_up() has to be called after changing any variable that could * change the result of the wait condition. * * The function will return -ERESTARTSYS if it was interrupted by a signal * (when allowed by @state) and 0 if @condition evaluated to true. */ #define wait_event_state(wq_head, condition, state) \ ({ \ int __ret = 0; \ might_sleep(); \ if (!(condition)) \ __ret = __wait_event_state(wq_head, condition, state); \ __ret; \ }) #define __wait_event_state_exclusive(wq, condition, state) \ ___wait_event(wq, condition, state, 1, 0, schedule()) #define wait_event_state_exclusive(wq, condition, state) \ ({ \ int __ret = 0; \ might_sleep(); \ if (!(condition)) \ __ret = __wait_event_state_exclusive(wq, condition, state); \ __ret; \ }) #define __wait_event_killable_timeout(wq_head, condition, timeout) \ ___wait_event(wq_head, ___wait_cond_timeout(condition), \ TASK_KILLABLE, 0, timeout, \ __ret = schedule_timeout(__ret)) /** * wait_event_killable_timeout - sleep until a condition gets true or a timeout elapses * @wq_head: the waitqueue to wait on * @condition: a C expression for the event to wait for * @timeout: timeout, in jiffies * * The process is put to sleep (TASK_KILLABLE) until the * @condition evaluates to true or a kill signal is received. * The @condition is checked each time the waitqueue @wq_head is woken up. * * wake_up() has to be called after changing any variable that could * change the result of the wait condition. * * Returns: * 0 if the @condition evaluated to %false after the @timeout elapsed, * 1 if the @condition evaluated to %true after the @timeout elapsed, * the remaining jiffies (at least 1) if the @condition evaluated * to %true before the @timeout elapsed, or -%ERESTARTSYS if it was * interrupted by a kill signal. * * Only kill signals interrupt this process. */ #define wait_event_killable_timeout(wq_head, condition, timeout) \ ({ \ long __ret = timeout; \ might_sleep(); \ if (!___wait_cond_timeout(condition)) \ __ret = __wait_event_killable_timeout(wq_head, \ condition, timeout); \ __ret; \ }) #define __wait_event_lock_irq(wq_head, condition, lock, cmd) \ (void)___wait_event(wq_head, condition, TASK_UNINTERRUPTIBLE, 0, 0, \ spin_unlock_irq(&lock); \ cmd; \ schedule(); \ spin_lock_irq(&lock)) /** * wait_event_lock_irq_cmd - sleep until a condition gets true. The * condition is checked under the lock. This * is expected to be called with the lock * taken. * @wq_head: the waitqueue to wait on * @condition: a C expression for the event to wait for * @lock: a locked spinlock_t, which will be released before cmd * and schedule() and reacquired afterwards. * @cmd: a command which is invoked outside the critical section before * sleep * * The process is put to sleep (TASK_UNINTERRUPTIBLE) until the * @condition evaluates to true. The @condition is checked each time * the waitqueue @wq_head is woken up. * * wake_up() has to be called after changing any variable that could * change the result of the wait condition. * * This is supposed to be called while holding the lock. The lock is * dropped before invoking the cmd and going to sleep and is reacquired * afterwards. */ #define wait_event_lock_irq_cmd(wq_head, condition, lock, cmd) \ do { \ if (condition) \ break; \ __wait_event_lock_irq(wq_head, condition, lock, cmd); \ } while (0) /** * wait_event_lock_irq - sleep until a condition gets true. The * condition is checked under the lock. This * is expected to be called with the lock * taken. * @wq_head: the waitqueue to wait on * @condition: a C expression for the event to wait for * @lock: a locked spinlock_t, which will be released before schedule() * and reacquired afterwards. * * The process is put to sleep (TASK_UNINTERRUPTIBLE) until the * @condition evaluates to true. The @condition is checked each time * the waitqueue @wq_head is woken up. * * wake_up() has to be called after changing any variable that could * change the result of the wait condition. * * This is supposed to be called while holding the lock. The lock is * dropped before going to sleep and is reacquired afterwards. */ #define wait_event_lock_irq(wq_head, condition, lock) \ do { \ if (condition) \ break; \ __wait_event_lock_irq(wq_head, condition, lock, ); \ } while (0) #define __wait_event_interruptible_lock_irq(wq_head, condition, lock, cmd) \ ___wait_event(wq_head, condition, TASK_INTERRUPTIBLE, 0, 0, \ spin_unlock_irq(&lock); \ cmd; \ schedule(); \ spin_lock_irq(&lock)) /** * wait_event_interruptible_lock_irq_cmd - sleep until a condition gets true. * The condition is checked under the lock. This is expected to * be called with the lock taken. * @wq_head: the waitqueue to wait on * @condition: a C expression for the event to wait for * @lock: a locked spinlock_t, which will be released before cmd and * schedule() and reacquired afterwards. * @cmd: a command which is invoked outside the critical section before * sleep * * The process is put to sleep (TASK_INTERRUPTIBLE) until the * @condition evaluates to true or a signal is received. The @condition is * checked each time the waitqueue @wq_head is woken up. * * wake_up() has to be called after changing any variable that could * change the result of the wait condition. * * This is supposed to be called while holding the lock. The lock is * dropped before invoking the cmd and going to sleep and is reacquired * afterwards. * * The macro will return -ERESTARTSYS if it was interrupted by a signal * and 0 if @condition evaluated to true. */ #define wait_event_interruptible_lock_irq_cmd(wq_head, condition, lock, cmd) \ ({ \ int __ret = 0; \ if (!(condition)) \ __ret = __wait_event_interruptible_lock_irq(wq_head, \ condition, lock, cmd); \ __ret; \ }) /** * wait_event_interruptible_lock_irq - sleep until a condition gets true. * The condition is checked under the lock. This is expected * to be called with the lock taken. * @wq_head: the waitqueue to wait on * @condition: a C expression for the event to wait for * @lock: a locked spinlock_t, which will be released before schedule() * and reacquired afterwards. * * The process is put to sleep (TASK_INTERRUPTIBLE) until the * @condition evaluates to true or signal is received. The @condition is * checked each time the waitqueue @wq_head is woken up. * * wake_up() has to be called after changing any variable that could * change the result of the wait condition. * * This is supposed to be called while holding the lock. The lock is * dropped before going to sleep and is reacquired afterwards. * * The macro will return -ERESTARTSYS if it was interrupted by a signal * and 0 if @condition evaluated to true. */ #define wait_event_interruptible_lock_irq(wq_head, condition, lock) \ ({ \ int __ret = 0; \ if (!(condition)) \ __ret = __wait_event_interruptible_lock_irq(wq_head, \ condition, lock,); \ __ret; \ }) #define __wait_event_lock_irq_timeout(wq_head, condition, lock, timeout, state) \ ___wait_event(wq_head, ___wait_cond_timeout(condition), \ state, 0, timeout, \ spin_unlock_irq(&lock); \ __ret = schedule_timeout(__ret); \ spin_lock_irq(&lock)); /** * wait_event_interruptible_lock_irq_timeout - sleep until a condition gets * true or a timeout elapses. The condition is checked under * the lock. This is expected to be called with the lock taken. * @wq_head: the waitqueue to wait on * @condition: a C expression for the event to wait for * @lock: a locked spinlock_t, which will be released before schedule() * and reacquired afterwards. * @timeout: timeout, in jiffies * * The process is put to sleep (TASK_INTERRUPTIBLE) until the * @condition evaluates to true or signal is received. The @condition is * checked each time the waitqueue @wq_head is woken up. * * wake_up() has to be called after changing any variable that could * change the result of the wait condition. * * This is supposed to be called while holding the lock. The lock is * dropped before going to sleep and is reacquired afterwards. * * The function returns 0 if the @timeout elapsed, -ERESTARTSYS if it * was interrupted by a signal, and the remaining jiffies otherwise * if the condition evaluated to true before the timeout elapsed. */ #define wait_event_interruptible_lock_irq_timeout(wq_head, condition, lock, \ timeout) \ ({ \ long __ret = timeout; \ if (!___wait_cond_timeout(condition)) \ __ret = __wait_event_lock_irq_timeout( \ wq_head, condition, lock, timeout, \ TASK_INTERRUPTIBLE); \ __ret; \ }) #define wait_event_lock_irq_timeout(wq_head, condition, lock, timeout) \ ({ \ long __ret = timeout; \ if (!___wait_cond_timeout(condition)) \ __ret = __wait_event_lock_irq_timeout( \ wq_head, condition, lock, timeout, \ TASK_UNINTERRUPTIBLE); \ __ret; \ }) /* * Waitqueues which are removed from the waitqueue_head at wakeup time */ void prepare_to_wait(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state); bool prepare_to_wait_exclusive(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state); long prepare_to_wait_event(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state); void finish_wait(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry); long wait_woken(struct wait_queue_entry *wq_entry, unsigned mode, long timeout); int woken_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync, void *key); int autoremove_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync, void *key); #define DEFINE_WAIT_FUNC(name, function) \ struct wait_queue_entry name = { \ .private = current, \ .func = function, \ .entry = LIST_HEAD_INIT((name).entry), \ } #define DEFINE_WAIT(name) DEFINE_WAIT_FUNC(name, autoremove_wake_function) #define init_wait_func(wait, function) \ do { \ (wait)->private = current; \ (wait)->func = function; \ INIT_LIST_HEAD(&(wait)->entry); \ (wait)->flags = 0; \ } while (0) #define init_wait(wait) init_wait_func(wait, autoremove_wake_function) typedef int (*task_call_f)(struct task_struct *p, void *arg); extern int task_call_func(struct task_struct *p, task_call_f func, void *arg); #endif /* _LINUX_WAIT_H */
1131 1136 1131 1 1 11 1 1 1 1 1 12 12 12 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 // SPDX-License-Identifier: GPL-2.0-or-later /* Copyright (c) 2014 Mahesh Bandewar <maheshb@google.com> */ #include <net/ip.h> #include "ipvlan.h" static unsigned int ipvlan_netid __read_mostly; struct ipvlan_netns { unsigned int ipvl_nf_hook_refcnt; }; static struct ipvl_addr *ipvlan_skb_to_addr(struct sk_buff *skb, struct net_device *dev) { struct ipvl_addr *addr = NULL; struct ipvl_port *port; int addr_type; void *lyr3h; if (!dev || !netif_is_ipvlan_port(dev)) goto out; port = ipvlan_port_get_rcu(dev); if (!port || port->mode != IPVLAN_MODE_L3S) goto out; lyr3h = ipvlan_get_L3_hdr(port, skb, &addr_type); if (!lyr3h) goto out; addr = ipvlan_addr_lookup(port, lyr3h, addr_type, true); out: return addr; } static struct sk_buff *ipvlan_l3_rcv(struct net_device *dev, struct sk_buff *skb, u16 proto) { struct ipvl_addr *addr; struct net_device *sdev; addr = ipvlan_skb_to_addr(skb, dev); if (!addr) goto out; sdev = addr->master->dev; switch (proto) { case AF_INET: { const struct iphdr *ip4h = ip_hdr(skb); int err; err = ip_route_input_noref(skb, ip4h->daddr, ip4h->saddr, ip4h_dscp(ip4h), sdev); if (unlikely(err)) goto out; break; } #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: { struct dst_entry *dst; struct ipv6hdr *ip6h = ipv6_hdr(skb); int flags = RT6_LOOKUP_F_HAS_SADDR; struct flowi6 fl6 = { .flowi6_iif = sdev->ifindex, .daddr = ip6h->daddr, .saddr = ip6h->saddr, .flowlabel = ip6_flowinfo(ip6h), .flowi6_mark = skb->mark, .flowi6_proto = ip6h->nexthdr, }; skb_dst_drop(skb); dst = ip6_route_input_lookup(dev_net(sdev), sdev, &fl6, skb, flags); skb_dst_set(skb, dst); break; } #endif default: break; } out: return skb; } static const struct l3mdev_ops ipvl_l3mdev_ops = { .l3mdev_l3_rcv = ipvlan_l3_rcv, }; static unsigned int ipvlan_nf_input(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { struct ipvl_addr *addr; unsigned int len; addr = ipvlan_skb_to_addr(skb, skb->dev); if (!addr) goto out; skb->dev = addr->master->dev; skb->skb_iif = skb->dev->ifindex; #if IS_ENABLED(CONFIG_IPV6) if (addr->atype == IPVL_IPV6) IP6CB(skb)->iif = skb->dev->ifindex; #endif len = skb->len + ETH_HLEN; ipvlan_count_rx(addr->master, len, true, false); out: return NF_ACCEPT; } static const struct nf_hook_ops ipvl_nfops[] = { { .hook = ipvlan_nf_input, .pf = NFPROTO_IPV4, .hooknum = NF_INET_LOCAL_IN, .priority = INT_MAX, }, #if IS_ENABLED(CONFIG_IPV6) { .hook = ipvlan_nf_input, .pf = NFPROTO_IPV6, .hooknum = NF_INET_LOCAL_IN, .priority = INT_MAX, }, #endif }; static int ipvlan_register_nf_hook(struct net *net) { struct ipvlan_netns *vnet = net_generic(net, ipvlan_netid); int err = 0; if (!vnet->ipvl_nf_hook_refcnt) { err = nf_register_net_hooks(net, ipvl_nfops, ARRAY_SIZE(ipvl_nfops)); if (!err) vnet->ipvl_nf_hook_refcnt = 1; } else { vnet->ipvl_nf_hook_refcnt++; } return err; } static void ipvlan_unregister_nf_hook(struct net *net) { struct ipvlan_netns *vnet = net_generic(net, ipvlan_netid); if (WARN_ON(!vnet->ipvl_nf_hook_refcnt)) return; vnet->ipvl_nf_hook_refcnt--; if (!vnet->ipvl_nf_hook_refcnt) nf_unregister_net_hooks(net, ipvl_nfops, ARRAY_SIZE(ipvl_nfops)); } void ipvlan_migrate_l3s_hook(struct net *oldnet, struct net *newnet) { struct ipvlan_netns *old_vnet; ASSERT_RTNL(); old_vnet = net_generic(oldnet, ipvlan_netid); if (!old_vnet->ipvl_nf_hook_refcnt) return; ipvlan_register_nf_hook(newnet); ipvlan_unregister_nf_hook(oldnet); } static void ipvlan_ns_exit(struct net *net) { struct ipvlan_netns *vnet = net_generic(net, ipvlan_netid); if (WARN_ON_ONCE(vnet->ipvl_nf_hook_refcnt)) { vnet->ipvl_nf_hook_refcnt = 0; nf_unregister_net_hooks(net, ipvl_nfops, ARRAY_SIZE(ipvl_nfops)); } } static struct pernet_operations ipvlan_net_ops = { .id = &ipvlan_netid, .size = sizeof(struct ipvlan_netns), .exit = ipvlan_ns_exit, }; int ipvlan_l3s_init(void) { return register_pernet_subsys(&ipvlan_net_ops); } void ipvlan_l3s_cleanup(void) { unregister_pernet_subsys(&ipvlan_net_ops); } int ipvlan_l3s_register(struct ipvl_port *port) { struct net_device *dev = port->dev; int ret; ASSERT_RTNL(); ret = ipvlan_register_nf_hook(read_pnet(&port->pnet)); if (!ret) { dev->l3mdev_ops = &ipvl_l3mdev_ops; dev->priv_flags |= IFF_L3MDEV_RX_HANDLER; } return ret; } void ipvlan_l3s_unregister(struct ipvl_port *port) { struct net_device *dev = port->dev; ASSERT_RTNL(); dev->priv_flags &= ~IFF_L3MDEV_RX_HANDLER; ipvlan_unregister_nf_hook(read_pnet(&port->pnet)); }
20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 19 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 21 21 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 19 1 1 20 20 20 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB /* * Copyright (c) 2017, Mellanox Technologies inc. All rights reserved. */ #include <rdma/uverbs_ioctl.h> #include <rdma/rdma_user_ioctl.h> #include <linux/bitops.h> #include "rdma_core.h" #include "uverbs.h" static int ib_uverbs_notsupp(struct uverbs_attr_bundle *attrs) { return -EOPNOTSUPP; } static void *uapi_add_elm(struct uverbs_api *uapi, u32 key, size_t alloc_size) { void *elm; int rc; if (key == UVERBS_API_KEY_ERR) return ERR_PTR(-EOVERFLOW); elm = kzalloc(alloc_size, GFP_KERNEL); if (!elm) return ERR_PTR(-ENOMEM); rc = radix_tree_insert(&uapi->radix, key, elm); if (rc) { kfree(elm); return ERR_PTR(rc); } return elm; } static void *uapi_add_get_elm(struct uverbs_api *uapi, u32 key, size_t alloc_size, bool *exists) { void *elm; elm = uapi_add_elm(uapi, key, alloc_size); if (!IS_ERR(elm)) { *exists = false; return elm; } if (elm != ERR_PTR(-EEXIST)) return elm; elm = radix_tree_lookup(&uapi->radix, key); if (WARN_ON(!elm)) return ERR_PTR(-EINVAL); *exists = true; return elm; } static int uapi_create_write(struct uverbs_api *uapi, struct ib_device *ibdev, const struct uapi_definition *def, u32 obj_key, u32 *cur_method_key) { struct uverbs_api_write_method *method_elm; u32 method_key = obj_key; bool exists; if (def->write.is_ex) method_key |= uapi_key_write_ex_method(def->write.command_num); else method_key |= uapi_key_write_method(def->write.command_num); method_elm = uapi_add_get_elm(uapi, method_key, sizeof(*method_elm), &exists); if (IS_ERR(method_elm)) return PTR_ERR(method_elm); if (WARN_ON(exists && (def->write.is_ex != method_elm->is_ex))) return -EINVAL; method_elm->is_ex = def->write.is_ex; method_elm->handler = def->func_write; if (!def->write.is_ex) method_elm->disabled = !(ibdev->uverbs_cmd_mask & BIT_ULL(def->write.command_num)); if (!def->write.is_ex && def->func_write) { method_elm->has_udata = def->write.has_udata; method_elm->has_resp = def->write.has_resp; method_elm->req_size = def->write.req_size; method_elm->resp_size = def->write.resp_size; } *cur_method_key = method_key; return 0; } static int uapi_merge_method(struct uverbs_api *uapi, struct uverbs_api_object *obj_elm, u32 obj_key, const struct uverbs_method_def *method, bool is_driver) { u32 method_key = obj_key | uapi_key_ioctl_method(method->id); struct uverbs_api_ioctl_method *method_elm; unsigned int i; bool exists; if (!method->attrs) return 0; method_elm = uapi_add_get_elm(uapi, method_key, sizeof(*method_elm), &exists); if (IS_ERR(method_elm)) return PTR_ERR(method_elm); if (exists) { /* * This occurs when a driver uses ADD_UVERBS_ATTRIBUTES_SIMPLE */ if (WARN_ON(method->handler)) return -EINVAL; } else { WARN_ON(!method->handler); rcu_assign_pointer(method_elm->handler, method->handler); if (method->handler != uverbs_destroy_def_handler) method_elm->driver_method = is_driver; } for (i = 0; i != method->num_attrs; i++) { const struct uverbs_attr_def *attr = (*method->attrs)[i]; struct uverbs_api_attr *attr_slot; if (!attr) continue; /* * ENUM_IN contains the 'ids' pointer to the driver's .rodata, * so if it is specified by a driver then it always makes this * into a driver method. */ if (attr->attr.type == UVERBS_ATTR_TYPE_ENUM_IN) method_elm->driver_method |= is_driver; /* * Like other uobject based things we only support a single * uobject being NEW'd or DESTROY'd */ if (attr->attr.type == UVERBS_ATTR_TYPE_IDRS_ARRAY) { u8 access = attr->attr.u2.objs_arr.access; if (WARN_ON(access == UVERBS_ACCESS_NEW || access == UVERBS_ACCESS_DESTROY)) return -EINVAL; } attr_slot = uapi_add_elm(uapi, method_key | uapi_key_attr(attr->id), sizeof(*attr_slot)); /* Attributes are not allowed to be modified by drivers */ if (IS_ERR(attr_slot)) return PTR_ERR(attr_slot); attr_slot->spec = attr->attr; } return 0; } static int uapi_merge_obj_tree(struct uverbs_api *uapi, const struct uverbs_object_def *obj, bool is_driver) { struct uverbs_api_object *obj_elm; unsigned int i; u32 obj_key; bool exists; int rc; obj_key = uapi_key_obj(obj->id); obj_elm = uapi_add_get_elm(uapi, obj_key, sizeof(*obj_elm), &exists); if (IS_ERR(obj_elm)) return PTR_ERR(obj_elm); if (obj->type_attrs) { if (WARN_ON(obj_elm->type_attrs)) return -EINVAL; obj_elm->id = obj->id; obj_elm->type_attrs = obj->type_attrs; obj_elm->type_class = obj->type_attrs->type_class; /* * Today drivers are only permitted to use idr_class and * fd_class types. We can revoke the IDR types during * disassociation, and the FD types require the driver to use * struct file_operations.owner to prevent the driver module * code from unloading while the file is open. This provides * enough safety that uverbs_uobject_fd_release() will * continue to work. Drivers using FD are responsible to * handle disassociation of the device on their own. */ if (WARN_ON(is_driver && obj->type_attrs->type_class != &uverbs_idr_class && obj->type_attrs->type_class != &uverbs_fd_class)) return -EINVAL; } if (!obj->methods) return 0; for (i = 0; i != obj->num_methods; i++) { const struct uverbs_method_def *method = (*obj->methods)[i]; if (!method) continue; rc = uapi_merge_method(uapi, obj_elm, obj_key, method, is_driver); if (rc) return rc; } return 0; } static int uapi_disable_elm(struct uverbs_api *uapi, const struct uapi_definition *def, u32 obj_key, u32 method_key) { bool exists; if (def->scope == UAPI_SCOPE_OBJECT) { struct uverbs_api_object *obj_elm; obj_elm = uapi_add_get_elm( uapi, obj_key, sizeof(*obj_elm), &exists); if (IS_ERR(obj_elm)) return PTR_ERR(obj_elm); obj_elm->disabled = 1; return 0; } if (def->scope == UAPI_SCOPE_METHOD && uapi_key_is_ioctl_method(method_key)) { struct uverbs_api_ioctl_method *method_elm; method_elm = uapi_add_get_elm(uapi, method_key, sizeof(*method_elm), &exists); if (IS_ERR(method_elm)) return PTR_ERR(method_elm); method_elm->disabled = 1; return 0; } if (def->scope == UAPI_SCOPE_METHOD && (uapi_key_is_write_method(method_key) || uapi_key_is_write_ex_method(method_key))) { struct uverbs_api_write_method *write_elm; write_elm = uapi_add_get_elm(uapi, method_key, sizeof(*write_elm), &exists); if (IS_ERR(write_elm)) return PTR_ERR(write_elm); write_elm->disabled = 1; return 0; } WARN_ON(true); return -EINVAL; } static int uapi_merge_def(struct uverbs_api *uapi, struct ib_device *ibdev, const struct uapi_definition *def_list, bool is_driver) { const struct uapi_definition *def = def_list; u32 cur_obj_key = UVERBS_API_KEY_ERR; u32 cur_method_key = UVERBS_API_KEY_ERR; bool exists; int rc; if (!def_list) return 0; for (;; def++) { switch ((enum uapi_definition_kind)def->kind) { case UAPI_DEF_CHAIN: rc = uapi_merge_def(uapi, ibdev, def->chain, is_driver); if (rc) return rc; continue; case UAPI_DEF_CHAIN_OBJ_TREE: if (WARN_ON(def->object_start.object_id != def->chain_obj_tree->id)) return -EINVAL; cur_obj_key = uapi_key_obj(def->object_start.object_id); rc = uapi_merge_obj_tree(uapi, def->chain_obj_tree, is_driver); if (rc) return rc; continue; case UAPI_DEF_END: return 0; case UAPI_DEF_IS_SUPPORTED_DEV_FN: { void **ibdev_fn = (void *)(&ibdev->ops) + def->needs_fn_offset; if (*ibdev_fn) continue; rc = uapi_disable_elm( uapi, def, cur_obj_key, cur_method_key); if (rc) return rc; continue; } case UAPI_DEF_IS_SUPPORTED_FUNC: if (def->func_is_supported(ibdev)) continue; rc = uapi_disable_elm( uapi, def, cur_obj_key, cur_method_key); if (rc) return rc; continue; case UAPI_DEF_OBJECT_START: { struct uverbs_api_object *obj_elm; cur_obj_key = uapi_key_obj(def->object_start.object_id); obj_elm = uapi_add_get_elm(uapi, cur_obj_key, sizeof(*obj_elm), &exists); if (IS_ERR(obj_elm)) return PTR_ERR(obj_elm); continue; } case UAPI_DEF_WRITE: rc = uapi_create_write( uapi, ibdev, def, cur_obj_key, &cur_method_key); if (rc) return rc; continue; } WARN_ON(true); return -EINVAL; } } static int uapi_finalize_ioctl_method(struct uverbs_api *uapi, struct uverbs_api_ioctl_method *method_elm, u32 method_key) { struct radix_tree_iter iter; unsigned int num_attrs = 0; unsigned int max_bkey = 0; bool single_uobj = false; void __rcu **slot; method_elm->destroy_bkey = UVERBS_API_ATTR_BKEY_LEN; radix_tree_for_each_slot (slot, &uapi->radix, &iter, uapi_key_attrs_start(method_key)) { struct uverbs_api_attr *elm = rcu_dereference_protected(*slot, true); u32 attr_key = iter.index & UVERBS_API_ATTR_KEY_MASK; u32 attr_bkey = uapi_bkey_attr(attr_key); u8 type = elm->spec.type; if (uapi_key_attr_to_ioctl_method(iter.index) != uapi_key_attr_to_ioctl_method(method_key)) break; if (elm->spec.mandatory) __set_bit(attr_bkey, method_elm->attr_mandatory); if (elm->spec.is_udata) method_elm->has_udata = true; if (type == UVERBS_ATTR_TYPE_IDR || type == UVERBS_ATTR_TYPE_FD) { u8 access = elm->spec.u.obj.access; /* * Verbs specs may only have one NEW/DESTROY, we don't * have the infrastructure to abort multiple NEW's or * cope with multiple DESTROY failure. */ if (access == UVERBS_ACCESS_NEW || access == UVERBS_ACCESS_DESTROY) { if (WARN_ON(single_uobj)) return -EINVAL; single_uobj = true; if (WARN_ON(!elm->spec.mandatory)) return -EINVAL; } if (access == UVERBS_ACCESS_DESTROY) method_elm->destroy_bkey = attr_bkey; } max_bkey = max(max_bkey, attr_bkey); num_attrs++; } method_elm->key_bitmap_len = max_bkey + 1; WARN_ON(method_elm->key_bitmap_len > UVERBS_API_ATTR_BKEY_LEN); uapi_compute_bundle_size(method_elm, num_attrs); return 0; } static int uapi_finalize(struct uverbs_api *uapi) { const struct uverbs_api_write_method **data; unsigned long max_write_ex = 0; unsigned long max_write = 0; struct radix_tree_iter iter; void __rcu **slot; int rc; int i; radix_tree_for_each_slot (slot, &uapi->radix, &iter, 0) { struct uverbs_api_ioctl_method *method_elm = rcu_dereference_protected(*slot, true); if (uapi_key_is_ioctl_method(iter.index)) { rc = uapi_finalize_ioctl_method(uapi, method_elm, iter.index); if (rc) return rc; } if (uapi_key_is_write_method(iter.index)) max_write = max(max_write, iter.index & UVERBS_API_ATTR_KEY_MASK); if (uapi_key_is_write_ex_method(iter.index)) max_write_ex = max(max_write_ex, iter.index & UVERBS_API_ATTR_KEY_MASK); } uapi->notsupp_method.handler = ib_uverbs_notsupp; uapi->num_write = max_write + 1; uapi->num_write_ex = max_write_ex + 1; data = kmalloc_array(uapi->num_write + uapi->num_write_ex, sizeof(*uapi->write_methods), GFP_KERNEL); if (!data) return -ENOMEM; for (i = 0; i != uapi->num_write + uapi->num_write_ex; i++) data[i] = &uapi->notsupp_method; uapi->write_methods = data; uapi->write_ex_methods = data + uapi->num_write; radix_tree_for_each_slot (slot, &uapi->radix, &iter, 0) { if (uapi_key_is_write_method(iter.index)) uapi->write_methods[iter.index & UVERBS_API_ATTR_KEY_MASK] = rcu_dereference_protected(*slot, true); if (uapi_key_is_write_ex_method(iter.index)) uapi->write_ex_methods[iter.index & UVERBS_API_ATTR_KEY_MASK] = rcu_dereference_protected(*slot, true); } return 0; } static void uapi_remove_range(struct uverbs_api *uapi, u32 start, u32 last) { struct radix_tree_iter iter; void __rcu **slot; radix_tree_for_each_slot (slot, &uapi->radix, &iter, start) { if (iter.index > last) return; kfree(rcu_dereference_protected(*slot, true)); radix_tree_iter_delete(&uapi->radix, &iter, slot); } } static void uapi_remove_object(struct uverbs_api *uapi, u32 obj_key) { uapi_remove_range(uapi, obj_key, obj_key | UVERBS_API_METHOD_KEY_MASK | UVERBS_API_ATTR_KEY_MASK); } static void uapi_remove_method(struct uverbs_api *uapi, u32 method_key) { uapi_remove_range(uapi, method_key, method_key | UVERBS_API_ATTR_KEY_MASK); } static u32 uapi_get_obj_id(struct uverbs_attr_spec *spec) { if (spec->type == UVERBS_ATTR_TYPE_IDR || spec->type == UVERBS_ATTR_TYPE_FD) return spec->u.obj.obj_type; if (spec->type == UVERBS_ATTR_TYPE_IDRS_ARRAY) return spec->u2.objs_arr.obj_type; return UVERBS_API_KEY_ERR; } static void uapi_key_okay(u32 key) { unsigned int count = 0; if (uapi_key_is_object(key)) count++; if (uapi_key_is_ioctl_method(key)) count++; if (uapi_key_is_write_method(key)) count++; if (uapi_key_is_write_ex_method(key)) count++; if (uapi_key_is_attr(key)) count++; WARN(count != 1, "Bad count %u key=%x", count, key); } static void uapi_finalize_disable(struct uverbs_api *uapi) { struct radix_tree_iter iter; u32 starting_key = 0; bool scan_again = false; void __rcu **slot; again: radix_tree_for_each_slot (slot, &uapi->radix, &iter, starting_key) { uapi_key_okay(iter.index); if (uapi_key_is_object(iter.index)) { struct uverbs_api_object *obj_elm = rcu_dereference_protected(*slot, true); if (obj_elm->disabled) { /* Have to check all the attrs again */ scan_again = true; starting_key = iter.index; uapi_remove_object(uapi, iter.index); goto again; } continue; } if (uapi_key_is_ioctl_method(iter.index)) { struct uverbs_api_ioctl_method *method_elm = rcu_dereference_protected(*slot, true); if (method_elm->disabled) { starting_key = iter.index; uapi_remove_method(uapi, iter.index); goto again; } continue; } if (uapi_key_is_write_method(iter.index) || uapi_key_is_write_ex_method(iter.index)) { struct uverbs_api_write_method *method_elm = rcu_dereference_protected(*slot, true); if (method_elm->disabled) { kfree(method_elm); radix_tree_iter_delete(&uapi->radix, &iter, slot); } continue; } if (uapi_key_is_attr(iter.index)) { struct uverbs_api_attr *attr_elm = rcu_dereference_protected(*slot, true); const struct uverbs_api_object *tmp_obj; u32 obj_key; /* * If the method has a mandatory object handle * attribute which relies on an object which is not * present then the entire method is uncallable. */ if (!attr_elm->spec.mandatory) continue; obj_key = uapi_get_obj_id(&attr_elm->spec); if (obj_key == UVERBS_API_KEY_ERR) continue; tmp_obj = uapi_get_object(uapi, obj_key); if (IS_ERR(tmp_obj)) { if (PTR_ERR(tmp_obj) == -ENOMSG) continue; } else { if (!tmp_obj->disabled) continue; } starting_key = iter.index; uapi_remove_method( uapi, iter.index & (UVERBS_API_OBJ_KEY_MASK | UVERBS_API_METHOD_KEY_MASK)); goto again; } WARN_ON(false); } if (!scan_again) return; scan_again = false; starting_key = 0; goto again; } void uverbs_destroy_api(struct uverbs_api *uapi) { if (!uapi) return; uapi_remove_range(uapi, 0, U32_MAX); kfree(uapi->write_methods); kfree(uapi); } static const struct uapi_definition uverbs_core_api[] = { UAPI_DEF_CHAIN(uverbs_def_obj_async_fd), UAPI_DEF_CHAIN(uverbs_def_obj_counters), UAPI_DEF_CHAIN(uverbs_def_obj_cq), UAPI_DEF_CHAIN(uverbs_def_obj_device), UAPI_DEF_CHAIN(uverbs_def_obj_dm), UAPI_DEF_CHAIN(uverbs_def_obj_dmah), UAPI_DEF_CHAIN(uverbs_def_obj_flow_action), UAPI_DEF_CHAIN(uverbs_def_obj_intf), UAPI_DEF_CHAIN(uverbs_def_obj_mr), UAPI_DEF_CHAIN(uverbs_def_obj_qp), UAPI_DEF_CHAIN(uverbs_def_obj_srq), UAPI_DEF_CHAIN(uverbs_def_obj_wq), UAPI_DEF_CHAIN(uverbs_def_write_intf), {}, }; struct uverbs_api *uverbs_alloc_api(struct ib_device *ibdev) { struct uverbs_api *uapi; int rc; uapi = kzalloc(sizeof(*uapi), GFP_KERNEL); if (!uapi) return ERR_PTR(-ENOMEM); INIT_RADIX_TREE(&uapi->radix, GFP_KERNEL); uapi->driver_id = ibdev->ops.driver_id; rc = uapi_merge_def(uapi, ibdev, uverbs_core_api, false); if (rc) goto err; rc = uapi_merge_def(uapi, ibdev, ibdev->driver_def, true); if (rc) goto err; uapi_finalize_disable(uapi); rc = uapi_finalize(uapi); if (rc) goto err; return uapi; err: if (rc != -ENOMEM) dev_err(&ibdev->dev, "Setup of uverbs_api failed, kernel parsing tree description is not valid (%d)??\n", rc); uverbs_destroy_api(uapi); return ERR_PTR(rc); } /* * The pre version is done before destroying the HW objects, it only blocks * off method access. All methods that require the ib_dev or the module data * must test one of these assignments prior to continuing. */ void uverbs_disassociate_api_pre(struct ib_uverbs_device *uverbs_dev) { struct uverbs_api *uapi = uverbs_dev->uapi; struct radix_tree_iter iter; void __rcu **slot; rcu_assign_pointer(uverbs_dev->ib_dev, NULL); radix_tree_for_each_slot (slot, &uapi->radix, &iter, 0) { if (uapi_key_is_ioctl_method(iter.index)) { struct uverbs_api_ioctl_method *method_elm = rcu_dereference_protected(*slot, true); if (method_elm->driver_method) rcu_assign_pointer(method_elm->handler, NULL); } } synchronize_srcu(&uverbs_dev->disassociate_srcu); } /* * Called when a driver disassociates from the ib_uverbs_device. The * assumption is that the driver module will unload after. Replace everything * related to the driver with NULL as a safety measure. */ void uverbs_disassociate_api(struct uverbs_api *uapi) { struct radix_tree_iter iter; void __rcu **slot; radix_tree_for_each_slot (slot, &uapi->radix, &iter, 0) { if (uapi_key_is_object(iter.index)) { struct uverbs_api_object *object_elm = rcu_dereference_protected(*slot, true); /* * Some type_attrs are in the driver module. We don't * bother to keep track of which since there should be * no use of this after disassociate. */ object_elm->type_attrs = NULL; } else if (uapi_key_is_attr(iter.index)) { struct uverbs_api_attr *elm = rcu_dereference_protected(*slot, true); if (elm->spec.type == UVERBS_ATTR_TYPE_ENUM_IN) elm->spec.u2.enum_def.ids = NULL; } } }
1 1 3 3 1 2 3 1 1 14 14 2 4 1 1 4 3 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 // SPDX-License-Identifier: GPL-2.0-only /* Copyright (C) 2000-2002 Joakim Axelsson <gozem@linux.nu> * Patrick Schaaf <bof@bof.de> * Martin Josefsson <gandalf@wlug.westbo.se> */ /* Kernel module implementing an IP set type: the bitmap:ip,mac type */ #include <linux/module.h> #include <linux/ip.h> #include <linux/etherdevice.h> #include <linux/skbuff.h> #include <linux/errno.h> #include <linux/if_ether.h> #include <linux/netlink.h> #include <linux/jiffies.h> #include <linux/timer.h> #include <net/netlink.h> #include <linux/netfilter/ipset/pfxlen.h> #include <linux/netfilter/ipset/ip_set.h> #include <linux/netfilter/ipset/ip_set_bitmap.h> #define IPSET_TYPE_REV_MIN 0 /* 1 Counter support added */ /* 2 Comment support added */ #define IPSET_TYPE_REV_MAX 3 /* skbinfo support added */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>"); IP_SET_MODULE_DESC("bitmap:ip,mac", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX); MODULE_ALIAS("ip_set_bitmap:ip,mac"); #define MTYPE bitmap_ipmac #define HOST_MASK 32 #define IP_SET_BITMAP_STORED_TIMEOUT enum { MAC_UNSET, /* element is set, without MAC */ MAC_FILLED, /* element is set with MAC */ }; /* Type structure */ struct bitmap_ipmac { unsigned long *members; /* the set members */ u32 first_ip; /* host byte order, included in range */ u32 last_ip; /* host byte order, included in range */ u32 elements; /* number of max elements in the set */ size_t memsize; /* members size */ struct timer_list gc; /* garbage collector */ struct ip_set *set; /* attached to this ip_set */ unsigned char extensions[] /* MAC + data extensions */ __aligned(__alignof__(u64)); }; /* ADT structure for generic function args */ struct bitmap_ipmac_adt_elem { unsigned char ether[ETH_ALEN] __aligned(2); u16 id; u16 add_mac; }; struct bitmap_ipmac_elem { unsigned char ether[ETH_ALEN]; unsigned char filled; } __aligned(__alignof__(u64)); static u32 ip_to_id(const struct bitmap_ipmac *m, u32 ip) { return ip - m->first_ip; } #define get_elem(extensions, id, dsize) \ (struct bitmap_ipmac_elem *)(extensions + (id) * (dsize)) #define get_const_elem(extensions, id, dsize) \ (const struct bitmap_ipmac_elem *)(extensions + (id) * (dsize)) /* Common functions */ static int bitmap_ipmac_do_test(const struct bitmap_ipmac_adt_elem *e, const struct bitmap_ipmac *map, size_t dsize) { const struct bitmap_ipmac_elem *elem; if (!test_bit(e->id, map->members)) return 0; elem = get_const_elem(map->extensions, e->id, dsize); if (e->add_mac && elem->filled == MAC_FILLED) return ether_addr_equal(e->ether, elem->ether); /* Trigger kernel to fill out the ethernet address */ return -EAGAIN; } static int bitmap_ipmac_gc_test(u16 id, const struct bitmap_ipmac *map, size_t dsize) { const struct bitmap_ipmac_elem *elem; if (!test_bit(id, map->members)) return 0; elem = get_const_elem(map->extensions, id, dsize); /* Timer not started for the incomplete elements */ return elem->filled == MAC_FILLED; } static int bitmap_ipmac_is_filled(const struct bitmap_ipmac_elem *elem) { return elem->filled == MAC_FILLED; } static int bitmap_ipmac_add_timeout(unsigned long *timeout, const struct bitmap_ipmac_adt_elem *e, const struct ip_set_ext *ext, struct ip_set *set, struct bitmap_ipmac *map, int mode) { u32 t = ext->timeout; if (mode == IPSET_ADD_START_STORED_TIMEOUT) { if (t == set->timeout) /* Timeout was not specified, get stored one */ t = *timeout; ip_set_timeout_set(timeout, t); } else { /* If MAC is unset yet, we store plain timeout value * because the timer is not activated yet * and we can reuse it later when MAC is filled out, * possibly by the kernel */ if (e->add_mac) ip_set_timeout_set(timeout, t); else *timeout = t; } return 0; } static int bitmap_ipmac_do_add(const struct bitmap_ipmac_adt_elem *e, struct bitmap_ipmac *map, u32 flags, size_t dsize) { struct bitmap_ipmac_elem *elem; elem = get_elem(map->extensions, e->id, dsize); if (test_bit(e->id, map->members)) { if (elem->filled == MAC_FILLED) { if (e->add_mac && (flags & IPSET_FLAG_EXIST) && !ether_addr_equal(e->ether, elem->ether)) { /* memcpy isn't atomic */ clear_bit(e->id, map->members); smp_mb__after_atomic(); ether_addr_copy(elem->ether, e->ether); } return IPSET_ADD_FAILED; } else if (!e->add_mac) /* Already added without ethernet address */ return IPSET_ADD_FAILED; /* Fill the MAC address and trigger the timer activation */ clear_bit(e->id, map->members); smp_mb__after_atomic(); ether_addr_copy(elem->ether, e->ether); elem->filled = MAC_FILLED; return IPSET_ADD_START_STORED_TIMEOUT; } else if (e->add_mac) { /* We can store MAC too */ ether_addr_copy(elem->ether, e->ether); elem->filled = MAC_FILLED; return 0; } elem->filled = MAC_UNSET; /* MAC is not stored yet, don't start timer */ return IPSET_ADD_STORE_PLAIN_TIMEOUT; } static int bitmap_ipmac_do_del(const struct bitmap_ipmac_adt_elem *e, struct bitmap_ipmac *map) { return !test_and_clear_bit(e->id, map->members); } static int bitmap_ipmac_do_list(struct sk_buff *skb, const struct bitmap_ipmac *map, u32 id, size_t dsize) { const struct bitmap_ipmac_elem *elem = get_const_elem(map->extensions, id, dsize); return nla_put_ipaddr4(skb, IPSET_ATTR_IP, htonl(map->first_ip + id)) || (elem->filled == MAC_FILLED && nla_put(skb, IPSET_ATTR_ETHER, ETH_ALEN, elem->ether)); } static int bitmap_ipmac_do_head(struct sk_buff *skb, const struct bitmap_ipmac *map) { return nla_put_ipaddr4(skb, IPSET_ATTR_IP, htonl(map->first_ip)) || nla_put_ipaddr4(skb, IPSET_ATTR_IP_TO, htonl(map->last_ip)); } static int bitmap_ipmac_kadt(struct ip_set *set, const struct sk_buff *skb, const struct xt_action_param *par, enum ipset_adt adt, struct ip_set_adt_opt *opt) { struct bitmap_ipmac *map = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct bitmap_ipmac_adt_elem e = { .id = 0, .add_mac = 1 }; struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); u32 ip; ip = ntohl(ip4addr(skb, opt->flags & IPSET_DIM_ONE_SRC)); if (ip < map->first_ip || ip > map->last_ip) return -IPSET_ERR_BITMAP_RANGE; /* Backward compatibility: we don't check the second flag */ if (skb_mac_header(skb) < skb->head || (skb_mac_header(skb) + ETH_HLEN) > skb->data) return -EINVAL; e.id = ip_to_id(map, ip); if (opt->flags & IPSET_DIM_TWO_SRC) ether_addr_copy(e.ether, eth_hdr(skb)->h_source); else ether_addr_copy(e.ether, eth_hdr(skb)->h_dest); if (is_zero_ether_addr(e.ether)) return -EINVAL; return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); } static int bitmap_ipmac_uadt(struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { const struct bitmap_ipmac *map = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct bitmap_ipmac_adt_elem e = { .id = 0 }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); u32 ip = 0; int ret = 0; if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); if (unlikely(!tb[IPSET_ATTR_IP])) return -IPSET_ERR_PROTOCOL; ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip); if (ret) return ret; ret = ip_set_get_extensions(set, tb, &ext); if (ret) return ret; if (ip < map->first_ip || ip > map->last_ip) return -IPSET_ERR_BITMAP_RANGE; e.id = ip_to_id(map, ip); if (tb[IPSET_ATTR_ETHER]) { if (nla_len(tb[IPSET_ATTR_ETHER]) != ETH_ALEN) return -IPSET_ERR_PROTOCOL; memcpy(e.ether, nla_data(tb[IPSET_ATTR_ETHER]), ETH_ALEN); e.add_mac = 1; } ret = adtfn(set, &e, &ext, &ext, flags); return ip_set_eexist(ret, flags) ? 0 : ret; } static bool bitmap_ipmac_same_set(const struct ip_set *a, const struct ip_set *b) { const struct bitmap_ipmac *x = a->data; const struct bitmap_ipmac *y = b->data; return x->first_ip == y->first_ip && x->last_ip == y->last_ip && a->timeout == b->timeout && a->extensions == b->extensions; } /* Plain variant */ #include "ip_set_bitmap_gen.h" /* Create bitmap:ip,mac type of sets */ static bool init_map_ipmac(struct ip_set *set, struct bitmap_ipmac *map, u32 first_ip, u32 last_ip, u32 elements) { map->members = bitmap_zalloc(elements, GFP_KERNEL | __GFP_NOWARN); if (!map->members) return false; map->first_ip = first_ip; map->last_ip = last_ip; map->elements = elements; set->timeout = IPSET_NO_TIMEOUT; map->set = set; set->data = map; set->family = NFPROTO_IPV4; return true; } static int bitmap_ipmac_create(struct net *net, struct ip_set *set, struct nlattr *tb[], u32 flags) { u32 first_ip = 0, last_ip = 0; u64 elements; struct bitmap_ipmac *map; int ret; if (unlikely(!tb[IPSET_ATTR_IP] || !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS))) return -IPSET_ERR_PROTOCOL; ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &first_ip); if (ret) return ret; if (tb[IPSET_ATTR_IP_TO]) { ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &last_ip); if (ret) return ret; if (first_ip > last_ip) swap(first_ip, last_ip); } else if (tb[IPSET_ATTR_CIDR]) { u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); if (cidr >= HOST_MASK) return -IPSET_ERR_INVALID_CIDR; ip_set_mask_from_to(first_ip, last_ip, cidr); } else { return -IPSET_ERR_PROTOCOL; } elements = (u64)last_ip - first_ip + 1; if (elements > IPSET_BITMAP_MAX_RANGE + 1) return -IPSET_ERR_BITMAP_RANGE_SIZE; set->dsize = ip_set_elem_len(set, tb, sizeof(struct bitmap_ipmac_elem), __alignof__(struct bitmap_ipmac_elem)); map = ip_set_alloc(sizeof(*map) + elements * set->dsize); if (!map) return -ENOMEM; map->memsize = BITS_TO_LONGS(elements) * sizeof(unsigned long); set->variant = &bitmap_ipmac; if (!init_map_ipmac(set, map, first_ip, last_ip, elements)) { ip_set_free(map); return -ENOMEM; } if (tb[IPSET_ATTR_TIMEOUT]) { set->timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); bitmap_ipmac_gc_init(set, bitmap_ipmac_gc); } return 0; } static struct ip_set_type bitmap_ipmac_type = { .name = "bitmap:ip,mac", .protocol = IPSET_PROTOCOL, .features = IPSET_TYPE_IP | IPSET_TYPE_MAC, .dimension = IPSET_DIM_TWO, .family = NFPROTO_IPV4, .revision_min = IPSET_TYPE_REV_MIN, .revision_max = IPSET_TYPE_REV_MAX, .create = bitmap_ipmac_create, .create_policy = { [IPSET_ATTR_IP] = { .type = NLA_NESTED }, [IPSET_ATTR_IP_TO] = { .type = NLA_NESTED }, [IPSET_ATTR_CIDR] = { .type = NLA_U8 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, }, .adt_policy = { [IPSET_ATTR_IP] = { .type = NLA_NESTED }, [IPSET_ATTR_ETHER] = { .type = NLA_BINARY, .len = ETH_ALEN }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, [IPSET_ATTR_LINENO] = { .type = NLA_U32 }, [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING, .len = IPSET_MAX_COMMENT_SIZE }, [IPSET_ATTR_SKBMARK] = { .type = NLA_U64 }, [IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 }, [IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 }, }, .me = THIS_MODULE, }; static int __init bitmap_ipmac_init(void) { return ip_set_type_register(&bitmap_ipmac_type); } static void __exit bitmap_ipmac_fini(void) { rcu_barrier(); ip_set_type_unregister(&bitmap_ipmac_type); } module_init(bitmap_ipmac_init); module_exit(bitmap_ipmac_fini);
16 22 68 5 2 9 7 45 44 45 45 45 9 10 11 45 5 43 44 23 23 11 1 1 7 7 7 7 7 7 42 42 7 7 23 12 35 35 12 35 23 12 35 35 35 35 35 1 35 35 35 35 23 12 23 24 13 11 11 24 6 18 7 2 5 7 7 7 7 7 7 7 7 7 7 7 50 58 11 11 5 6 11 11 7 7 7 5 2 7 7 7 7 2 2 2 7 6 5 4 1 3 3 1 14 14 14 12 2 7 7 52 3 55 37 3 17 55 54 3 3 3 52 52 3 52 3 3 3 3 3 1 1 30 1 2 3 2 3 2 2 1 3 2 1 1 3 3 2 1 1 1 1 1 3 1 2 3 7 3 13 13 13 13 12 1 34 35 34 3 3 3 3 3 3 1 3 1 2 3 3 8 9 9 8 9 1 1 1 3 1 2 2 1 1 1 1 1 1 1 1 1 1 3 9 3 9 3 9 9 8 8 1 3 3 3 3 3 3 3 2 1 1 1 3 2 1 3 10 10 10 10 9 10 10 9 10 9 10 9 10 10 9 10 1 25 25 10 25 10 25 25 2 2 37 37 36 35 8 8 3 3 5 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082 6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121 6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132 6133 6134 6135 6136 6137 6138 6139 6140 6141 6142 6143 6144 6145 6146 6147 6148 6149 6150 6151 6152 6153 6154 6155 6156 6157 6158 6159 6160 6161 6162 6163 6164 6165 6166 6167 6168 6169 6170 6171 6172 6173 6174 6175 6176 6177 6178 6179 6180 6181 6182 6183 6184 6185 6186 6187 6188 6189 6190 6191 6192 6193 6194 6195 6196 6197 6198 6199 6200 6201 6202 6203 6204 6205 6206 6207 6208 6209 6210 6211 6212 6213 6214 6215 6216 6217 6218 6219 6220 6221 6222 6223 6224 6225 6226 6227 6228 6229 6230 6231 6232 6233 6234 6235 6236 6237 6238 6239 6240 6241 6242 6243 6244 6245 6246 6247 6248 6249 6250 6251 6252 6253 6254 6255 6256 6257 6258 6259 6260 6261 6262 6263 6264 6265 6266 6267 6268 6269 6270 6271 6272 6273 6274 6275 6276 6277 6278 6279 6280 6281 6282 6283 6284 6285 6286 6287 6288 6289 6290 6291 6292 6293 6294 6295 6296 6297 6298 6299 6300 6301 6302 6303 6304 6305 6306 6307 6308 6309 6310 6311 6312 6313 6314 6315 6316 6317 6318 6319 6320 6321 6322 6323 6324 6325 6326 6327 6328 6329 6330 6331 6332 6333 6334 6335 6336 6337 6338 6339 6340 6341 6342 6343 6344 6345 6346 6347 6348 6349 6350 6351 6352 6353 6354 6355 6356 6357 6358 6359 6360 6361 6362 6363 6364 6365 6366 6367 6368 6369 6370 6371 6372 6373 6374 6375 6376 6377 6378 6379 6380 6381 6382 6383 6384 6385 6386 6387 6388 6389 6390 6391 6392 6393 6394 6395 6396 6397 6398 6399 6400 6401 6402 6403 6404 6405 6406 6407 6408 6409 6410 6411 6412 6413 6414 6415 6416 6417 6418 6419 6420 6421 6422 6423 6424 6425 6426 6427 6428 6429 6430 6431 6432 6433 6434 6435 6436 6437 6438 6439 6440 6441 6442 6443 6444 6445 6446 6447 6448 6449 6450 6451 6452 6453 6454 6455 6456 6457 6458 6459 6460 6461 6462 6463 6464 6465 6466 6467 6468 6469 6470 6471 6472 6473 6474 6475 6476 6477 6478 6479 6480 6481 6482 6483 6484 6485 6486 6487 6488 6489 6490 6491 6492 6493 6494 6495 6496 6497 6498 6499 6500 6501 6502 6503 6504 6505 6506 6507 6508 6509 6510 6511 6512 6513 6514 6515 6516 6517 6518 6519 6520 6521 6522 6523 6524 6525 6526 6527 6528 6529 6530 6531 6532 6533 6534 6535 6536 6537 6538 6539 6540 6541 6542 6543 6544 6545 6546 6547 6548 6549 6550 6551 6552 6553 6554 6555 6556 6557 6558 6559 6560 6561 6562 6563 6564 6565 6566 6567 // SPDX-License-Identifier: GPL-2.0 /* * USB hub driver. * * (C) Copyright 1999 Linus Torvalds * (C) Copyright 1999 Johannes Erdfelt * (C) Copyright 1999 Gregory P. Smith * (C) Copyright 2001 Brad Hards (bhards@bigpond.net.au) * * Released under the GPLv2 only. */ #include <linux/kernel.h> #include <linux/errno.h> #include <linux/module.h> #include <linux/moduleparam.h> #include <linux/completion.h> #include <linux/sched/mm.h> #include <linux/list.h> #include <linux/slab.h> #include <linux/string_choices.h> #include <linux/kcov.h> #include <linux/ioctl.h> #include <linux/usb.h> #include <linux/usbdevice_fs.h> #include <linux/usb/hcd.h> #include <linux/usb/onboard_dev.h> #include <linux/usb/otg.h> #include <linux/usb/quirks.h> #include <linux/workqueue.h> #include <linux/minmax.h> #include <linux/mutex.h> #include <linux/random.h> #include <linux/pm_qos.h> #include <linux/kobject.h> #include <linux/bitfield.h> #include <linux/uaccess.h> #include <asm/byteorder.h> #include "hub.h" #include "phy.h" #include "otg_productlist.h" #include "trace.h" #define USB_VENDOR_GENESYS_LOGIC 0x05e3 #define USB_VENDOR_SMSC 0x0424 #define USB_PRODUCT_USB5534B 0x5534 #define USB_VENDOR_CYPRESS 0x04b4 #define USB_PRODUCT_CY7C65632 0x6570 #define USB_VENDOR_TEXAS_INSTRUMENTS 0x0451 #define USB_PRODUCT_TUSB8041_USB3 0x8140 #define USB_PRODUCT_TUSB8041_USB2 0x8142 #define USB_VENDOR_MICROCHIP 0x0424 #define USB_PRODUCT_USB4913 0x4913 #define USB_PRODUCT_USB4914 0x4914 #define USB_PRODUCT_USB4915 0x4915 #define HUB_QUIRK_CHECK_PORT_AUTOSUSPEND BIT(0) #define HUB_QUIRK_DISABLE_AUTOSUSPEND BIT(1) #define HUB_QUIRK_REDUCE_FRAME_INTR_BINTERVAL BIT(2) #define USB_TP_TRANSMISSION_DELAY 40 /* ns */ #define USB_TP_TRANSMISSION_DELAY_MAX 65535 /* ns */ #define USB_PING_RESPONSE_TIME 400 /* ns */ #define USB_REDUCE_FRAME_INTR_BINTERVAL 9 /* * The SET_ADDRESS request timeout will be 500 ms when * USB_QUIRK_SHORT_SET_ADDRESS_REQ_TIMEOUT quirk flag is set. */ #define USB_SHORT_SET_ADDRESS_REQ_TIMEOUT 500 /* ms */ /* * Give SS hubs 200ms time after wake to train downstream links before * assuming no port activity and allowing hub to runtime suspend back. */ #define USB_SS_PORT_U0_WAKE_TIME 200 /* ms */ /* Protect struct usb_device->state and ->children members * Note: Both are also protected by ->dev.sem, except that ->state can * change to USB_STATE_NOTATTACHED even when the semaphore isn't held. */ static DEFINE_SPINLOCK(device_state_lock); /* workqueue to process hub events */ static struct workqueue_struct *hub_wq; static void hub_event(struct work_struct *work); /* synchronize hub-port add/remove and peering operations */ DEFINE_MUTEX(usb_port_peer_mutex); /* cycle leds on hubs that aren't blinking for attention */ static bool blinkenlights; module_param(blinkenlights, bool, S_IRUGO); MODULE_PARM_DESC(blinkenlights, "true to cycle leds on hubs"); /* * Device SATA8000 FW1.0 from DATAST0R Technology Corp requires about * 10 seconds to send reply for the initial 64-byte descriptor request. */ /* define initial 64-byte descriptor request timeout in milliseconds */ static int initial_descriptor_timeout = USB_CTRL_GET_TIMEOUT; module_param(initial_descriptor_timeout, int, S_IRUGO|S_IWUSR); MODULE_PARM_DESC(initial_descriptor_timeout, "initial 64-byte descriptor request timeout in milliseconds " "(default 5000 - 5.0 seconds)"); /* * As of 2.6.10 we introduce a new USB device initialization scheme which * closely resembles the way Windows works. Hopefully it will be compatible * with a wider range of devices than the old scheme. However some previously * working devices may start giving rise to "device not accepting address" * errors; if that happens the user can try the old scheme by adjusting the * following module parameters. * * For maximum flexibility there are two boolean parameters to control the * hub driver's behavior. On the first initialization attempt, if the * "old_scheme_first" parameter is set then the old scheme will be used, * otherwise the new scheme is used. If that fails and "use_both_schemes" * is set, then the driver will make another attempt, using the other scheme. */ static bool old_scheme_first; module_param(old_scheme_first, bool, S_IRUGO | S_IWUSR); MODULE_PARM_DESC(old_scheme_first, "start with the old device initialization scheme"); static bool use_both_schemes = true; module_param(use_both_schemes, bool, S_IRUGO | S_IWUSR); MODULE_PARM_DESC(use_both_schemes, "try the other device initialization scheme if the " "first one fails"); /* Mutual exclusion for EHCI CF initialization. This interferes with * port reset on some companion controllers. */ DECLARE_RWSEM(ehci_cf_port_reset_rwsem); EXPORT_SYMBOL_GPL(ehci_cf_port_reset_rwsem); #define HUB_DEBOUNCE_TIMEOUT 2000 #define HUB_DEBOUNCE_STEP 25 #define HUB_DEBOUNCE_STABLE 100 static int usb_reset_and_verify_device(struct usb_device *udev); static int hub_port_disable(struct usb_hub *hub, int port1, int set_state); static bool hub_port_warm_reset_required(struct usb_hub *hub, int port1, u16 portstatus); static inline char *portspeed(struct usb_hub *hub, int portstatus) { if (hub_is_superspeedplus(hub->hdev)) return "10.0 Gb/s"; if (hub_is_superspeed(hub->hdev)) return "5.0 Gb/s"; if (portstatus & USB_PORT_STAT_HIGH_SPEED) return "480 Mb/s"; else if (portstatus & USB_PORT_STAT_LOW_SPEED) return "1.5 Mb/s"; else return "12 Mb/s"; } /* Note that hdev or one of its children must be locked! */ struct usb_hub *usb_hub_to_struct_hub(struct usb_device *hdev) { if (!hdev || !hdev->actconfig || !hdev->maxchild) return NULL; return usb_get_intfdata(hdev->actconfig->interface[0]); } int usb_device_supports_lpm(struct usb_device *udev) { /* Some devices have trouble with LPM */ if (udev->quirks & USB_QUIRK_NO_LPM) return 0; /* Skip if the device BOS descriptor couldn't be read */ if (!udev->bos) return 0; /* USB 2.1 (and greater) devices indicate LPM support through * their USB 2.0 Extended Capabilities BOS descriptor. */ if (udev->speed == USB_SPEED_HIGH || udev->speed == USB_SPEED_FULL) { if (udev->bos->ext_cap && (USB_LPM_SUPPORT & le32_to_cpu(udev->bos->ext_cap->bmAttributes))) return 1; return 0; } /* * According to the USB 3.0 spec, all USB 3.0 devices must support LPM. * However, there are some that don't, and they set the U1/U2 exit * latencies to zero. */ if (!udev->bos->ss_cap) { dev_info(&udev->dev, "No LPM exit latency info found, disabling LPM.\n"); return 0; } if (udev->bos->ss_cap->bU1devExitLat == 0 && udev->bos->ss_cap->bU2DevExitLat == 0) { if (udev->parent) dev_info(&udev->dev, "LPM exit latency is zeroed, disabling LPM.\n"); else dev_info(&udev->dev, "We don't know the algorithms for LPM for this host, disabling LPM.\n"); return 0; } if (!udev->parent || udev->parent->lpm_capable) return 1; return 0; } /* * Set the Maximum Exit Latency (MEL) for the host to wakup up the path from * U1/U2, send a PING to the device and receive a PING_RESPONSE. * See USB 3.1 section C.1.5.2 */ static void usb_set_lpm_mel(struct usb_device *udev, struct usb3_lpm_parameters *udev_lpm_params, unsigned int udev_exit_latency, struct usb_hub *hub, struct usb3_lpm_parameters *hub_lpm_params, unsigned int hub_exit_latency) { unsigned int total_mel; /* * tMEL1. time to transition path from host to device into U0. * MEL for parent already contains the delay up to parent, so only add * the exit latency for the last link (pick the slower exit latency), * and the hub header decode latency. See USB 3.1 section C 2.2.1 * Store MEL in nanoseconds */ total_mel = hub_lpm_params->mel + max(udev_exit_latency, hub_exit_latency) * 1000 + hub->descriptor->u.ss.bHubHdrDecLat * 100; /* * tMEL2. Time to submit PING packet. Sum of tTPTransmissionDelay for * each link + wHubDelay for each hub. Add only for last link. * tMEL4, the time for PING_RESPONSE to traverse upstream is similar. * Multiply by 2 to include it as well. */ total_mel += (__le16_to_cpu(hub->descriptor->u.ss.wHubDelay) + USB_TP_TRANSMISSION_DELAY) * 2; /* * tMEL3, tPingResponse. Time taken by device to generate PING_RESPONSE * after receiving PING. Also add 2100ns as stated in USB 3.1 C 1.5.2.4 * to cover the delay if the PING_RESPONSE is queued behind a Max Packet * Size DP. * Note these delays should be added only once for the entire path, so * add them to the MEL of the device connected to the roothub. */ if (!hub->hdev->parent) total_mel += USB_PING_RESPONSE_TIME + 2100; udev_lpm_params->mel = total_mel; } /* * Set the maximum Device to Host Exit Latency (PEL) for the device to initiate * a transition from either U1 or U2. */ static void usb_set_lpm_pel(struct usb_device *udev, struct usb3_lpm_parameters *udev_lpm_params, unsigned int udev_exit_latency, struct usb_hub *hub, struct usb3_lpm_parameters *hub_lpm_params, unsigned int hub_exit_latency, unsigned int port_to_port_exit_latency) { unsigned int first_link_pel; unsigned int hub_pel; /* * First, the device sends an LFPS to transition the link between the * device and the parent hub into U0. The exit latency is the bigger of * the device exit latency or the hub exit latency. */ first_link_pel = max(udev_exit_latency, hub_exit_latency) * 1000; /* * When the hub starts to receive the LFPS, there is a slight delay for * it to figure out that one of the ports is sending an LFPS. Then it * will forward the LFPS to its upstream link. The exit latency is the * delay, plus the PEL that we calculated for this hub. */ hub_pel = port_to_port_exit_latency * 1000 + hub_lpm_params->pel; /* * According to figure C-7 in the USB 3.0 spec, the PEL for this device * is the greater of the two exit latencies. */ udev_lpm_params->pel = max(first_link_pel, hub_pel); } /* * Set the System Exit Latency (SEL) to indicate the total worst-case time from * when a device initiates a transition to U0, until when it will receive the * first packet from the host controller. * * Section C.1.5.1 describes the four components to this: * - t1: device PEL * - t2: time for the ERDY to make it from the device to the host. * - t3: a host-specific delay to process the ERDY. * - t4: time for the packet to make it from the host to the device. * * t3 is specific to both the xHCI host and the platform the host is integrated * into. The Intel HW folks have said it's negligible, FIXME if a different * vendor says otherwise. */ static void usb_set_lpm_sel(struct usb_device *udev, struct usb3_lpm_parameters *udev_lpm_params) { struct usb_device *parent; unsigned int num_hubs; unsigned int total_sel; /* t1 = device PEL */ total_sel = udev_lpm_params->pel; /* How many external hubs are in between the device & the root port. */ for (parent = udev->parent, num_hubs = 0; parent->parent; parent = parent->parent) num_hubs++; /* t2 = 2.1us + 250ns * (num_hubs - 1) */ if (num_hubs > 0) total_sel += 2100 + 250 * (num_hubs - 1); /* t4 = 250ns * num_hubs */ total_sel += 250 * num_hubs; udev_lpm_params->sel = total_sel; } static void usb_set_lpm_parameters(struct usb_device *udev) { struct usb_hub *hub; unsigned int port_to_port_delay; unsigned int udev_u1_del; unsigned int udev_u2_del; unsigned int hub_u1_del; unsigned int hub_u2_del; if (!udev->lpm_capable || udev->speed < USB_SPEED_SUPER) return; /* Skip if the device BOS descriptor couldn't be read */ if (!udev->bos) return; hub = usb_hub_to_struct_hub(udev->parent); /* It doesn't take time to transition the roothub into U0, since it * doesn't have an upstream link. */ if (!hub) return; udev_u1_del = udev->bos->ss_cap->bU1devExitLat; udev_u2_del = le16_to_cpu(udev->bos->ss_cap->bU2DevExitLat); hub_u1_del = udev->parent->bos->ss_cap->bU1devExitLat; hub_u2_del = le16_to_cpu(udev->parent->bos->ss_cap->bU2DevExitLat); usb_set_lpm_mel(udev, &udev->u1_params, udev_u1_del, hub, &udev->parent->u1_params, hub_u1_del); usb_set_lpm_mel(udev, &udev->u2_params, udev_u2_del, hub, &udev->parent->u2_params, hub_u2_del); /* * Appendix C, section C.2.2.2, says that there is a slight delay from * when the parent hub notices the downstream port is trying to * transition to U0 to when the hub initiates a U0 transition on its * upstream port. The section says the delays are tPort2PortU1EL and * tPort2PortU2EL, but it doesn't define what they are. * * The hub chapter, sections 10.4.2.4 and 10.4.2.5 seem to be talking * about the same delays. Use the maximum delay calculations from those * sections. For U1, it's tHubPort2PortExitLat, which is 1us max. For * U2, it's tHubPort2PortExitLat + U2DevExitLat - U1DevExitLat. I * assume the device exit latencies they are talking about are the hub * exit latencies. * * What do we do if the U2 exit latency is less than the U1 exit * latency? It's possible, although not likely... */ port_to_port_delay = 1; usb_set_lpm_pel(udev, &udev->u1_params, udev_u1_del, hub, &udev->parent->u1_params, hub_u1_del, port_to_port_delay); if (hub_u2_del > hub_u1_del) port_to_port_delay = 1 + hub_u2_del - hub_u1_del; else port_to_port_delay = 1 + hub_u1_del; usb_set_lpm_pel(udev, &udev->u2_params, udev_u2_del, hub, &udev->parent->u2_params, hub_u2_del, port_to_port_delay); /* Now that we've got PEL, calculate SEL. */ usb_set_lpm_sel(udev, &udev->u1_params); usb_set_lpm_sel(udev, &udev->u2_params); } /* USB 2.0 spec Section 11.24.4.5 */ static int get_hub_descriptor(struct usb_device *hdev, struct usb_hub_descriptor *desc) { int i, ret, size; unsigned dtype; if (hub_is_superspeed(hdev)) { dtype = USB_DT_SS_HUB; size = USB_DT_SS_HUB_SIZE; } else { dtype = USB_DT_HUB; size = sizeof(struct usb_hub_descriptor); } for (i = 0; i < 3; i++) { ret = usb_control_msg(hdev, usb_rcvctrlpipe(hdev, 0), USB_REQ_GET_DESCRIPTOR, USB_DIR_IN | USB_RT_HUB, dtype << 8, 0, desc, size, USB_CTRL_GET_TIMEOUT); if (hub_is_superspeed(hdev)) { if (ret == size) return ret; } else if (ret >= USB_DT_HUB_NONVAR_SIZE + 2) { /* Make sure we have the DeviceRemovable field. */ size = USB_DT_HUB_NONVAR_SIZE + desc->bNbrPorts / 8 + 1; if (ret < size) return -EMSGSIZE; return ret; } } return -EINVAL; } /* * USB 2.0 spec Section 11.24.2.1 */ static int clear_hub_feature(struct usb_device *hdev, int feature) { return usb_control_msg(hdev, usb_sndctrlpipe(hdev, 0), USB_REQ_CLEAR_FEATURE, USB_RT_HUB, feature, 0, NULL, 0, 1000); } /* * USB 2.0 spec Section 11.24.2.2 */ int usb_clear_port_feature(struct usb_device *hdev, int port1, int feature) { return usb_control_msg(hdev, usb_sndctrlpipe(hdev, 0), USB_REQ_CLEAR_FEATURE, USB_RT_PORT, feature, port1, NULL, 0, 1000); } /* * USB 2.0 spec Section 11.24.2.13 */ static int set_port_feature(struct usb_device *hdev, int port1, int feature) { return usb_control_msg(hdev, usb_sndctrlpipe(hdev, 0), USB_REQ_SET_FEATURE, USB_RT_PORT, feature, port1, NULL, 0, 1000); } static char *to_led_name(int selector) { switch (selector) { case HUB_LED_AMBER: return "amber"; case HUB_LED_GREEN: return "green"; case HUB_LED_OFF: return "off"; case HUB_LED_AUTO: return "auto"; default: return "??"; } } /* * USB 2.0 spec Section 11.24.2.7.1.10 and table 11-7 * for info about using port indicators */ static void set_port_led(struct usb_hub *hub, int port1, int selector) { struct usb_port *port_dev = hub->ports[port1 - 1]; int status; status = set_port_feature(hub->hdev, (selector << 8) | port1, USB_PORT_FEAT_INDICATOR); dev_dbg(&port_dev->dev, "indicator %s status %d\n", to_led_name(selector), status); } #define LED_CYCLE_PERIOD ((2*HZ)/3) static void led_work(struct work_struct *work) { struct usb_hub *hub = container_of(work, struct usb_hub, leds.work); struct usb_device *hdev = hub->hdev; unsigned i; unsigned changed = 0; int cursor = -1; if (hdev->state != USB_STATE_CONFIGURED || hub->quiescing) return; for (i = 0; i < hdev->maxchild; i++) { unsigned selector, mode; /* 30%-50% duty cycle */ switch (hub->indicator[i]) { /* cycle marker */ case INDICATOR_CYCLE: cursor = i; selector = HUB_LED_AUTO; mode = INDICATOR_AUTO; break; /* blinking green = sw attention */ case INDICATOR_GREEN_BLINK: selector = HUB_LED_GREEN; mode = INDICATOR_GREEN_BLINK_OFF; break; case INDICATOR_GREEN_BLINK_OFF: selector = HUB_LED_OFF; mode = INDICATOR_GREEN_BLINK; break; /* blinking amber = hw attention */ case INDICATOR_AMBER_BLINK: selector = HUB_LED_AMBER; mode = INDICATOR_AMBER_BLINK_OFF; break; case INDICATOR_AMBER_BLINK_OFF: selector = HUB_LED_OFF; mode = INDICATOR_AMBER_BLINK; break; /* blink green/amber = reserved */ case INDICATOR_ALT_BLINK: selector = HUB_LED_GREEN; mode = INDICATOR_ALT_BLINK_OFF; break; case INDICATOR_ALT_BLINK_OFF: selector = HUB_LED_AMBER; mode = INDICATOR_ALT_BLINK; break; default: continue; } if (selector != HUB_LED_AUTO) changed = 1; set_port_led(hub, i + 1, selector); hub->indicator[i] = mode; } if (!changed && blinkenlights) { cursor++; cursor %= hdev->maxchild; set_port_led(hub, cursor + 1, HUB_LED_GREEN); hub->indicator[cursor] = INDICATOR_CYCLE; changed++; } if (changed) queue_delayed_work(system_power_efficient_wq, &hub->leds, LED_CYCLE_PERIOD); } /* use a short timeout for hub/port status fetches */ #define USB_STS_TIMEOUT 1000 #define USB_STS_RETRIES 5 /* * USB 2.0 spec Section 11.24.2.6 */ static int get_hub_status(struct usb_device *hdev, struct usb_hub_status *data) { int i, status = -ETIMEDOUT; for (i = 0; i < USB_STS_RETRIES && (status == -ETIMEDOUT || status == -EPIPE); i++) { status = usb_control_msg(hdev, usb_rcvctrlpipe(hdev, 0), USB_REQ_GET_STATUS, USB_DIR_IN | USB_RT_HUB, 0, 0, data, sizeof(*data), USB_STS_TIMEOUT); } return status; } /* * USB 2.0 spec Section 11.24.2.7 * USB 3.1 takes into use the wValue and wLength fields, spec Section 10.16.2.6 */ static int get_port_status(struct usb_device *hdev, int port1, void *data, u16 value, u16 length) { int i, status = -ETIMEDOUT; for (i = 0; i < USB_STS_RETRIES && (status == -ETIMEDOUT || status == -EPIPE); i++) { status = usb_control_msg(hdev, usb_rcvctrlpipe(hdev, 0), USB_REQ_GET_STATUS, USB_DIR_IN | USB_RT_PORT, value, port1, data, length, USB_STS_TIMEOUT); } return status; } static int hub_ext_port_status(struct usb_hub *hub, int port1, int type, u16 *status, u16 *change, u32 *ext_status) { int ret; int len = 4; if (type != HUB_PORT_STATUS) len = 8; mutex_lock(&hub->status_mutex); ret = get_port_status(hub->hdev, port1, &hub->status->port, type, len); if (ret < len) { if (ret != -ENODEV) dev_err(hub->intfdev, "%s failed (err = %d)\n", __func__, ret); if (ret >= 0) ret = -EIO; } else { *status = le16_to_cpu(hub->status->port.wPortStatus); *change = le16_to_cpu(hub->status->port.wPortChange); if (type != HUB_PORT_STATUS && ext_status) *ext_status = le32_to_cpu( hub->status->port.dwExtPortStatus); ret = 0; } mutex_unlock(&hub->status_mutex); /* * There is no need to lock status_mutex here, because status_mutex * protects hub->status, and the phy driver only checks the port * status without changing the status. */ if (!ret) { struct usb_device *hdev = hub->hdev; /* * Only roothub will be notified of connection changes, * since the USB PHY only cares about changes at the next * level. */ if (is_root_hub(hdev)) { struct usb_hcd *hcd = bus_to_hcd(hdev->bus); bool connect; bool connect_change; connect_change = *change & USB_PORT_STAT_C_CONNECTION; connect = *status & USB_PORT_STAT_CONNECTION; if (connect_change && connect) usb_phy_roothub_notify_connect(hcd->phy_roothub, port1 - 1); else if (connect_change) usb_phy_roothub_notify_disconnect(hcd->phy_roothub, port1 - 1); } } return ret; } int usb_hub_port_status(struct usb_hub *hub, int port1, u16 *status, u16 *change) { return hub_ext_port_status(hub, port1, HUB_PORT_STATUS, status, change, NULL); } static void hub_resubmit_irq_urb(struct usb_hub *hub) { unsigned long flags; int status; spin_lock_irqsave(&hub->irq_urb_lock, flags); if (hub->quiescing) { spin_unlock_irqrestore(&hub->irq_urb_lock, flags); return; } status = usb_submit_urb(hub->urb, GFP_ATOMIC); if (status && status != -ENODEV && status != -EPERM && status != -ESHUTDOWN) { dev_err(hub->intfdev, "resubmit --> %d\n", status); mod_timer(&hub->irq_urb_retry, jiffies + HZ); } spin_unlock_irqrestore(&hub->irq_urb_lock, flags); } static void hub_retry_irq_urb(struct timer_list *t) { struct usb_hub *hub = timer_container_of(hub, t, irq_urb_retry); hub_resubmit_irq_urb(hub); } static void kick_hub_wq(struct usb_hub *hub) { struct usb_interface *intf; if (hub->disconnected || work_pending(&hub->events)) return; /* * Suppress autosuspend until the event is proceed. * * Be careful and make sure that the symmetric operation is * always called. We are here only when there is no pending * work for this hub. Therefore put the interface either when * the new work is called or when it is canceled. */ intf = to_usb_interface(hub->intfdev); usb_autopm_get_interface_no_resume(intf); hub_get(hub); if (queue_work(hub_wq, &hub->events)) return; /* the work has already been scheduled */ usb_autopm_put_interface_async(intf); hub_put(hub); } void usb_kick_hub_wq(struct usb_device *hdev) { struct usb_hub *hub = usb_hub_to_struct_hub(hdev); if (hub) kick_hub_wq(hub); } /* * Let the USB core know that a USB 3.0 device has sent a Function Wake Device * Notification, which indicates it had initiated remote wakeup. * * USB 3.0 hubs do not report the port link state change from U3 to U0 when the * device initiates resume, so the USB core will not receive notice of the * resume through the normal hub interrupt URB. */ void usb_wakeup_notification(struct usb_device *hdev, unsigned int portnum) { struct usb_hub *hub; struct usb_port *port_dev; if (!hdev) return; hub = usb_hub_to_struct_hub(hdev); if (hub) { port_dev = hub->ports[portnum - 1]; if (port_dev && port_dev->child) pm_wakeup_event(&port_dev->child->dev, 0); set_bit(portnum, hub->wakeup_bits); kick_hub_wq(hub); } } EXPORT_SYMBOL_GPL(usb_wakeup_notification); /* completion function, fires on port status changes and various faults */ static void hub_irq(struct urb *urb) { struct usb_hub *hub = urb->context; int status = urb->status; unsigned i; unsigned long bits; switch (status) { case -ENOENT: /* synchronous unlink */ case -ECONNRESET: /* async unlink */ case -ESHUTDOWN: /* hardware going away */ return; default: /* presumably an error */ /* Cause a hub reset after 10 consecutive errors */ dev_dbg(hub->intfdev, "transfer --> %d\n", status); if ((++hub->nerrors < 10) || hub->error) goto resubmit; hub->error = status; fallthrough; /* let hub_wq handle things */ case 0: /* we got data: port status changed */ bits = 0; for (i = 0; i < urb->actual_length; ++i) bits |= ((unsigned long) ((*hub->buffer)[i])) << (i*8); hub->event_bits[0] = bits; break; } hub->nerrors = 0; /* Something happened, let hub_wq figure it out */ kick_hub_wq(hub); resubmit: hub_resubmit_irq_urb(hub); } /* USB 2.0 spec Section 11.24.2.3 */ static inline int hub_clear_tt_buffer(struct usb_device *hdev, u16 devinfo, u16 tt) { /* Need to clear both directions for control ep */ if (((devinfo >> 11) & USB_ENDPOINT_XFERTYPE_MASK) == USB_ENDPOINT_XFER_CONTROL) { int status = usb_control_msg(hdev, usb_sndctrlpipe(hdev, 0), HUB_CLEAR_TT_BUFFER, USB_RT_PORT, devinfo ^ 0x8000, tt, NULL, 0, 1000); if (status) return status; } return usb_control_msg(hdev, usb_sndctrlpipe(hdev, 0), HUB_CLEAR_TT_BUFFER, USB_RT_PORT, devinfo, tt, NULL, 0, 1000); } /* * enumeration blocks hub_wq for a long time. we use keventd instead, since * long blocking there is the exception, not the rule. accordingly, HCDs * talking to TTs must queue control transfers (not just bulk and iso), so * both can talk to the same hub concurrently. */ static void hub_tt_work(struct work_struct *work) { struct usb_hub *hub = container_of(work, struct usb_hub, tt.clear_work); unsigned long flags; spin_lock_irqsave(&hub->tt.lock, flags); while (!list_empty(&hub->tt.clear_list)) { struct list_head *next; struct usb_tt_clear *clear; struct usb_device *hdev = hub->hdev; const struct hc_driver *drv; int status; next = hub->tt.clear_list.next; clear = list_entry(next, struct usb_tt_clear, clear_list); list_del(&clear->clear_list); /* drop lock so HCD can concurrently report other TT errors */ spin_unlock_irqrestore(&hub->tt.lock, flags); status = hub_clear_tt_buffer(hdev, clear->devinfo, clear->tt); if (status && status != -ENODEV) dev_err(&hdev->dev, "clear tt %d (%04x) error %d\n", clear->tt, clear->devinfo, status); /* Tell the HCD, even if the operation failed */ drv = clear->hcd->driver; if (drv->clear_tt_buffer_complete) (drv->clear_tt_buffer_complete)(clear->hcd, clear->ep); kfree(clear); spin_lock_irqsave(&hub->tt.lock, flags); } spin_unlock_irqrestore(&hub->tt.lock, flags); } /** * usb_hub_set_port_power - control hub port's power state * @hdev: USB device belonging to the usb hub * @hub: target hub * @port1: port index * @set: expected status * * call this function to control port's power via setting or * clearing the port's PORT_POWER feature. * * Return: 0 if successful. A negative error code otherwise. */ int usb_hub_set_port_power(struct usb_device *hdev, struct usb_hub *hub, int port1, bool set) { int ret; if (set) ret = set_port_feature(hdev, port1, USB_PORT_FEAT_POWER); else ret = usb_clear_port_feature(hdev, port1, USB_PORT_FEAT_POWER); if (ret) return ret; if (set) set_bit(port1, hub->power_bits); else clear_bit(port1, hub->power_bits); return 0; } /** * usb_hub_clear_tt_buffer - clear control/bulk TT state in high speed hub * @urb: an URB associated with the failed or incomplete split transaction * * High speed HCDs use this to tell the hub driver that some split control or * bulk transaction failed in a way that requires clearing internal state of * a transaction translator. This is normally detected (and reported) from * interrupt context. * * It may not be possible for that hub to handle additional full (or low) * speed transactions until that state is fully cleared out. * * Return: 0 if successful. A negative error code otherwise. */ int usb_hub_clear_tt_buffer(struct urb *urb) { struct usb_device *udev = urb->dev; int pipe = urb->pipe; struct usb_tt *tt = udev->tt; unsigned long flags; struct usb_tt_clear *clear; /* we've got to cope with an arbitrary number of pending TT clears, * since each TT has "at least two" buffers that can need it (and * there can be many TTs per hub). even if they're uncommon. */ clear = kmalloc(sizeof *clear, GFP_ATOMIC); if (clear == NULL) { dev_err(&udev->dev, "can't save CLEAR_TT_BUFFER state\n"); /* FIXME recover somehow ... RESET_TT? */ return -ENOMEM; } /* info that CLEAR_TT_BUFFER needs */ clear->tt = tt->multi ? udev->ttport : 1; clear->devinfo = usb_pipeendpoint (pipe); clear->devinfo |= ((u16)udev->devaddr) << 4; clear->devinfo |= usb_pipecontrol(pipe) ? (USB_ENDPOINT_XFER_CONTROL << 11) : (USB_ENDPOINT_XFER_BULK << 11); if (usb_pipein(pipe)) clear->devinfo |= 1 << 15; /* info for completion callback */ clear->hcd = bus_to_hcd(udev->bus); clear->ep = urb->ep; /* tell keventd to clear state for this TT */ spin_lock_irqsave(&tt->lock, flags); list_add_tail(&clear->clear_list, &tt->clear_list); schedule_work(&tt->clear_work); spin_unlock_irqrestore(&tt->lock, flags); return 0; } EXPORT_SYMBOL_GPL(usb_hub_clear_tt_buffer); static void hub_power_on(struct usb_hub *hub, bool do_delay) { int port1; /* Enable power on each port. Some hubs have reserved values * of LPSM (> 2) in their descriptors, even though they are * USB 2.0 hubs. Some hubs do not implement port-power switching * but only emulate it. In all cases, the ports won't work * unless we send these messages to the hub. */ if (hub_is_port_power_switchable(hub)) dev_dbg(hub->intfdev, "enabling power on all ports\n"); else dev_dbg(hub->intfdev, "trying to enable port power on " "non-switchable hub\n"); for (port1 = 1; port1 <= hub->hdev->maxchild; port1++) if (test_bit(port1, hub->power_bits)) set_port_feature(hub->hdev, port1, USB_PORT_FEAT_POWER); else usb_clear_port_feature(hub->hdev, port1, USB_PORT_FEAT_POWER); if (do_delay) msleep(hub_power_on_good_delay(hub)); } static int hub_hub_status(struct usb_hub *hub, u16 *status, u16 *change) { int ret; mutex_lock(&hub->status_mutex); ret = get_hub_status(hub->hdev, &hub->status->hub); if (ret < 0) { if (ret != -ENODEV) dev_err(hub->intfdev, "%s failed (err = %d)\n", __func__, ret); } else { *status = le16_to_cpu(hub->status->hub.wHubStatus); *change = le16_to_cpu(hub->status->hub.wHubChange); ret = 0; } mutex_unlock(&hub->status_mutex); return ret; } static int hub_set_port_link_state(struct usb_hub *hub, int port1, unsigned int link_status) { return set_port_feature(hub->hdev, port1 | (link_status << 3), USB_PORT_FEAT_LINK_STATE); } /* * Disable a port and mark a logical connect-change event, so that some * time later hub_wq will disconnect() any existing usb_device on the port * and will re-enumerate if there actually is a device attached. */ static void hub_port_logical_disconnect(struct usb_hub *hub, int port1) { dev_dbg(&hub->ports[port1 - 1]->dev, "logical disconnect\n"); hub_port_disable(hub, port1, 1); /* FIXME let caller ask to power down the port: * - some devices won't enumerate without a VBUS power cycle * - SRP saves power that way * - ... new call, TBD ... * That's easy if this hub can switch power per-port, and * hub_wq reactivates the port later (timer, SRP, etc). * Powerdown must be optional, because of reset/DFU. */ set_bit(port1, hub->change_bits); kick_hub_wq(hub); } /** * usb_remove_device - disable a device's port on its parent hub * @udev: device to be disabled and removed * Context: @udev locked, must be able to sleep. * * After @udev's port has been disabled, hub_wq is notified and it will * see that the device has been disconnected. When the device is * physically unplugged and something is plugged in, the events will * be received and processed normally. * * Return: 0 if successful. A negative error code otherwise. */ int usb_remove_device(struct usb_device *udev) { struct usb_hub *hub; struct usb_interface *intf; int ret; if (!udev->parent) /* Can't remove a root hub */ return -EINVAL; hub = usb_hub_to_struct_hub(udev->parent); intf = to_usb_interface(hub->intfdev); ret = usb_autopm_get_interface(intf); if (ret < 0) return ret; set_bit(udev->portnum, hub->removed_bits); hub_port_logical_disconnect(hub, udev->portnum); usb_autopm_put_interface(intf); return 0; } enum hub_activation_type { HUB_INIT, HUB_INIT2, HUB_INIT3, /* INITs must come first */ HUB_POST_RESET, HUB_RESUME, HUB_RESET_RESUME, }; static void hub_init_func2(struct work_struct *ws); static void hub_init_func3(struct work_struct *ws); static void hub_activate(struct usb_hub *hub, enum hub_activation_type type) { struct usb_device *hdev = hub->hdev; struct usb_hcd *hcd; int ret; int port1; int status; bool need_debounce_delay = false; unsigned delay; /* Continue a partial initialization */ if (type == HUB_INIT2 || type == HUB_INIT3) { device_lock(&hdev->dev); /* Was the hub disconnected while we were waiting? */ if (hub->disconnected) goto disconnected; if (type == HUB_INIT2) goto init2; goto init3; } hub_get(hub); /* The superspeed hub except for root hub has to use Hub Depth * value as an offset into the route string to locate the bits * it uses to determine the downstream port number. So hub driver * should send a set hub depth request to superspeed hub after * the superspeed hub is set configuration in initialization or * reset procedure. * * After a resume, port power should still be on. * For any other type of activation, turn it on. */ if (type != HUB_RESUME) { if (hdev->parent && hub_is_superspeed(hdev)) { ret = usb_control_msg(hdev, usb_sndctrlpipe(hdev, 0), HUB_SET_DEPTH, USB_RT_HUB, hdev->level - 1, 0, NULL, 0, USB_CTRL_SET_TIMEOUT); if (ret < 0) dev_err(hub->intfdev, "set hub depth failed\n"); } /* Speed up system boot by using a delayed_work for the * hub's initial power-up delays. This is pretty awkward * and the implementation looks like a home-brewed sort of * setjmp/longjmp, but it saves at least 100 ms for each * root hub (assuming usbcore is compiled into the kernel * rather than as a module). It adds up. * * This can't be done for HUB_RESUME or HUB_RESET_RESUME * because for those activation types the ports have to be * operational when we return. In theory this could be done * for HUB_POST_RESET, but it's easier not to. */ if (type == HUB_INIT) { delay = hub_power_on_good_delay(hub); hub_power_on(hub, false); INIT_DELAYED_WORK(&hub->init_work, hub_init_func2); queue_delayed_work(system_power_efficient_wq, &hub->init_work, msecs_to_jiffies(delay)); /* Suppress autosuspend until init is done */ usb_autopm_get_interface_no_resume( to_usb_interface(hub->intfdev)); return; /* Continues at init2: below */ } else if (type == HUB_RESET_RESUME) { /* The internal host controller state for the hub device * may be gone after a host power loss on system resume. * Update the device's info so the HW knows it's a hub. */ hcd = bus_to_hcd(hdev->bus); if (hcd->driver->update_hub_device) { ret = hcd->driver->update_hub_device(hcd, hdev, &hub->tt, GFP_NOIO); if (ret < 0) { dev_err(hub->intfdev, "Host not accepting hub info update\n"); dev_err(hub->intfdev, "LS/FS devices and hubs may not work under this hub\n"); } } hub_power_on(hub, true); } else { hub_power_on(hub, true); } /* Give some time on remote wakeup to let links to transit to U0 */ } else if (hub_is_superspeed(hub->hdev)) msleep(20); init2: /* * Check each port and set hub->change_bits to let hub_wq know * which ports need attention. */ for (port1 = 1; port1 <= hdev->maxchild; ++port1) { struct usb_port *port_dev = hub->ports[port1 - 1]; struct usb_device *udev = port_dev->child; u16 portstatus, portchange; portstatus = portchange = 0; status = usb_hub_port_status(hub, port1, &portstatus, &portchange); if (status) goto abort; if (udev || (portstatus & USB_PORT_STAT_CONNECTION)) dev_dbg(&port_dev->dev, "status %04x change %04x\n", portstatus, portchange); /* * After anything other than HUB_RESUME (i.e., initialization * or any sort of reset), every port should be disabled. * Unconnected ports should likewise be disabled (paranoia), * and so should ports for which we have no usb_device. */ if ((portstatus & USB_PORT_STAT_ENABLE) && ( type != HUB_RESUME || !(portstatus & USB_PORT_STAT_CONNECTION) || !udev || udev->state == USB_STATE_NOTATTACHED)) { /* * USB3 protocol ports will automatically transition * to Enabled state when detect an USB3.0 device attach. * Do not disable USB3 protocol ports, just pretend * power was lost */ portstatus &= ~USB_PORT_STAT_ENABLE; if (!hub_is_superspeed(hdev)) usb_clear_port_feature(hdev, port1, USB_PORT_FEAT_ENABLE); } /* Make sure a warm-reset request is handled by port_event */ if (type == HUB_RESUME && hub_port_warm_reset_required(hub, port1, portstatus)) set_bit(port1, hub->event_bits); /* * Add debounce if USB3 link is in polling/link training state. * Link will automatically transition to Enabled state after * link training completes. */ if (hub_is_superspeed(hdev) && ((portstatus & USB_PORT_STAT_LINK_STATE) == USB_SS_PORT_LS_POLLING)) need_debounce_delay = true; /* Clear status-change flags; we'll debounce later */ if (portchange & USB_PORT_STAT_C_CONNECTION) { need_debounce_delay = true; usb_clear_port_feature(hub->hdev, port1, USB_PORT_FEAT_C_CONNECTION); } if (portchange & USB_PORT_STAT_C_ENABLE) { need_debounce_delay = true; usb_clear_port_feature(hub->hdev, port1, USB_PORT_FEAT_C_ENABLE); } if (portchange & USB_PORT_STAT_C_RESET) { need_debounce_delay = true; usb_clear_port_feature(hub->hdev, port1, USB_PORT_FEAT_C_RESET); } if ((portchange & USB_PORT_STAT_C_BH_RESET) && hub_is_superspeed(hub->hdev)) { need_debounce_delay = true; usb_clear_port_feature(hub->hdev, port1, USB_PORT_FEAT_C_BH_PORT_RESET); } /* We can forget about a "removed" device when there's a * physical disconnect or the connect status changes. */ if (!(portstatus & USB_PORT_STAT_CONNECTION) || (portchange & USB_PORT_STAT_C_CONNECTION)) clear_bit(port1, hub->removed_bits); if (!udev || udev->state == USB_STATE_NOTATTACHED) { /* Tell hub_wq to disconnect the device or * check for a new connection or over current condition. * Based on USB2.0 Spec Section 11.12.5, * C_PORT_OVER_CURRENT could be set while * PORT_OVER_CURRENT is not. So check for any of them. */ if (udev || (portstatus & USB_PORT_STAT_CONNECTION) || (portchange & USB_PORT_STAT_C_CONNECTION) || (portstatus & USB_PORT_STAT_OVERCURRENT) || (portchange & USB_PORT_STAT_C_OVERCURRENT)) set_bit(port1, hub->change_bits); } else if (portstatus & USB_PORT_STAT_ENABLE) { bool port_resumed = (portstatus & USB_PORT_STAT_LINK_STATE) == USB_SS_PORT_LS_U0; /* The power session apparently survived the resume. * If there was an overcurrent or suspend change * (i.e., remote wakeup request), have hub_wq * take care of it. Look at the port link state * for USB 3.0 hubs, since they don't have a suspend * change bit, and they don't set the port link change * bit on device-initiated resume. */ if (portchange || (hub_is_superspeed(hub->hdev) && port_resumed)) set_bit(port1, hub->event_bits); } else if (udev->persist_enabled) { #ifdef CONFIG_PM udev->reset_resume = 1; #endif /* Don't set the change_bits when the device * was powered off. */ if (test_bit(port1, hub->power_bits)) set_bit(port1, hub->change_bits); } else { /* The power session is gone; tell hub_wq */ usb_set_device_state(udev, USB_STATE_NOTATTACHED); set_bit(port1, hub->change_bits); } } /* If no port-status-change flags were set, we don't need any * debouncing. If flags were set we can try to debounce the * ports all at once right now, instead of letting hub_wq do them * one at a time later on. * * If any port-status changes do occur during this delay, hub_wq * will see them later and handle them normally. */ if (need_debounce_delay) { delay = HUB_DEBOUNCE_STABLE; /* Don't do a long sleep inside a workqueue routine */ if (type == HUB_INIT2) { INIT_DELAYED_WORK(&hub->init_work, hub_init_func3); queue_delayed_work(system_power_efficient_wq, &hub->init_work, msecs_to_jiffies(delay)); device_unlock(&hdev->dev); return; /* Continues at init3: below */ } else { msleep(delay); } } init3: hub->quiescing = 0; status = usb_submit_urb(hub->urb, GFP_NOIO); if (status < 0) dev_err(hub->intfdev, "activate --> %d\n", status); if (hub->has_indicators && blinkenlights) queue_delayed_work(system_power_efficient_wq, &hub->leds, LED_CYCLE_PERIOD); /* Scan all ports that need attention */ kick_hub_wq(hub); abort: if (type == HUB_INIT2 || type == HUB_INIT3) { /* Allow autosuspend if it was suppressed */ disconnected: usb_autopm_put_interface_async(to_usb_interface(hub->intfdev)); device_unlock(&hdev->dev); } if (type == HUB_RESUME && hub_is_superspeed(hub->hdev)) { /* give usb3 downstream links training time after hub resume */ usb_autopm_get_interface_no_resume( to_usb_interface(hub->intfdev)); queue_delayed_work(system_power_efficient_wq, &hub->post_resume_work, msecs_to_jiffies(USB_SS_PORT_U0_WAKE_TIME)); return; } hub_put(hub); } /* Implement the continuations for the delays above */ static void hub_init_func2(struct work_struct *ws) { struct usb_hub *hub = container_of(ws, struct usb_hub, init_work.work); hub_activate(hub, HUB_INIT2); } static void hub_init_func3(struct work_struct *ws) { struct usb_hub *hub = container_of(ws, struct usb_hub, init_work.work); hub_activate(hub, HUB_INIT3); } static void hub_post_resume(struct work_struct *ws) { struct usb_hub *hub = container_of(ws, struct usb_hub, post_resume_work.work); usb_autopm_put_interface_async(to_usb_interface(hub->intfdev)); hub_put(hub); } enum hub_quiescing_type { HUB_DISCONNECT, HUB_PRE_RESET, HUB_SUSPEND }; static void hub_quiesce(struct usb_hub *hub, enum hub_quiescing_type type) { struct usb_device *hdev = hub->hdev; unsigned long flags; int i; /* hub_wq and related activity won't re-trigger */ spin_lock_irqsave(&hub->irq_urb_lock, flags); hub->quiescing = 1; spin_unlock_irqrestore(&hub->irq_urb_lock, flags); if (type != HUB_SUSPEND) { /* Disconnect all the children */ for (i = 0; i < hdev->maxchild; ++i) { if (hub->ports[i]->child) usb_disconnect(&hub->ports[i]->child); } } /* Stop hub_wq and related activity */ timer_delete_sync(&hub->irq_urb_retry); flush_delayed_work(&hub->post_resume_work); usb_kill_urb(hub->urb); if (hub->has_indicators) cancel_delayed_work_sync(&hub->leds); if (hub->tt.hub) flush_work(&hub->tt.clear_work); } static void hub_pm_barrier_for_all_ports(struct usb_hub *hub) { int i; for (i = 0; i < hub->hdev->maxchild; ++i) pm_runtime_barrier(&hub->ports[i]->dev); } /* caller has locked the hub device */ static int hub_pre_reset(struct usb_interface *intf) { struct usb_hub *hub = usb_get_intfdata(intf); hub_quiesce(hub, HUB_PRE_RESET); hub->in_reset = 1; hub_pm_barrier_for_all_ports(hub); return 0; } /* caller has locked the hub device */ static int hub_post_reset(struct usb_interface *intf) { struct usb_hub *hub = usb_get_intfdata(intf); hub->in_reset = 0; hub_pm_barrier_for_all_ports(hub); hub_activate(hub, HUB_POST_RESET); return 0; } static int hub_configure(struct usb_hub *hub, struct usb_endpoint_descriptor *endpoint) { struct usb_hcd *hcd; struct usb_device *hdev = hub->hdev; struct device *hub_dev = hub->intfdev; u16 hubstatus, hubchange; u16 wHubCharacteristics; unsigned int pipe; int maxp, ret, i; char *message = "out of memory"; unsigned unit_load; unsigned full_load; unsigned maxchild; hub->buffer = kmalloc(sizeof(*hub->buffer), GFP_KERNEL); if (!hub->buffer) { ret = -ENOMEM; goto fail; } hub->status = kmalloc(sizeof(*hub->status), GFP_KERNEL); if (!hub->status) { ret = -ENOMEM; goto fail; } mutex_init(&hub->status_mutex); hub->descriptor = kzalloc(sizeof(*hub->descriptor), GFP_KERNEL); if (!hub->descriptor) { ret = -ENOMEM; goto fail; } /* Request the entire hub descriptor. * hub->descriptor can handle USB_MAXCHILDREN ports, * but a (non-SS) hub can/will return fewer bytes here. */ ret = get_hub_descriptor(hdev, hub->descriptor); if (ret < 0) { message = "can't read hub descriptor"; goto fail; } maxchild = USB_MAXCHILDREN; if (hub_is_superspeed(hdev)) maxchild = min_t(unsigned, maxchild, USB_SS_MAXPORTS); if (hub->descriptor->bNbrPorts > maxchild) { message = "hub has too many ports!"; ret = -ENODEV; goto fail; } else if (hub->descriptor->bNbrPorts == 0) { message = "hub doesn't have any ports!"; ret = -ENODEV; goto fail; } /* * Accumulate wHubDelay + 40ns for every hub in the tree of devices. * The resulting value will be used for SetIsochDelay() request. */ if (hub_is_superspeed(hdev) || hub_is_superspeedplus(hdev)) { u32 delay = __le16_to_cpu(hub->descriptor->u.ss.wHubDelay); if (hdev->parent) delay += hdev->parent->hub_delay; delay += USB_TP_TRANSMISSION_DELAY; hdev->hub_delay = min_t(u32, delay, USB_TP_TRANSMISSION_DELAY_MAX); } maxchild = hub->descriptor->bNbrPorts; dev_info(hub_dev, "%d port%s detected\n", maxchild, str_plural(maxchild)); hub->ports = kcalloc(maxchild, sizeof(struct usb_port *), GFP_KERNEL); if (!hub->ports) { ret = -ENOMEM; goto fail; } wHubCharacteristics = le16_to_cpu(hub->descriptor->wHubCharacteristics); if (hub_is_superspeed(hdev)) { unit_load = 150; full_load = 900; } else { unit_load = 100; full_load = 500; } /* FIXME for USB 3.0, skip for now */ if ((wHubCharacteristics & HUB_CHAR_COMPOUND) && !(hub_is_superspeed(hdev))) { char portstr[USB_MAXCHILDREN + 1]; for (i = 0; i < maxchild; i++) portstr[i] = hub->descriptor->u.hs.DeviceRemovable [((i + 1) / 8)] & (1 << ((i + 1) % 8)) ? 'F' : 'R'; portstr[maxchild] = 0; dev_dbg(hub_dev, "compound device; port removable status: %s\n", portstr); } else dev_dbg(hub_dev, "standalone hub\n"); switch (wHubCharacteristics & HUB_CHAR_LPSM) { case HUB_CHAR_COMMON_LPSM: dev_dbg(hub_dev, "ganged power switching\n"); break; case HUB_CHAR_INDV_PORT_LPSM: dev_dbg(hub_dev, "individual port power switching\n"); break; case HUB_CHAR_NO_LPSM: case HUB_CHAR_LPSM: dev_dbg(hub_dev, "no power switching (usb 1.0)\n"); break; } switch (wHubCharacteristics & HUB_CHAR_OCPM) { case HUB_CHAR_COMMON_OCPM: dev_dbg(hub_dev, "global over-current protection\n"); break; case HUB_CHAR_INDV_PORT_OCPM: dev_dbg(hub_dev, "individual port over-current protection\n"); break; case HUB_CHAR_NO_OCPM: case HUB_CHAR_OCPM: dev_dbg(hub_dev, "no over-current protection\n"); break; } spin_lock_init(&hub->tt.lock); INIT_LIST_HEAD(&hub->tt.clear_list); INIT_WORK(&hub->tt.clear_work, hub_tt_work); switch (hdev->descriptor.bDeviceProtocol) { case USB_HUB_PR_FS: break; case USB_HUB_PR_HS_SINGLE_TT: dev_dbg(hub_dev, "Single TT\n"); hub->tt.hub = hdev; break; case USB_HUB_PR_HS_MULTI_TT: ret = usb_set_interface(hdev, 0, 1); if (ret == 0) { dev_dbg(hub_dev, "TT per port\n"); hub->tt.multi = 1; } else dev_err(hub_dev, "Using single TT (err %d)\n", ret); hub->tt.hub = hdev; break; case USB_HUB_PR_SS: /* USB 3.0 hubs don't have a TT */ break; default: dev_dbg(hub_dev, "Unrecognized hub protocol %d\n", hdev->descriptor.bDeviceProtocol); break; } /* Note 8 FS bit times == (8 bits / 12000000 bps) ~= 666ns */ switch (wHubCharacteristics & HUB_CHAR_TTTT) { case HUB_TTTT_8_BITS: if (hdev->descriptor.bDeviceProtocol != 0) { hub->tt.think_time = 666; dev_dbg(hub_dev, "TT requires at most %d " "FS bit times (%d ns)\n", 8, hub->tt.think_time); } break; case HUB_TTTT_16_BITS: hub->tt.think_time = 666 * 2; dev_dbg(hub_dev, "TT requires at most %d " "FS bit times (%d ns)\n", 16, hub->tt.think_time); break; case HUB_TTTT_24_BITS: hub->tt.think_time = 666 * 3; dev_dbg(hub_dev, "TT requires at most %d " "FS bit times (%d ns)\n", 24, hub->tt.think_time); break; case HUB_TTTT_32_BITS: hub->tt.think_time = 666 * 4; dev_dbg(hub_dev, "TT requires at most %d " "FS bit times (%d ns)\n", 32, hub->tt.think_time); break; } /* probe() zeroes hub->indicator[] */ if (wHubCharacteristics & HUB_CHAR_PORTIND) { hub->has_indicators = 1; dev_dbg(hub_dev, "Port indicators are supported\n"); } dev_dbg(hub_dev, "power on to power good time: %dms\n", hub->descriptor->bPwrOn2PwrGood * 2); /* power budgeting mostly matters with bus-powered hubs, * and battery-powered root hubs (may provide just 8 mA). */ ret = usb_get_std_status(hdev, USB_RECIP_DEVICE, 0, &hubstatus); if (ret) { message = "can't get hub status"; goto fail; } hcd = bus_to_hcd(hdev->bus); if (hdev == hdev->bus->root_hub) { if (hcd->power_budget > 0) hdev->bus_mA = hcd->power_budget; else hdev->bus_mA = full_load * maxchild; if (hdev->bus_mA >= full_load) hub->mA_per_port = full_load; else { hub->mA_per_port = hdev->bus_mA; hub->limited_power = 1; } } else if ((hubstatus & (1 << USB_DEVICE_SELF_POWERED)) == 0) { int remaining = hdev->bus_mA - hub->descriptor->bHubContrCurrent; dev_dbg(hub_dev, "hub controller current requirement: %dmA\n", hub->descriptor->bHubContrCurrent); hub->limited_power = 1; if (remaining < maxchild * unit_load) dev_warn(hub_dev, "insufficient power available " "to use all downstream ports\n"); hub->mA_per_port = unit_load; /* 7.2.1 */ } else { /* Self-powered external hub */ /* FIXME: What about battery-powered external hubs that * provide less current per port? */ hub->mA_per_port = full_load; } if (hub->mA_per_port < full_load) dev_dbg(hub_dev, "%umA bus power budget for each child\n", hub->mA_per_port); ret = hub_hub_status(hub, &hubstatus, &hubchange); if (ret < 0) { message = "can't get hub status"; goto fail; } /* local power status reports aren't always correct */ if (hdev->actconfig->desc.bmAttributes & USB_CONFIG_ATT_SELFPOWER) dev_dbg(hub_dev, "local power source is %s\n", (hubstatus & HUB_STATUS_LOCAL_POWER) ? "lost (inactive)" : "good"); if ((wHubCharacteristics & HUB_CHAR_OCPM) == 0) dev_dbg(hub_dev, "%sover-current condition exists\n", (hubstatus & HUB_STATUS_OVERCURRENT) ? "" : "no "); /* set up the interrupt endpoint * We use the EP's maxpacket size instead of (PORTS+1+7)/8 * bytes as USB2.0[11.12.3] says because some hubs are known * to send more data (and thus cause overflow). For root hubs, * maxpktsize is defined in hcd.c's fake endpoint descriptors * to be big enough for at least USB_MAXCHILDREN ports. */ pipe = usb_rcvintpipe(hdev, endpoint->bEndpointAddress); maxp = usb_maxpacket(hdev, pipe); if (maxp > sizeof(*hub->buffer)) maxp = sizeof(*hub->buffer); hub->urb = usb_alloc_urb(0, GFP_KERNEL); if (!hub->urb) { ret = -ENOMEM; goto fail; } usb_fill_int_urb(hub->urb, hdev, pipe, *hub->buffer, maxp, hub_irq, hub, endpoint->bInterval); /* maybe cycle the hub leds */ if (hub->has_indicators && blinkenlights) hub->indicator[0] = INDICATOR_CYCLE; mutex_lock(&usb_port_peer_mutex); for (i = 0; i < maxchild; i++) { ret = usb_hub_create_port_device(hub, i + 1); if (ret < 0) { dev_err(hub->intfdev, "couldn't create port%d device.\n", i + 1); break; } } hdev->maxchild = i; for (i = 0; i < hdev->maxchild; i++) { struct usb_port *port_dev = hub->ports[i]; pm_runtime_put(&port_dev->dev); } mutex_unlock(&usb_port_peer_mutex); if (ret < 0) goto fail; /* Update the HCD's internal representation of this hub before hub_wq * starts getting port status changes for devices under the hub. */ if (hcd->driver->update_hub_device) { ret = hcd->driver->update_hub_device(hcd, hdev, &hub->tt, GFP_KERNEL); if (ret < 0) { message = "can't update HCD hub info"; goto fail; } } usb_hub_adjust_deviceremovable(hdev, hub->descriptor); hub_activate(hub, HUB_INIT); return 0; fail: dev_err(hub_dev, "config failed, %s (err %d)\n", message, ret); /* hub_disconnect() frees urb and descriptor */ return ret; } static void hub_release(struct kref *kref) { struct usb_hub *hub = container_of(kref, struct usb_hub, kref); usb_put_dev(hub->hdev); usb_put_intf(to_usb_interface(hub->intfdev)); kfree(hub); } void hub_get(struct usb_hub *hub) { kref_get(&hub->kref); } void hub_put(struct usb_hub *hub) { kref_put(&hub->kref, hub_release); } static unsigned highspeed_hubs; static void hub_disconnect(struct usb_interface *intf) { struct usb_hub *hub = usb_get_intfdata(intf); struct usb_device *hdev = interface_to_usbdev(intf); int port1; /* * Stop adding new hub events. We do not want to block here and thus * will not try to remove any pending work item. */ hub->disconnected = 1; /* Disconnect all children and quiesce the hub */ hub->error = 0; hub_quiesce(hub, HUB_DISCONNECT); mutex_lock(&usb_port_peer_mutex); /* Avoid races with recursively_mark_NOTATTACHED() */ spin_lock_irq(&device_state_lock); port1 = hdev->maxchild; hdev->maxchild = 0; usb_set_intfdata(intf, NULL); spin_unlock_irq(&device_state_lock); for (; port1 > 0; --port1) usb_hub_remove_port_device(hub, port1); mutex_unlock(&usb_port_peer_mutex); if (hub->hdev->speed == USB_SPEED_HIGH) highspeed_hubs--; usb_free_urb(hub->urb); kfree(hub->ports); kfree(hub->descriptor); kfree(hub->status); kfree(hub->buffer); pm_suspend_ignore_children(&intf->dev, false); if (hub->quirk_disable_autosuspend) usb_autopm_put_interface(intf); onboard_dev_destroy_pdevs(&hub->onboard_devs); hub_put(hub); } static bool hub_descriptor_is_sane(struct usb_host_interface *desc) { /* Some hubs have a subclass of 1, which AFAICT according to the */ /* specs is not defined, but it works */ if (desc->desc.bInterfaceSubClass != 0 && desc->desc.bInterfaceSubClass != 1) return false; /* Multiple endpoints? What kind of mutant ninja-hub is this? */ if (desc->desc.bNumEndpoints != 1) return false; /* If the first endpoint is not interrupt IN, we'd better punt! */ if (!usb_endpoint_is_int_in(&desc->endpoint[0].desc)) return false; return true; } static int hub_probe(struct usb_interface *intf, const struct usb_device_id *id) { struct usb_host_interface *desc; struct usb_device *hdev; struct usb_hub *hub; desc = intf->cur_altsetting; hdev = interface_to_usbdev(intf); /* * The USB 2.0 spec prohibits hubs from having more than one * configuration or interface, and we rely on this prohibition. * Refuse to accept a device that violates it. */ if (hdev->descriptor.bNumConfigurations > 1 || hdev->actconfig->desc.bNumInterfaces > 1) { dev_err(&intf->dev, "Invalid hub with more than one config or interface\n"); return -EINVAL; } /* * Set default autosuspend delay as 0 to speedup bus suspend, * based on the below considerations: * * - Unlike other drivers, the hub driver does not rely on the * autosuspend delay to provide enough time to handle a wakeup * event, and the submitted status URB is just to check future * change on hub downstream ports, so it is safe to do it. * * - The patch might cause one or more auto supend/resume for * below very rare devices when they are plugged into hub * first time: * * devices having trouble initializing, and disconnect * themselves from the bus and then reconnect a second * or so later * * devices just for downloading firmware, and disconnects * themselves after completing it * * For these quite rare devices, their drivers may change the * autosuspend delay of their parent hub in the probe() to one * appropriate value to avoid the subtle problem if someone * does care it. * * - The patch may cause one or more auto suspend/resume on * hub during running 'lsusb', but it is probably too * infrequent to worry about. * * - Change autosuspend delay of hub can avoid unnecessary auto * suspend timer for hub, also may decrease power consumption * of USB bus. * * - If user has indicated to prevent autosuspend by passing * usbcore.autosuspend = -1 then keep autosuspend disabled. */ #ifdef CONFIG_PM if (hdev->dev.power.autosuspend_delay >= 0) pm_runtime_set_autosuspend_delay(&hdev->dev, 0); #endif /* * Hubs have proper suspend/resume support, except for root hubs * where the controller driver doesn't have bus_suspend and * bus_resume methods. */ if (hdev->parent) { /* normal device */ usb_enable_autosuspend(hdev); } else { /* root hub */ const struct hc_driver *drv = bus_to_hcd(hdev->bus)->driver; if (drv->bus_suspend && drv->bus_resume) usb_enable_autosuspend(hdev); } if (hdev->level == MAX_TOPO_LEVEL) { dev_err(&intf->dev, "Unsupported bus topology: hub nested too deep\n"); return -E2BIG; } #ifdef CONFIG_USB_OTG_DISABLE_EXTERNAL_HUB if (hdev->parent) { dev_warn(&intf->dev, "ignoring external hub\n"); return -ENODEV; } #endif if (!hub_descriptor_is_sane(desc)) { dev_err(&intf->dev, "bad descriptor, ignoring hub\n"); return -EIO; } /* We found a hub */ dev_info(&intf->dev, "USB hub found\n"); hub = kzalloc(sizeof(*hub), GFP_KERNEL); if (!hub) return -ENOMEM; kref_init(&hub->kref); hub->intfdev = &intf->dev; hub->hdev = hdev; INIT_DELAYED_WORK(&hub->leds, led_work); INIT_DELAYED_WORK(&hub->init_work, NULL); INIT_DELAYED_WORK(&hub->post_resume_work, hub_post_resume); INIT_WORK(&hub->events, hub_event); INIT_LIST_HEAD(&hub->onboard_devs); spin_lock_init(&hub->irq_urb_lock); timer_setup(&hub->irq_urb_retry, hub_retry_irq_urb, 0); usb_get_intf(intf); usb_get_dev(hdev); usb_set_intfdata(intf, hub); intf->needs_remote_wakeup = 1; pm_suspend_ignore_children(&intf->dev, true); if (hdev->speed == USB_SPEED_HIGH) highspeed_hubs++; if (id->driver_info & HUB_QUIRK_CHECK_PORT_AUTOSUSPEND) hub->quirk_check_port_auto_suspend = 1; if (id->driver_info & HUB_QUIRK_DISABLE_AUTOSUSPEND) { hub->quirk_disable_autosuspend = 1; usb_autopm_get_interface_no_resume(intf); } if ((id->driver_info & HUB_QUIRK_REDUCE_FRAME_INTR_BINTERVAL) && desc->endpoint[0].desc.bInterval > USB_REDUCE_FRAME_INTR_BINTERVAL) { desc->endpoint[0].desc.bInterval = USB_REDUCE_FRAME_INTR_BINTERVAL; /* Tell the HCD about the interrupt ep's new bInterval */ usb_set_interface(hdev, 0, 0); } if (hub_configure(hub, &desc->endpoint[0].desc) >= 0) { onboard_dev_create_pdevs(hdev, &hub->onboard_devs); return 0; } hub_disconnect(intf); return -ENODEV; } static int hub_ioctl(struct usb_interface *intf, unsigned int code, void *user_data) { struct usb_device *hdev = interface_to_usbdev(intf); struct usb_hub *hub = usb_hub_to_struct_hub(hdev); /* assert ifno == 0 (part of hub spec) */ switch (code) { case USBDEVFS_HUB_PORTINFO: { struct usbdevfs_hub_portinfo *info = user_data; int i; spin_lock_irq(&device_state_lock); if (hdev->devnum <= 0) info->nports = 0; else { info->nports = hdev->maxchild; for (i = 0; i < info->nports; i++) { if (hub->ports[i]->child == NULL) info->port[i] = 0; else info->port[i] = hub->ports[i]->child->devnum; } } spin_unlock_irq(&device_state_lock); return info->nports + 1; } default: return -ENOSYS; } } /* * Allow user programs to claim ports on a hub. When a device is attached * to one of these "claimed" ports, the program will "own" the device. */ static int find_port_owner(struct usb_device *hdev, unsigned port1, struct usb_dev_state ***ppowner) { struct usb_hub *hub = usb_hub_to_struct_hub(hdev); if (hdev->state == USB_STATE_NOTATTACHED) return -ENODEV; if (port1 == 0 || port1 > hdev->maxchild) return -EINVAL; /* Devices not managed by the hub driver * will always have maxchild equal to 0. */ *ppowner = &(hub->ports[port1 - 1]->port_owner); return 0; } /* In the following three functions, the caller must hold hdev's lock */ int usb_hub_claim_port(struct usb_device *hdev, unsigned port1, struct usb_dev_state *owner) { int rc; struct usb_dev_state **powner; rc = find_port_owner(hdev, port1, &powner); if (rc) return rc; if (*powner) return -EBUSY; *powner = owner; return rc; } EXPORT_SYMBOL_GPL(usb_hub_claim_port); int usb_hub_release_port(struct usb_device *hdev, unsigned port1, struct usb_dev_state *owner) { int rc; struct usb_dev_state **powner; rc = find_port_owner(hdev, port1, &powner); if (rc) return rc; if (*powner != owner) return -ENOENT; *powner = NULL; return rc; } EXPORT_SYMBOL_GPL(usb_hub_release_port); void usb_hub_release_all_ports(struct usb_device *hdev, struct usb_dev_state *owner) { struct usb_hub *hub = usb_hub_to_struct_hub(hdev); int n; for (n = 0; n < hdev->maxchild; n++) { if (hub->ports[n]->port_owner == owner) hub->ports[n]->port_owner = NULL; } } /* The caller must hold udev's lock */ bool usb_device_is_owned(struct usb_device *udev) { struct usb_hub *hub; if (udev->state == USB_STATE_NOTATTACHED || !udev->parent) return false; hub = usb_hub_to_struct_hub(udev->parent); return !!hub->ports[udev->portnum - 1]->port_owner; } static void update_port_device_state(struct usb_device *udev) { struct usb_hub *hub; struct usb_port *port_dev; if (udev->parent) { hub = usb_hub_to_struct_hub(udev->parent); /* * The Link Layer Validation System Driver (lvstest) * has a test step to unbind the hub before running the * rest of the procedure. This triggers hub_disconnect * which will set the hub's maxchild to 0, further * resulting in usb_hub_to_struct_hub returning NULL. */ if (hub) { port_dev = hub->ports[udev->portnum - 1]; WRITE_ONCE(port_dev->state, udev->state); sysfs_notify_dirent(port_dev->state_kn); } } } static void update_usb_device_state(struct usb_device *udev, enum usb_device_state new_state) { if (udev->state == USB_STATE_SUSPENDED && new_state != USB_STATE_SUSPENDED) udev->active_duration -= jiffies; else if (new_state == USB_STATE_SUSPENDED && udev->state != USB_STATE_SUSPENDED) udev->active_duration += jiffies; udev->state = new_state; update_port_device_state(udev); trace_usb_set_device_state(udev); } static void recursively_mark_NOTATTACHED(struct usb_device *udev) { struct usb_hub *hub = usb_hub_to_struct_hub(udev); int i; for (i = 0; i < udev->maxchild; ++i) { if (hub->ports[i]->child) recursively_mark_NOTATTACHED(hub->ports[i]->child); } update_usb_device_state(udev, USB_STATE_NOTATTACHED); } /** * usb_set_device_state - change a device's current state (usbcore, hcds) * @udev: pointer to device whose state should be changed * @new_state: new state value to be stored * * udev->state is _not_ fully protected by the device lock. Although * most transitions are made only while holding the lock, the state can * can change to USB_STATE_NOTATTACHED at almost any time. This * is so that devices can be marked as disconnected as soon as possible, * without having to wait for any semaphores to be released. As a result, * all changes to any device's state must be protected by the * device_state_lock spinlock. * * Once a device has been added to the device tree, all changes to its state * should be made using this routine. The state should _not_ be set directly. * * If udev->state is already USB_STATE_NOTATTACHED then no change is made. * Otherwise udev->state is set to new_state, and if new_state is * USB_STATE_NOTATTACHED then all of udev's descendants' states are also set * to USB_STATE_NOTATTACHED. */ void usb_set_device_state(struct usb_device *udev, enum usb_device_state new_state) { unsigned long flags; int wakeup = -1; spin_lock_irqsave(&device_state_lock, flags); if (udev->state == USB_STATE_NOTATTACHED) ; /* do nothing */ else if (new_state != USB_STATE_NOTATTACHED) { /* root hub wakeup capabilities are managed out-of-band * and may involve silicon errata ... ignore them here. */ if (udev->parent) { if (udev->state == USB_STATE_SUSPENDED || new_state == USB_STATE_SUSPENDED) ; /* No change to wakeup settings */ else if (new_state == USB_STATE_CONFIGURED) wakeup = (udev->quirks & USB_QUIRK_IGNORE_REMOTE_WAKEUP) ? 0 : udev->actconfig->desc.bmAttributes & USB_CONFIG_ATT_WAKEUP; else wakeup = 0; } update_usb_device_state(udev, new_state); } else recursively_mark_NOTATTACHED(udev); spin_unlock_irqrestore(&device_state_lock, flags); if (wakeup >= 0) device_set_wakeup_capable(&udev->dev, wakeup); } EXPORT_SYMBOL_GPL(usb_set_device_state); /* * Choose a device number. * * Device numbers are used as filenames in usbfs. On USB-1.1 and * USB-2.0 buses they are also used as device addresses, however on * USB-3.0 buses the address is assigned by the controller hardware * and it usually is not the same as the device number. * * Devices connected under xHCI are not as simple. The host controller * supports virtualization, so the hardware assigns device addresses and * the HCD must setup data structures before issuing a set address * command to the hardware. */ static void choose_devnum(struct usb_device *udev) { int devnum; struct usb_bus *bus = udev->bus; /* be safe when more hub events are proceed in parallel */ mutex_lock(&bus->devnum_next_mutex); /* Try to allocate the next devnum beginning at bus->devnum_next. */ devnum = find_next_zero_bit(bus->devmap, 128, bus->devnum_next); if (devnum >= 128) devnum = find_next_zero_bit(bus->devmap, 128, 1); bus->devnum_next = (devnum >= 127 ? 1 : devnum + 1); if (devnum < 128) { set_bit(devnum, bus->devmap); udev->devnum = devnum; } mutex_unlock(&bus->devnum_next_mutex); } static void release_devnum(struct usb_device *udev) { if (udev->devnum > 0) { clear_bit(udev->devnum, udev->bus->devmap); udev->devnum = -1; } } static void update_devnum(struct usb_device *udev, int devnum) { udev->devnum = devnum; if (!udev->devaddr) udev->devaddr = (u8)devnum; } static void hub_free_dev(struct usb_device *udev) { struct usb_hcd *hcd = bus_to_hcd(udev->bus); /* Root hubs aren't real devices, so don't free HCD resources */ if (hcd->driver->free_dev && udev->parent) hcd->driver->free_dev(hcd, udev); } static void hub_disconnect_children(struct usb_device *udev) { struct usb_hub *hub = usb_hub_to_struct_hub(udev); int i; /* Free up all the children before we remove this device */ for (i = 0; i < udev->maxchild; i++) { if (hub->ports[i]->child) usb_disconnect(&hub->ports[i]->child); } } /** * usb_disconnect - disconnect a device (usbcore-internal) * @pdev: pointer to device being disconnected * * Context: task context, might sleep * * Something got disconnected. Get rid of it and all of its children. * * If *pdev is a normal device then the parent hub must already be locked. * If *pdev is a root hub then the caller must hold the usb_bus_idr_lock, * which protects the set of root hubs as well as the list of buses. * * Only hub drivers (including virtual root hub drivers for host * controllers) should ever call this. * * This call is synchronous, and may not be used in an interrupt context. */ void usb_disconnect(struct usb_device **pdev) { struct usb_port *port_dev = NULL; struct usb_device *udev = *pdev; struct usb_hub *hub = NULL; int port1 = 1; /* mark the device as inactive, so any further urb submissions for * this device (and any of its children) will fail immediately. * this quiesces everything except pending urbs. */ usb_set_device_state(udev, USB_STATE_NOTATTACHED); dev_info(&udev->dev, "USB disconnect, device number %d\n", udev->devnum); /* * Ensure that the pm runtime code knows that the USB device * is in the process of being disconnected. */ pm_runtime_barrier(&udev->dev); usb_lock_device(udev); hub_disconnect_children(udev); /* deallocate hcd/hardware state ... nuking all pending urbs and * cleaning up all state associated with the current configuration * so that the hardware is now fully quiesced. */ dev_dbg(&udev->dev, "unregistering device\n"); usb_disable_device(udev, 0); usb_hcd_synchronize_unlinks(udev); if (udev->parent) { port1 = udev->portnum; hub = usb_hub_to_struct_hub(udev->parent); port_dev = hub->ports[port1 - 1]; sysfs_remove_link(&udev->dev.kobj, "port"); sysfs_remove_link(&port_dev->dev.kobj, "device"); /* * As usb_port_runtime_resume() de-references udev, make * sure no resumes occur during removal */ if (!test_and_set_bit(port1, hub->child_usage_bits)) pm_runtime_get_sync(&port_dev->dev); typec_deattach(port_dev->connector, &udev->dev); } usb_remove_ep_devs(&udev->ep0); usb_unlock_device(udev); if (udev->usb4_link) device_link_del(udev->usb4_link); /* Unregister the device. The device driver is responsible * for de-configuring the device and invoking the remove-device * notifier chain (used by usbfs and possibly others). */ device_del(&udev->dev); /* Free the device number and delete the parent's children[] * (or root_hub) pointer. */ release_devnum(udev); /* Avoid races with recursively_mark_NOTATTACHED() */ spin_lock_irq(&device_state_lock); *pdev = NULL; spin_unlock_irq(&device_state_lock); if (port_dev && test_and_clear_bit(port1, hub->child_usage_bits)) pm_runtime_put(&port_dev->dev); hub_free_dev(udev); put_device(&udev->dev); } #ifdef CONFIG_USB_ANNOUNCE_NEW_DEVICES static void show_string(struct usb_device *udev, char *id, char *string) { if (!string) return; dev_info(&udev->dev, "%s: %s\n", id, string); } static void announce_device(struct usb_device *udev) { u16 bcdDevice = le16_to_cpu(udev->descriptor.bcdDevice); dev_info(&udev->dev, "New USB device found, idVendor=%04x, idProduct=%04x, bcdDevice=%2x.%02x\n", le16_to_cpu(udev->descriptor.idVendor), le16_to_cpu(udev->descriptor.idProduct), bcdDevice >> 8, bcdDevice & 0xff); dev_info(&udev->dev, "New USB device strings: Mfr=%d, Product=%d, SerialNumber=%d\n", udev->descriptor.iManufacturer, udev->descriptor.iProduct, udev->descriptor.iSerialNumber); show_string(udev, "Product", udev->product); show_string(udev, "Manufacturer", udev->manufacturer); show_string(udev, "SerialNumber", udev->serial); } #else static inline void announce_device(struct usb_device *udev) { } #endif /** * usb_enumerate_device_otg - FIXME (usbcore-internal) * @udev: newly addressed device (in ADDRESS state) * * Finish enumeration for On-The-Go devices * * Return: 0 if successful. A negative error code otherwise. */ static int usb_enumerate_device_otg(struct usb_device *udev) { int err = 0; #ifdef CONFIG_USB_OTG /* * OTG-aware devices on OTG-capable root hubs may be able to use SRP, * to wake us after we've powered off VBUS; and HNP, switching roles * "host" to "peripheral". The OTG descriptor helps figure this out. */ if (!udev->bus->is_b_host && udev->config && udev->parent == udev->bus->root_hub) { struct usb_otg_descriptor *desc = NULL; struct usb_bus *bus = udev->bus; unsigned port1 = udev->portnum; /* descriptor may appear anywhere in config */ err = __usb_get_extra_descriptor(udev->rawdescriptors[0], le16_to_cpu(udev->config[0].desc.wTotalLength), USB_DT_OTG, (void **) &desc, sizeof(*desc)); if (err || !(desc->bmAttributes & USB_OTG_HNP)) return 0; dev_info(&udev->dev, "Dual-Role OTG device on %sHNP port\n", (port1 == bus->otg_port) ? "" : "non-"); /* enable HNP before suspend, it's simpler */ if (port1 == bus->otg_port) { bus->b_hnp_enable = 1; err = usb_control_msg(udev, usb_sndctrlpipe(udev, 0), USB_REQ_SET_FEATURE, 0, USB_DEVICE_B_HNP_ENABLE, 0, NULL, 0, USB_CTRL_SET_TIMEOUT); if (err < 0) { /* * OTG MESSAGE: report errors here, * customize to match your product. */ dev_err(&udev->dev, "can't set HNP mode: %d\n", err); bus->b_hnp_enable = 0; } } else if (desc->bLength == sizeof (struct usb_otg_descriptor)) { /* * We are operating on a legacy OTP device * These should be told that they are operating * on the wrong port if we have another port that does * support HNP */ if (bus->otg_port != 0) { /* Set a_alt_hnp_support for legacy otg device */ err = usb_control_msg(udev, usb_sndctrlpipe(udev, 0), USB_REQ_SET_FEATURE, 0, USB_DEVICE_A_ALT_HNP_SUPPORT, 0, NULL, 0, USB_CTRL_SET_TIMEOUT); if (err < 0) dev_err(&udev->dev, "set a_alt_hnp_support failed: %d\n", err); } } } #endif return err; } /** * usb_enumerate_device - Read device configs/intfs/otg (usbcore-internal) * @udev: newly addressed device (in ADDRESS state) * * This is only called by usb_new_device() -- all comments that apply there * apply here wrt to environment. * * If the device is WUSB and not authorized, we don't attempt to read * the string descriptors, as they will be errored out by the device * until it has been authorized. * * Return: 0 if successful. A negative error code otherwise. */ static int usb_enumerate_device(struct usb_device *udev) { int err; struct usb_hcd *hcd = bus_to_hcd(udev->bus); if (udev->config == NULL) { err = usb_get_configuration(udev); if (err < 0) { if (err != -ENODEV) dev_err(&udev->dev, "can't read configurations, error %d\n", err); return err; } } /* read the standard strings and cache them if present */ udev->product = usb_cache_string(udev, udev->descriptor.iProduct); udev->manufacturer = usb_cache_string(udev, udev->descriptor.iManufacturer); udev->serial = usb_cache_string(udev, udev->descriptor.iSerialNumber); err = usb_enumerate_device_otg(udev); if (err < 0) return err; if (IS_ENABLED(CONFIG_USB_OTG_PRODUCTLIST) && hcd->tpl_support && !is_targeted(udev)) { /* Maybe it can talk to us, though we can't talk to it. * (Includes HNP test device.) */ if (IS_ENABLED(CONFIG_USB_OTG) && (udev->bus->b_hnp_enable || udev->bus->is_b_host)) { err = usb_port_suspend(udev, PMSG_AUTO_SUSPEND); if (err < 0) dev_dbg(&udev->dev, "HNP fail, %d\n", err); } return -ENOTSUPP; } usb_detect_interface_quirks(udev); return 0; } static void set_usb_port_removable(struct usb_device *udev) { struct usb_device *hdev = udev->parent; struct usb_hub *hub; u8 port = udev->portnum; u16 wHubCharacteristics; bool removable = true; dev_set_removable(&udev->dev, DEVICE_REMOVABLE_UNKNOWN); if (!hdev) return; hub = usb_hub_to_struct_hub(udev->parent); /* * If the platform firmware has provided information about a port, * use that to determine whether it's removable. */ switch (hub->ports[udev->portnum - 1]->connect_type) { case USB_PORT_CONNECT_TYPE_HOT_PLUG: dev_set_removable(&udev->dev, DEVICE_REMOVABLE); return; case USB_PORT_CONNECT_TYPE_HARD_WIRED: case USB_PORT_NOT_USED: dev_set_removable(&udev->dev, DEVICE_FIXED); return; default: break; } /* * Otherwise, check whether the hub knows whether a port is removable * or not */ wHubCharacteristics = le16_to_cpu(hub->descriptor->wHubCharacteristics); if (!(wHubCharacteristics & HUB_CHAR_COMPOUND)) return; if (hub_is_superspeed(hdev)) { if (le16_to_cpu(hub->descriptor->u.ss.DeviceRemovable) & (1 << port)) removable = false; } else { if (hub->descriptor->u.hs.DeviceRemovable[port / 8] & (1 << (port % 8))) removable = false; } if (removable) dev_set_removable(&udev->dev, DEVICE_REMOVABLE); else dev_set_removable(&udev->dev, DEVICE_FIXED); } /** * usb_new_device - perform initial device setup (usbcore-internal) * @udev: newly addressed device (in ADDRESS state) * * This is called with devices which have been detected but not fully * enumerated. The device descriptor is available, but not descriptors * for any device configuration. The caller must have locked either * the parent hub (if udev is a normal device) or else the * usb_bus_idr_lock (if udev is a root hub). The parent's pointer to * udev has already been installed, but udev is not yet visible through * sysfs or other filesystem code. * * This call is synchronous, and may not be used in an interrupt context. * * Only the hub driver or root-hub registrar should ever call this. * * Return: Whether the device is configured properly or not. Zero if the * interface was registered with the driver core; else a negative errno * value. * */ int usb_new_device(struct usb_device *udev) { int err; if (udev->parent) { /* Initialize non-root-hub device wakeup to disabled; * device (un)configuration controls wakeup capable * sysfs power/wakeup controls wakeup enabled/disabled */ device_init_wakeup(&udev->dev, 0); } /* Tell the runtime-PM framework the device is active */ pm_runtime_set_active(&udev->dev); pm_runtime_get_noresume(&udev->dev); pm_runtime_use_autosuspend(&udev->dev); pm_runtime_enable(&udev->dev); /* By default, forbid autosuspend for all devices. It will be * allowed for hubs during binding. */ usb_disable_autosuspend(udev); err = usb_enumerate_device(udev); /* Read descriptors */ if (err < 0) goto fail; dev_dbg(&udev->dev, "udev %d, busnum %d, minor = %d\n", udev->devnum, udev->bus->busnum, (((udev->bus->busnum-1) * 128) + (udev->devnum-1))); /* export the usbdev device-node for libusb */ udev->dev.devt = MKDEV(USB_DEVICE_MAJOR, (((udev->bus->busnum-1) * 128) + (udev->devnum-1))); /* Tell the world! */ announce_device(udev); if (udev->serial) add_device_randomness(udev->serial, strlen(udev->serial)); if (udev->product) add_device_randomness(udev->product, strlen(udev->product)); if (udev->manufacturer) add_device_randomness(udev->manufacturer, strlen(udev->manufacturer)); device_enable_async_suspend(&udev->dev); /* check whether the hub or firmware marks this port as non-removable */ set_usb_port_removable(udev); /* Register the device. The device driver is responsible * for configuring the device and invoking the add-device * notifier chain (used by usbfs and possibly others). */ err = device_add(&udev->dev); if (err) { dev_err(&udev->dev, "can't device_add, error %d\n", err); goto fail; } /* Create link files between child device and usb port device. */ if (udev->parent) { struct usb_hub *hub = usb_hub_to_struct_hub(udev->parent); int port1 = udev->portnum; struct usb_port *port_dev = hub->ports[port1 - 1]; err = sysfs_create_link(&udev->dev.kobj, &port_dev->dev.kobj, "port"); if (err) goto out_del_dev; err = sysfs_create_link(&port_dev->dev.kobj, &udev->dev.kobj, "device"); if (err) { sysfs_remove_link(&udev->dev.kobj, "port"); goto out_del_dev; } if (!test_and_set_bit(port1, hub->child_usage_bits)) pm_runtime_get_sync(&port_dev->dev); typec_attach(port_dev->connector, &udev->dev); } (void) usb_create_ep_devs(&udev->dev, &udev->ep0, udev); usb_mark_last_busy(udev); pm_runtime_put_sync_autosuspend(&udev->dev); return err; out_del_dev: device_del(&udev->dev); fail: usb_set_device_state(udev, USB_STATE_NOTATTACHED); pm_runtime_disable(&udev->dev); pm_runtime_set_suspended(&udev->dev); return err; } /** * usb_deauthorize_device - deauthorize a device (usbcore-internal) * @usb_dev: USB device * * Move the USB device to a very basic state where interfaces are disabled * and the device is in fact unconfigured and unusable. * * We share a lock (that we have) with device_del(), so we need to * defer its call. * * Return: 0. */ int usb_deauthorize_device(struct usb_device *usb_dev) { usb_lock_device(usb_dev); if (usb_dev->authorized == 0) goto out_unauthorized; usb_dev->authorized = 0; usb_set_configuration(usb_dev, -1); out_unauthorized: usb_unlock_device(usb_dev); return 0; } int usb_authorize_device(struct usb_device *usb_dev) { int result = 0, c; usb_lock_device(usb_dev); if (usb_dev->authorized == 1) goto out_authorized; result = usb_autoresume_device(usb_dev); if (result < 0) { dev_err(&usb_dev->dev, "can't autoresume for authorization: %d\n", result); goto error_autoresume; } usb_dev->authorized = 1; /* Choose and set the configuration. This registers the interfaces * with the driver core and lets interface drivers bind to them. */ c = usb_choose_configuration(usb_dev); if (c >= 0) { result = usb_set_configuration(usb_dev, c); if (result) { dev_err(&usb_dev->dev, "can't set config #%d, error %d\n", c, result); /* This need not be fatal. The user can try to * set other configurations. */ } } dev_info(&usb_dev->dev, "authorized to connect\n"); usb_autosuspend_device(usb_dev); error_autoresume: out_authorized: usb_unlock_device(usb_dev); /* complements locktree */ return result; } /** * get_port_ssp_rate - Match the extended port status to SSP rate * @hdev: The hub device * @ext_portstatus: extended port status * * Match the extended port status speed id to the SuperSpeed Plus sublink speed * capability attributes. Base on the number of connected lanes and speed, * return the corresponding enum usb_ssp_rate. */ static enum usb_ssp_rate get_port_ssp_rate(struct usb_device *hdev, u32 ext_portstatus) { struct usb_ssp_cap_descriptor *ssp_cap; u32 attr; u8 speed_id; u8 ssac; u8 lanes; int i; if (!hdev->bos) goto out; ssp_cap = hdev->bos->ssp_cap; if (!ssp_cap) goto out; speed_id = ext_portstatus & USB_EXT_PORT_STAT_RX_SPEED_ID; lanes = USB_EXT_PORT_RX_LANES(ext_portstatus) + 1; ssac = le32_to_cpu(ssp_cap->bmAttributes) & USB_SSP_SUBLINK_SPEED_ATTRIBS; for (i = 0; i <= ssac; i++) { u8 ssid; attr = le32_to_cpu(ssp_cap->bmSublinkSpeedAttr[i]); ssid = FIELD_GET(USB_SSP_SUBLINK_SPEED_SSID, attr); if (speed_id == ssid) { u16 mantissa; u8 lse; u8 type; /* * Note: currently asymmetric lane types are only * applicable for SSIC operate in SuperSpeed protocol */ type = FIELD_GET(USB_SSP_SUBLINK_SPEED_ST, attr); if (type == USB_SSP_SUBLINK_SPEED_ST_ASYM_RX || type == USB_SSP_SUBLINK_SPEED_ST_ASYM_TX) goto out; if (FIELD_GET(USB_SSP_SUBLINK_SPEED_LP, attr) != USB_SSP_SUBLINK_SPEED_LP_SSP) goto out; lse = FIELD_GET(USB_SSP_SUBLINK_SPEED_LSE, attr); mantissa = FIELD_GET(USB_SSP_SUBLINK_SPEED_LSM, attr); /* Convert to Gbps */ for (; lse < USB_SSP_SUBLINK_SPEED_LSE_GBPS; lse++) mantissa /= 1000; if (mantissa >= 10 && lanes == 1) return USB_SSP_GEN_2x1; if (mantissa >= 10 && lanes == 2) return USB_SSP_GEN_2x2; if (mantissa >= 5 && lanes == 2) return USB_SSP_GEN_1x2; goto out; } } out: return USB_SSP_GEN_UNKNOWN; } #ifdef CONFIG_USB_FEW_INIT_RETRIES #define PORT_RESET_TRIES 2 #define SET_ADDRESS_TRIES 1 #define GET_DESCRIPTOR_TRIES 1 #define GET_MAXPACKET0_TRIES 1 #define PORT_INIT_TRIES 4 #else #define PORT_RESET_TRIES 5 #define SET_ADDRESS_TRIES 2 #define GET_DESCRIPTOR_TRIES 2 #define GET_MAXPACKET0_TRIES 3 #define PORT_INIT_TRIES 4 #endif /* CONFIG_USB_FEW_INIT_RETRIES */ #define DETECT_DISCONNECT_TRIES 5 #define HUB_ROOT_RESET_TIME 60 /* times are in msec */ #define HUB_SHORT_RESET_TIME 10 #define HUB_BH_RESET_TIME 50 #define HUB_LONG_RESET_TIME 200 #define HUB_RESET_TIMEOUT 800 static bool use_new_scheme(struct usb_device *udev, int retry, struct usb_port *port_dev) { int old_scheme_first_port = (port_dev->quirks & USB_PORT_QUIRK_OLD_SCHEME) || old_scheme_first; /* * "New scheme" enumeration causes an extra state transition to be * exposed to an xhci host and causes USB3 devices to receive control * commands in the default state. This has been seen to cause * enumeration failures, so disable this enumeration scheme for USB3 * devices. */ if (udev->speed >= USB_SPEED_SUPER) return false; /* * If use_both_schemes is set, use the first scheme (whichever * it is) for the larger half of the retries, then use the other * scheme. Otherwise, use the first scheme for all the retries. */ if (use_both_schemes && retry >= (PORT_INIT_TRIES + 1) / 2) return old_scheme_first_port; /* Second half */ return !old_scheme_first_port; /* First half or all */ } /* Is a USB 3.0 port in the Inactive or Compliance Mode state? * Port warm reset is required to recover */ static bool hub_port_warm_reset_required(struct usb_hub *hub, int port1, u16 portstatus) { u16 link_state; if (!hub_is_superspeed(hub->hdev)) return false; if (test_bit(port1, hub->warm_reset_bits)) return true; link_state = portstatus & USB_PORT_STAT_LINK_STATE; return link_state == USB_SS_PORT_LS_SS_INACTIVE || link_state == USB_SS_PORT_LS_COMP_MOD; } static int hub_port_wait_reset(struct usb_hub *hub, int port1, struct usb_device *udev, unsigned int delay, bool warm) { int delay_time, ret; u16 portstatus; u16 portchange; u32 ext_portstatus = 0; for (delay_time = 0; delay_time < HUB_RESET_TIMEOUT; delay_time += delay) { /* wait to give the device a chance to reset */ msleep(delay); /* read and decode port status */ if (hub_is_superspeedplus(hub->hdev)) ret = hub_ext_port_status(hub, port1, HUB_EXT_PORT_STATUS, &portstatus, &portchange, &ext_portstatus); else ret = usb_hub_port_status(hub, port1, &portstatus, &portchange); if (ret < 0) return ret; /* * The port state is unknown until the reset completes. * * On top of that, some chips may require additional time * to re-establish a connection after the reset is complete, * so also wait for the connection to be re-established. */ if (!(portstatus & USB_PORT_STAT_RESET) && (portstatus & USB_PORT_STAT_CONNECTION)) break; /* switch to the long delay after two short delay failures */ if (delay_time >= 2 * HUB_SHORT_RESET_TIME) delay = HUB_LONG_RESET_TIME; dev_dbg(&hub->ports[port1 - 1]->dev, "not %sreset yet, waiting %dms\n", warm ? "warm " : "", delay); } if ((portstatus & USB_PORT_STAT_RESET)) return -EBUSY; if (hub_port_warm_reset_required(hub, port1, portstatus)) return -ENOTCONN; /* Device went away? */ if (!(portstatus & USB_PORT_STAT_CONNECTION)) return -ENOTCONN; /* Retry if connect change is set but status is still connected. * A USB 3.0 connection may bounce if multiple warm resets were issued, * but the device may have successfully re-connected. Ignore it. */ if (!hub_is_superspeed(hub->hdev) && (portchange & USB_PORT_STAT_C_CONNECTION)) { usb_clear_port_feature(hub->hdev, port1, USB_PORT_FEAT_C_CONNECTION); return -EAGAIN; } if (!(portstatus & USB_PORT_STAT_ENABLE)) return -EBUSY; if (!udev) return 0; if (hub_is_superspeedplus(hub->hdev)) { /* extended portstatus Rx and Tx lane count are zero based */ udev->rx_lanes = USB_EXT_PORT_RX_LANES(ext_portstatus) + 1; udev->tx_lanes = USB_EXT_PORT_TX_LANES(ext_portstatus) + 1; udev->ssp_rate = get_port_ssp_rate(hub->hdev, ext_portstatus); } else { udev->rx_lanes = 1; udev->tx_lanes = 1; udev->ssp_rate = USB_SSP_GEN_UNKNOWN; } if (udev->ssp_rate != USB_SSP_GEN_UNKNOWN) udev->speed = USB_SPEED_SUPER_PLUS; else if (hub_is_superspeed(hub->hdev)) udev->speed = USB_SPEED_SUPER; else if (portstatus & USB_PORT_STAT_HIGH_SPEED) udev->speed = USB_SPEED_HIGH; else if (portstatus & USB_PORT_STAT_LOW_SPEED) udev->speed = USB_SPEED_LOW; else udev->speed = USB_SPEED_FULL; return 0; } /* Handle port reset and port warm(BH) reset (for USB3 protocol ports) */ static int hub_port_reset(struct usb_hub *hub, int port1, struct usb_device *udev, unsigned int delay, bool warm) { int i, status; u16 portchange, portstatus; struct usb_port *port_dev = hub->ports[port1 - 1]; int reset_recovery_time; if (!hub_is_superspeed(hub->hdev)) { if (warm) { dev_err(hub->intfdev, "only USB3 hub support " "warm reset\n"); return -EINVAL; } /* Block EHCI CF initialization during the port reset. * Some companion controllers don't like it when they mix. */ down_read(&ehci_cf_port_reset_rwsem); } else if (!warm) { /* * If the caller hasn't explicitly requested a warm reset, * double check and see if one is needed. */ if (usb_hub_port_status(hub, port1, &portstatus, &portchange) == 0) if (hub_port_warm_reset_required(hub, port1, portstatus)) warm = true; } clear_bit(port1, hub->warm_reset_bits); /* Reset the port */ for (i = 0; i < PORT_RESET_TRIES; i++) { status = set_port_feature(hub->hdev, port1, (warm ? USB_PORT_FEAT_BH_PORT_RESET : USB_PORT_FEAT_RESET)); if (status == -ENODEV) { ; /* The hub is gone */ } else if (status) { dev_err(&port_dev->dev, "cannot %sreset (err = %d)\n", warm ? "warm " : "", status); } else { status = hub_port_wait_reset(hub, port1, udev, delay, warm); if (status && status != -ENOTCONN && status != -ENODEV) dev_dbg(hub->intfdev, "port_wait_reset: err = %d\n", status); } /* * Check for disconnect or reset, and bail out after several * reset attempts to avoid warm reset loop. */ if (status == 0 || status == -ENOTCONN || status == -ENODEV || (status == -EBUSY && i == PORT_RESET_TRIES - 1)) { usb_clear_port_feature(hub->hdev, port1, USB_PORT_FEAT_C_RESET); if (!hub_is_superspeed(hub->hdev)) goto done; usb_clear_port_feature(hub->hdev, port1, USB_PORT_FEAT_C_BH_PORT_RESET); usb_clear_port_feature(hub->hdev, port1, USB_PORT_FEAT_C_PORT_LINK_STATE); if (udev) usb_clear_port_feature(hub->hdev, port1, USB_PORT_FEAT_C_CONNECTION); /* * If a USB 3.0 device migrates from reset to an error * state, re-issue the warm reset. */ if (usb_hub_port_status(hub, port1, &portstatus, &portchange) < 0) goto done; if (!hub_port_warm_reset_required(hub, port1, portstatus)) goto done; /* * If the port is in SS.Inactive or Compliance Mode, the * hot or warm reset failed. Try another warm reset. */ if (!warm) { dev_dbg(&port_dev->dev, "hot reset failed, warm reset\n"); warm = true; } } dev_dbg(&port_dev->dev, "not enabled, trying %sreset again...\n", warm ? "warm " : ""); delay = HUB_LONG_RESET_TIME; } dev_err(&port_dev->dev, "Cannot enable. Maybe the USB cable is bad?\n"); done: if (status == 0) { if (port_dev->quirks & USB_PORT_QUIRK_FAST_ENUM) usleep_range(10000, 12000); else { /* TRSTRCY = 10 ms; plus some extra */ reset_recovery_time = 10 + 40; /* Hub needs extra delay after resetting its port. */ if (hub->hdev->quirks & USB_QUIRK_HUB_SLOW_RESET) reset_recovery_time += 100; msleep(reset_recovery_time); } if (udev) { struct usb_hcd *hcd = bus_to_hcd(udev->bus); update_devnum(udev, 0); /* The xHC may think the device is already reset, * so ignore the status. */ if (hcd->driver->reset_device) hcd->driver->reset_device(hcd, udev); usb_set_device_state(udev, USB_STATE_DEFAULT); } } else { if (udev) usb_set_device_state(udev, USB_STATE_NOTATTACHED); } if (!hub_is_superspeed(hub->hdev)) up_read(&ehci_cf_port_reset_rwsem); return status; } /* * hub_port_stop_enumerate - stop USB enumeration or ignore port events * @hub: target hub * @port1: port num of the port * @retries: port retries number of hub_port_init() * * Return: * true: ignore port actions/events or give up connection attempts. * false: keep original behavior. * * This function will be based on retries to check whether the port which is * marked with early_stop attribute would stop enumeration or ignore events. * * Note: * This function didn't change anything if early_stop is not set, and it will * prevent all connection attempts when early_stop is set and the attempts of * the port are more than 1. */ static bool hub_port_stop_enumerate(struct usb_hub *hub, int port1, int retries) { struct usb_port *port_dev = hub->ports[port1 - 1]; if (port_dev->early_stop) { if (port_dev->ignore_event) return true; /* * We want unsuccessful attempts to fail quickly. * Since some devices may need one failure during * port initialization, we allow two tries but no * more. */ if (retries < 2) return false; port_dev->ignore_event = 1; } else port_dev->ignore_event = 0; return port_dev->ignore_event; } /* Check if a port is power on */ int usb_port_is_power_on(struct usb_hub *hub, unsigned int portstatus) { int ret = 0; if (hub_is_superspeed(hub->hdev)) { if (portstatus & USB_SS_PORT_STAT_POWER) ret = 1; } else { if (portstatus & USB_PORT_STAT_POWER) ret = 1; } return ret; } static void usb_lock_port(struct usb_port *port_dev) __acquires(&port_dev->status_lock) { mutex_lock(&port_dev->status_lock); __acquire(&port_dev->status_lock); } static void usb_unlock_port(struct usb_port *port_dev) __releases(&port_dev->status_lock) { mutex_unlock(&port_dev->status_lock); __release(&port_dev->status_lock); } #ifdef CONFIG_PM /* Check if a port is suspended(USB2.0 port) or in U3 state(USB3.0 port) */ static int port_is_suspended(struct usb_hub *hub, unsigned portstatus) { int ret = 0; if (hub_is_superspeed(hub->hdev)) { if ((portstatus & USB_PORT_STAT_LINK_STATE) == USB_SS_PORT_LS_U3) ret = 1; } else { if (portstatus & USB_PORT_STAT_SUSPEND) ret = 1; } return ret; } /* Determine whether the device on a port is ready for a normal resume, * is ready for a reset-resume, or should be disconnected. */ static int check_port_resume_type(struct usb_device *udev, struct usb_hub *hub, int port1, int status, u16 portchange, u16 portstatus) { struct usb_port *port_dev = hub->ports[port1 - 1]; int retries = 3; retry: /* Is a warm reset needed to recover the connection? */ if (status == 0 && udev->reset_resume && hub_port_warm_reset_required(hub, port1, portstatus)) { /* pass */; } /* Is the device still present? */ else if (status || port_is_suspended(hub, portstatus) || !usb_port_is_power_on(hub, portstatus)) { if (status >= 0) status = -ENODEV; } else if (!(portstatus & USB_PORT_STAT_CONNECTION)) { if (retries--) { usleep_range(200, 300); status = usb_hub_port_status(hub, port1, &portstatus, &portchange); goto retry; } status = -ENODEV; } /* Can't do a normal resume if the port isn't enabled, * so try a reset-resume instead. */ else if (!(portstatus & USB_PORT_STAT_ENABLE) && !udev->reset_resume) { if (udev->persist_enabled) udev->reset_resume = 1; else status = -ENODEV; } if (status) { dev_dbg(&port_dev->dev, "status %04x.%04x after resume, %d\n", portchange, portstatus, status); } else if (udev->reset_resume) { /* Late port handoff can set status-change bits */ if (portchange & USB_PORT_STAT_C_CONNECTION) usb_clear_port_feature(hub->hdev, port1, USB_PORT_FEAT_C_CONNECTION); if (portchange & USB_PORT_STAT_C_ENABLE) usb_clear_port_feature(hub->hdev, port1, USB_PORT_FEAT_C_ENABLE); /* * Whatever made this reset-resume necessary may have * turned on the port1 bit in hub->change_bits. But after * a successful reset-resume we want the bit to be clear; * if it was on it would indicate that something happened * following the reset-resume. */ clear_bit(port1, hub->change_bits); } return status; } int usb_disable_ltm(struct usb_device *udev) { struct usb_hcd *hcd = bus_to_hcd(udev->bus); /* Check if the roothub and device supports LTM. */ if (!usb_device_supports_ltm(hcd->self.root_hub) || !usb_device_supports_ltm(udev)) return 0; /* Clear Feature LTM Enable can only be sent if the device is * configured. */ if (!udev->actconfig) return 0; return usb_control_msg(udev, usb_sndctrlpipe(udev, 0), USB_REQ_CLEAR_FEATURE, USB_RECIP_DEVICE, USB_DEVICE_LTM_ENABLE, 0, NULL, 0, USB_CTRL_SET_TIMEOUT); } EXPORT_SYMBOL_GPL(usb_disable_ltm); void usb_enable_ltm(struct usb_device *udev) { struct usb_hcd *hcd = bus_to_hcd(udev->bus); /* Check if the roothub and device supports LTM. */ if (!usb_device_supports_ltm(hcd->self.root_hub) || !usb_device_supports_ltm(udev)) return; /* Set Feature LTM Enable can only be sent if the device is * configured. */ if (!udev->actconfig) return; usb_control_msg(udev, usb_sndctrlpipe(udev, 0), USB_REQ_SET_FEATURE, USB_RECIP_DEVICE, USB_DEVICE_LTM_ENABLE, 0, NULL, 0, USB_CTRL_SET_TIMEOUT); } EXPORT_SYMBOL_GPL(usb_enable_ltm); /* * usb_enable_remote_wakeup - enable remote wakeup for a device * @udev: target device * * For USB-2 devices: Set the device's remote wakeup feature. * * For USB-3 devices: Assume there's only one function on the device and * enable remote wake for the first interface. FIXME if the interface * association descriptor shows there's more than one function. */ static int usb_enable_remote_wakeup(struct usb_device *udev) { if (udev->speed < USB_SPEED_SUPER) return usb_control_msg(udev, usb_sndctrlpipe(udev, 0), USB_REQ_SET_FEATURE, USB_RECIP_DEVICE, USB_DEVICE_REMOTE_WAKEUP, 0, NULL, 0, USB_CTRL_SET_TIMEOUT); else return usb_control_msg(udev, usb_sndctrlpipe(udev, 0), USB_REQ_SET_FEATURE, USB_RECIP_INTERFACE, USB_INTRF_FUNC_SUSPEND, USB_INTRF_FUNC_SUSPEND_RW | USB_INTRF_FUNC_SUSPEND_LP, NULL, 0, USB_CTRL_SET_TIMEOUT); } /* * usb_disable_remote_wakeup - disable remote wakeup for a device * @udev: target device * * For USB-2 devices: Clear the device's remote wakeup feature. * * For USB-3 devices: Assume there's only one function on the device and * disable remote wake for the first interface. FIXME if the interface * association descriptor shows there's more than one function. */ static int usb_disable_remote_wakeup(struct usb_device *udev) { if (udev->speed < USB_SPEED_SUPER) return usb_control_msg(udev, usb_sndctrlpipe(udev, 0), USB_REQ_CLEAR_FEATURE, USB_RECIP_DEVICE, USB_DEVICE_REMOTE_WAKEUP, 0, NULL, 0, USB_CTRL_SET_TIMEOUT); else return usb_control_msg(udev, usb_sndctrlpipe(udev, 0), USB_REQ_SET_FEATURE, USB_RECIP_INTERFACE, USB_INTRF_FUNC_SUSPEND, 0, NULL, 0, USB_CTRL_SET_TIMEOUT); } /* Count of wakeup-enabled devices at or below udev */ unsigned usb_wakeup_enabled_descendants(struct usb_device *udev) { struct usb_hub *hub = usb_hub_to_struct_hub(udev); return udev->do_remote_wakeup + (hub ? hub->wakeup_enabled_descendants : 0); } EXPORT_SYMBOL_GPL(usb_wakeup_enabled_descendants); /* * usb_port_suspend - suspend a usb device's upstream port * @udev: device that's no longer in active use, not a root hub * Context: must be able to sleep; device not locked; pm locks held * * Suspends a USB device that isn't in active use, conserving power. * Devices may wake out of a suspend, if anything important happens, * using the remote wakeup mechanism. They may also be taken out of * suspend by the host, using usb_port_resume(). It's also routine * to disconnect devices while they are suspended. * * This only affects the USB hardware for a device; its interfaces * (and, for hubs, child devices) must already have been suspended. * * Selective port suspend reduces power; most suspended devices draw * less than 500 uA. It's also used in OTG, along with remote wakeup. * All devices below the suspended port are also suspended. * * Devices leave suspend state when the host wakes them up. Some devices * also support "remote wakeup", where the device can activate the USB * tree above them to deliver data, such as a keypress or packet. In * some cases, this wakes the USB host. * * Suspending OTG devices may trigger HNP, if that's been enabled * between a pair of dual-role devices. That will change roles, such * as from A-Host to A-Peripheral or from B-Host back to B-Peripheral. * * Devices on USB hub ports have only one "suspend" state, corresponding * to ACPI D2, "may cause the device to lose some context". * State transitions include: * * - suspend, resume ... when the VBUS power link stays live * - suspend, disconnect ... VBUS lost * * Once VBUS drop breaks the circuit, the port it's using has to go through * normal re-enumeration procedures, starting with enabling VBUS power. * Other than re-initializing the hub (plug/unplug, except for root hubs), * Linux (2.6) currently has NO mechanisms to initiate that: no hub_wq * timer, no SRP, no requests through sysfs. * * If Runtime PM isn't enabled or used, non-SuperSpeed devices may not get * suspended until their bus goes into global suspend (i.e., the root * hub is suspended). Nevertheless, we change @udev->state to * USB_STATE_SUSPENDED as this is the device's "logical" state. The actual * upstream port setting is stored in @udev->port_is_suspended. * * Returns 0 on success, else negative errno. */ int usb_port_suspend(struct usb_device *udev, pm_message_t msg) { struct usb_hub *hub = usb_hub_to_struct_hub(udev->parent); struct usb_port *port_dev = hub->ports[udev->portnum - 1]; int port1 = udev->portnum; int status; bool really_suspend = true; usb_lock_port(port_dev); /* enable remote wakeup when appropriate; this lets the device * wake up the upstream hub (including maybe the root hub). * * NOTE: OTG devices may issue remote wakeup (or SRP) even when * we don't explicitly enable it here. */ if (udev->do_remote_wakeup) { status = usb_enable_remote_wakeup(udev); if (status) { dev_dbg(&udev->dev, "won't remote wakeup, status %d\n", status); /* bail if autosuspend is requested */ if (PMSG_IS_AUTO(msg)) goto err_wakeup; } } /* disable USB2 hardware LPM */ usb_disable_usb2_hardware_lpm(udev); if (usb_disable_ltm(udev)) { dev_err(&udev->dev, "Failed to disable LTM before suspend\n"); status = -ENOMEM; if (PMSG_IS_AUTO(msg)) goto err_ltm; } /* see 7.1.7.6 */ if (hub_is_superspeed(hub->hdev)) status = hub_set_port_link_state(hub, port1, USB_SS_PORT_LS_U3); /* * For system suspend, we do not need to enable the suspend feature * on individual USB-2 ports. The devices will automatically go * into suspend a few ms after the root hub stops sending packets. * The USB 2.0 spec calls this "global suspend". * * However, many USB hubs have a bug: They don't relay wakeup requests * from a downstream port if the port's suspend feature isn't on. * Therefore we will turn on the suspend feature if udev or any of its * descendants is enabled for remote wakeup. */ else if (PMSG_IS_AUTO(msg) || usb_wakeup_enabled_descendants(udev) > 0) status = set_port_feature(hub->hdev, port1, USB_PORT_FEAT_SUSPEND); else { really_suspend = false; status = 0; } if (status) { /* Check if the port has been suspended for the timeout case * to prevent the suspended port from incorrect handling. */ if (status == -ETIMEDOUT) { int ret; u16 portstatus, portchange; portstatus = portchange = 0; ret = usb_hub_port_status(hub, port1, &portstatus, &portchange); dev_dbg(&port_dev->dev, "suspend timeout, status %04x\n", portstatus); if (ret == 0 && port_is_suspended(hub, portstatus)) { status = 0; goto suspend_done; } } dev_dbg(&port_dev->dev, "can't suspend, status %d\n", status); /* Try to enable USB3 LTM again */ usb_enable_ltm(udev); err_ltm: /* Try to enable USB2 hardware LPM again */ usb_enable_usb2_hardware_lpm(udev); if (udev->do_remote_wakeup) (void) usb_disable_remote_wakeup(udev); err_wakeup: /* System sleep transitions should never fail */ if (!PMSG_IS_AUTO(msg)) status = 0; } else { suspend_done: dev_dbg(&udev->dev, "usb %ssuspend, wakeup %d\n", (PMSG_IS_AUTO(msg) ? "auto-" : ""), udev->do_remote_wakeup); if (really_suspend) { udev->port_is_suspended = 1; /* device has up to 10 msec to fully suspend */ msleep(10); } usb_set_device_state(udev, USB_STATE_SUSPENDED); } if (status == 0 && !udev->do_remote_wakeup && udev->persist_enabled && test_and_clear_bit(port1, hub->child_usage_bits)) pm_runtime_put_sync(&port_dev->dev); usb_mark_last_busy(hub->hdev); usb_unlock_port(port_dev); return status; } /* * If the USB "suspend" state is in use (rather than "global suspend"), * many devices will be individually taken out of suspend state using * special "resume" signaling. This routine kicks in shortly after * hardware resume signaling is finished, either because of selective * resume (by host) or remote wakeup (by device) ... now see what changed * in the tree that's rooted at this device. * * If @udev->reset_resume is set then the device is reset before the * status check is done. */ static int finish_port_resume(struct usb_device *udev) { int status = 0; u16 devstatus = 0; /* caller owns the udev device lock */ dev_dbg(&udev->dev, "%s\n", udev->reset_resume ? "finish reset-resume" : "finish resume"); /* usb ch9 identifies four variants of SUSPENDED, based on what * state the device resumes to. Linux currently won't see the * first two on the host side; they'd be inside hub_port_init() * during many timeouts, but hub_wq can't suspend until later. */ usb_set_device_state(udev, udev->actconfig ? USB_STATE_CONFIGURED : USB_STATE_ADDRESS); /* 10.5.4.5 says not to reset a suspended port if the attached * device is enabled for remote wakeup. Hence the reset * operation is carried out here, after the port has been * resumed. */ if (udev->reset_resume) { /* * If the device morphs or switches modes when it is reset, * we don't want to perform a reset-resume. We'll fail the * resume, which will cause a logical disconnect, and then * the device will be rediscovered. */ retry_reset_resume: if (udev->quirks & USB_QUIRK_RESET) status = -ENODEV; else status = usb_reset_and_verify_device(udev); } /* 10.5.4.5 says be sure devices in the tree are still there. * For now let's assume the device didn't go crazy on resume, * and device drivers will know about any resume quirks. */ if (status == 0) { devstatus = 0; status = usb_get_std_status(udev, USB_RECIP_DEVICE, 0, &devstatus); /* If a normal resume failed, try doing a reset-resume */ if (status && !udev->reset_resume && udev->persist_enabled) { dev_dbg(&udev->dev, "retry with reset-resume\n"); udev->reset_resume = 1; goto retry_reset_resume; } } if (status) { dev_dbg(&udev->dev, "gone after usb resume? status %d\n", status); /* * There are a few quirky devices which violate the standard * by claiming to have remote wakeup enabled after a reset, * which crash if the feature is cleared, hence check for * udev->reset_resume */ } else if (udev->actconfig && !udev->reset_resume) { if (udev->speed < USB_SPEED_SUPER) { if (devstatus & (1 << USB_DEVICE_REMOTE_WAKEUP)) status = usb_disable_remote_wakeup(udev); } else { status = usb_get_std_status(udev, USB_RECIP_INTERFACE, 0, &devstatus); if (!status && devstatus & (USB_INTRF_STAT_FUNC_RW_CAP | USB_INTRF_STAT_FUNC_RW)) status = usb_disable_remote_wakeup(udev); } if (status) dev_dbg(&udev->dev, "disable remote wakeup, status %d\n", status); status = 0; } return status; } /* * There are some SS USB devices which take longer time for link training. * XHCI specs 4.19.4 says that when Link training is successful, port * sets CCS bit to 1. So if SW reads port status before successful link * training, then it will not find device to be present. * USB Analyzer log with such buggy devices show that in some cases * device switch on the RX termination after long delay of host enabling * the VBUS. In few other cases it has been seen that device fails to * negotiate link training in first attempt. It has been * reported till now that few devices take as long as 2000 ms to train * the link after host enabling its VBUS and termination. Following * routine implements a 2000 ms timeout for link training. If in a case * link trains before timeout, loop will exit earlier. * * There are also some 2.0 hard drive based devices and 3.0 thumb * drives that, when plugged into a 2.0 only port, take a long * time to set CCS after VBUS enable. * * FIXME: If a device was connected before suspend, but was removed * while system was asleep, then the loop in the following routine will * only exit at timeout. * * This routine should only be called when persist is enabled. */ static int wait_for_connected(struct usb_device *udev, struct usb_hub *hub, int port1, u16 *portchange, u16 *portstatus) { int status = 0, delay_ms = 0; while (delay_ms < 2000) { if (status || *portstatus & USB_PORT_STAT_CONNECTION) break; if (!usb_port_is_power_on(hub, *portstatus)) { status = -ENODEV; break; } msleep(20); delay_ms += 20; status = usb_hub_port_status(hub, port1, portstatus, portchange); } dev_dbg(&udev->dev, "Waited %dms for CONNECT\n", delay_ms); return status; } /* * usb_port_resume - re-activate a suspended usb device's upstream port * @udev: device to re-activate, not a root hub * Context: must be able to sleep; device not locked; pm locks held * * This will re-activate the suspended device, increasing power usage * while letting drivers communicate again with its endpoints. * USB resume explicitly guarantees that the power session between * the host and the device is the same as it was when the device * suspended. * * If @udev->reset_resume is set then this routine won't check that the * port is still enabled. Furthermore, finish_port_resume() above will * reset @udev. The end result is that a broken power session can be * recovered and @udev will appear to persist across a loss of VBUS power. * * For example, if a host controller doesn't maintain VBUS suspend current * during a system sleep or is reset when the system wakes up, all the USB * power sessions below it will be broken. This is especially troublesome * for mass-storage devices containing mounted filesystems, since the * device will appear to have disconnected and all the memory mappings * to it will be lost. Using the USB_PERSIST facility, the device can be * made to appear as if it had not disconnected. * * This facility can be dangerous. Although usb_reset_and_verify_device() makes * every effort to insure that the same device is present after the * reset as before, it cannot provide a 100% guarantee. Furthermore it's * quite possible for a device to remain unaltered but its media to be * changed. If the user replaces a flash memory card while the system is * asleep, he will have only himself to blame when the filesystem on the * new card is corrupted and the system crashes. * * Returns 0 on success, else negative errno. */ int usb_port_resume(struct usb_device *udev, pm_message_t msg) { struct usb_hub *hub = usb_hub_to_struct_hub(udev->parent); struct usb_port *port_dev = hub->ports[udev->portnum - 1]; int port1 = udev->portnum; int status; u16 portchange, portstatus; if (!test_and_set_bit(port1, hub->child_usage_bits)) { status = pm_runtime_resume_and_get(&port_dev->dev); if (status < 0) { dev_dbg(&udev->dev, "can't resume usb port, status %d\n", status); return status; } } usb_lock_port(port_dev); /* Skip the initial Clear-Suspend step for a remote wakeup */ status = usb_hub_port_status(hub, port1, &portstatus, &portchange); if (status == 0 && !port_is_suspended(hub, portstatus)) { if (portchange & USB_PORT_STAT_C_SUSPEND) pm_wakeup_event(&udev->dev, 0); goto SuspendCleared; } /* see 7.1.7.7; affects power usage, but not budgeting */ if (hub_is_superspeed(hub->hdev)) status = hub_set_port_link_state(hub, port1, USB_SS_PORT_LS_U0); else status = usb_clear_port_feature(hub->hdev, port1, USB_PORT_FEAT_SUSPEND); if (status) { dev_dbg(&port_dev->dev, "can't resume, status %d\n", status); } else { /* drive resume for USB_RESUME_TIMEOUT msec */ dev_dbg(&udev->dev, "usb %sresume\n", (PMSG_IS_AUTO(msg) ? "auto-" : "")); msleep(USB_RESUME_TIMEOUT); /* Virtual root hubs can trigger on GET_PORT_STATUS to * stop resume signaling. Then finish the resume * sequence. */ status = usb_hub_port_status(hub, port1, &portstatus, &portchange); } SuspendCleared: if (status == 0) { udev->port_is_suspended = 0; if (hub_is_superspeed(hub->hdev)) { if (portchange & USB_PORT_STAT_C_LINK_STATE) usb_clear_port_feature(hub->hdev, port1, USB_PORT_FEAT_C_PORT_LINK_STATE); } else { if (portchange & USB_PORT_STAT_C_SUSPEND) usb_clear_port_feature(hub->hdev, port1, USB_PORT_FEAT_C_SUSPEND); } /* TRSMRCY = 10 msec */ msleep(10); } if (udev->persist_enabled) status = wait_for_connected(udev, hub, port1, &portchange, &portstatus); status = check_port_resume_type(udev, hub, port1, status, portchange, portstatus); if (status == 0) status = finish_port_resume(udev); if (status < 0) { dev_dbg(&udev->dev, "can't resume, status %d\n", status); hub_port_logical_disconnect(hub, port1); } else { /* Try to enable USB2 hardware LPM */ usb_enable_usb2_hardware_lpm(udev); /* Try to enable USB3 LTM */ usb_enable_ltm(udev); } usb_unlock_port(port_dev); return status; } int usb_remote_wakeup(struct usb_device *udev) { int status = 0; usb_lock_device(udev); if (udev->state == USB_STATE_SUSPENDED) { dev_dbg(&udev->dev, "usb %sresume\n", "wakeup-"); status = usb_autoresume_device(udev); if (status == 0) { /* Let the drivers do their thing, then... */ usb_autosuspend_device(udev); } } usb_unlock_device(udev); return status; } /* Returns 1 if there was a remote wakeup and a connect status change. */ static int hub_handle_remote_wakeup(struct usb_hub *hub, unsigned int port, u16 portstatus, u16 portchange) __must_hold(&port_dev->status_lock) { struct usb_port *port_dev = hub->ports[port - 1]; struct usb_device *hdev; struct usb_device *udev; int connect_change = 0; u16 link_state; int ret; hdev = hub->hdev; udev = port_dev->child; if (!hub_is_superspeed(hdev)) { if (!(portchange & USB_PORT_STAT_C_SUSPEND)) return 0; usb_clear_port_feature(hdev, port, USB_PORT_FEAT_C_SUSPEND); } else { link_state = portstatus & USB_PORT_STAT_LINK_STATE; if (!udev || udev->state != USB_STATE_SUSPENDED || (link_state != USB_SS_PORT_LS_U0 && link_state != USB_SS_PORT_LS_U1 && link_state != USB_SS_PORT_LS_U2)) return 0; } if (udev) { /* TRSMRCY = 10 msec */ msleep(10); usb_unlock_port(port_dev); ret = usb_remote_wakeup(udev); usb_lock_port(port_dev); if (ret < 0) connect_change = 1; } else { ret = -ENODEV; hub_port_disable(hub, port, 1); } dev_dbg(&port_dev->dev, "resume, status %d\n", ret); return connect_change; } static int check_ports_changed(struct usb_hub *hub) { int port1; for (port1 = 1; port1 <= hub->hdev->maxchild; ++port1) { u16 portstatus, portchange; int status; status = usb_hub_port_status(hub, port1, &portstatus, &portchange); if (!status && portchange) return 1; } return 0; } static int hub_suspend(struct usb_interface *intf, pm_message_t msg) { struct usb_hub *hub = usb_get_intfdata(intf); struct usb_device *hdev = hub->hdev; unsigned port1; /* * Warn if children aren't already suspended. * Also, add up the number of wakeup-enabled descendants. */ hub->wakeup_enabled_descendants = 0; for (port1 = 1; port1 <= hdev->maxchild; port1++) { struct usb_port *port_dev = hub->ports[port1 - 1]; struct usb_device *udev = port_dev->child; if (udev && udev->can_submit) { dev_warn(&port_dev->dev, "device %s not suspended yet\n", dev_name(&udev->dev)); if (PMSG_IS_AUTO(msg)) return -EBUSY; } if (udev) hub->wakeup_enabled_descendants += usb_wakeup_enabled_descendants(udev); } if (hdev->do_remote_wakeup && hub->quirk_check_port_auto_suspend) { /* check if there are changes pending on hub ports */ if (check_ports_changed(hub)) { if (PMSG_IS_AUTO(msg)) return -EBUSY; pm_wakeup_event(&hdev->dev, 2000); } } if (hub_is_superspeed(hdev) && hdev->do_remote_wakeup) { /* Enable hub to send remote wakeup for all ports. */ for (port1 = 1; port1 <= hdev->maxchild; port1++) { set_port_feature(hdev, port1 | USB_PORT_FEAT_REMOTE_WAKE_CONNECT | USB_PORT_FEAT_REMOTE_WAKE_DISCONNECT | USB_PORT_FEAT_REMOTE_WAKE_OVER_CURRENT, USB_PORT_FEAT_REMOTE_WAKE_MASK); } } dev_dbg(&intf->dev, "%s\n", __func__); /* stop hub_wq and related activity */ hub_quiesce(hub, HUB_SUSPEND); return 0; } /* Report wakeup requests from the ports of a resuming root hub */ static void report_wakeup_requests(struct usb_hub *hub) { struct usb_device *hdev = hub->hdev; struct usb_device *udev; struct usb_hcd *hcd; unsigned long resuming_ports; int i; if (hdev->parent) return; /* Not a root hub */ hcd = bus_to_hcd(hdev->bus); if (hcd->driver->get_resuming_ports) { /* * The get_resuming_ports() method returns a bitmap (origin 0) * of ports which have started wakeup signaling but have not * yet finished resuming. During system resume we will * resume all the enabled ports, regardless of any wakeup * signals, which means the wakeup requests would be lost. * To prevent this, report them to the PM core here. */ resuming_ports = hcd->driver->get_resuming_ports(hcd); for (i = 0; i < hdev->maxchild; ++i) { if (test_bit(i, &resuming_ports)) { udev = hub->ports[i]->child; if (udev) pm_wakeup_event(&udev->dev, 0); } } } } static int hub_resume(struct usb_interface *intf) { struct usb_hub *hub = usb_get_intfdata(intf); dev_dbg(&intf->dev, "%s\n", __func__); hub_activate(hub, HUB_RESUME); /* * This should be called only for system resume, not runtime resume. * We can't tell the difference here, so some wakeup requests will be * reported at the wrong time or more than once. This shouldn't * matter much, so long as they do get reported. */ report_wakeup_requests(hub); return 0; } static int hub_reset_resume(struct usb_interface *intf) { struct usb_hub *hub = usb_get_intfdata(intf); dev_dbg(&intf->dev, "%s\n", __func__); hub_activate(hub, HUB_RESET_RESUME); return 0; } /** * usb_root_hub_lost_power - called by HCD if the root hub lost Vbus power * @rhdev: struct usb_device for the root hub * * The USB host controller driver calls this function when its root hub * is resumed and Vbus power has been interrupted or the controller * has been reset. The routine marks @rhdev as having lost power. * When the hub driver is resumed it will take notice and carry out * power-session recovery for all the "USB-PERSIST"-enabled child devices; * the others will be disconnected. */ void usb_root_hub_lost_power(struct usb_device *rhdev) { dev_notice(&rhdev->dev, "root hub lost power or was reset\n"); rhdev->reset_resume = 1; } EXPORT_SYMBOL_GPL(usb_root_hub_lost_power); static const char * const usb3_lpm_names[] = { "U0", "U1", "U2", "U3", }; /* * Send a Set SEL control transfer to the device, prior to enabling * device-initiated U1 or U2. This lets the device know the exit latencies from * the time the device initiates a U1 or U2 exit, to the time it will receive a * packet from the host. * * This function will fail if the SEL or PEL values for udev are greater than * the maximum allowed values for the link state to be enabled. */ static int usb_req_set_sel(struct usb_device *udev) { struct usb_set_sel_req *sel_values; unsigned long long u1_sel; unsigned long long u1_pel; unsigned long long u2_sel; unsigned long long u2_pel; int ret; if (!udev->parent || udev->speed < USB_SPEED_SUPER || !udev->lpm_capable) return 0; /* Convert SEL and PEL stored in ns to us */ u1_sel = DIV_ROUND_UP(udev->u1_params.sel, 1000); u1_pel = DIV_ROUND_UP(udev->u1_params.pel, 1000); u2_sel = DIV_ROUND_UP(udev->u2_params.sel, 1000); u2_pel = DIV_ROUND_UP(udev->u2_params.pel, 1000); /* * Make sure that the calculated SEL and PEL values for the link * state we're enabling aren't bigger than the max SEL/PEL * value that will fit in the SET SEL control transfer. * Otherwise the device would get an incorrect idea of the exit * latency for the link state, and could start a device-initiated * U1/U2 when the exit latencies are too high. */ if (u1_sel > USB3_LPM_MAX_U1_SEL_PEL || u1_pel > USB3_LPM_MAX_U1_SEL_PEL || u2_sel > USB3_LPM_MAX_U2_SEL_PEL || u2_pel > USB3_LPM_MAX_U2_SEL_PEL) { dev_dbg(&udev->dev, "Device-initiated U1/U2 disabled due to long SEL or PEL\n"); return -EINVAL; } /* * usb_enable_lpm() can be called as part of a failed device reset, * which may be initiated by an error path of a mass storage driver. * Therefore, use GFP_NOIO. */ sel_values = kmalloc(sizeof *(sel_values), GFP_NOIO); if (!sel_values) return -ENOMEM; sel_values->u1_sel = u1_sel; sel_values->u1_pel = u1_pel; sel_values->u2_sel = cpu_to_le16(u2_sel); sel_values->u2_pel = cpu_to_le16(u2_pel); ret = usb_control_msg(udev, usb_sndctrlpipe(udev, 0), USB_REQ_SET_SEL, USB_RECIP_DEVICE, 0, 0, sel_values, sizeof *(sel_values), USB_CTRL_SET_TIMEOUT); kfree(sel_values); if (ret > 0) udev->lpm_devinit_allow = 1; return ret; } /* * Enable or disable device-initiated U1 or U2 transitions. */ static int usb_set_device_initiated_lpm(struct usb_device *udev, enum usb3_link_state state, bool enable) { int ret; int feature; switch (state) { case USB3_LPM_U1: feature = USB_DEVICE_U1_ENABLE; break; case USB3_LPM_U2: feature = USB_DEVICE_U2_ENABLE; break; default: dev_warn(&udev->dev, "%s: Can't %s non-U1 or U2 state.\n", __func__, str_enable_disable(enable)); return -EINVAL; } if (udev->state != USB_STATE_CONFIGURED) { dev_dbg(&udev->dev, "%s: Can't %s %s state " "for unconfigured device.\n", __func__, str_enable_disable(enable), usb3_lpm_names[state]); return -EINVAL; } if (enable) { /* * Now send the control transfer to enable device-initiated LPM * for either U1 or U2. */ ret = usb_control_msg(udev, usb_sndctrlpipe(udev, 0), USB_REQ_SET_FEATURE, USB_RECIP_DEVICE, feature, 0, NULL, 0, USB_CTRL_SET_TIMEOUT); } else { ret = usb_control_msg(udev, usb_sndctrlpipe(udev, 0), USB_REQ_CLEAR_FEATURE, USB_RECIP_DEVICE, feature, 0, NULL, 0, USB_CTRL_SET_TIMEOUT); } if (ret < 0) { dev_warn(&udev->dev, "%s of device-initiated %s failed.\n", str_enable_disable(enable), usb3_lpm_names[state]); return -EBUSY; } return 0; } static int usb_set_lpm_timeout(struct usb_device *udev, enum usb3_link_state state, int timeout) { int ret; int feature; switch (state) { case USB3_LPM_U1: feature = USB_PORT_FEAT_U1_TIMEOUT; break; case USB3_LPM_U2: feature = USB_PORT_FEAT_U2_TIMEOUT; break; default: dev_warn(&udev->dev, "%s: Can't set timeout for non-U1 or U2 state.\n", __func__); return -EINVAL; } if (state == USB3_LPM_U1 && timeout > USB3_LPM_U1_MAX_TIMEOUT && timeout != USB3_LPM_DEVICE_INITIATED) { dev_warn(&udev->dev, "Failed to set %s timeout to 0x%x, " "which is a reserved value.\n", usb3_lpm_names[state], timeout); return -EINVAL; } ret = set_port_feature(udev->parent, USB_PORT_LPM_TIMEOUT(timeout) | udev->portnum, feature); if (ret < 0) { dev_warn(&udev->dev, "Failed to set %s timeout to 0x%x," "error code %i\n", usb3_lpm_names[state], timeout, ret); return -EBUSY; } if (state == USB3_LPM_U1) udev->u1_params.timeout = timeout; else udev->u2_params.timeout = timeout; return 0; } /* * Don't allow device intiated U1/U2 if device isn't in the configured state, * or the system exit latency + one bus interval is greater than the minimum * service interval of any active periodic endpoint. See USB 3.2 section 9.4.9 */ static bool usb_device_may_initiate_lpm(struct usb_device *udev, enum usb3_link_state state) { unsigned int sel; /* us */ int i, j; if (!udev->lpm_devinit_allow || !udev->actconfig) return false; if (state == USB3_LPM_U1) sel = DIV_ROUND_UP(udev->u1_params.sel, 1000); else if (state == USB3_LPM_U2) sel = DIV_ROUND_UP(udev->u2_params.sel, 1000); else return false; for (i = 0; i < udev->actconfig->desc.bNumInterfaces; i++) { struct usb_interface *intf; struct usb_endpoint_descriptor *desc; unsigned int interval; intf = udev->actconfig->interface[i]; if (!intf) continue; for (j = 0; j < intf->cur_altsetting->desc.bNumEndpoints; j++) { desc = &intf->cur_altsetting->endpoint[j].desc; if (usb_endpoint_xfer_int(desc) || usb_endpoint_xfer_isoc(desc)) { interval = (1 << (desc->bInterval - 1)) * 125; if (sel + 125 > interval) return false; } } } return true; } /* * Enable the hub-initiated U1/U2 idle timeouts, and enable device-initiated * U1/U2 entry. * * We will attempt to enable U1 or U2, but there are no guarantees that the * control transfers to set the hub timeout or enable device-initiated U1/U2 * will be successful. * * If the control transfer to enable device-initiated U1/U2 entry fails, then * hub-initiated U1/U2 will be disabled. * * If we cannot set the parent hub U1/U2 timeout, we attempt to let the xHCI * driver know about it. If that call fails, it should be harmless, and just * take up more slightly more bus bandwidth for unnecessary U1/U2 exit latency. */ static int usb_enable_link_state(struct usb_hcd *hcd, struct usb_device *udev, enum usb3_link_state state) { int timeout; __u8 u1_mel; __le16 u2_mel; /* Skip if the device BOS descriptor couldn't be read */ if (!udev->bos) return -EINVAL; u1_mel = udev->bos->ss_cap->bU1devExitLat; u2_mel = udev->bos->ss_cap->bU2DevExitLat; /* If the device says it doesn't have *any* exit latency to come out of * U1 or U2, it's probably lying. Assume it doesn't implement that link * state. */ if ((state == USB3_LPM_U1 && u1_mel == 0) || (state == USB3_LPM_U2 && u2_mel == 0)) return -EINVAL; /* We allow the host controller to set the U1/U2 timeout internally * first, so that it can change its schedule to account for the * additional latency to send data to a device in a lower power * link state. */ timeout = hcd->driver->enable_usb3_lpm_timeout(hcd, udev, state); /* xHCI host controller doesn't want to enable this LPM state. */ if (timeout == 0) return -EINVAL; if (timeout < 0) { dev_warn(&udev->dev, "Could not enable %s link state, " "xHCI error %i.\n", usb3_lpm_names[state], timeout); return timeout; } if (usb_set_lpm_timeout(udev, state, timeout)) { /* If we can't set the parent hub U1/U2 timeout, * device-initiated LPM won't be allowed either, so let the xHCI * host know that this link state won't be enabled. */ hcd->driver->disable_usb3_lpm_timeout(hcd, udev, state); return -EBUSY; } if (state == USB3_LPM_U1) udev->usb3_lpm_u1_enabled = 1; else if (state == USB3_LPM_U2) udev->usb3_lpm_u2_enabled = 1; return 0; } /* * Disable the hub-initiated U1/U2 idle timeouts, and disable device-initiated * U1/U2 entry. * * If this function returns -EBUSY, the parent hub will still allow U1/U2 entry. * If zero is returned, the parent will not allow the link to go into U1/U2. * * If zero is returned, device-initiated U1/U2 entry may still be enabled, but * it won't have an effect on the bus link state because the parent hub will * still disallow device-initiated U1/U2 entry. * * If zero is returned, the xHCI host controller may still think U1/U2 entry is * possible. The result will be slightly more bus bandwidth will be taken up * (to account for U1/U2 exit latency), but it should be harmless. */ static int usb_disable_link_state(struct usb_hcd *hcd, struct usb_device *udev, enum usb3_link_state state) { switch (state) { case USB3_LPM_U1: case USB3_LPM_U2: break; default: dev_warn(&udev->dev, "%s: Can't disable non-U1 or U2 state.\n", __func__); return -EINVAL; } if (usb_set_lpm_timeout(udev, state, 0)) return -EBUSY; if (hcd->driver->disable_usb3_lpm_timeout(hcd, udev, state)) dev_warn(&udev->dev, "Could not disable xHCI %s timeout, " "bus schedule bandwidth may be impacted.\n", usb3_lpm_names[state]); /* As soon as usb_set_lpm_timeout(0) return 0, hub initiated LPM * is disabled. Hub will disallows link to enter U1/U2 as well, * even device is initiating LPM. Hence LPM is disabled if hub LPM * timeout set to 0, no matter device-initiated LPM is disabled or * not. */ if (state == USB3_LPM_U1) udev->usb3_lpm_u1_enabled = 0; else if (state == USB3_LPM_U2) udev->usb3_lpm_u2_enabled = 0; return 0; } /* * Disable hub-initiated and device-initiated U1 and U2 entry. * Caller must own the bandwidth_mutex. * * This will call usb_enable_lpm() on failure, which will decrement * lpm_disable_count, and will re-enable LPM if lpm_disable_count reaches zero. */ int usb_disable_lpm(struct usb_device *udev) { struct usb_hcd *hcd; int err; if (!udev || !udev->parent || udev->speed < USB_SPEED_SUPER || !udev->lpm_capable || udev->state < USB_STATE_CONFIGURED) return 0; hcd = bus_to_hcd(udev->bus); if (!hcd || !hcd->driver->disable_usb3_lpm_timeout) return 0; udev->lpm_disable_count++; if ((udev->u1_params.timeout == 0 && udev->u2_params.timeout == 0)) return 0; /* If LPM is enabled, attempt to disable it. */ if (usb_disable_link_state(hcd, udev, USB3_LPM_U1)) goto disable_failed; if (usb_disable_link_state(hcd, udev, USB3_LPM_U2)) goto disable_failed; err = usb_set_device_initiated_lpm(udev, USB3_LPM_U1, false); if (!err) usb_set_device_initiated_lpm(udev, USB3_LPM_U2, false); return 0; disable_failed: udev->lpm_disable_count--; return -EBUSY; } EXPORT_SYMBOL_GPL(usb_disable_lpm); /* Grab the bandwidth_mutex before calling usb_disable_lpm() */ int usb_unlocked_disable_lpm(struct usb_device *udev) { struct usb_hcd *hcd = bus_to_hcd(udev->bus); int ret; if (!hcd) return -EINVAL; mutex_lock(hcd->bandwidth_mutex); ret = usb_disable_lpm(udev); mutex_unlock(hcd->bandwidth_mutex); return ret; } EXPORT_SYMBOL_GPL(usb_unlocked_disable_lpm); /* * Attempt to enable device-initiated and hub-initiated U1 and U2 entry. The * xHCI host policy may prevent U1 or U2 from being enabled. * * Other callers may have disabled link PM, so U1 and U2 entry will be disabled * until the lpm_disable_count drops to zero. Caller must own the * bandwidth_mutex. */ void usb_enable_lpm(struct usb_device *udev) { struct usb_hcd *hcd; struct usb_hub *hub; struct usb_port *port_dev; if (!udev || !udev->parent || udev->speed < USB_SPEED_SUPER || !udev->lpm_capable || udev->state < USB_STATE_CONFIGURED) return; udev->lpm_disable_count--; hcd = bus_to_hcd(udev->bus); /* Double check that we can both enable and disable LPM. * Device must be configured to accept set feature U1/U2 timeout. */ if (!hcd || !hcd->driver->enable_usb3_lpm_timeout || !hcd->driver->disable_usb3_lpm_timeout) return; if (udev->lpm_disable_count > 0) return; hub = usb_hub_to_struct_hub(udev->parent); if (!hub) return; port_dev = hub->ports[udev->portnum - 1]; if (port_dev->usb3_lpm_u1_permit) if (usb_enable_link_state(hcd, udev, USB3_LPM_U1)) return; if (port_dev->usb3_lpm_u2_permit) if (usb_enable_link_state(hcd, udev, USB3_LPM_U2)) return; /* * Enable device initiated U1/U2 with a SetFeature(U1/U2_ENABLE) request * if system exit latency is short enough and device is configured */ if (usb_device_may_initiate_lpm(udev, USB3_LPM_U1)) { if (usb_set_device_initiated_lpm(udev, USB3_LPM_U1, true)) return; if (usb_device_may_initiate_lpm(udev, USB3_LPM_U2)) usb_set_device_initiated_lpm(udev, USB3_LPM_U2, true); } } EXPORT_SYMBOL_GPL(usb_enable_lpm); /* Grab the bandwidth_mutex before calling usb_enable_lpm() */ void usb_unlocked_enable_lpm(struct usb_device *udev) { struct usb_hcd *hcd = bus_to_hcd(udev->bus); if (!hcd) return; mutex_lock(hcd->bandwidth_mutex); usb_enable_lpm(udev); mutex_unlock(hcd->bandwidth_mutex); } EXPORT_SYMBOL_GPL(usb_unlocked_enable_lpm); /* usb3 devices use U3 for disabled, make sure remote wakeup is disabled */ static void hub_usb3_port_prepare_disable(struct usb_hub *hub, struct usb_port *port_dev) { struct usb_device *udev = port_dev->child; int ret; if (udev && udev->port_is_suspended && udev->do_remote_wakeup) { ret = hub_set_port_link_state(hub, port_dev->portnum, USB_SS_PORT_LS_U0); if (!ret) { msleep(USB_RESUME_TIMEOUT); ret = usb_disable_remote_wakeup(udev); } if (ret) dev_warn(&udev->dev, "Port disable: can't disable remote wake\n"); udev->do_remote_wakeup = 0; } } #else /* CONFIG_PM */ #define hub_suspend NULL #define hub_resume NULL #define hub_reset_resume NULL static inline void hub_usb3_port_prepare_disable(struct usb_hub *hub, struct usb_port *port_dev) { } int usb_disable_lpm(struct usb_device *udev) { return 0; } EXPORT_SYMBOL_GPL(usb_disable_lpm); void usb_enable_lpm(struct usb_device *udev) { } EXPORT_SYMBOL_GPL(usb_enable_lpm); int usb_unlocked_disable_lpm(struct usb_device *udev) { return 0; } EXPORT_SYMBOL_GPL(usb_unlocked_disable_lpm); void usb_unlocked_enable_lpm(struct usb_device *udev) { } EXPORT_SYMBOL_GPL(usb_unlocked_enable_lpm); int usb_disable_ltm(struct usb_device *udev) { return 0; } EXPORT_SYMBOL_GPL(usb_disable_ltm); void usb_enable_ltm(struct usb_device *udev) { } EXPORT_SYMBOL_GPL(usb_enable_ltm); static int hub_handle_remote_wakeup(struct usb_hub *hub, unsigned int port, u16 portstatus, u16 portchange) { return 0; } static int usb_req_set_sel(struct usb_device *udev) { return 0; } #endif /* CONFIG_PM */ /* * USB-3 does not have a similar link state as USB-2 that will avoid negotiating * a connection with a plugged-in cable but will signal the host when the cable * is unplugged. Disable remote wake and set link state to U3 for USB-3 devices */ static int hub_port_disable(struct usb_hub *hub, int port1, int set_state) { struct usb_port *port_dev = hub->ports[port1 - 1]; struct usb_device *hdev = hub->hdev; int ret = 0; if (!hub->error) { if (hub_is_superspeed(hub->hdev)) { hub_usb3_port_prepare_disable(hub, port_dev); ret = hub_set_port_link_state(hub, port_dev->portnum, USB_SS_PORT_LS_U3); } else { ret = usb_clear_port_feature(hdev, port1, USB_PORT_FEAT_ENABLE); } } if (port_dev->child && set_state) usb_set_device_state(port_dev->child, USB_STATE_NOTATTACHED); if (ret && ret != -ENODEV) dev_err(&port_dev->dev, "cannot disable (err = %d)\n", ret); return ret; } /* * usb_port_disable - disable a usb device's upstream port * @udev: device to disable * Context: @udev locked, must be able to sleep. * * Disables a USB device that isn't in active use. */ int usb_port_disable(struct usb_device *udev) { struct usb_hub *hub = usb_hub_to_struct_hub(udev->parent); return hub_port_disable(hub, udev->portnum, 0); } /* USB 2.0 spec, 7.1.7.3 / fig 7-29: * * Between connect detection and reset signaling there must be a delay * of 100ms at least for debounce and power-settling. The corresponding * timer shall restart whenever the downstream port detects a disconnect. * * Apparently there are some bluetooth and irda-dongles and a number of * low-speed devices for which this debounce period may last over a second. * Not covered by the spec - but easy to deal with. * * This implementation uses a 1500ms total debounce timeout; if the * connection isn't stable by then it returns -ETIMEDOUT. It checks * every 25ms for transient disconnects. When the port status has been * unchanged for 100ms it returns the port status. */ int hub_port_debounce(struct usb_hub *hub, int port1, bool must_be_connected) { int ret; u16 portchange, portstatus; unsigned connection = 0xffff; int total_time, stable_time = 0; struct usb_port *port_dev = hub->ports[port1 - 1]; for (total_time = 0; ; total_time += HUB_DEBOUNCE_STEP) { ret = usb_hub_port_status(hub, port1, &portstatus, &portchange); if (ret < 0) return ret; if (!(portchange & USB_PORT_STAT_C_CONNECTION) && (portstatus & USB_PORT_STAT_CONNECTION) == connection) { if (!must_be_connected || (connection == USB_PORT_STAT_CONNECTION)) stable_time += HUB_DEBOUNCE_STEP; if (stable_time >= HUB_DEBOUNCE_STABLE) break; } else { stable_time = 0; connection = portstatus & USB_PORT_STAT_CONNECTION; } if (portchange & USB_PORT_STAT_C_CONNECTION) { usb_clear_port_feature(hub->hdev, port1, USB_PORT_FEAT_C_CONNECTION); } if (total_time >= HUB_DEBOUNCE_TIMEOUT) break; msleep(HUB_DEBOUNCE_STEP); } dev_dbg(&port_dev->dev, "debounce total %dms stable %dms status 0x%x\n", total_time, stable_time, portstatus); if (stable_time < HUB_DEBOUNCE_STABLE) return -ETIMEDOUT; return portstatus; } void usb_ep0_reinit(struct usb_device *udev) { usb_disable_endpoint(udev, 0 + USB_DIR_IN, true); usb_disable_endpoint(udev, 0 + USB_DIR_OUT, true); usb_enable_endpoint(udev, &udev->ep0, true); } EXPORT_SYMBOL_GPL(usb_ep0_reinit); static int hub_set_address(struct usb_device *udev, int devnum) { int retval; unsigned int timeout_ms = USB_CTRL_SET_TIMEOUT; struct usb_hcd *hcd = bus_to_hcd(udev->bus); struct usb_hub *hub = usb_hub_to_struct_hub(udev->parent); if (hub->hdev->quirks & USB_QUIRK_SHORT_SET_ADDRESS_REQ_TIMEOUT) timeout_ms = USB_SHORT_SET_ADDRESS_REQ_TIMEOUT; /* * The host controller will choose the device address, * instead of the core having chosen it earlier */ if (!hcd->driver->address_device && devnum <= 1) return -EINVAL; if (udev->state == USB_STATE_ADDRESS) return 0; if (udev->state != USB_STATE_DEFAULT) return -EINVAL; if (hcd->driver->address_device) retval = hcd->driver->address_device(hcd, udev, timeout_ms); else retval = usb_control_msg(udev, usb_sndctrlpipe(udev, 0), USB_REQ_SET_ADDRESS, 0, devnum, 0, NULL, 0, timeout_ms); if (retval == 0) { update_devnum(udev, devnum); /* Device now using proper address. */ usb_set_device_state(udev, USB_STATE_ADDRESS); usb_ep0_reinit(udev); } return retval; } /* * There are reports of USB 3.0 devices that say they support USB 2.0 Link PM * when they're plugged into a USB 2.0 port, but they don't work when LPM is * enabled. * * Only enable USB 2.0 Link PM if the port is internal (hardwired), or the * device says it supports the new USB 2.0 Link PM errata by setting the BESL * support bit in the BOS descriptor. */ static void hub_set_initial_usb2_lpm_policy(struct usb_device *udev) { struct usb_hub *hub = usb_hub_to_struct_hub(udev->parent); int connect_type = USB_PORT_CONNECT_TYPE_UNKNOWN; if (!udev->usb2_hw_lpm_capable || !udev->bos) return; if (hub) connect_type = hub->ports[udev->portnum - 1]->connect_type; if ((udev->bos->ext_cap->bmAttributes & cpu_to_le32(USB_BESL_SUPPORT)) || connect_type == USB_PORT_CONNECT_TYPE_HARD_WIRED) { udev->usb2_hw_lpm_allowed = 1; usb_enable_usb2_hardware_lpm(udev); } } static int hub_enable_device(struct usb_device *udev) { struct usb_hcd *hcd = bus_to_hcd(udev->bus); if (!hcd->driver->enable_device) return 0; if (udev->state == USB_STATE_ADDRESS) return 0; if (udev->state != USB_STATE_DEFAULT) return -EINVAL; return hcd->driver->enable_device(hcd, udev); } /* * Get the bMaxPacketSize0 value during initialization by reading the * device's device descriptor. Since we don't already know this value, * the transfer is unsafe and it ignores I/O errors, only testing for * reasonable received values. * * For "old scheme" initialization, size will be 8 so we read just the * start of the device descriptor, which should work okay regardless of * the actual bMaxPacketSize0 value. For "new scheme" initialization, * size will be 64 (and buf will point to a sufficiently large buffer), * which might not be kosher according to the USB spec but it's what * Windows does and what many devices expect. * * Returns: bMaxPacketSize0 or a negative error code. */ static int get_bMaxPacketSize0(struct usb_device *udev, struct usb_device_descriptor *buf, int size, bool first_time) { int i, rc; /* * Retry on all errors; some devices are flakey. * 255 is for WUSB devices, we actually need to use * 512 (WUSB1.0[4.8.1]). */ for (i = 0; i < GET_MAXPACKET0_TRIES; ++i) { /* Start with invalid values in case the transfer fails */ buf->bDescriptorType = buf->bMaxPacketSize0 = 0; rc = usb_control_msg(udev, usb_rcvctrlpipe(udev, 0), USB_REQ_GET_DESCRIPTOR, USB_DIR_IN, USB_DT_DEVICE << 8, 0, buf, size, initial_descriptor_timeout); switch (buf->bMaxPacketSize0) { case 8: case 16: case 32: case 64: case 9: if (buf->bDescriptorType == USB_DT_DEVICE) { rc = buf->bMaxPacketSize0; break; } fallthrough; default: if (rc >= 0) rc = -EPROTO; break; } /* * Some devices time out if they are powered on * when already connected. They need a second * reset, so return early. But only on the first * attempt, lest we get into a time-out/reset loop. */ if (rc > 0 || (rc == -ETIMEDOUT && first_time && udev->speed > USB_SPEED_FULL)) break; } return rc; } #define GET_DESCRIPTOR_BUFSIZE 64 /* Reset device, (re)assign address, get device descriptor. * Device connection must be stable, no more debouncing needed. * Returns device in USB_STATE_ADDRESS, except on error. * * If this is called for an already-existing device (as part of * usb_reset_and_verify_device), the caller must own the device lock and * the port lock. For a newly detected device that is not accessible * through any global pointers, it's not necessary to lock the device, * but it is still necessary to lock the port. * * For a newly detected device, @dev_descr must be NULL. The device * descriptor retrieved from the device will then be stored in * @udev->descriptor. For an already existing device, @dev_descr * must be non-NULL. The device descriptor will be stored there, * not in @udev->descriptor, because descriptors for registered * devices are meant to be immutable. */ static int hub_port_init(struct usb_hub *hub, struct usb_device *udev, int port1, int retry_counter, struct usb_device_descriptor *dev_descr) { struct usb_device *hdev = hub->hdev; struct usb_hcd *hcd = bus_to_hcd(hdev->bus); struct usb_port *port_dev = hub->ports[port1 - 1]; int retries, operations, retval, i; unsigned delay = HUB_SHORT_RESET_TIME; enum usb_device_speed oldspeed = udev->speed; const char *speed; int devnum = udev->devnum; const char *driver_name; bool do_new_scheme; const bool initial = !dev_descr; int maxp0; struct usb_device_descriptor *buf, *descr; buf = kmalloc(GET_DESCRIPTOR_BUFSIZE, GFP_NOIO); if (!buf) return -ENOMEM; /* root hub ports have a slightly longer reset period * (from USB 2.0 spec, section 7.1.7.5) */ if (!hdev->parent) { delay = HUB_ROOT_RESET_TIME; if (port1 == hdev->bus->otg_port) hdev->bus->b_hnp_enable = 0; } /* Some low speed devices have problems with the quick delay, so */ /* be a bit pessimistic with those devices. RHbug #23670 */ if (oldspeed == USB_SPEED_LOW) delay = HUB_LONG_RESET_TIME; /* Reset the device; full speed may morph to high speed */ /* FIXME a USB 2.0 device may morph into SuperSpeed on reset. */ retval = hub_port_reset(hub, port1, udev, delay, false); if (retval < 0) /* error or disconnect */ goto fail; /* success, speed is known */ retval = -ENODEV; /* Don't allow speed changes at reset, except usb 3.0 to faster */ if (oldspeed != USB_SPEED_UNKNOWN && oldspeed != udev->speed && !(oldspeed == USB_SPEED_SUPER && udev->speed > oldspeed)) { dev_dbg(&udev->dev, "device reset changed speed!\n"); goto fail; } oldspeed = udev->speed; if (initial) { /* USB 2.0 section 5.5.3 talks about ep0 maxpacket ... * it's fixed size except for full speed devices. */ switch (udev->speed) { case USB_SPEED_SUPER_PLUS: case USB_SPEED_SUPER: udev->ep0.desc.wMaxPacketSize = cpu_to_le16(512); break; case USB_SPEED_HIGH: /* fixed at 64 */ udev->ep0.desc.wMaxPacketSize = cpu_to_le16(64); break; case USB_SPEED_FULL: /* 8, 16, 32, or 64 */ /* to determine the ep0 maxpacket size, try to read * the device descriptor to get bMaxPacketSize0 and * then correct our initial guess. */ udev->ep0.desc.wMaxPacketSize = cpu_to_le16(64); break; case USB_SPEED_LOW: /* fixed at 8 */ udev->ep0.desc.wMaxPacketSize = cpu_to_le16(8); break; default: goto fail; } } speed = usb_speed_string(udev->speed); /* * The controller driver may be NULL if the controller device * is the middle device between platform device and roothub. * This middle device may not need a device driver due to * all hardware control can be at platform device driver, this * platform device is usually a dual-role USB controller device. */ if (udev->bus->controller->driver) driver_name = udev->bus->controller->driver->name; else driver_name = udev->bus->sysdev->driver->name; if (udev->speed < USB_SPEED_SUPER) dev_info(&udev->dev, "%s %s USB device number %d using %s\n", (initial ? "new" : "reset"), speed, devnum, driver_name); if (initial) { /* Set up TT records, if needed */ if (hdev->tt) { udev->tt = hdev->tt; udev->ttport = hdev->ttport; } else if (udev->speed != USB_SPEED_HIGH && hdev->speed == USB_SPEED_HIGH) { if (!hub->tt.hub) { dev_err(&udev->dev, "parent hub has no TT\n"); retval = -EINVAL; goto fail; } udev->tt = &hub->tt; udev->ttport = port1; } } /* Why interleave GET_DESCRIPTOR and SET_ADDRESS this way? * Because device hardware and firmware is sometimes buggy in * this area, and this is how Linux has done it for ages. * Change it cautiously. * * NOTE: If use_new_scheme() is true we will start by issuing * a 64-byte GET_DESCRIPTOR request. This is what Windows does, * so it may help with some non-standards-compliant devices. * Otherwise we start with SET_ADDRESS and then try to read the * first 8 bytes of the device descriptor to get the ep0 maxpacket * value. */ do_new_scheme = use_new_scheme(udev, retry_counter, port_dev); for (retries = 0; retries < GET_DESCRIPTOR_TRIES; (++retries, msleep(100))) { if (hub_port_stop_enumerate(hub, port1, retries)) { retval = -ENODEV; break; } if (do_new_scheme) { retval = hub_enable_device(udev); if (retval < 0) { dev_err(&udev->dev, "hub failed to enable device, error %d\n", retval); goto fail; } maxp0 = get_bMaxPacketSize0(udev, buf, GET_DESCRIPTOR_BUFSIZE, retries == 0); if (maxp0 > 0 && !initial && maxp0 != udev->descriptor.bMaxPacketSize0) { dev_err(&udev->dev, "device reset changed ep0 maxpacket size!\n"); retval = -ENODEV; goto fail; } retval = hub_port_reset(hub, port1, udev, delay, false); if (retval < 0) /* error or disconnect */ goto fail; if (oldspeed != udev->speed) { dev_dbg(&udev->dev, "device reset changed speed!\n"); retval = -ENODEV; goto fail; } if (maxp0 < 0) { if (maxp0 != -ENODEV) dev_err(&udev->dev, "device descriptor read/64, error %d\n", maxp0); retval = maxp0; continue; } } for (operations = 0; operations < SET_ADDRESS_TRIES; ++operations) { retval = hub_set_address(udev, devnum); if (retval >= 0) break; msleep(200); } if (retval < 0) { if (retval != -ENODEV) dev_err(&udev->dev, "device not accepting address %d, error %d\n", devnum, retval); goto fail; } if (udev->speed >= USB_SPEED_SUPER) { devnum = udev->devnum; dev_info(&udev->dev, "%s SuperSpeed%s%s USB device number %d using %s\n", (udev->config) ? "reset" : "new", (udev->speed == USB_SPEED_SUPER_PLUS) ? " Plus" : "", (udev->ssp_rate == USB_SSP_GEN_2x2) ? " Gen 2x2" : (udev->ssp_rate == USB_SSP_GEN_2x1) ? " Gen 2x1" : (udev->ssp_rate == USB_SSP_GEN_1x2) ? " Gen 1x2" : "", devnum, driver_name); } /* * cope with hardware quirkiness: * - let SET_ADDRESS settle, some device hardware wants it * - read ep0 maxpacket even for high and low speed, */ msleep(10); if (do_new_scheme) break; maxp0 = get_bMaxPacketSize0(udev, buf, 8, retries == 0); if (maxp0 < 0) { retval = maxp0; if (retval != -ENODEV) dev_err(&udev->dev, "device descriptor read/8, error %d\n", retval); } else { u32 delay; if (!initial && maxp0 != udev->descriptor.bMaxPacketSize0) { dev_err(&udev->dev, "device reset changed ep0 maxpacket size!\n"); retval = -ENODEV; goto fail; } delay = udev->parent->hub_delay; udev->hub_delay = min_t(u32, delay, USB_TP_TRANSMISSION_DELAY_MAX); retval = usb_set_isoch_delay(udev); if (retval) { dev_dbg(&udev->dev, "Failed set isoch delay, error %d\n", retval); retval = 0; } break; } } if (retval) goto fail; /* * Check the ep0 maxpacket guess and correct it if necessary. * maxp0 is the value stored in the device descriptor; * i is the value it encodes (logarithmic for SuperSpeed or greater). */ i = maxp0; if (udev->speed >= USB_SPEED_SUPER) { if (maxp0 <= 16) i = 1 << maxp0; else i = 0; /* Invalid */ } if (usb_endpoint_maxp(&udev->ep0.desc) == i) { ; /* Initial ep0 maxpacket guess is right */ } else if (((udev->speed == USB_SPEED_FULL || udev->speed == USB_SPEED_HIGH) && (i == 8 || i == 16 || i == 32 || i == 64)) || (udev->speed >= USB_SPEED_SUPER && i > 0)) { /* Initial guess is wrong; use the descriptor's value */ if (udev->speed == USB_SPEED_FULL) dev_dbg(&udev->dev, "ep0 maxpacket = %d\n", i); else dev_warn(&udev->dev, "Using ep0 maxpacket: %d\n", i); udev->ep0.desc.wMaxPacketSize = cpu_to_le16(i); usb_ep0_reinit(udev); } else { /* Initial guess is wrong and descriptor's value is invalid */ dev_err(&udev->dev, "Invalid ep0 maxpacket: %d\n", maxp0); retval = -EMSGSIZE; goto fail; } descr = usb_get_device_descriptor(udev); if (IS_ERR(descr)) { retval = PTR_ERR(descr); if (retval != -ENODEV) dev_err(&udev->dev, "device descriptor read/all, error %d\n", retval); goto fail; } if (initial) udev->descriptor = *descr; else *dev_descr = *descr; kfree(descr); /* * Some superspeed devices have finished the link training process * and attached to a superspeed hub port, but the device descriptor * got from those devices show they aren't superspeed devices. Warm * reset the port attached by the devices can fix them. */ if ((udev->speed >= USB_SPEED_SUPER) && (le16_to_cpu(udev->descriptor.bcdUSB) < 0x0300)) { dev_err(&udev->dev, "got a wrong device descriptor, warm reset device\n"); hub_port_reset(hub, port1, udev, HUB_BH_RESET_TIME, true); retval = -EINVAL; goto fail; } usb_detect_quirks(udev); if (le16_to_cpu(udev->descriptor.bcdUSB) >= 0x0201) { retval = usb_get_bos_descriptor(udev); if (!retval) { udev->lpm_capable = usb_device_supports_lpm(udev); udev->lpm_disable_count = 1; usb_set_lpm_parameters(udev); usb_req_set_sel(udev); } } retval = 0; /* notify HCD that we have a device connected and addressed */ if (hcd->driver->update_device) hcd->driver->update_device(hcd, udev); hub_set_initial_usb2_lpm_policy(udev); fail: if (retval) { hub_port_disable(hub, port1, 0); update_devnum(udev, devnum); /* for disconnect processing */ } kfree(buf); return retval; } static void check_highspeed(struct usb_hub *hub, struct usb_device *udev, int port1) { struct usb_qualifier_descriptor *qual; int status; if (udev->quirks & USB_QUIRK_DEVICE_QUALIFIER) return; qual = kmalloc(sizeof *qual, GFP_KERNEL); if (qual == NULL) return; status = usb_get_descriptor(udev, USB_DT_DEVICE_QUALIFIER, 0, qual, sizeof *qual); if (status == sizeof *qual) { dev_info(&udev->dev, "not running at top speed; " "connect to a high speed hub\n"); /* hub LEDs are probably harder to miss than syslog */ if (hub->has_indicators) { hub->indicator[port1-1] = INDICATOR_GREEN_BLINK; queue_delayed_work(system_power_efficient_wq, &hub->leds, 0); } } kfree(qual); } static unsigned hub_power_remaining(struct usb_hub *hub) { struct usb_device *hdev = hub->hdev; int remaining; int port1; if (!hub->limited_power) return 0; remaining = hdev->bus_mA - hub->descriptor->bHubContrCurrent; for (port1 = 1; port1 <= hdev->maxchild; ++port1) { struct usb_port *port_dev = hub->ports[port1 - 1]; struct usb_device *udev = port_dev->child; unsigned unit_load; int delta; if (!udev) continue; if (hub_is_superspeed(udev)) unit_load = 150; else unit_load = 100; /* * Unconfigured devices may not use more than one unit load, * or 8mA for OTG ports */ if (udev->actconfig) delta = usb_get_max_power(udev, udev->actconfig); else if (port1 != udev->bus->otg_port || hdev->parent) delta = unit_load; else delta = 8; if (delta > hub->mA_per_port) dev_warn(&port_dev->dev, "%dmA is over %umA budget!\n", delta, hub->mA_per_port); remaining -= delta; } if (remaining < 0) { dev_warn(hub->intfdev, "%dmA over power budget!\n", -remaining); remaining = 0; } return remaining; } static int descriptors_changed(struct usb_device *udev, struct usb_device_descriptor *new_device_descriptor, struct usb_host_bos *old_bos) { int changed = 0; unsigned index; unsigned serial_len = 0; unsigned len; unsigned old_length; int length; char *buf; if (memcmp(&udev->descriptor, new_device_descriptor, sizeof(*new_device_descriptor)) != 0) return 1; if ((old_bos && !udev->bos) || (!old_bos && udev->bos)) return 1; if (udev->bos) { len = le16_to_cpu(udev->bos->desc->wTotalLength); if (len != le16_to_cpu(old_bos->desc->wTotalLength)) return 1; if (memcmp(udev->bos->desc, old_bos->desc, len)) return 1; } /* Since the idVendor, idProduct, and bcdDevice values in the * device descriptor haven't changed, we will assume the * Manufacturer and Product strings haven't changed either. * But the SerialNumber string could be different (e.g., a * different flash card of the same brand). */ if (udev->serial) serial_len = strlen(udev->serial) + 1; len = serial_len; for (index = 0; index < udev->descriptor.bNumConfigurations; index++) { old_length = le16_to_cpu(udev->config[index].desc.wTotalLength); len = max(len, old_length); } buf = kmalloc(len, GFP_NOIO); if (!buf) /* assume the worst */ return 1; for (index = 0; index < udev->descriptor.bNumConfigurations; index++) { old_length = le16_to_cpu(udev->config[index].desc.wTotalLength); length = usb_get_descriptor(udev, USB_DT_CONFIG, index, buf, old_length); if (length != old_length) { dev_dbg(&udev->dev, "config index %d, error %d\n", index, length); changed = 1; break; } if (memcmp(buf, udev->rawdescriptors[index], old_length) != 0) { dev_dbg(&udev->dev, "config index %d changed (#%d)\n", index, ((struct usb_config_descriptor *) buf)-> bConfigurationValue); changed = 1; break; } } if (!changed && serial_len) { length = usb_string(udev, udev->descriptor.iSerialNumber, buf, serial_len); if (length + 1 != serial_len) { dev_dbg(&udev->dev, "serial string error %d\n", length); changed = 1; } else if (memcmp(buf, udev->serial, length) != 0) { dev_dbg(&udev->dev, "serial string changed\n"); changed = 1; } } kfree(buf); return changed; } static void hub_port_connect(struct usb_hub *hub, int port1, u16 portstatus, u16 portchange) { int status = -ENODEV; int i; unsigned unit_load; struct usb_device *hdev = hub->hdev; struct usb_hcd *hcd = bus_to_hcd(hdev->bus); struct usb_port *port_dev = hub->ports[port1 - 1]; struct usb_device *udev = port_dev->child; static int unreliable_port = -1; bool retry_locked; /* Disconnect any existing devices under this port */ if (udev) { if (hcd->usb_phy && !hdev->parent) usb_phy_notify_disconnect(hcd->usb_phy, udev->speed); usb_disconnect(&port_dev->child); } /* We can forget about a "removed" device when there's a physical * disconnect or the connect status changes. */ if (!(portstatus & USB_PORT_STAT_CONNECTION) || (portchange & USB_PORT_STAT_C_CONNECTION)) clear_bit(port1, hub->removed_bits); if (portchange & (USB_PORT_STAT_C_CONNECTION | USB_PORT_STAT_C_ENABLE)) { status = hub_port_debounce_be_stable(hub, port1); if (status < 0) { if (status != -ENODEV && port1 != unreliable_port && printk_ratelimit()) dev_err(&port_dev->dev, "connect-debounce failed\n"); portstatus &= ~USB_PORT_STAT_CONNECTION; unreliable_port = port1; } else { portstatus = status; } } /* Return now if debouncing failed or nothing is connected or * the device was "removed". */ if (!(portstatus & USB_PORT_STAT_CONNECTION) || test_bit(port1, hub->removed_bits)) { /* * maybe switch power back on (e.g. root hub was reset) * but only if the port isn't owned by someone else. */ if (hub_is_port_power_switchable(hub) && !usb_port_is_power_on(hub, portstatus) && !port_dev->port_owner) set_port_feature(hdev, port1, USB_PORT_FEAT_POWER); if (portstatus & USB_PORT_STAT_ENABLE) goto done; return; } if (hub_is_superspeed(hub->hdev)) unit_load = 150; else unit_load = 100; status = 0; for (i = 0; i < PORT_INIT_TRIES; i++) { if (hub_port_stop_enumerate(hub, port1, i)) { status = -ENODEV; break; } usb_lock_port(port_dev); mutex_lock(hcd->address0_mutex); retry_locked = true; /* reallocate for each attempt, since references * to the previous one can escape in various ways */ udev = usb_alloc_dev(hdev, hdev->bus, port1); if (!udev) { dev_err(&port_dev->dev, "couldn't allocate usb_device\n"); mutex_unlock(hcd->address0_mutex); usb_unlock_port(port_dev); goto done; } usb_set_device_state(udev, USB_STATE_POWERED); udev->bus_mA = hub->mA_per_port; udev->level = hdev->level + 1; /* Devices connected to SuperSpeed hubs are USB 3.0 or later */ if (hub_is_superspeed(hub->hdev)) udev->speed = USB_SPEED_SUPER; else udev->speed = USB_SPEED_UNKNOWN; choose_devnum(udev); if (udev->devnum <= 0) { status = -ENOTCONN; /* Don't retry */ goto loop; } /* reset (non-USB 3.0 devices) and get descriptor */ status = hub_port_init(hub, udev, port1, i, NULL); if (status < 0) goto loop; mutex_unlock(hcd->address0_mutex); usb_unlock_port(port_dev); retry_locked = false; if (udev->quirks & USB_QUIRK_DELAY_INIT) msleep(2000); /* consecutive bus-powered hubs aren't reliable; they can * violate the voltage drop budget. if the new child has * a "powered" LED, users should notice we didn't enable it * (without reading syslog), even without per-port LEDs * on the parent. */ if (udev->descriptor.bDeviceClass == USB_CLASS_HUB && udev->bus_mA <= unit_load) { u16 devstat; status = usb_get_std_status(udev, USB_RECIP_DEVICE, 0, &devstat); if (status) { dev_dbg(&udev->dev, "get status %d ?\n", status); goto loop_disable; } if ((devstat & (1 << USB_DEVICE_SELF_POWERED)) == 0) { dev_err(&udev->dev, "can't connect bus-powered hub " "to this port\n"); if (hub->has_indicators) { hub->indicator[port1-1] = INDICATOR_AMBER_BLINK; queue_delayed_work( system_power_efficient_wq, &hub->leds, 0); } status = -ENOTCONN; /* Don't retry */ goto loop_disable; } } /* check for devices running slower than they could */ if (le16_to_cpu(udev->descriptor.bcdUSB) >= 0x0200 && udev->speed == USB_SPEED_FULL && highspeed_hubs != 0) check_highspeed(hub, udev, port1); /* Store the parent's children[] pointer. At this point * udev becomes globally accessible, although presumably * no one will look at it until hdev is unlocked. */ status = 0; mutex_lock(&usb_port_peer_mutex); /* We mustn't add new devices if the parent hub has * been disconnected; we would race with the * recursively_mark_NOTATTACHED() routine. */ spin_lock_irq(&device_state_lock); if (hdev->state == USB_STATE_NOTATTACHED) status = -ENOTCONN; else port_dev->child = udev; spin_unlock_irq(&device_state_lock); mutex_unlock(&usb_port_peer_mutex); /* Run it through the hoops (find a driver, etc) */ if (!status) { status = usb_new_device(udev); if (status) { mutex_lock(&usb_port_peer_mutex); spin_lock_irq(&device_state_lock); port_dev->child = NULL; spin_unlock_irq(&device_state_lock); mutex_unlock(&usb_port_peer_mutex); } else { if (hcd->usb_phy && !hdev->parent) usb_phy_notify_connect(hcd->usb_phy, udev->speed); } } if (status) goto loop_disable; status = hub_power_remaining(hub); if (status) dev_dbg(hub->intfdev, "%dmA power budget left\n", status); return; loop_disable: hub_port_disable(hub, port1, 1); loop: usb_ep0_reinit(udev); release_devnum(udev); hub_free_dev(udev); if (retry_locked) { mutex_unlock(hcd->address0_mutex); usb_unlock_port(port_dev); } usb_put_dev(udev); if ((status == -ENOTCONN) || (status == -ENOTSUPP)) break; /* When halfway through our retry count, power-cycle the port */ if (i == (PORT_INIT_TRIES - 1) / 2) { dev_info(&port_dev->dev, "attempt power cycle\n"); usb_hub_set_port_power(hdev, hub, port1, false); msleep(2 * hub_power_on_good_delay(hub)); usb_hub_set_port_power(hdev, hub, port1, true); msleep(hub_power_on_good_delay(hub)); } } if (hub->hdev->parent || !hcd->driver->port_handed_over || !(hcd->driver->port_handed_over)(hcd, port1)) { if (status != -ENOTCONN && status != -ENODEV) dev_err(&port_dev->dev, "unable to enumerate USB device\n"); } done: hub_port_disable(hub, port1, 1); if (hcd->driver->relinquish_port && !hub->hdev->parent) { if (status != -ENOTCONN && status != -ENODEV) hcd->driver->relinquish_port(hcd, port1); } } /* Handle physical or logical connection change events. * This routine is called when: * a port connection-change occurs; * a port enable-change occurs (often caused by EMI); * usb_reset_and_verify_device() encounters changed descriptors (as from * a firmware download) * caller already locked the hub */ static void hub_port_connect_change(struct usb_hub *hub, int port1, u16 portstatus, u16 portchange) __must_hold(&port_dev->status_lock) { struct usb_port *port_dev = hub->ports[port1 - 1]; struct usb_device *udev = port_dev->child; struct usb_device_descriptor *descr; int status = -ENODEV; dev_dbg(&port_dev->dev, "status %04x, change %04x, %s\n", portstatus, portchange, portspeed(hub, portstatus)); if (hub->has_indicators) { set_port_led(hub, port1, HUB_LED_AUTO); hub->indicator[port1-1] = INDICATOR_AUTO; } #ifdef CONFIG_USB_OTG /* during HNP, don't repeat the debounce */ if (hub->hdev->bus->is_b_host) portchange &= ~(USB_PORT_STAT_C_CONNECTION | USB_PORT_STAT_C_ENABLE); #endif /* Try to resuscitate an existing device */ if ((portstatus & USB_PORT_STAT_CONNECTION) && udev && udev->state != USB_STATE_NOTATTACHED) { if (portstatus & USB_PORT_STAT_ENABLE) { /* * USB-3 connections are initialized automatically by * the hostcontroller hardware. Therefore check for * changed device descriptors before resuscitating the * device. */ descr = usb_get_device_descriptor(udev); if (IS_ERR(descr)) { dev_dbg(&udev->dev, "can't read device descriptor %ld\n", PTR_ERR(descr)); } else { if (descriptors_changed(udev, descr, udev->bos)) { dev_dbg(&udev->dev, "device descriptor has changed\n"); } else { status = 0; /* Nothing to do */ } kfree(descr); } #ifdef CONFIG_PM } else if (udev->state == USB_STATE_SUSPENDED && udev->persist_enabled) { /* For a suspended device, treat this as a * remote wakeup event. */ usb_unlock_port(port_dev); status = usb_remote_wakeup(udev); usb_lock_port(port_dev); #endif } else { /* Don't resuscitate */; } } clear_bit(port1, hub->change_bits); /* successfully revalidated the connection */ if (status == 0) return; usb_unlock_port(port_dev); hub_port_connect(hub, port1, portstatus, portchange); usb_lock_port(port_dev); } /* Handle notifying userspace about hub over-current events */ static void port_over_current_notify(struct usb_port *port_dev) { char *envp[3] = { NULL, NULL, NULL }; struct device *hub_dev; char *port_dev_path; sysfs_notify(&port_dev->dev.kobj, NULL, "over_current_count"); hub_dev = port_dev->dev.parent; if (!hub_dev) return; port_dev_path = kobject_get_path(&port_dev->dev.kobj, GFP_KERNEL); if (!port_dev_path) return; envp[0] = kasprintf(GFP_KERNEL, "OVER_CURRENT_PORT=%s", port_dev_path); if (!envp[0]) goto exit; envp[1] = kasprintf(GFP_KERNEL, "OVER_CURRENT_COUNT=%u", port_dev->over_current_count); if (!envp[1]) goto exit; kobject_uevent_env(&hub_dev->kobj, KOBJ_CHANGE, envp); exit: kfree(envp[1]); kfree(envp[0]); kfree(port_dev_path); } static void port_event(struct usb_hub *hub, int port1) __must_hold(&port_dev->status_lock) { int connect_change; struct usb_port *port_dev = hub->ports[port1 - 1]; struct usb_device *udev = port_dev->child; struct usb_device *hdev = hub->hdev; u16 portstatus, portchange; int i = 0; int err; connect_change = test_bit(port1, hub->change_bits); clear_bit(port1, hub->event_bits); clear_bit(port1, hub->wakeup_bits); if (usb_hub_port_status(hub, port1, &portstatus, &portchange) < 0) return; if (portchange & USB_PORT_STAT_C_CONNECTION) { usb_clear_port_feature(hdev, port1, USB_PORT_FEAT_C_CONNECTION); connect_change = 1; } if (portchange & USB_PORT_STAT_C_ENABLE) { if (!connect_change) dev_dbg(&port_dev->dev, "enable change, status %08x\n", portstatus); usb_clear_port_feature(hdev, port1, USB_PORT_FEAT_C_ENABLE); /* * EM interference sometimes causes badly shielded USB devices * to be shutdown by the hub, this hack enables them again. * Works at least with mouse driver. */ if (!(portstatus & USB_PORT_STAT_ENABLE) && !connect_change && udev) { dev_err(&port_dev->dev, "disabled by hub (EMI?), re-enabling...\n"); connect_change = 1; } } if (portchange & USB_PORT_STAT_C_OVERCURRENT) { u16 status = 0, unused; port_dev->over_current_count++; port_over_current_notify(port_dev); dev_dbg(&port_dev->dev, "over-current change #%u\n", port_dev->over_current_count); usb_clear_port_feature(hdev, port1, USB_PORT_FEAT_C_OVER_CURRENT); msleep(100); /* Cool down */ hub_power_on(hub, true); usb_hub_port_status(hub, port1, &status, &unused); if (status & USB_PORT_STAT_OVERCURRENT) dev_err(&port_dev->dev, "over-current condition\n"); } if (portchange & USB_PORT_STAT_C_RESET) { dev_dbg(&port_dev->dev, "reset change\n"); usb_clear_port_feature(hdev, port1, USB_PORT_FEAT_C_RESET); } if ((portchange & USB_PORT_STAT_C_BH_RESET) && hub_is_superspeed(hdev)) { dev_dbg(&port_dev->dev, "warm reset change\n"); usb_clear_port_feature(hdev, port1, USB_PORT_FEAT_C_BH_PORT_RESET); } if (portchange & USB_PORT_STAT_C_LINK_STATE) { dev_dbg(&port_dev->dev, "link state change\n"); usb_clear_port_feature(hdev, port1, USB_PORT_FEAT_C_PORT_LINK_STATE); } if (portchange & USB_PORT_STAT_C_CONFIG_ERROR) { dev_warn(&port_dev->dev, "config error\n"); usb_clear_port_feature(hdev, port1, USB_PORT_FEAT_C_PORT_CONFIG_ERROR); } /* skip port actions that require the port to be powered on */ if (!pm_runtime_active(&port_dev->dev)) return; /* skip port actions if ignore_event and early_stop are true */ if (port_dev->ignore_event && port_dev->early_stop) return; if (hub_handle_remote_wakeup(hub, port1, portstatus, portchange)) connect_change = 1; /* * Avoid trying to recover a USB3 SS.Inactive port with a warm reset if * the device was disconnected. A 12ms disconnect detect timer in * SS.Inactive state transitions the port to RxDetect automatically. * SS.Inactive link error state is common during device disconnect. */ while (hub_port_warm_reset_required(hub, port1, portstatus)) { if ((i++ < DETECT_DISCONNECT_TRIES) && udev) { u16 unused; msleep(20); usb_hub_port_status(hub, port1, &portstatus, &unused); dev_dbg(&port_dev->dev, "Wait for inactive link disconnect detect\n"); continue; } else if (!udev || !(portstatus & USB_PORT_STAT_CONNECTION) || udev->state == USB_STATE_NOTATTACHED) { dev_dbg(&port_dev->dev, "do warm reset, port only\n"); err = hub_port_reset(hub, port1, NULL, HUB_BH_RESET_TIME, true); if (!udev && err == -ENOTCONN) connect_change = 0; else if (err < 0) hub_port_disable(hub, port1, 1); } else { dev_dbg(&port_dev->dev, "do warm reset, full device\n"); usb_unlock_port(port_dev); usb_lock_device(udev); usb_reset_device(udev); usb_unlock_device(udev); usb_lock_port(port_dev); connect_change = 0; } break; } if (connect_change) hub_port_connect_change(hub, port1, portstatus, portchange); } static void hub_event(struct work_struct *work) { struct usb_device *hdev; struct usb_interface *intf; struct usb_hub *hub; struct device *hub_dev; u16 hubstatus; u16 hubchange; int i, ret; hub = container_of(work, struct usb_hub, events); hdev = hub->hdev; hub_dev = hub->intfdev; intf = to_usb_interface(hub_dev); kcov_remote_start_usb((u64)hdev->bus->busnum); dev_dbg(hub_dev, "state %d ports %d chg %04x evt %04x\n", hdev->state, hdev->maxchild, /* NOTE: expects max 15 ports... */ (u16) hub->change_bits[0], (u16) hub->event_bits[0]); /* Lock the device, then check to see if we were * disconnected while waiting for the lock to succeed. */ usb_lock_device(hdev); if (unlikely(hub->disconnected)) goto out_hdev_lock; /* If the hub has died, clean up after it */ if (hdev->state == USB_STATE_NOTATTACHED) { hub->error = -ENODEV; hub_quiesce(hub, HUB_DISCONNECT); goto out_hdev_lock; } /* Autoresume */ ret = usb_autopm_get_interface(intf); if (ret) { dev_dbg(hub_dev, "Can't autoresume: %d\n", ret); goto out_hdev_lock; } /* If this is an inactive hub, do nothing */ if (hub->quiescing) goto out_autopm; if (hub->error) { dev_dbg(hub_dev, "resetting for error %d\n", hub->error); ret = usb_reset_device(hdev); if (ret) { dev_dbg(hub_dev, "error resetting hub: %d\n", ret); goto out_autopm; } hub->nerrors = 0; hub->error = 0; } /* deal with port status changes */ for (i = 1; i <= hdev->maxchild; i++) { struct usb_port *port_dev = hub->ports[i - 1]; if (test_bit(i, hub->event_bits) || test_bit(i, hub->change_bits) || test_bit(i, hub->wakeup_bits)) { /* * The get_noresume and barrier ensure that if * the port was in the process of resuming, we * flush that work and keep the port active for * the duration of the port_event(). However, * if the port is runtime pm suspended * (powered-off), we leave it in that state, run * an abbreviated port_event(), and move on. */ pm_runtime_get_noresume(&port_dev->dev); pm_runtime_barrier(&port_dev->dev); usb_lock_port(port_dev); port_event(hub, i); usb_unlock_port(port_dev); pm_runtime_put_sync(&port_dev->dev); } } /* deal with hub status changes */ if (test_and_clear_bit(0, hub->event_bits) == 0) ; /* do nothing */ else if (hub_hub_status(hub, &hubstatus, &hubchange) < 0) dev_err(hub_dev, "get_hub_status failed\n"); else { if (hubchange & HUB_CHANGE_LOCAL_POWER) { dev_dbg(hub_dev, "power change\n"); clear_hub_feature(hdev, C_HUB_LOCAL_POWER); if (hubstatus & HUB_STATUS_LOCAL_POWER) /* FIXME: Is this always true? */ hub->limited_power = 1; else hub->limited_power = 0; } if (hubchange & HUB_CHANGE_OVERCURRENT) { u16 status = 0; u16 unused; dev_dbg(hub_dev, "over-current change\n"); clear_hub_feature(hdev, C_HUB_OVER_CURRENT); msleep(500); /* Cool down */ hub_power_on(hub, true); hub_hub_status(hub, &status, &unused); if (status & HUB_STATUS_OVERCURRENT) dev_err(hub_dev, "over-current condition\n"); } } out_autopm: /* Balance the usb_autopm_get_interface() above */ usb_autopm_put_interface_no_suspend(intf); out_hdev_lock: usb_unlock_device(hdev); /* Balance the stuff in kick_hub_wq() and allow autosuspend */ usb_autopm_put_interface(intf); hub_put(hub); kcov_remote_stop(); } static const struct usb_device_id hub_id_table[] = { { .match_flags = USB_DEVICE_ID_MATCH_VENDOR | USB_DEVICE_ID_MATCH_PRODUCT | USB_DEVICE_ID_MATCH_INT_CLASS, .idVendor = USB_VENDOR_SMSC, .idProduct = USB_PRODUCT_USB5534B, .bInterfaceClass = USB_CLASS_HUB, .driver_info = HUB_QUIRK_DISABLE_AUTOSUSPEND}, { .match_flags = USB_DEVICE_ID_MATCH_VENDOR | USB_DEVICE_ID_MATCH_PRODUCT, .idVendor = USB_VENDOR_CYPRESS, .idProduct = USB_PRODUCT_CY7C65632, .driver_info = HUB_QUIRK_DISABLE_AUTOSUSPEND}, { .match_flags = USB_DEVICE_ID_MATCH_VENDOR | USB_DEVICE_ID_MATCH_INT_CLASS, .idVendor = USB_VENDOR_GENESYS_LOGIC, .bInterfaceClass = USB_CLASS_HUB, .driver_info = HUB_QUIRK_CHECK_PORT_AUTOSUSPEND}, { .match_flags = USB_DEVICE_ID_MATCH_VENDOR | USB_DEVICE_ID_MATCH_PRODUCT, .idVendor = USB_VENDOR_TEXAS_INSTRUMENTS, .idProduct = USB_PRODUCT_TUSB8041_USB2, .driver_info = HUB_QUIRK_DISABLE_AUTOSUSPEND}, { .match_flags = USB_DEVICE_ID_MATCH_VENDOR | USB_DEVICE_ID_MATCH_PRODUCT, .idVendor = USB_VENDOR_TEXAS_INSTRUMENTS, .idProduct = USB_PRODUCT_TUSB8041_USB3, .driver_info = HUB_QUIRK_DISABLE_AUTOSUSPEND}, { .match_flags = USB_DEVICE_ID_MATCH_VENDOR | USB_DEVICE_ID_MATCH_PRODUCT, .idVendor = USB_VENDOR_MICROCHIP, .idProduct = USB_PRODUCT_USB4913, .driver_info = HUB_QUIRK_REDUCE_FRAME_INTR_BINTERVAL}, { .match_flags = USB_DEVICE_ID_MATCH_VENDOR | USB_DEVICE_ID_MATCH_PRODUCT, .idVendor = USB_VENDOR_MICROCHIP, .idProduct = USB_PRODUCT_USB4914, .driver_info = HUB_QUIRK_REDUCE_FRAME_INTR_BINTERVAL}, { .match_flags = USB_DEVICE_ID_MATCH_VENDOR | USB_DEVICE_ID_MATCH_PRODUCT, .idVendor = USB_VENDOR_MICROCHIP, .idProduct = USB_PRODUCT_USB4915, .driver_info = HUB_QUIRK_REDUCE_FRAME_INTR_BINTERVAL}, { .match_flags = USB_DEVICE_ID_MATCH_DEV_CLASS, .bDeviceClass = USB_CLASS_HUB}, { .match_flags = USB_DEVICE_ID_MATCH_INT_CLASS, .bInterfaceClass = USB_CLASS_HUB}, { } /* Terminating entry */ }; MODULE_DEVICE_TABLE(usb, hub_id_table); static struct usb_driver hub_driver = { .name = "hub", .probe = hub_probe, .disconnect = hub_disconnect, .suspend = hub_suspend, .resume = hub_resume, .reset_resume = hub_reset_resume, .pre_reset = hub_pre_reset, .post_reset = hub_post_reset, .unlocked_ioctl = hub_ioctl, .id_table = hub_id_table, .supports_autosuspend = 1, }; int usb_hub_init(void) { if (usb_register(&hub_driver) < 0) { printk(KERN_ERR "%s: can't register hub driver\n", usbcore_name); return -1; } /* * The workqueue needs to be freezable to avoid interfering with * USB-PERSIST port handover. Otherwise it might see that a full-speed * device was gone before the EHCI controller had handed its port * over to the companion full-speed controller. */ hub_wq = alloc_workqueue("usb_hub_wq", WQ_FREEZABLE | WQ_PERCPU, 0); if (hub_wq) return 0; /* Fall through if kernel_thread failed */ usb_deregister(&hub_driver); pr_err("%s: can't allocate workqueue for usb hub\n", usbcore_name); return -1; } void usb_hub_cleanup(void) { destroy_workqueue(hub_wq); /* * Hub resources are freed for us by usb_deregister. It calls * usb_driver_purge on every device which in turn calls that * devices disconnect function if it is using this driver. * The hub_disconnect function takes care of releasing the * individual hub resources. -greg */ usb_deregister(&hub_driver); } /* usb_hub_cleanup() */ /** * hub_hc_release_resources - clear resources used by host controller * @udev: pointer to device being released * * Context: task context, might sleep * * Function releases the host controller resources in correct order before * making any operation on resuming usb device. The host controller resources * allocated for devices in tree should be released starting from the last * usb device in tree toward the root hub. This function is used only during * resuming device when usb device require reinitialization – that is, when * flag udev->reset_resume is set. * * This call is synchronous, and may not be used in an interrupt context. */ static void hub_hc_release_resources(struct usb_device *udev) { struct usb_hub *hub = usb_hub_to_struct_hub(udev); struct usb_hcd *hcd = bus_to_hcd(udev->bus); int i; /* Release up resources for all children before this device */ for (i = 0; i < udev->maxchild; i++) if (hub->ports[i]->child) hub_hc_release_resources(hub->ports[i]->child); if (hcd->driver->reset_device) hcd->driver->reset_device(hcd, udev); } /** * usb_reset_and_verify_device - perform a USB port reset to reinitialize a device * @udev: device to reset (not in SUSPENDED or NOTATTACHED state) * * WARNING - don't use this routine to reset a composite device * (one with multiple interfaces owned by separate drivers)! * Use usb_reset_device() instead. * * Do a port reset, reassign the device's address, and establish its * former operating configuration. If the reset fails, or the device's * descriptors change from their values before the reset, or the original * configuration and altsettings cannot be restored, a flag will be set * telling hub_wq to pretend the device has been disconnected and then * re-connected. All drivers will be unbound, and the device will be * re-enumerated and probed all over again. * * Return: 0 if the reset succeeded, -ENODEV if the device has been * flagged for logical disconnection, or some other negative error code * if the reset wasn't even attempted. * * Note: * The caller must own the device lock and the port lock, the latter is * taken by usb_reset_device(). For example, it's safe to use * usb_reset_device() from a driver probe() routine after downloading * new firmware. For calls that might not occur during probe(), drivers * should lock the device using usb_lock_device_for_reset(). * * Locking exception: This routine may also be called from within an * autoresume handler. Such usage won't conflict with other tasks * holding the device lock because these tasks should always call * usb_autopm_resume_device(), thereby preventing any unwanted * autoresume. The autoresume handler is expected to have already * acquired the port lock before calling this routine. */ static int usb_reset_and_verify_device(struct usb_device *udev) { struct usb_device *parent_hdev = udev->parent; struct usb_hub *parent_hub; struct usb_hcd *hcd = bus_to_hcd(udev->bus); struct usb_device_descriptor descriptor; struct usb_interface *intf; struct usb_host_bos *bos; int i, j, ret = 0; int port1 = udev->portnum; if (udev->state == USB_STATE_NOTATTACHED || udev->state == USB_STATE_SUSPENDED) { dev_dbg(&udev->dev, "device reset not allowed in state %d\n", udev->state); return -EINVAL; } if (!parent_hdev) return -EISDIR; parent_hub = usb_hub_to_struct_hub(parent_hdev); /* Disable USB2 hardware LPM. * It will be re-enabled by the enumeration process. */ usb_disable_usb2_hardware_lpm(udev); bos = udev->bos; udev->bos = NULL; if (udev->reset_resume) hub_hc_release_resources(udev); mutex_lock(hcd->address0_mutex); for (i = 0; i < PORT_INIT_TRIES; ++i) { if (hub_port_stop_enumerate(parent_hub, port1, i)) { ret = -ENODEV; break; } /* ep0 maxpacket size may change; let the HCD know about it. * Other endpoints will be handled by re-enumeration. */ usb_ep0_reinit(udev); ret = hub_port_init(parent_hub, udev, port1, i, &descriptor); if (ret >= 0 || ret == -ENOTCONN || ret == -ENODEV) break; } mutex_unlock(hcd->address0_mutex); if (ret < 0) goto re_enumerate; /* Device might have changed firmware (DFU or similar) */ if (descriptors_changed(udev, &descriptor, bos)) { dev_info(&udev->dev, "device firmware changed\n"); goto re_enumerate; } /* Restore the device's previous configuration */ if (!udev->actconfig) goto done; /* * Some devices can't handle setting default altsetting 0 with a * Set-Interface request. Disable host-side endpoints of those * interfaces here. Enable and reset them back after host has set * its internal endpoint structures during usb_hcd_alloc_bandwith() */ for (i = 0; i < udev->actconfig->desc.bNumInterfaces; i++) { intf = udev->actconfig->interface[i]; if (intf->cur_altsetting->desc.bAlternateSetting == 0) usb_disable_interface(udev, intf, true); } mutex_lock(hcd->bandwidth_mutex); ret = usb_hcd_alloc_bandwidth(udev, udev->actconfig, NULL, NULL); if (ret < 0) { dev_warn(&udev->dev, "Busted HC? Not enough HCD resources for " "old configuration.\n"); mutex_unlock(hcd->bandwidth_mutex); goto re_enumerate; } ret = usb_control_msg(udev, usb_sndctrlpipe(udev, 0), USB_REQ_SET_CONFIGURATION, 0, udev->actconfig->desc.bConfigurationValue, 0, NULL, 0, USB_CTRL_SET_TIMEOUT); if (ret < 0) { dev_err(&udev->dev, "can't restore configuration #%d (error=%d)\n", udev->actconfig->desc.bConfigurationValue, ret); mutex_unlock(hcd->bandwidth_mutex); goto re_enumerate; } mutex_unlock(hcd->bandwidth_mutex); usb_set_device_state(udev, USB_STATE_CONFIGURED); /* Put interfaces back into the same altsettings as before. * Don't bother to send the Set-Interface request for interfaces * that were already in altsetting 0; besides being unnecessary, * many devices can't handle it. Instead just reset the host-side * endpoint state. */ for (i = 0; i < udev->actconfig->desc.bNumInterfaces; i++) { struct usb_host_config *config = udev->actconfig; struct usb_interface_descriptor *desc; intf = config->interface[i]; desc = &intf->cur_altsetting->desc; if (desc->bAlternateSetting == 0) { usb_enable_interface(udev, intf, true); ret = 0; } else { /* Let the bandwidth allocation function know that this * device has been reset, and it will have to use * alternate setting 0 as the current alternate setting. */ intf->resetting_device = 1; ret = usb_set_interface(udev, desc->bInterfaceNumber, desc->bAlternateSetting); intf->resetting_device = 0; } if (ret < 0) { dev_err(&udev->dev, "failed to restore interface %d " "altsetting %d (error=%d)\n", desc->bInterfaceNumber, desc->bAlternateSetting, ret); goto re_enumerate; } /* Resetting also frees any allocated streams */ for (j = 0; j < intf->cur_altsetting->desc.bNumEndpoints; j++) intf->cur_altsetting->endpoint[j].streams = 0; } done: /* Now that the alt settings are re-installed, enable LTM and LPM. */ usb_enable_usb2_hardware_lpm(udev); usb_unlocked_enable_lpm(udev); usb_enable_ltm(udev); usb_release_bos_descriptor(udev); udev->bos = bos; return 0; re_enumerate: usb_release_bos_descriptor(udev); udev->bos = bos; hub_port_logical_disconnect(parent_hub, port1); return -ENODEV; } /** * usb_reset_device - warn interface drivers and perform a USB port reset * @udev: device to reset (not in NOTATTACHED state) * * Warns all drivers bound to registered interfaces (using their pre_reset * method), performs the port reset, and then lets the drivers know that * the reset is over (using their post_reset method). * * Return: The same as for usb_reset_and_verify_device(). * However, if a reset is already in progress (for instance, if a * driver doesn't have pre_reset() or post_reset() callbacks, and while * being unbound or re-bound during the ongoing reset its disconnect() * or probe() routine tries to perform a second, nested reset), the * routine returns -EINPROGRESS. * * Note: * The caller must own the device lock. For example, it's safe to use * this from a driver probe() routine after downloading new firmware. * For calls that might not occur during probe(), drivers should lock * the device using usb_lock_device_for_reset(). * * If an interface is currently being probed or disconnected, we assume * its driver knows how to handle resets. For all other interfaces, * if the driver doesn't have pre_reset and post_reset methods then * we attempt to unbind it and rebind afterward. */ int usb_reset_device(struct usb_device *udev) { int ret; int i; unsigned int noio_flag; struct usb_port *port_dev; struct usb_host_config *config = udev->actconfig; struct usb_hub *hub = usb_hub_to_struct_hub(udev->parent); if (udev->state == USB_STATE_NOTATTACHED) { dev_dbg(&udev->dev, "device reset not allowed in state %d\n", udev->state); return -EINVAL; } if (!udev->parent) { /* this requires hcd-specific logic; see ohci_restart() */ dev_dbg(&udev->dev, "%s for root hub!\n", __func__); return -EISDIR; } if (udev->reset_in_progress) return -EINPROGRESS; udev->reset_in_progress = 1; port_dev = hub->ports[udev->portnum - 1]; /* * Don't allocate memory with GFP_KERNEL in current * context to avoid possible deadlock if usb mass * storage interface or usbnet interface(iSCSI case) * is included in current configuration. The easist * approach is to do it for every device reset, * because the device 'memalloc_noio' flag may have * not been set before reseting the usb device. */ noio_flag = memalloc_noio_save(); /* Prevent autosuspend during the reset */ usb_autoresume_device(udev); if (config) { for (i = 0; i < config->desc.bNumInterfaces; ++i) { struct usb_interface *cintf = config->interface[i]; struct usb_driver *drv; int unbind = 0; if (cintf->dev.driver) { drv = to_usb_driver(cintf->dev.driver); if (drv->pre_reset && drv->post_reset) unbind = (drv->pre_reset)(cintf); else if (cintf->condition == USB_INTERFACE_BOUND) unbind = 1; if (unbind) usb_forced_unbind_intf(cintf); } } } usb_lock_port(port_dev); ret = usb_reset_and_verify_device(udev); usb_unlock_port(port_dev); if (config) { for (i = config->desc.bNumInterfaces - 1; i >= 0; --i) { struct usb_interface *cintf = config->interface[i]; struct usb_driver *drv; int rebind = cintf->needs_binding; if (!rebind && cintf->dev.driver) { drv = to_usb_driver(cintf->dev.driver); if (drv->post_reset) rebind = (drv->post_reset)(cintf); else if (cintf->condition == USB_INTERFACE_BOUND) rebind = 1; if (rebind) cintf->needs_binding = 1; } } /* If the reset failed, hub_wq will unbind drivers later */ if (ret == 0) usb_unbind_and_rebind_marked_interfaces(udev); } usb_autosuspend_device(udev); memalloc_noio_restore(noio_flag); udev->reset_in_progress = 0; return ret; } EXPORT_SYMBOL_GPL(usb_reset_device); /** * usb_queue_reset_device - Reset a USB device from an atomic context * @iface: USB interface belonging to the device to reset * * This function can be used to reset a USB device from an atomic * context, where usb_reset_device() won't work (as it blocks). * * Doing a reset via this method is functionally equivalent to calling * usb_reset_device(), except for the fact that it is delayed to a * workqueue. This means that any drivers bound to other interfaces * might be unbound, as well as users from usbfs in user space. * * Corner cases: * * - Scheduling two resets at the same time from two different drivers * attached to two different interfaces of the same device is * possible; depending on how the driver attached to each interface * handles ->pre_reset(), the second reset might happen or not. * * - If the reset is delayed so long that the interface is unbound from * its driver, the reset will be skipped. * * - This function can be called during .probe(). It can also be called * during .disconnect(), but doing so is pointless because the reset * will not occur. If you really want to reset the device during * .disconnect(), call usb_reset_device() directly -- but watch out * for nested unbinding issues! */ void usb_queue_reset_device(struct usb_interface *iface) { if (schedule_work(&iface->reset_ws)) usb_get_intf(iface); } EXPORT_SYMBOL_GPL(usb_queue_reset_device); /** * usb_hub_find_child - Get the pointer of child device * attached to the port which is specified by @port1. * @hdev: USB device belonging to the usb hub * @port1: port num to indicate which port the child device * is attached to. * * USB drivers call this function to get hub's child device * pointer. * * Return: %NULL if input param is invalid and * child's usb_device pointer if non-NULL. */ struct usb_device *usb_hub_find_child(struct usb_device *hdev, int port1) { struct usb_hub *hub = usb_hub_to_struct_hub(hdev); if (port1 < 1 || port1 > hdev->maxchild) return NULL; return hub->ports[port1 - 1]->child; } EXPORT_SYMBOL_GPL(usb_hub_find_child); void usb_hub_adjust_deviceremovable(struct usb_device *hdev, struct usb_hub_descriptor *desc) { struct usb_hub *hub = usb_hub_to_struct_hub(hdev); enum usb_port_connect_type connect_type; int i; if (!hub) return; if (!hub_is_superspeed(hdev)) { for (i = 1; i <= hdev->maxchild; i++) { struct usb_port *port_dev = hub->ports[i - 1]; connect_type = port_dev->connect_type; if (connect_type == USB_PORT_CONNECT_TYPE_HARD_WIRED) { u8 mask = 1 << (i%8); if (!(desc->u.hs.DeviceRemovable[i/8] & mask)) { dev_dbg(&port_dev->dev, "DeviceRemovable is changed to 1 according to platform information.\n"); desc->u.hs.DeviceRemovable[i/8] |= mask; } } } } else { u16 port_removable = le16_to_cpu(desc->u.ss.DeviceRemovable); for (i = 1; i <= hdev->maxchild; i++) { struct usb_port *port_dev = hub->ports[i - 1]; connect_type = port_dev->connect_type; if (connect_type == USB_PORT_CONNECT_TYPE_HARD_WIRED) { u16 mask = 1 << i; if (!(port_removable & mask)) { dev_dbg(&port_dev->dev, "DeviceRemovable is changed to 1 according to platform information.\n"); port_removable |= mask; } } } desc->u.ss.DeviceRemovable = cpu_to_le16(port_removable); } } #ifdef CONFIG_ACPI /** * usb_get_hub_port_acpi_handle - Get the usb port's acpi handle * @hdev: USB device belonging to the usb hub * @port1: port num of the port * * Return: Port's acpi handle if successful, %NULL if params are * invalid. */ acpi_handle usb_get_hub_port_acpi_handle(struct usb_device *hdev, int port1) { struct usb_hub *hub = usb_hub_to_struct_hub(hdev); if (!hub) return NULL; return ACPI_HANDLE(&hub->ports[port1 - 1]->dev); } #endif
13 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 // SPDX-License-Identifier: GPL-2.0-only /* x_tables module for setting the IPv4/IPv6 DSCP field, Version 1.8 * * (C) 2002 by Harald Welte <laforge@netfilter.org> * based on ipt_FTOS.c (C) 2000 by Matthew G. Marsh <mgm@paktronix.com> * * See RFC2474 for a description of the DSCP field within the IP Header. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/skbuff.h> #include <linux/ip.h> #include <linux/ipv6.h> #include <net/dsfield.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter/xt_DSCP.h> MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>"); MODULE_DESCRIPTION("Xtables: DSCP/TOS field modification"); MODULE_LICENSE("GPL"); MODULE_ALIAS("ipt_DSCP"); MODULE_ALIAS("ip6t_DSCP"); MODULE_ALIAS("ipt_TOS"); MODULE_ALIAS("ip6t_TOS"); #define XT_DSCP_ECN_MASK 3u static unsigned int dscp_tg(struct sk_buff *skb, const struct xt_action_param *par) { const struct xt_DSCP_info *dinfo = par->targinfo; u_int8_t dscp = ipv4_get_dsfield(ip_hdr(skb)) >> XT_DSCP_SHIFT; if (dscp != dinfo->dscp) { if (skb_ensure_writable(skb, sizeof(struct iphdr))) return NF_DROP; ipv4_change_dsfield(ip_hdr(skb), XT_DSCP_ECN_MASK, dinfo->dscp << XT_DSCP_SHIFT); } return XT_CONTINUE; } static unsigned int dscp_tg6(struct sk_buff *skb, const struct xt_action_param *par) { const struct xt_DSCP_info *dinfo = par->targinfo; u_int8_t dscp = ipv6_get_dsfield(ipv6_hdr(skb)) >> XT_DSCP_SHIFT; if (dscp != dinfo->dscp) { if (skb_ensure_writable(skb, sizeof(struct ipv6hdr))) return NF_DROP; ipv6_change_dsfield(ipv6_hdr(skb), XT_DSCP_ECN_MASK, dinfo->dscp << XT_DSCP_SHIFT); } return XT_CONTINUE; } static int dscp_tg_check(const struct xt_tgchk_param *par) { const struct xt_DSCP_info *info = par->targinfo; if (info->dscp > XT_DSCP_MAX) return -EDOM; return 0; } static unsigned int tos_tg(struct sk_buff *skb, const struct xt_action_param *par) { const struct xt_tos_target_info *info = par->targinfo; struct iphdr *iph = ip_hdr(skb); u_int8_t orig, nv; orig = ipv4_get_dsfield(iph); nv = (orig & ~info->tos_mask) ^ info->tos_value; if (orig != nv) { if (skb_ensure_writable(skb, sizeof(struct iphdr))) return NF_DROP; iph = ip_hdr(skb); ipv4_change_dsfield(iph, 0, nv); } return XT_CONTINUE; } static unsigned int tos_tg6(struct sk_buff *skb, const struct xt_action_param *par) { const struct xt_tos_target_info *info = par->targinfo; struct ipv6hdr *iph = ipv6_hdr(skb); u_int8_t orig, nv; orig = ipv6_get_dsfield(iph); nv = (orig & ~info->tos_mask) ^ info->tos_value; if (orig != nv) { if (skb_ensure_writable(skb, sizeof(struct iphdr))) return NF_DROP; iph = ipv6_hdr(skb); ipv6_change_dsfield(iph, 0, nv); } return XT_CONTINUE; } static struct xt_target dscp_tg_reg[] __read_mostly = { { .name = "DSCP", .family = NFPROTO_IPV4, .checkentry = dscp_tg_check, .target = dscp_tg, .targetsize = sizeof(struct xt_DSCP_info), .table = "mangle", .me = THIS_MODULE, }, { .name = "DSCP", .family = NFPROTO_IPV6, .checkentry = dscp_tg_check, .target = dscp_tg6, .targetsize = sizeof(struct xt_DSCP_info), .table = "mangle", .me = THIS_MODULE, }, { .name = "TOS", .revision = 1, .family = NFPROTO_IPV4, .table = "mangle", .target = tos_tg, .targetsize = sizeof(struct xt_tos_target_info), .me = THIS_MODULE, }, { .name = "TOS", .revision = 1, .family = NFPROTO_IPV6, .table = "mangle", .target = tos_tg6, .targetsize = sizeof(struct xt_tos_target_info), .me = THIS_MODULE, }, }; static int __init dscp_tg_init(void) { return xt_register_targets(dscp_tg_reg, ARRAY_SIZE(dscp_tg_reg)); } static void __exit dscp_tg_exit(void) { xt_unregister_targets(dscp_tg_reg, ARRAY_SIZE(dscp_tg_reg)); } module_init(dscp_tg_init); module_exit(dscp_tg_exit);
16 16 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 // SPDX-License-Identifier: GPL-2.0-or-later /* Socket buffer accounting * * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/net.h> #include <linux/skbuff.h> #include <net/sock.h> #include <net/af_rxrpc.h> #include "ar-internal.h" #define select_skb_count(skb) (&rxrpc_n_rx_skbs) /* * Note the allocation or reception of a socket buffer. */ void rxrpc_new_skb(struct sk_buff *skb, enum rxrpc_skb_trace why) { int n = atomic_inc_return(select_skb_count(skb)); trace_rxrpc_skb(skb, refcount_read(&skb->users), n, why); } /* * Note the re-emergence of a socket buffer from a queue or buffer. */ void rxrpc_see_skb(struct sk_buff *skb, enum rxrpc_skb_trace why) { if (skb) { int n = atomic_read(select_skb_count(skb)); trace_rxrpc_skb(skb, refcount_read(&skb->users), n, why); } } /* * Note the addition of a ref on a socket buffer. */ void rxrpc_get_skb(struct sk_buff *skb, enum rxrpc_skb_trace why) { int n = atomic_inc_return(select_skb_count(skb)); trace_rxrpc_skb(skb, refcount_read(&skb->users), n, why); skb_get(skb); } /* * Note the dropping of a ref on a socket buffer by the core. */ void rxrpc_eaten_skb(struct sk_buff *skb, enum rxrpc_skb_trace why) { int n = atomic_inc_return(&rxrpc_n_rx_skbs); trace_rxrpc_skb(skb, 0, n, why); } /* * Note the destruction of a socket buffer. */ void rxrpc_free_skb(struct sk_buff *skb, enum rxrpc_skb_trace why) { if (skb) { int n = atomic_dec_return(select_skb_count(skb)); trace_rxrpc_skb(skb, refcount_read(&skb->users), n, why); consume_skb(skb); } } /* * Clear a queue of socket buffers. */ void rxrpc_purge_queue(struct sk_buff_head *list) { struct sk_buff *skb; while ((skb = skb_dequeue((list))) != NULL) { int n = atomic_dec_return(select_skb_count(skb)); trace_rxrpc_skb(skb, refcount_read(&skb->users), n, rxrpc_skb_put_purge); consume_skb(skb); } }
20 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * ALSA sequencer Memory Manager * Copyright (c) 1998 by Frank van de Pol <fvdpol@coil.demon.nl> */ #ifndef __SND_SEQ_MEMORYMGR_H #define __SND_SEQ_MEMORYMGR_H #include <sound/seq_kernel.h> #include <linux/poll.h> struct snd_info_buffer; /* aliasing for legacy and UMP event packet handling */ union __snd_seq_event { struct snd_seq_event legacy; #if IS_ENABLED(CONFIG_SND_SEQ_UMP) struct snd_seq_ump_event ump; #endif struct { struct snd_seq_event event; #if IS_ENABLED(CONFIG_SND_SEQ_UMP) u32 extra; #endif } __packed raw; }; /* container for sequencer event (internal use) */ struct snd_seq_event_cell { union { struct snd_seq_event event; union __snd_seq_event ump; }; struct snd_seq_pool *pool; /* used pool */ struct snd_seq_event_cell *next; /* next cell */ }; /* design note: the pool is a contiguous block of memory, if we dynamicly want to add additional cells to the pool be better store this in another pool as we need to know the base address of the pool when releasing memory. */ struct snd_seq_pool { struct snd_seq_event_cell *ptr; /* pointer to first event chunk */ struct snd_seq_event_cell *free; /* pointer to the head of the free list */ int total_elements; /* pool size actually allocated */ atomic_t counter; /* cells free */ int size; /* pool size to be allocated */ int room; /* watermark for sleep/wakeup */ int closing; /* statistics */ int max_used; int event_alloc_nopool; int event_alloc_failures; int event_alloc_success; /* Write locking */ wait_queue_head_t output_sleep; /* Pool lock */ spinlock_t lock; }; void snd_seq_cell_free(struct snd_seq_event_cell *cell); int snd_seq_event_dup(struct snd_seq_pool *pool, struct snd_seq_event *event, struct snd_seq_event_cell **cellp, int nonblock, struct file *file, struct mutex *mutexp); /* return number of unused (free) cells */ static inline int snd_seq_unused_cells(struct snd_seq_pool *pool) { return pool ? pool->total_elements - atomic_read(&pool->counter) : 0; } /* return total number of allocated cells */ static inline int snd_seq_total_cells(struct snd_seq_pool *pool) { return pool ? pool->total_elements : 0; } /* init pool - allocate events */ int snd_seq_pool_init(struct snd_seq_pool *pool); /* done pool - free events */ void snd_seq_pool_mark_closing(struct snd_seq_pool *pool); int snd_seq_pool_done(struct snd_seq_pool *pool); /* create pool */ struct snd_seq_pool *snd_seq_pool_new(int poolsize); /* remove pool */ int snd_seq_pool_delete(struct snd_seq_pool **pool); /* polling */ int snd_seq_pool_poll_wait(struct snd_seq_pool *pool, struct file *file, poll_table *wait); void snd_seq_info_pool(struct snd_info_buffer *buffer, struct snd_seq_pool *pool, char *space); #endif
12 1 12 12 12 12 18 3 7 6 1 4 4 3 1 1 3 4 2 2 3 5 5 4 1 5 5 5 20 23 22 1 3 1 1 1 5 9 6 5 12 14 14 3 1 2 2 2 1 3 3 6 1 3 3 3 5 6 1 5 3 1 25 3 1 1 1 8 6 4 1 3 1 1 1 1 4 3 11 1 2 1 2 3 3 3 1 4 7 1 1 2 3 3 2 1 3 2 1 3 3 19 13 4 1 3 12 11 1 12 12 5 4 1 19 19 5 5 12 11 2 3 10 3 1 4 12 29 15 29 1 2 1 8 2 4 16 19 7 7 7 6 6 6 6 6 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 // SPDX-License-Identifier: GPL-2.0 // Copyright (c) 2010-2011 EIA Electronics, // Pieter Beyens <pieter.beyens@eia.be> // Copyright (c) 2010-2011 EIA Electronics, // Kurt Van Dijck <kurt.van.dijck@eia.be> // Copyright (c) 2018 Protonic, // Robin van der Gracht <robin@protonic.nl> // Copyright (c) 2017-2019 Pengutronix, // Marc Kleine-Budde <kernel@pengutronix.de> // Copyright (c) 2017-2019 Pengutronix, // Oleksij Rempel <kernel@pengutronix.de> #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/can/can-ml.h> #include <linux/can/core.h> #include <linux/can/skb.h> #include <linux/errqueue.h> #include <linux/if_arp.h> #include "j1939-priv.h" #define J1939_MIN_NAMELEN CAN_REQUIRED_SIZE(struct sockaddr_can, can_addr.j1939) /* conversion function between struct sock::sk_priority from linux and * j1939 priority field */ static inline priority_t j1939_prio(u32 sk_priority) { sk_priority = min(sk_priority, 7U); return 7 - sk_priority; } static inline u32 j1939_to_sk_priority(priority_t prio) { return 7 - prio; } /* function to see if pgn is to be evaluated */ static inline bool j1939_pgn_is_valid(pgn_t pgn) { return pgn <= J1939_PGN_MAX; } /* test function to avoid non-zero DA placeholder for pdu1 pgn's */ static inline bool j1939_pgn_is_clean_pdu(pgn_t pgn) { if (j1939_pgn_is_pdu1(pgn)) return !(pgn & 0xff); else return true; } static inline void j1939_sock_pending_add(struct sock *sk) { struct j1939_sock *jsk = j1939_sk(sk); atomic_inc(&jsk->skb_pending); } static int j1939_sock_pending_get(struct sock *sk) { struct j1939_sock *jsk = j1939_sk(sk); return atomic_read(&jsk->skb_pending); } void j1939_sock_pending_del(struct sock *sk) { struct j1939_sock *jsk = j1939_sk(sk); /* atomic_dec_return returns the new value */ if (!atomic_dec_return(&jsk->skb_pending)) wake_up(&jsk->waitq); /* no pending SKB's */ } static void j1939_jsk_add(struct j1939_priv *priv, struct j1939_sock *jsk) { jsk->state |= J1939_SOCK_BOUND; j1939_priv_get(priv); write_lock_bh(&priv->j1939_socks_lock); list_add_tail(&jsk->list, &priv->j1939_socks); write_unlock_bh(&priv->j1939_socks_lock); } static void j1939_jsk_del(struct j1939_priv *priv, struct j1939_sock *jsk) { write_lock_bh(&priv->j1939_socks_lock); list_del_init(&jsk->list); write_unlock_bh(&priv->j1939_socks_lock); j1939_priv_put(priv); jsk->state &= ~J1939_SOCK_BOUND; } static bool j1939_sk_queue_session(struct j1939_session *session) { struct j1939_sock *jsk = j1939_sk(session->sk); bool empty; spin_lock_bh(&jsk->sk_session_queue_lock); empty = list_empty(&jsk->sk_session_queue); j1939_session_get(session); list_add_tail(&session->sk_session_queue_entry, &jsk->sk_session_queue); spin_unlock_bh(&jsk->sk_session_queue_lock); j1939_sock_pending_add(&jsk->sk); return empty; } static struct j1939_session *j1939_sk_get_incomplete_session(struct j1939_sock *jsk) { struct j1939_session *session = NULL; spin_lock_bh(&jsk->sk_session_queue_lock); if (!list_empty(&jsk->sk_session_queue)) { session = list_last_entry(&jsk->sk_session_queue, struct j1939_session, sk_session_queue_entry); if (session->total_queued_size == session->total_message_size) session = NULL; else j1939_session_get(session); } spin_unlock_bh(&jsk->sk_session_queue_lock); return session; } static void j1939_sk_queue_drop_all(struct j1939_priv *priv, struct j1939_sock *jsk, int err) { struct j1939_session *session, *tmp; netdev_dbg(priv->ndev, "%s: err: %i\n", __func__, err); spin_lock_bh(&jsk->sk_session_queue_lock); list_for_each_entry_safe(session, tmp, &jsk->sk_session_queue, sk_session_queue_entry) { list_del_init(&session->sk_session_queue_entry); session->err = err; j1939_session_put(session); } spin_unlock_bh(&jsk->sk_session_queue_lock); } static void j1939_sk_queue_activate_next_locked(struct j1939_session *session) { struct j1939_sock *jsk; struct j1939_session *first; int err; /* RX-Session don't have a socket (yet) */ if (!session->sk) return; jsk = j1939_sk(session->sk); lockdep_assert_held(&jsk->sk_session_queue_lock); err = session->err; first = list_first_entry_or_null(&jsk->sk_session_queue, struct j1939_session, sk_session_queue_entry); /* Some else has already activated the next session */ if (first != session) return; activate_next: list_del_init(&first->sk_session_queue_entry); j1939_session_put(first); first = list_first_entry_or_null(&jsk->sk_session_queue, struct j1939_session, sk_session_queue_entry); if (!first) return; if (j1939_session_activate(first)) { netdev_warn_once(first->priv->ndev, "%s: 0x%p: Identical session is already activated.\n", __func__, first); first->err = -EBUSY; goto activate_next; } else { /* Give receiver some time (arbitrary chosen) to recover */ int time_ms = 0; if (err) time_ms = 10 + get_random_u32_below(16); j1939_tp_schedule_txtimer(first, time_ms); } } void j1939_sk_queue_activate_next(struct j1939_session *session) { struct j1939_sock *jsk; if (!session->sk) return; jsk = j1939_sk(session->sk); spin_lock_bh(&jsk->sk_session_queue_lock); j1939_sk_queue_activate_next_locked(session); spin_unlock_bh(&jsk->sk_session_queue_lock); } static bool j1939_sk_match_dst(struct j1939_sock *jsk, const struct j1939_sk_buff_cb *skcb) { if ((jsk->state & J1939_SOCK_PROMISC)) return true; /* Destination address filter */ if (jsk->addr.src_name && skcb->addr.dst_name) { if (jsk->addr.src_name != skcb->addr.dst_name) return false; } else { /* receive (all sockets) if * - all packages that match our bind() address * - all broadcast on a socket if SO_BROADCAST * is set */ if (j1939_address_is_unicast(skcb->addr.da)) { if (jsk->addr.sa != skcb->addr.da) return false; } else if (!sock_flag(&jsk->sk, SOCK_BROADCAST)) { /* receiving broadcast without SO_BROADCAST * flag is not allowed */ return false; } } /* Source address filter */ if (jsk->state & J1939_SOCK_CONNECTED) { /* receive (all sockets) if * - all packages that match our connect() name or address */ if (jsk->addr.dst_name && skcb->addr.src_name) { if (jsk->addr.dst_name != skcb->addr.src_name) return false; } else { if (jsk->addr.da != skcb->addr.sa) return false; } } /* PGN filter */ if (j1939_pgn_is_valid(jsk->pgn_rx_filter) && jsk->pgn_rx_filter != skcb->addr.pgn) return false; return true; } /* matches skb control buffer (addr) with a j1939 filter */ static bool j1939_sk_match_filter(struct j1939_sock *jsk, const struct j1939_sk_buff_cb *skcb) { const struct j1939_filter *f; int nfilter; spin_lock_bh(&jsk->filters_lock); f = jsk->filters; nfilter = jsk->nfilters; if (!nfilter) /* receive all when no filters are assigned */ goto filter_match_found; for (; nfilter; ++f, --nfilter) { if ((skcb->addr.pgn & f->pgn_mask) != f->pgn) continue; if ((skcb->addr.sa & f->addr_mask) != f->addr) continue; if ((skcb->addr.src_name & f->name_mask) != f->name) continue; goto filter_match_found; } spin_unlock_bh(&jsk->filters_lock); return false; filter_match_found: spin_unlock_bh(&jsk->filters_lock); return true; } static bool j1939_sk_recv_match_one(struct j1939_sock *jsk, const struct j1939_sk_buff_cb *skcb) { if (!(jsk->state & J1939_SOCK_BOUND)) return false; if (!j1939_sk_match_dst(jsk, skcb)) return false; if (!j1939_sk_match_filter(jsk, skcb)) return false; return true; } static void j1939_sk_recv_one(struct j1939_sock *jsk, struct sk_buff *oskb) { const struct j1939_sk_buff_cb *oskcb = j1939_skb_to_cb(oskb); struct j1939_sk_buff_cb *skcb; enum skb_drop_reason reason; struct sk_buff *skb; if (oskb->sk == &jsk->sk) return; if (!j1939_sk_recv_match_one(jsk, oskcb)) return; skb = skb_clone(oskb, GFP_ATOMIC); if (!skb) { pr_warn("skb clone failed\n"); return; } can_skb_set_owner(skb, oskb->sk); skcb = j1939_skb_to_cb(skb); skcb->msg_flags &= ~(MSG_DONTROUTE); if (skb->sk) skcb->msg_flags |= MSG_DONTROUTE; if (sock_queue_rcv_skb_reason(&jsk->sk, skb, &reason) < 0) sk_skb_reason_drop(&jsk->sk, skb, reason); } bool j1939_sk_recv_match(struct j1939_priv *priv, struct j1939_sk_buff_cb *skcb) { struct j1939_sock *jsk; bool match = false; read_lock_bh(&priv->j1939_socks_lock); list_for_each_entry(jsk, &priv->j1939_socks, list) { match = j1939_sk_recv_match_one(jsk, skcb); if (match) break; } read_unlock_bh(&priv->j1939_socks_lock); return match; } void j1939_sk_recv(struct j1939_priv *priv, struct sk_buff *skb) { struct j1939_sock *jsk; read_lock_bh(&priv->j1939_socks_lock); list_for_each_entry(jsk, &priv->j1939_socks, list) { j1939_sk_recv_one(jsk, skb); } read_unlock_bh(&priv->j1939_socks_lock); } static void j1939_sk_sock_destruct(struct sock *sk) { struct j1939_sock *jsk = j1939_sk(sk); /* This function will be called by the generic networking code, when * the socket is ultimately closed (sk->sk_destruct). * * The race between * - processing a received CAN frame * (can_receive -> j1939_can_recv) * and accessing j1939_priv * ... and ... * - closing a socket * (j1939_can_rx_unregister -> can_rx_unregister) * and calling the final j1939_priv_put() * * is avoided by calling the final j1939_priv_put() from this * RCU deferred cleanup call. */ if (jsk->priv) { j1939_priv_put(jsk->priv); jsk->priv = NULL; } /* call generic CAN sock destruct */ can_sock_destruct(sk); } static int j1939_sk_init(struct sock *sk) { struct j1939_sock *jsk = j1939_sk(sk); /* Ensure that "sk" is first member in "struct j1939_sock", so that we * can skip it during memset(). */ BUILD_BUG_ON(offsetof(struct j1939_sock, sk) != 0); memset((void *)jsk + sizeof(jsk->sk), 0x0, sizeof(*jsk) - sizeof(jsk->sk)); INIT_LIST_HEAD(&jsk->list); init_waitqueue_head(&jsk->waitq); jsk->sk.sk_priority = j1939_to_sk_priority(6); jsk->sk.sk_reuse = 1; /* per default */ jsk->addr.sa = J1939_NO_ADDR; jsk->addr.da = J1939_NO_ADDR; jsk->addr.pgn = J1939_NO_PGN; jsk->pgn_rx_filter = J1939_NO_PGN; atomic_set(&jsk->skb_pending, 0); spin_lock_init(&jsk->sk_session_queue_lock); INIT_LIST_HEAD(&jsk->sk_session_queue); spin_lock_init(&jsk->filters_lock); /* j1939_sk_sock_destruct() depends on SOCK_RCU_FREE flag */ sock_set_flag(sk, SOCK_RCU_FREE); sk->sk_destruct = j1939_sk_sock_destruct; sk->sk_protocol = CAN_J1939; return 0; } static int j1939_sk_sanity_check(struct sockaddr_can *addr, int len) { if (!addr) return -EDESTADDRREQ; if (len < J1939_MIN_NAMELEN) return -EINVAL; if (addr->can_family != AF_CAN) return -EINVAL; if (!addr->can_ifindex) return -ENODEV; if (j1939_pgn_is_valid(addr->can_addr.j1939.pgn) && !j1939_pgn_is_clean_pdu(addr->can_addr.j1939.pgn)) return -EINVAL; return 0; } static int j1939_sk_bind(struct socket *sock, struct sockaddr_unsized *uaddr, int len) { struct sockaddr_can *addr = (struct sockaddr_can *)uaddr; struct j1939_sock *jsk = j1939_sk(sock->sk); struct j1939_priv *priv; struct sock *sk; struct net *net; int ret = 0; ret = j1939_sk_sanity_check(addr, len); if (ret) return ret; lock_sock(sock->sk); priv = jsk->priv; sk = sock->sk; net = sock_net(sk); /* Already bound to an interface? */ if (jsk->state & J1939_SOCK_BOUND) { /* A re-bind() to a different interface is not * supported. */ if (jsk->ifindex != addr->can_ifindex) { ret = -EINVAL; goto out_release_sock; } /* drop old references */ j1939_jsk_del(priv, jsk); j1939_local_ecu_put(priv, jsk->addr.src_name, jsk->addr.sa); } else { struct can_ml_priv *can_ml; struct net_device *ndev; ndev = dev_get_by_index(net, addr->can_ifindex); if (!ndev) { ret = -ENODEV; goto out_release_sock; } if (ndev->reg_state != NETREG_REGISTERED) { dev_put(ndev); ret = -ENODEV; goto out_release_sock; } can_ml = can_get_ml_priv(ndev); if (!can_ml) { dev_put(ndev); ret = -ENODEV; goto out_release_sock; } if (!(ndev->flags & IFF_UP)) { dev_put(ndev); ret = -ENETDOWN; goto out_release_sock; } priv = j1939_netdev_start(ndev); dev_put(ndev); if (IS_ERR(priv)) { ret = PTR_ERR(priv); goto out_release_sock; } jsk->ifindex = addr->can_ifindex; /* the corresponding j1939_priv_put() is called via * sk->sk_destruct, which points to j1939_sk_sock_destruct() */ j1939_priv_get(priv); jsk->priv = priv; } /* set default transmit pgn */ if (j1939_pgn_is_valid(addr->can_addr.j1939.pgn)) jsk->pgn_rx_filter = addr->can_addr.j1939.pgn; jsk->addr.src_name = addr->can_addr.j1939.name; jsk->addr.sa = addr->can_addr.j1939.addr; /* get new references */ ret = j1939_local_ecu_get(priv, jsk->addr.src_name, jsk->addr.sa); if (ret) { j1939_netdev_stop(priv); jsk->priv = NULL; synchronize_rcu(); j1939_priv_put(priv); goto out_release_sock; } j1939_jsk_add(priv, jsk); out_release_sock: /* fall through */ release_sock(sock->sk); return ret; } static int j1939_sk_connect(struct socket *sock, struct sockaddr_unsized *uaddr, int len, int flags) { struct sockaddr_can *addr = (struct sockaddr_can *)uaddr; struct j1939_sock *jsk = j1939_sk(sock->sk); int ret = 0; ret = j1939_sk_sanity_check(addr, len); if (ret) return ret; lock_sock(sock->sk); /* bind() before connect() is mandatory */ if (!(jsk->state & J1939_SOCK_BOUND)) { ret = -EINVAL; goto out_release_sock; } /* A connect() to a different interface is not supported. */ if (jsk->ifindex != addr->can_ifindex) { ret = -EINVAL; goto out_release_sock; } if (!addr->can_addr.j1939.name && addr->can_addr.j1939.addr == J1939_NO_ADDR && !sock_flag(&jsk->sk, SOCK_BROADCAST)) { /* broadcast, but SO_BROADCAST not set */ ret = -EACCES; goto out_release_sock; } jsk->addr.dst_name = addr->can_addr.j1939.name; jsk->addr.da = addr->can_addr.j1939.addr; if (j1939_pgn_is_valid(addr->can_addr.j1939.pgn)) jsk->addr.pgn = addr->can_addr.j1939.pgn; jsk->state |= J1939_SOCK_CONNECTED; out_release_sock: /* fall through */ release_sock(sock->sk); return ret; } static void j1939_sk_sock2sockaddr_can(struct sockaddr_can *addr, const struct j1939_sock *jsk, int peer) { /* There are two holes (2 bytes and 3 bytes) to clear to avoid * leaking kernel information to user space. */ memset(addr, 0, J1939_MIN_NAMELEN); addr->can_family = AF_CAN; addr->can_ifindex = jsk->ifindex; addr->can_addr.j1939.pgn = jsk->addr.pgn; if (peer) { addr->can_addr.j1939.name = jsk->addr.dst_name; addr->can_addr.j1939.addr = jsk->addr.da; } else { addr->can_addr.j1939.name = jsk->addr.src_name; addr->can_addr.j1939.addr = jsk->addr.sa; } } static int j1939_sk_getname(struct socket *sock, struct sockaddr *uaddr, int peer) { struct sockaddr_can *addr = (struct sockaddr_can *)uaddr; struct sock *sk = sock->sk; struct j1939_sock *jsk = j1939_sk(sk); int ret = 0; lock_sock(sk); if (peer && !(jsk->state & J1939_SOCK_CONNECTED)) { ret = -EADDRNOTAVAIL; goto failure; } j1939_sk_sock2sockaddr_can(addr, jsk, peer); ret = J1939_MIN_NAMELEN; failure: release_sock(sk); return ret; } static int j1939_sk_release(struct socket *sock) { struct sock *sk = sock->sk; struct j1939_sock *jsk; if (!sk) return 0; lock_sock(sk); jsk = j1939_sk(sk); if (jsk->state & J1939_SOCK_BOUND) { struct j1939_priv *priv = jsk->priv; if (wait_event_interruptible(jsk->waitq, !j1939_sock_pending_get(&jsk->sk))) { j1939_cancel_active_session(priv, sk); j1939_sk_queue_drop_all(priv, jsk, ESHUTDOWN); } j1939_jsk_del(priv, jsk); j1939_local_ecu_put(priv, jsk->addr.src_name, jsk->addr.sa); j1939_netdev_stop(priv); } kfree(jsk->filters); sock_orphan(sk); sock->sk = NULL; release_sock(sk); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); sock_put(sk); return 0; } static int j1939_sk_setsockopt_flag(struct j1939_sock *jsk, sockptr_t optval, unsigned int optlen, int flag) { int tmp; if (optlen != sizeof(tmp)) return -EINVAL; if (copy_from_sockptr(&tmp, optval, optlen)) return -EFAULT; lock_sock(&jsk->sk); if (tmp) jsk->state |= flag; else jsk->state &= ~flag; release_sock(&jsk->sk); return tmp; } static int j1939_sk_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; struct j1939_sock *jsk = j1939_sk(sk); int tmp, count = 0, ret = 0; struct j1939_filter *filters = NULL, *ofilters; if (level != SOL_CAN_J1939) return -EINVAL; switch (optname) { case SO_J1939_FILTER: if (!sockptr_is_null(optval) && optlen != 0) { struct j1939_filter *f; int c; if (optlen % sizeof(*filters) != 0) return -EINVAL; if (optlen > J1939_FILTER_MAX * sizeof(struct j1939_filter)) return -EINVAL; count = optlen / sizeof(*filters); filters = memdup_sockptr(optval, optlen); if (IS_ERR(filters)) return PTR_ERR(filters); for (f = filters, c = count; c; f++, c--) { f->name &= f->name_mask; f->pgn &= f->pgn_mask; f->addr &= f->addr_mask; } } lock_sock(&jsk->sk); spin_lock_bh(&jsk->filters_lock); ofilters = jsk->filters; jsk->filters = filters; jsk->nfilters = count; spin_unlock_bh(&jsk->filters_lock); release_sock(&jsk->sk); kfree(ofilters); return 0; case SO_J1939_PROMISC: return j1939_sk_setsockopt_flag(jsk, optval, optlen, J1939_SOCK_PROMISC); case SO_J1939_ERRQUEUE: ret = j1939_sk_setsockopt_flag(jsk, optval, optlen, J1939_SOCK_ERRQUEUE); if (ret < 0) return ret; if (!(jsk->state & J1939_SOCK_ERRQUEUE)) skb_queue_purge(&sk->sk_error_queue); return ret; case SO_J1939_SEND_PRIO: if (optlen != sizeof(tmp)) return -EINVAL; if (copy_from_sockptr(&tmp, optval, optlen)) return -EFAULT; if (tmp < 0 || tmp > 7) return -EDOM; if (tmp < 2 && !capable(CAP_NET_ADMIN)) return -EPERM; lock_sock(&jsk->sk); jsk->sk.sk_priority = j1939_to_sk_priority(tmp); release_sock(&jsk->sk); return 0; default: return -ENOPROTOOPT; } } static int j1939_sk_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) { struct sock *sk = sock->sk; struct j1939_sock *jsk = j1939_sk(sk); int ret, ulen; /* set defaults for using 'int' properties */ int tmp = 0; int len = sizeof(tmp); void *val = &tmp; if (level != SOL_CAN_J1939) return -EINVAL; if (get_user(ulen, optlen)) return -EFAULT; if (ulen < 0) return -EINVAL; lock_sock(&jsk->sk); switch (optname) { case SO_J1939_PROMISC: tmp = (jsk->state & J1939_SOCK_PROMISC) ? 1 : 0; break; case SO_J1939_ERRQUEUE: tmp = (jsk->state & J1939_SOCK_ERRQUEUE) ? 1 : 0; break; case SO_J1939_SEND_PRIO: tmp = j1939_prio(jsk->sk.sk_priority); break; default: ret = -ENOPROTOOPT; goto no_copy; } /* copy to user, based on 'len' & 'val' * but most sockopt's are 'int' properties, and have 'len' & 'val' * left unchanged, but instead modified 'tmp' */ if (len > ulen) ret = -EFAULT; else if (put_user(len, optlen)) ret = -EFAULT; else if (copy_to_user(optval, val, len)) ret = -EFAULT; else ret = 0; no_copy: release_sock(&jsk->sk); return ret; } static int j1939_sk_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, int flags) { struct sock *sk = sock->sk; struct sk_buff *skb; struct j1939_sk_buff_cb *skcb; int ret = 0; if (flags & ~(MSG_DONTWAIT | MSG_ERRQUEUE | MSG_CMSG_COMPAT)) return -EINVAL; if (flags & MSG_ERRQUEUE) return sock_recv_errqueue(sock->sk, msg, size, SOL_CAN_J1939, SCM_J1939_ERRQUEUE); skb = skb_recv_datagram(sk, flags, &ret); if (!skb) return ret; if (size < skb->len) msg->msg_flags |= MSG_TRUNC; else size = skb->len; ret = memcpy_to_msg(msg, skb->data, size); if (ret < 0) { skb_free_datagram(sk, skb); return ret; } skcb = j1939_skb_to_cb(skb); if (j1939_address_is_valid(skcb->addr.da)) put_cmsg(msg, SOL_CAN_J1939, SCM_J1939_DEST_ADDR, sizeof(skcb->addr.da), &skcb->addr.da); if (skcb->addr.dst_name) put_cmsg(msg, SOL_CAN_J1939, SCM_J1939_DEST_NAME, sizeof(skcb->addr.dst_name), &skcb->addr.dst_name); put_cmsg(msg, SOL_CAN_J1939, SCM_J1939_PRIO, sizeof(skcb->priority), &skcb->priority); if (msg->msg_name) { struct sockaddr_can *paddr = msg->msg_name; msg->msg_namelen = J1939_MIN_NAMELEN; memset(msg->msg_name, 0, msg->msg_namelen); paddr->can_family = AF_CAN; paddr->can_ifindex = skb->skb_iif; paddr->can_addr.j1939.name = skcb->addr.src_name; paddr->can_addr.j1939.addr = skcb->addr.sa; paddr->can_addr.j1939.pgn = skcb->addr.pgn; } sock_recv_cmsgs(msg, sk, skb); msg->msg_flags |= skcb->msg_flags; skb_free_datagram(sk, skb); return size; } static struct sk_buff *j1939_sk_alloc_skb(struct net_device *ndev, struct sock *sk, struct msghdr *msg, size_t size, int *errcode) { struct j1939_sock *jsk = j1939_sk(sk); struct j1939_sk_buff_cb *skcb; struct sk_buff *skb; int ret; skb = sock_alloc_send_skb(sk, size + sizeof(struct can_frame) - sizeof(((struct can_frame *)NULL)->data) + sizeof(struct can_skb_priv), msg->msg_flags & MSG_DONTWAIT, &ret); if (!skb) goto failure; can_skb_reserve(skb); can_skb_prv(skb)->ifindex = ndev->ifindex; can_skb_prv(skb)->skbcnt = 0; skb_reserve(skb, offsetof(struct can_frame, data)); ret = memcpy_from_msg(skb_put(skb, size), msg, size); if (ret < 0) goto free_skb; skb->dev = ndev; skcb = j1939_skb_to_cb(skb); memset(skcb, 0, sizeof(*skcb)); skcb->addr = jsk->addr; skcb->priority = j1939_prio(READ_ONCE(sk->sk_priority)); if (msg->msg_name) { struct sockaddr_can *addr = msg->msg_name; if (addr->can_addr.j1939.name || addr->can_addr.j1939.addr != J1939_NO_ADDR) { skcb->addr.dst_name = addr->can_addr.j1939.name; skcb->addr.da = addr->can_addr.j1939.addr; } if (j1939_pgn_is_valid(addr->can_addr.j1939.pgn)) skcb->addr.pgn = addr->can_addr.j1939.pgn; } *errcode = ret; return skb; free_skb: kfree_skb(skb); failure: *errcode = ret; return NULL; } static size_t j1939_sk_opt_stats_get_size(enum j1939_sk_errqueue_type type) { switch (type) { case J1939_ERRQUEUE_RX_RTS: return nla_total_size(sizeof(u32)) + /* J1939_NLA_TOTAL_SIZE */ nla_total_size(sizeof(u32)) + /* J1939_NLA_PGN */ nla_total_size(sizeof(u64)) + /* J1939_NLA_SRC_NAME */ nla_total_size(sizeof(u64)) + /* J1939_NLA_DEST_NAME */ nla_total_size(sizeof(u8)) + /* J1939_NLA_SRC_ADDR */ nla_total_size(sizeof(u8)) + /* J1939_NLA_DEST_ADDR */ 0; default: return nla_total_size(sizeof(u32)) + /* J1939_NLA_BYTES_ACKED */ 0; } } static struct sk_buff * j1939_sk_get_timestamping_opt_stats(struct j1939_session *session, enum j1939_sk_errqueue_type type) { struct sk_buff *stats; u32 size; stats = alloc_skb(j1939_sk_opt_stats_get_size(type), GFP_ATOMIC); if (!stats) return NULL; if (session->skcb.addr.type == J1939_SIMPLE) size = session->total_message_size; else size = min(session->pkt.tx_acked * 7, session->total_message_size); switch (type) { case J1939_ERRQUEUE_RX_RTS: nla_put_u32(stats, J1939_NLA_TOTAL_SIZE, session->total_message_size); nla_put_u32(stats, J1939_NLA_PGN, session->skcb.addr.pgn); nla_put_u64_64bit(stats, J1939_NLA_SRC_NAME, session->skcb.addr.src_name, J1939_NLA_PAD); nla_put_u64_64bit(stats, J1939_NLA_DEST_NAME, session->skcb.addr.dst_name, J1939_NLA_PAD); nla_put_u8(stats, J1939_NLA_SRC_ADDR, session->skcb.addr.sa); nla_put_u8(stats, J1939_NLA_DEST_ADDR, session->skcb.addr.da); break; default: nla_put_u32(stats, J1939_NLA_BYTES_ACKED, size); } return stats; } static void __j1939_sk_errqueue(struct j1939_session *session, struct sock *sk, enum j1939_sk_errqueue_type type) { struct j1939_priv *priv = session->priv; struct j1939_sock *jsk; struct sock_exterr_skb *serr; struct sk_buff *skb; char *state = "UNK"; u32 tsflags; int err; jsk = j1939_sk(sk); if (!(jsk->state & J1939_SOCK_ERRQUEUE)) return; tsflags = READ_ONCE(sk->sk_tsflags); switch (type) { case J1939_ERRQUEUE_TX_ACK: if (!(tsflags & SOF_TIMESTAMPING_TX_ACK)) return; break; case J1939_ERRQUEUE_TX_SCHED: if (!(tsflags & SOF_TIMESTAMPING_TX_SCHED)) return; break; case J1939_ERRQUEUE_TX_ABORT: break; case J1939_ERRQUEUE_RX_RTS: fallthrough; case J1939_ERRQUEUE_RX_DPO: fallthrough; case J1939_ERRQUEUE_RX_ABORT: if (!(tsflags & SOF_TIMESTAMPING_RX_SOFTWARE)) return; break; default: netdev_err(priv->ndev, "Unknown errqueue type %i\n", type); } skb = j1939_sk_get_timestamping_opt_stats(session, type); if (!skb) return; skb->tstamp = ktime_get_real(); BUILD_BUG_ON(sizeof(struct sock_exterr_skb) > sizeof(skb->cb)); serr = SKB_EXT_ERR(skb); memset(serr, 0, sizeof(*serr)); switch (type) { case J1939_ERRQUEUE_TX_ACK: serr->ee.ee_errno = ENOMSG; serr->ee.ee_origin = SO_EE_ORIGIN_TIMESTAMPING; serr->ee.ee_info = SCM_TSTAMP_ACK; state = "TX ACK"; break; case J1939_ERRQUEUE_TX_SCHED: serr->ee.ee_errno = ENOMSG; serr->ee.ee_origin = SO_EE_ORIGIN_TIMESTAMPING; serr->ee.ee_info = SCM_TSTAMP_SCHED; state = "TX SCH"; break; case J1939_ERRQUEUE_TX_ABORT: serr->ee.ee_errno = session->err; serr->ee.ee_origin = SO_EE_ORIGIN_LOCAL; serr->ee.ee_info = J1939_EE_INFO_TX_ABORT; state = "TX ABT"; break; case J1939_ERRQUEUE_RX_RTS: serr->ee.ee_errno = ENOMSG; serr->ee.ee_origin = SO_EE_ORIGIN_LOCAL; serr->ee.ee_info = J1939_EE_INFO_RX_RTS; state = "RX RTS"; break; case J1939_ERRQUEUE_RX_DPO: serr->ee.ee_errno = ENOMSG; serr->ee.ee_origin = SO_EE_ORIGIN_LOCAL; serr->ee.ee_info = J1939_EE_INFO_RX_DPO; state = "RX DPO"; break; case J1939_ERRQUEUE_RX_ABORT: serr->ee.ee_errno = session->err; serr->ee.ee_origin = SO_EE_ORIGIN_LOCAL; serr->ee.ee_info = J1939_EE_INFO_RX_ABORT; state = "RX ABT"; break; } serr->opt_stats = true; if (tsflags & SOF_TIMESTAMPING_OPT_ID) serr->ee.ee_data = session->tskey; netdev_dbg(session->priv->ndev, "%s: 0x%p tskey: %i, state: %s\n", __func__, session, session->tskey, state); err = sock_queue_err_skb(sk, skb); if (err) kfree_skb(skb); }; void j1939_sk_errqueue(struct j1939_session *session, enum j1939_sk_errqueue_type type) { struct j1939_priv *priv = session->priv; struct j1939_sock *jsk; if (session->sk) { /* send TX notifications to the socket of origin */ __j1939_sk_errqueue(session, session->sk, type); return; } /* spread RX notifications to all sockets subscribed to this session */ read_lock_bh(&priv->j1939_socks_lock); list_for_each_entry(jsk, &priv->j1939_socks, list) { if (j1939_sk_recv_match_one(jsk, &session->skcb)) __j1939_sk_errqueue(session, &jsk->sk, type); } read_unlock_bh(&priv->j1939_socks_lock); }; void j1939_sk_send_loop_abort(struct sock *sk, int err) { struct j1939_sock *jsk = j1939_sk(sk); if (jsk->state & J1939_SOCK_ERRQUEUE) return; sk->sk_err = err; sk_error_report(sk); } static int j1939_sk_send_loop(struct j1939_priv *priv, struct sock *sk, struct msghdr *msg, size_t size) { struct j1939_sock *jsk = j1939_sk(sk); struct j1939_session *session = j1939_sk_get_incomplete_session(jsk); struct sk_buff *skb; size_t segment_size, todo_size; int ret = 0; if (session && session->total_message_size != session->total_queued_size + size) { j1939_session_put(session); return -EIO; } todo_size = size; do { struct j1939_sk_buff_cb *skcb; segment_size = min_t(size_t, J1939_MAX_TP_PACKET_SIZE, todo_size); /* Allocate skb for one segment */ skb = j1939_sk_alloc_skb(priv->ndev, sk, msg, segment_size, &ret); if (ret) break; skcb = j1939_skb_to_cb(skb); if (!session) { /* at this point the size should be full size * of the session */ skcb->offset = 0; session = j1939_tp_send(priv, skb, size); if (IS_ERR(session)) { ret = PTR_ERR(session); goto kfree_skb; } if (j1939_sk_queue_session(session)) { /* try to activate session if we a * fist in the queue */ if (!j1939_session_activate(session)) { j1939_tp_schedule_txtimer(session, 0); } else { ret = -EBUSY; session->err = ret; j1939_sk_queue_drop_all(priv, jsk, EBUSY); break; } } } else { skcb->offset = session->total_queued_size; j1939_session_skb_queue(session, skb); } todo_size -= segment_size; session->total_queued_size += segment_size; } while (todo_size); switch (ret) { case 0: /* OK */ if (todo_size) netdev_warn(priv->ndev, "no error found and not completely queued?! %zu\n", todo_size); ret = size; break; case -ERESTARTSYS: ret = -EINTR; fallthrough; case -EAGAIN: /* OK */ if (todo_size != size) ret = size - todo_size; break; default: /* ERROR */ break; } if (session) j1939_session_put(session); return ret; kfree_skb: kfree_skb(skb); return ret; } static int j1939_sk_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) { struct sock *sk = sock->sk; struct j1939_sock *jsk = j1939_sk(sk); struct j1939_priv *priv; int ifindex; int ret; lock_sock(sock->sk); /* various socket state tests */ if (!(jsk->state & J1939_SOCK_BOUND)) { ret = -EBADFD; goto sendmsg_done; } priv = jsk->priv; ifindex = jsk->ifindex; if (!jsk->addr.src_name && jsk->addr.sa == J1939_NO_ADDR) { /* no source address assigned yet */ ret = -EBADFD; goto sendmsg_done; } /* deal with provided destination address info */ if (msg->msg_name) { struct sockaddr_can *addr = msg->msg_name; if (msg->msg_namelen < J1939_MIN_NAMELEN) { ret = -EINVAL; goto sendmsg_done; } if (addr->can_family != AF_CAN) { ret = -EINVAL; goto sendmsg_done; } if (addr->can_ifindex && addr->can_ifindex != ifindex) { ret = -EBADFD; goto sendmsg_done; } if (j1939_pgn_is_valid(addr->can_addr.j1939.pgn) && !j1939_pgn_is_clean_pdu(addr->can_addr.j1939.pgn)) { ret = -EINVAL; goto sendmsg_done; } if (!addr->can_addr.j1939.name && addr->can_addr.j1939.addr == J1939_NO_ADDR && !sock_flag(sk, SOCK_BROADCAST)) { /* broadcast, but SO_BROADCAST not set */ ret = -EACCES; goto sendmsg_done; } } else { if (!jsk->addr.dst_name && jsk->addr.da == J1939_NO_ADDR && !sock_flag(sk, SOCK_BROADCAST)) { /* broadcast, but SO_BROADCAST not set */ ret = -EACCES; goto sendmsg_done; } } ret = j1939_sk_send_loop(priv, sk, msg, size); sendmsg_done: release_sock(sock->sk); return ret; } void j1939_sk_netdev_event_netdown(struct j1939_priv *priv) { struct j1939_sock *jsk; int error_code = ENETDOWN; read_lock_bh(&priv->j1939_socks_lock); list_for_each_entry(jsk, &priv->j1939_socks, list) { jsk->sk.sk_err = error_code; if (!sock_flag(&jsk->sk, SOCK_DEAD)) sk_error_report(&jsk->sk); j1939_sk_queue_drop_all(priv, jsk, error_code); } read_unlock_bh(&priv->j1939_socks_lock); } void j1939_sk_netdev_event_unregister(struct j1939_priv *priv) { struct sock *sk; struct j1939_sock *jsk; bool wait_rcu = false; rescan: /* The caller is holding a ref on this "priv" via j1939_priv_get_by_ndev(). */ read_lock_bh(&priv->j1939_socks_lock); list_for_each_entry(jsk, &priv->j1939_socks, list) { /* Skip if j1939_jsk_add() is not called on this socket. */ if (!(jsk->state & J1939_SOCK_BOUND)) continue; sk = &jsk->sk; sock_hold(sk); read_unlock_bh(&priv->j1939_socks_lock); /* Check if j1939_jsk_del() is not yet called on this socket after holding * socket's lock, for both j1939_sk_bind() and j1939_sk_release() call * j1939_jsk_del() with socket's lock held. */ lock_sock(sk); if (jsk->state & J1939_SOCK_BOUND) { /* Neither j1939_sk_bind() nor j1939_sk_release() called j1939_jsk_del(). * Make this socket no longer bound, by pretending as if j1939_sk_bind() * dropped old references but did not get new references. */ j1939_jsk_del(priv, jsk); j1939_local_ecu_put(priv, jsk->addr.src_name, jsk->addr.sa); j1939_netdev_stop(priv); /* Call j1939_priv_put() now and prevent j1939_sk_sock_destruct() from * calling the corresponding j1939_priv_put(). * * j1939_sk_sock_destruct() is supposed to call j1939_priv_put() after * an RCU grace period. But since the caller is holding a ref on this * "priv", we can defer synchronize_rcu() until immediately before * the caller calls j1939_priv_put(). */ j1939_priv_put(priv); jsk->priv = NULL; wait_rcu = true; } release_sock(sk); sock_put(sk); goto rescan; } read_unlock_bh(&priv->j1939_socks_lock); if (wait_rcu) synchronize_rcu(); } static int j1939_sk_no_ioctlcmd(struct socket *sock, unsigned int cmd, unsigned long arg) { /* no ioctls for socket layer -> hand it down to NIC layer */ return -ENOIOCTLCMD; } static const struct proto_ops j1939_ops = { .family = PF_CAN, .release = j1939_sk_release, .bind = j1939_sk_bind, .connect = j1939_sk_connect, .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = j1939_sk_getname, .poll = datagram_poll, .ioctl = j1939_sk_no_ioctlcmd, .listen = sock_no_listen, .shutdown = sock_no_shutdown, .setsockopt = j1939_sk_setsockopt, .getsockopt = j1939_sk_getsockopt, .sendmsg = j1939_sk_sendmsg, .recvmsg = j1939_sk_recvmsg, .mmap = sock_no_mmap, }; static struct proto j1939_proto __read_mostly = { .name = "CAN_J1939", .owner = THIS_MODULE, .obj_size = sizeof(struct j1939_sock), .init = j1939_sk_init, }; const struct can_proto j1939_can_proto = { .type = SOCK_DGRAM, .protocol = CAN_J1939, .ops = &j1939_ops, .prot = &j1939_proto, };
709 272 579 181 308 710 1 66 375 361 412 75 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_HIGHMEM_H #define _LINUX_HIGHMEM_H #include <linux/fs.h> #include <linux/kernel.h> #include <linux/bug.h> #include <linux/cacheflush.h> #include <linux/kmsan.h> #include <linux/mm.h> #include <linux/uaccess.h> #include <linux/hardirq.h> #include "highmem-internal.h" /** * kmap - Map a page for long term usage * @page: Pointer to the page to be mapped * * Returns: The virtual address of the mapping * * Can only be invoked from preemptible task context because on 32bit * systems with CONFIG_HIGHMEM enabled this function might sleep. * * For systems with CONFIG_HIGHMEM=n and for pages in the low memory area * this returns the virtual address of the direct kernel mapping. * * The returned virtual address is globally visible and valid up to the * point where it is unmapped via kunmap(). The pointer can be handed to * other contexts. * * For highmem pages on 32bit systems this can be slow as the mapping space * is limited and protected by a global lock. In case that there is no * mapping slot available the function blocks until a slot is released via * kunmap(). */ static inline void *kmap(struct page *page); /** * kunmap - Unmap the virtual address mapped by kmap() * @page: Pointer to the page which was mapped by kmap() * * Counterpart to kmap(). A NOOP for CONFIG_HIGHMEM=n and for mappings of * pages in the low memory area. */ static inline void kunmap(const struct page *page); /** * kmap_to_page - Get the page for a kmap'ed address * @addr: The address to look up * * Returns: The page which is mapped to @addr. */ static inline struct page *kmap_to_page(void *addr); /** * kmap_flush_unused - Flush all unused kmap mappings in order to * remove stray mappings */ static inline void kmap_flush_unused(void); /** * kmap_local_page - Map a page for temporary usage * @page: Pointer to the page to be mapped * * Returns: The virtual address of the mapping * * Can be invoked from any context, including interrupts. * * Requires careful handling when nesting multiple mappings because the map * management is stack based. The unmap has to be in the reverse order of * the map operation: * * addr1 = kmap_local_page(page1); * addr2 = kmap_local_page(page2); * ... * kunmap_local(addr2); * kunmap_local(addr1); * * Unmapping addr1 before addr2 is invalid and causes malfunction. * * Contrary to kmap() mappings the mapping is only valid in the context of * the caller and cannot be handed to other contexts. * * On CONFIG_HIGHMEM=n kernels and for low memory pages this returns the * virtual address of the direct mapping. Only real highmem pages are * temporarily mapped. * * While kmap_local_page() is significantly faster than kmap() for the highmem * case it comes with restrictions about the pointer validity. * * On HIGHMEM enabled systems mapping a highmem page has the side effect of * disabling migration in order to keep the virtual address stable across * preemption. No caller of kmap_local_page() can rely on this side effect. */ static inline void *kmap_local_page(const struct page *page); /** * kmap_local_folio - Map a page in this folio for temporary usage * @folio: The folio containing the page. * @offset: The byte offset within the folio which identifies the page. * * Requires careful handling when nesting multiple mappings because the map * management is stack based. The unmap has to be in the reverse order of * the map operation:: * * addr1 = kmap_local_folio(folio1, offset1); * addr2 = kmap_local_folio(folio2, offset2); * ... * kunmap_local(addr2); * kunmap_local(addr1); * * Unmapping addr1 before addr2 is invalid and causes malfunction. * * Contrary to kmap() mappings the mapping is only valid in the context of * the caller and cannot be handed to other contexts. * * On CONFIG_HIGHMEM=n kernels and for low memory pages this returns the * virtual address of the direct mapping. Only real highmem pages are * temporarily mapped. * * While it is significantly faster than kmap() for the highmem case it * comes with restrictions about the pointer validity. * * On HIGHMEM enabled systems mapping a highmem page has the side effect of * disabling migration in order to keep the virtual address stable across * preemption. No caller of kmap_local_folio() can rely on this side effect. * * Context: Can be invoked from any context. * Return: The virtual address of @offset. */ static inline void *kmap_local_folio(const struct folio *folio, size_t offset); /** * kmap_atomic - Atomically map a page for temporary usage - Deprecated! * @page: Pointer to the page to be mapped * * Returns: The virtual address of the mapping * * In fact a wrapper around kmap_local_page() which also disables pagefaults * and, depending on PREEMPT_RT configuration, also CPU migration and * preemption. Therefore users should not count on the latter two side effects. * * Mappings should always be released by kunmap_atomic(). * * Do not use in new code. Use kmap_local_page() instead. * * It is used in atomic context when code wants to access the contents of a * page that might be allocated from high memory (see __GFP_HIGHMEM), for * example a page in the pagecache. The API has two functions, and they * can be used in a manner similar to the following:: * * // Find the page of interest. * struct page *page = find_get_page(mapping, offset); * * // Gain access to the contents of that page. * void *vaddr = kmap_atomic(page); * * // Do something to the contents of that page. * memset(vaddr, 0, PAGE_SIZE); * * // Unmap that page. * kunmap_atomic(vaddr); * * Note that the kunmap_atomic() call takes the result of the kmap_atomic() * call, not the argument. * * If you need to map two pages because you want to copy from one page to * another you need to keep the kmap_atomic calls strictly nested, like: * * vaddr1 = kmap_atomic(page1); * vaddr2 = kmap_atomic(page2); * * memcpy(vaddr1, vaddr2, PAGE_SIZE); * * kunmap_atomic(vaddr2); * kunmap_atomic(vaddr1); */ static inline void *kmap_atomic(const struct page *page); /* Highmem related interfaces for management code */ static inline unsigned long nr_free_highpages(void); static inline unsigned long totalhigh_pages(void); #ifndef ARCH_HAS_FLUSH_ANON_PAGE static inline void flush_anon_page(struct vm_area_struct *vma, struct page *page, unsigned long vmaddr) { } #endif #ifndef ARCH_IMPLEMENTS_FLUSH_KERNEL_VMAP_RANGE static inline void flush_kernel_vmap_range(void *vaddr, int size) { } static inline void invalidate_kernel_vmap_range(void *vaddr, int size) { } #endif /* when CONFIG_HIGHMEM is not set these will be plain clear/copy_page */ #ifndef clear_user_highpage static inline void clear_user_highpage(struct page *page, unsigned long vaddr) { void *addr = kmap_local_page(page); clear_user_page(addr, vaddr, page); kunmap_local(addr); } #endif #ifndef vma_alloc_zeroed_movable_folio /** * vma_alloc_zeroed_movable_folio - Allocate a zeroed page for a VMA. * @vma: The VMA the page is to be allocated for. * @vaddr: The virtual address the page will be inserted into. * * This function will allocate a page suitable for inserting into this * VMA at this virtual address. It may be allocated from highmem or * the movable zone. An architecture may provide its own implementation. * * Return: A folio containing one allocated and zeroed page or NULL if * we are out of memory. */ static inline struct folio *vma_alloc_zeroed_movable_folio(struct vm_area_struct *vma, unsigned long vaddr) { struct folio *folio; folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, vaddr); if (folio && user_alloc_needs_zeroing()) clear_user_highpage(&folio->page, vaddr); return folio; } #endif static inline void clear_highpage(struct page *page) { void *kaddr = kmap_local_page(page); clear_page(kaddr); kunmap_local(kaddr); } static inline void clear_highpage_kasan_tagged(struct page *page) { void *kaddr = kmap_local_page(page); clear_page(kasan_reset_tag(kaddr)); kunmap_local(kaddr); } #ifndef __HAVE_ARCH_TAG_CLEAR_HIGHPAGES /* Return false to let people know we did not initialize the pages */ static inline bool tag_clear_highpages(struct page *page, int numpages) { return false; } #endif /* * If we pass in a base or tail page, we can zero up to PAGE_SIZE. * If we pass in a head page, we can zero up to the size of the compound page. */ #ifdef CONFIG_HIGHMEM void zero_user_segments(struct page *page, unsigned start1, unsigned end1, unsigned start2, unsigned end2); #else static inline void zero_user_segments(struct page *page, unsigned start1, unsigned end1, unsigned start2, unsigned end2) { void *kaddr = kmap_local_page(page); unsigned int i; BUG_ON(end1 > page_size(page) || end2 > page_size(page)); if (end1 > start1) memset(kaddr + start1, 0, end1 - start1); if (end2 > start2) memset(kaddr + start2, 0, end2 - start2); kunmap_local(kaddr); for (i = 0; i < compound_nr(page); i++) flush_dcache_page(page + i); } #endif static inline void zero_user_segment(struct page *page, unsigned start, unsigned end) { zero_user_segments(page, start, end, 0, 0); } #ifndef __HAVE_ARCH_COPY_USER_HIGHPAGE static inline void copy_user_highpage(struct page *to, struct page *from, unsigned long vaddr, struct vm_area_struct *vma) { char *vfrom, *vto; vfrom = kmap_local_page(from); vto = kmap_local_page(to); copy_user_page(vto, vfrom, vaddr, to); kmsan_unpoison_memory(page_address(to), PAGE_SIZE); kunmap_local(vto); kunmap_local(vfrom); } #endif #ifndef __HAVE_ARCH_COPY_HIGHPAGE static inline void copy_highpage(struct page *to, struct page *from) { char *vfrom, *vto; vfrom = kmap_local_page(from); vto = kmap_local_page(to); copy_page(vto, vfrom); kmsan_copy_page_meta(to, from); kunmap_local(vto); kunmap_local(vfrom); } #endif #ifdef copy_mc_to_kernel /* * If architecture supports machine check exception handling, define the * #MC versions of copy_user_highpage and copy_highpage. They copy a memory * page with #MC in source page (@from) handled, and return the number * of bytes not copied if there was a #MC, otherwise 0 for success. */ static inline int copy_mc_user_highpage(struct page *to, struct page *from, unsigned long vaddr, struct vm_area_struct *vma) { unsigned long ret; char *vfrom, *vto; vfrom = kmap_local_page(from); vto = kmap_local_page(to); ret = copy_mc_to_kernel(vto, vfrom, PAGE_SIZE); if (!ret) kmsan_unpoison_memory(page_address(to), PAGE_SIZE); kunmap_local(vto); kunmap_local(vfrom); if (ret) memory_failure_queue(page_to_pfn(from), 0); return ret; } static inline int copy_mc_highpage(struct page *to, struct page *from) { unsigned long ret; char *vfrom, *vto; vfrom = kmap_local_page(from); vto = kmap_local_page(to); ret = copy_mc_to_kernel(vto, vfrom, PAGE_SIZE); if (!ret) kmsan_copy_page_meta(to, from); kunmap_local(vto); kunmap_local(vfrom); if (ret) memory_failure_queue(page_to_pfn(from), 0); return ret; } #else static inline int copy_mc_user_highpage(struct page *to, struct page *from, unsigned long vaddr, struct vm_area_struct *vma) { copy_user_highpage(to, from, vaddr, vma); return 0; } static inline int copy_mc_highpage(struct page *to, struct page *from) { copy_highpage(to, from); return 0; } #endif static inline void memcpy_page(struct page *dst_page, size_t dst_off, struct page *src_page, size_t src_off, size_t len) { char *dst = kmap_local_page(dst_page); char *src = kmap_local_page(src_page); VM_BUG_ON(dst_off + len > PAGE_SIZE || src_off + len > PAGE_SIZE); memcpy(dst + dst_off, src + src_off, len); kunmap_local(src); kunmap_local(dst); } static inline void memcpy_folio(struct folio *dst_folio, size_t dst_off, struct folio *src_folio, size_t src_off, size_t len) { VM_BUG_ON(dst_off + len > folio_size(dst_folio)); VM_BUG_ON(src_off + len > folio_size(src_folio)); do { char *dst = kmap_local_folio(dst_folio, dst_off); const char *src = kmap_local_folio(src_folio, src_off); size_t chunk = len; if (folio_test_highmem(dst_folio) && chunk > PAGE_SIZE - offset_in_page(dst_off)) chunk = PAGE_SIZE - offset_in_page(dst_off); if (folio_test_highmem(src_folio) && chunk > PAGE_SIZE - offset_in_page(src_off)) chunk = PAGE_SIZE - offset_in_page(src_off); memcpy(dst, src, chunk); kunmap_local(src); kunmap_local(dst); dst_off += chunk; src_off += chunk; len -= chunk; } while (len > 0); } static inline void memset_page(struct page *page, size_t offset, int val, size_t len) { char *addr = kmap_local_page(page); VM_BUG_ON(offset + len > PAGE_SIZE); memset(addr + offset, val, len); kunmap_local(addr); } static inline void memcpy_from_page(char *to, struct page *page, size_t offset, size_t len) { char *from = kmap_local_page(page); VM_BUG_ON(offset + len > PAGE_SIZE); memcpy(to, from + offset, len); kunmap_local(from); } static inline void memcpy_to_page(struct page *page, size_t offset, const char *from, size_t len) { char *to = kmap_local_page(page); VM_BUG_ON(offset + len > PAGE_SIZE); memcpy(to + offset, from, len); flush_dcache_page(page); kunmap_local(to); } static inline void memzero_page(struct page *page, size_t offset, size_t len) { char *addr = kmap_local_page(page); VM_BUG_ON(offset + len > PAGE_SIZE); memset(addr + offset, 0, len); flush_dcache_page(page); kunmap_local(addr); } /** * memcpy_from_folio - Copy a range of bytes from a folio. * @to: The memory to copy to. * @folio: The folio to read from. * @offset: The first byte in the folio to read. * @len: The number of bytes to copy. */ static inline void memcpy_from_folio(char *to, struct folio *folio, size_t offset, size_t len) { VM_BUG_ON(offset + len > folio_size(folio)); do { const char *from = kmap_local_folio(folio, offset); size_t chunk = len; if (folio_test_partial_kmap(folio) && chunk > PAGE_SIZE - offset_in_page(offset)) chunk = PAGE_SIZE - offset_in_page(offset); memcpy(to, from, chunk); kunmap_local(from); to += chunk; offset += chunk; len -= chunk; } while (len > 0); } /** * memcpy_to_folio - Copy a range of bytes to a folio. * @folio: The folio to write to. * @offset: The first byte in the folio to store to. * @from: The memory to copy from. * @len: The number of bytes to copy. */ static inline void memcpy_to_folio(struct folio *folio, size_t offset, const char *from, size_t len) { VM_BUG_ON(offset + len > folio_size(folio)); do { char *to = kmap_local_folio(folio, offset); size_t chunk = len; if (folio_test_partial_kmap(folio) && chunk > PAGE_SIZE - offset_in_page(offset)) chunk = PAGE_SIZE - offset_in_page(offset); memcpy(to, from, chunk); kunmap_local(to); from += chunk; offset += chunk; len -= chunk; } while (len > 0); flush_dcache_folio(folio); } /** * folio_zero_tail - Zero the tail of a folio. * @folio: The folio to zero. * @offset: The byte offset in the folio to start zeroing at. * @kaddr: The address the folio is currently mapped to. * * If you have already used kmap_local_folio() to map a folio, written * some data to it and now need to zero the end of the folio (and flush * the dcache), you can use this function. If you do not have the * folio kmapped (eg the folio has been partially populated by DMA), * use folio_zero_range() or folio_zero_segment() instead. * * Return: An address which can be passed to kunmap_local(). */ static inline __must_check void *folio_zero_tail(struct folio *folio, size_t offset, void *kaddr) { size_t len = folio_size(folio) - offset; if (folio_test_partial_kmap(folio)) { size_t max = PAGE_SIZE - offset_in_page(offset); while (len > max) { memset(kaddr, 0, max); kunmap_local(kaddr); len -= max; offset += max; max = PAGE_SIZE; kaddr = kmap_local_folio(folio, offset); } } memset(kaddr, 0, len); flush_dcache_folio(folio); return kaddr; } /** * folio_fill_tail - Copy some data to a folio and pad with zeroes. * @folio: The destination folio. * @offset: The offset into @folio at which to start copying. * @from: The data to copy. * @len: How many bytes of data to copy. * * This function is most useful for filesystems which support inline data. * When they want to copy data from the inode into the page cache, this * function does everything for them. It supports large folios even on * HIGHMEM configurations. */ static inline void folio_fill_tail(struct folio *folio, size_t offset, const char *from, size_t len) { char *to = kmap_local_folio(folio, offset); VM_BUG_ON(offset + len > folio_size(folio)); if (folio_test_partial_kmap(folio)) { size_t max = PAGE_SIZE - offset_in_page(offset); while (len > max) { memcpy(to, from, max); kunmap_local(to); len -= max; from += max; offset += max; max = PAGE_SIZE; to = kmap_local_folio(folio, offset); } } memcpy(to, from, len); to = folio_zero_tail(folio, offset + len, to + len); kunmap_local(to); } /** * memcpy_from_file_folio - Copy some bytes from a file folio. * @to: The destination buffer. * @folio: The folio to copy from. * @pos: The position in the file. * @len: The maximum number of bytes to copy. * * Copy up to @len bytes from this folio. This may be limited by PAGE_SIZE * if the folio comes from HIGHMEM, and by the size of the folio. * * Return: The number of bytes copied from the folio. */ static inline size_t memcpy_from_file_folio(char *to, struct folio *folio, loff_t pos, size_t len) { size_t offset = offset_in_folio(folio, pos); char *from = kmap_local_folio(folio, offset); if (folio_test_partial_kmap(folio)) { offset = offset_in_page(offset); len = min_t(size_t, len, PAGE_SIZE - offset); } else len = min(len, folio_size(folio) - offset); memcpy(to, from, len); kunmap_local(from); return len; } /** * folio_zero_segments() - Zero two byte ranges in a folio. * @folio: The folio to write to. * @start1: The first byte to zero. * @xend1: One more than the last byte in the first range. * @start2: The first byte to zero in the second range. * @xend2: One more than the last byte in the second range. */ static inline void folio_zero_segments(struct folio *folio, size_t start1, size_t xend1, size_t start2, size_t xend2) { zero_user_segments(&folio->page, start1, xend1, start2, xend2); } /** * folio_zero_segment() - Zero a byte range in a folio. * @folio: The folio to write to. * @start: The first byte to zero. * @xend: One more than the last byte to zero. */ static inline void folio_zero_segment(struct folio *folio, size_t start, size_t xend) { zero_user_segments(&folio->page, start, xend, 0, 0); } /** * folio_zero_range() - Zero a byte range in a folio. * @folio: The folio to write to. * @start: The first byte to zero. * @length: The number of bytes to zero. */ static inline void folio_zero_range(struct folio *folio, size_t start, size_t length) { zero_user_segments(&folio->page, start, start + length, 0, 0); } /** * folio_release_kmap - Unmap a folio and drop a refcount. * @folio: The folio to release. * @addr: The address previously returned by a call to kmap_local_folio(). * * It is common, eg in directory handling to kmap a folio. This function * unmaps the folio and drops the refcount that was being held to keep the * folio alive while we accessed it. */ static inline void folio_release_kmap(struct folio *folio, void *addr) { kunmap_local(addr); folio_put(folio); } #endif /* _LINUX_HIGHMEM_H */
25 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 /* SPDX-License-Identifier: GPL-2.0 */ /* * Tick related global functions */ #ifndef _LINUX_TICK_H #define _LINUX_TICK_H #include <linux/clockchips.h> #include <linux/irqflags.h> #include <linux/percpu.h> #include <linux/context_tracking_state.h> #include <linux/cpumask.h> #include <linux/sched.h> #include <linux/rcupdate.h> #include <linux/static_key.h> #ifdef CONFIG_GENERIC_CLOCKEVENTS extern void __init tick_init(void); /* Should be core only, but ARM BL switcher requires it */ extern void tick_suspend_local(void); /* Should be core only, but XEN resume magic and ARM BL switcher require it */ extern void tick_resume_local(void); #else /* CONFIG_GENERIC_CLOCKEVENTS */ static inline void tick_init(void) { } static inline void tick_suspend_local(void) { } static inline void tick_resume_local(void) { } #endif /* !CONFIG_GENERIC_CLOCKEVENTS */ #if defined(CONFIG_GENERIC_CLOCKEVENTS) && defined(CONFIG_HOTPLUG_CPU) extern int tick_cpu_dying(unsigned int cpu); extern void tick_assert_timekeeping_handover(void); #else #define tick_cpu_dying NULL static inline void tick_assert_timekeeping_handover(void) { } #endif #if defined(CONFIG_GENERIC_CLOCKEVENTS) && defined(CONFIG_SUSPEND) extern void tick_freeze(void); extern void tick_unfreeze(void); #else static inline void tick_freeze(void) { } static inline void tick_unfreeze(void) { } #endif #ifdef CONFIG_TICK_ONESHOT extern void tick_irq_enter(void); # ifndef arch_needs_cpu # define arch_needs_cpu() (0) # endif # else static inline void tick_irq_enter(void) { } #endif #if defined(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST) && defined(CONFIG_TICK_ONESHOT) extern void hotplug_cpu__broadcast_tick_pull(int dead_cpu); #else static inline void hotplug_cpu__broadcast_tick_pull(int dead_cpu) { } #endif enum tick_broadcast_mode { TICK_BROADCAST_OFF, TICK_BROADCAST_ON, TICK_BROADCAST_FORCE, }; enum tick_broadcast_state { TICK_BROADCAST_EXIT, TICK_BROADCAST_ENTER, }; extern struct static_key_false arch_needs_tick_broadcast; #ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST extern void tick_broadcast_control(enum tick_broadcast_mode mode); #else static inline void tick_broadcast_control(enum tick_broadcast_mode mode) { } #endif /* BROADCAST */ #ifdef CONFIG_GENERIC_CLOCKEVENTS extern int tick_broadcast_oneshot_control(enum tick_broadcast_state state); #else static inline int tick_broadcast_oneshot_control(enum tick_broadcast_state state) { return 0; } #endif static inline void tick_broadcast_enable(void) { tick_broadcast_control(TICK_BROADCAST_ON); } static inline void tick_broadcast_disable(void) { tick_broadcast_control(TICK_BROADCAST_OFF); } static inline void tick_broadcast_force(void) { tick_broadcast_control(TICK_BROADCAST_FORCE); } static inline int tick_broadcast_enter(void) { return tick_broadcast_oneshot_control(TICK_BROADCAST_ENTER); } static inline void tick_broadcast_exit(void) { tick_broadcast_oneshot_control(TICK_BROADCAST_EXIT); } enum tick_dep_bits { TICK_DEP_BIT_POSIX_TIMER = 0, TICK_DEP_BIT_PERF_EVENTS = 1, TICK_DEP_BIT_SCHED = 2, TICK_DEP_BIT_CLOCK_UNSTABLE = 3, TICK_DEP_BIT_RCU = 4, TICK_DEP_BIT_RCU_EXP = 5 }; #define TICK_DEP_BIT_MAX TICK_DEP_BIT_RCU_EXP #define TICK_DEP_MASK_NONE 0 #define TICK_DEP_MASK_POSIX_TIMER (1 << TICK_DEP_BIT_POSIX_TIMER) #define TICK_DEP_MASK_PERF_EVENTS (1 << TICK_DEP_BIT_PERF_EVENTS) #define TICK_DEP_MASK_SCHED (1 << TICK_DEP_BIT_SCHED) #define TICK_DEP_MASK_CLOCK_UNSTABLE (1 << TICK_DEP_BIT_CLOCK_UNSTABLE) #define TICK_DEP_MASK_RCU (1 << TICK_DEP_BIT_RCU) #define TICK_DEP_MASK_RCU_EXP (1 << TICK_DEP_BIT_RCU_EXP) #ifdef CONFIG_NO_HZ_COMMON extern bool tick_nohz_enabled; extern bool tick_nohz_tick_stopped(void); extern bool tick_nohz_tick_stopped_cpu(int cpu); extern void tick_nohz_idle_stop_tick(void); extern void tick_nohz_idle_retain_tick(void); extern void tick_nohz_idle_restart_tick(void); extern void tick_nohz_idle_enter(void); extern void tick_nohz_idle_exit(void); extern void tick_nohz_irq_exit(void); extern bool tick_nohz_idle_got_tick(void); extern ktime_t tick_nohz_get_next_hrtimer(void); extern ktime_t tick_nohz_get_sleep_length(ktime_t *delta_next); extern unsigned long tick_nohz_get_idle_calls_cpu(int cpu); extern u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time); extern u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time); #else /* !CONFIG_NO_HZ_COMMON */ #define tick_nohz_enabled (0) static inline int tick_nohz_tick_stopped(void) { return 0; } static inline int tick_nohz_tick_stopped_cpu(int cpu) { return 0; } static inline void tick_nohz_idle_stop_tick(void) { } static inline void tick_nohz_idle_retain_tick(void) { } static inline void tick_nohz_idle_restart_tick(void) { } static inline void tick_nohz_idle_enter(void) { } static inline void tick_nohz_idle_exit(void) { } static inline bool tick_nohz_idle_got_tick(void) { return false; } static inline ktime_t tick_nohz_get_next_hrtimer(void) { /* Next wake up is the tick period, assume it starts now */ return ktime_add(ktime_get(), TICK_NSEC); } static inline ktime_t tick_nohz_get_sleep_length(ktime_t *delta_next) { *delta_next = TICK_NSEC; return *delta_next; } static inline u64 get_cpu_idle_time_us(int cpu, u64 *unused) { return -1; } static inline u64 get_cpu_iowait_time_us(int cpu, u64 *unused) { return -1; } #endif /* !CONFIG_NO_HZ_COMMON */ /* * Mask of CPUs that are nohz_full. * * Users should be guarded by CONFIG_NO_HZ_FULL or a tick_nohz_full_cpu() * check. */ extern cpumask_var_t tick_nohz_full_mask; #ifdef CONFIG_NO_HZ_FULL extern bool tick_nohz_full_running; static inline bool tick_nohz_full_enabled(void) { if (!context_tracking_enabled()) return false; return tick_nohz_full_running; } /* * Check if a CPU is part of the nohz_full subset. Arrange for evaluating * the cpu expression (typically smp_processor_id()) _after_ the static * key. */ #define tick_nohz_full_cpu(_cpu) ({ \ bool __ret = false; \ if (tick_nohz_full_enabled()) \ __ret = cpumask_test_cpu((_cpu), tick_nohz_full_mask); \ __ret; \ }) extern void tick_nohz_dep_set(enum tick_dep_bits bit); extern void tick_nohz_dep_clear(enum tick_dep_bits bit); extern void tick_nohz_dep_set_cpu(int cpu, enum tick_dep_bits bit); extern void tick_nohz_dep_clear_cpu(int cpu, enum tick_dep_bits bit); extern void tick_nohz_dep_set_task(struct task_struct *tsk, enum tick_dep_bits bit); extern void tick_nohz_dep_clear_task(struct task_struct *tsk, enum tick_dep_bits bit); extern void tick_nohz_dep_set_signal(struct task_struct *tsk, enum tick_dep_bits bit); extern void tick_nohz_dep_clear_signal(struct signal_struct *signal, enum tick_dep_bits bit); extern bool tick_nohz_cpu_hotpluggable(unsigned int cpu); /* * The below are tick_nohz_[set,clear]_dep() wrappers that optimize off-cases * on top of static keys. */ static inline void tick_dep_set(enum tick_dep_bits bit) { if (tick_nohz_full_enabled()) tick_nohz_dep_set(bit); } static inline void tick_dep_clear(enum tick_dep_bits bit) { if (tick_nohz_full_enabled()) tick_nohz_dep_clear(bit); } static inline void tick_dep_set_cpu(int cpu, enum tick_dep_bits bit) { if (tick_nohz_full_cpu(cpu)) tick_nohz_dep_set_cpu(cpu, bit); } static inline void tick_dep_clear_cpu(int cpu, enum tick_dep_bits bit) { if (tick_nohz_full_cpu(cpu)) tick_nohz_dep_clear_cpu(cpu, bit); } static inline void tick_dep_set_task(struct task_struct *tsk, enum tick_dep_bits bit) { if (tick_nohz_full_enabled()) tick_nohz_dep_set_task(tsk, bit); } static inline void tick_dep_clear_task(struct task_struct *tsk, enum tick_dep_bits bit) { if (tick_nohz_full_enabled()) tick_nohz_dep_clear_task(tsk, bit); } static inline void tick_dep_init_task(struct task_struct *tsk) { atomic_set(&tsk->tick_dep_mask, 0); } static inline void tick_dep_set_signal(struct task_struct *tsk, enum tick_dep_bits bit) { if (tick_nohz_full_enabled()) tick_nohz_dep_set_signal(tsk, bit); } static inline void tick_dep_clear_signal(struct signal_struct *signal, enum tick_dep_bits bit) { if (tick_nohz_full_enabled()) tick_nohz_dep_clear_signal(signal, bit); } extern void tick_nohz_full_kick_cpu(int cpu); extern void __tick_nohz_task_switch(void); extern void __init tick_nohz_full_setup(cpumask_var_t cpumask); #else static inline bool tick_nohz_full_enabled(void) { return false; } static inline bool tick_nohz_full_cpu(int cpu) { return false; } static inline void tick_nohz_dep_set_cpu(int cpu, enum tick_dep_bits bit) { } static inline void tick_nohz_dep_clear_cpu(int cpu, enum tick_dep_bits bit) { } static inline bool tick_nohz_cpu_hotpluggable(unsigned int cpu) { return true; } static inline void tick_dep_set(enum tick_dep_bits bit) { } static inline void tick_dep_clear(enum tick_dep_bits bit) { } static inline void tick_dep_set_cpu(int cpu, enum tick_dep_bits bit) { } static inline void tick_dep_clear_cpu(int cpu, enum tick_dep_bits bit) { } static inline void tick_dep_set_task(struct task_struct *tsk, enum tick_dep_bits bit) { } static inline void tick_dep_clear_task(struct task_struct *tsk, enum tick_dep_bits bit) { } static inline void tick_dep_init_task(struct task_struct *tsk) { } static inline void tick_dep_set_signal(struct task_struct *tsk, enum tick_dep_bits bit) { } static inline void tick_dep_clear_signal(struct signal_struct *signal, enum tick_dep_bits bit) { } static inline void tick_nohz_full_kick_cpu(int cpu) { } static inline void __tick_nohz_task_switch(void) { } static inline void tick_nohz_full_setup(cpumask_var_t cpumask) { } #endif static inline void tick_nohz_task_switch(void) { if (tick_nohz_full_enabled()) __tick_nohz_task_switch(); } static inline void tick_nohz_user_enter_prepare(void) { if (tick_nohz_full_cpu(smp_processor_id())) rcu_nocb_flush_deferred_wakeup(); } #endif
103 103 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 // SPDX-License-Identifier: GPL-2.0 #include <linux/slab.h> #include <linux/lockdep.h> #include <linux/sysfs.h> #include <linux/kobject.h> #include <linux/memory.h> #include <linux/memory-tiers.h> #include <linux/notifier.h> #include <linux/sched/sysctl.h> #include "internal.h" struct memory_tier { /* hierarchy of memory tiers */ struct list_head list; /* list of all memory types part of this tier */ struct list_head memory_types; /* * start value of abstract distance. memory tier maps * an abstract distance range, * adistance_start .. adistance_start + MEMTIER_CHUNK_SIZE */ int adistance_start; struct device dev; /* All the nodes that are part of all the lower memory tiers. */ nodemask_t lower_tier_mask; }; struct demotion_nodes { nodemask_t preferred; }; struct node_memory_type_map { struct memory_dev_type *memtype; int map_count; }; static DEFINE_MUTEX(memory_tier_lock); static LIST_HEAD(memory_tiers); /* * The list is used to store all memory types that are not created * by a device driver. */ static LIST_HEAD(default_memory_types); static struct node_memory_type_map node_memory_types[MAX_NUMNODES]; struct memory_dev_type *default_dram_type; nodemask_t default_dram_nodes __initdata = NODE_MASK_NONE; static const struct bus_type memory_tier_subsys = { .name = "memory_tiering", .dev_name = "memory_tier", }; #ifdef CONFIG_NUMA_BALANCING /** * folio_use_access_time - check if a folio reuses cpupid for page access time * @folio: folio to check * * folio's _last_cpupid field is repurposed by memory tiering. In memory * tiering mode, cpupid of slow memory folio (not toptier memory) is used to * record page access time. * * Return: the folio _last_cpupid is used to record page access time */ bool folio_use_access_time(struct folio *folio) { return (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) && !node_is_toptier(folio_nid(folio)); } #endif #ifdef CONFIG_MIGRATION static int top_tier_adistance; /* * node_demotion[] examples: * * Example 1: * * Node 0 & 1 are CPU + DRAM nodes, node 2 & 3 are PMEM nodes. * * node distances: * node 0 1 2 3 * 0 10 20 30 40 * 1 20 10 40 30 * 2 30 40 10 40 * 3 40 30 40 10 * * memory_tiers0 = 0-1 * memory_tiers1 = 2-3 * * node_demotion[0].preferred = 2 * node_demotion[1].preferred = 3 * node_demotion[2].preferred = <empty> * node_demotion[3].preferred = <empty> * * Example 2: * * Node 0 & 1 are CPU + DRAM nodes, node 2 is memory-only DRAM node. * * node distances: * node 0 1 2 * 0 10 20 30 * 1 20 10 30 * 2 30 30 10 * * memory_tiers0 = 0-2 * * node_demotion[0].preferred = <empty> * node_demotion[1].preferred = <empty> * node_demotion[2].preferred = <empty> * * Example 3: * * Node 0 is CPU + DRAM nodes, Node 1 is HBM node, node 2 is PMEM node. * * node distances: * node 0 1 2 * 0 10 20 30 * 1 20 10 40 * 2 30 40 10 * * memory_tiers0 = 1 * memory_tiers1 = 0 * memory_tiers2 = 2 * * node_demotion[0].preferred = 2 * node_demotion[1].preferred = 0 * node_demotion[2].preferred = <empty> * */ static struct demotion_nodes *node_demotion __read_mostly; #endif /* CONFIG_MIGRATION */ static BLOCKING_NOTIFIER_HEAD(mt_adistance_algorithms); /* The lock is used to protect `default_dram_perf*` info and nid. */ static DEFINE_MUTEX(default_dram_perf_lock); static bool default_dram_perf_error; static struct access_coordinate default_dram_perf; static int default_dram_perf_ref_nid = NUMA_NO_NODE; static const char *default_dram_perf_ref_source; static inline struct memory_tier *to_memory_tier(struct device *device) { return container_of(device, struct memory_tier, dev); } static __always_inline nodemask_t get_memtier_nodemask(struct memory_tier *memtier) { nodemask_t nodes = NODE_MASK_NONE; struct memory_dev_type *memtype; list_for_each_entry(memtype, &memtier->memory_types, tier_sibling) nodes_or(nodes, nodes, memtype->nodes); return nodes; } static void memory_tier_device_release(struct device *dev) { struct memory_tier *tier = to_memory_tier(dev); /* * synchronize_rcu in clear_node_memory_tier makes sure * we don't have rcu access to this memory tier. */ kfree(tier); } static ssize_t nodelist_show(struct device *dev, struct device_attribute *attr, char *buf) { int ret; nodemask_t nmask; mutex_lock(&memory_tier_lock); nmask = get_memtier_nodemask(to_memory_tier(dev)); ret = sysfs_emit(buf, "%*pbl\n", nodemask_pr_args(&nmask)); mutex_unlock(&memory_tier_lock); return ret; } static DEVICE_ATTR_RO(nodelist); static struct attribute *memtier_dev_attrs[] = { &dev_attr_nodelist.attr, NULL }; static const struct attribute_group memtier_dev_group = { .attrs = memtier_dev_attrs, }; static const struct attribute_group *memtier_dev_groups[] = { &memtier_dev_group, NULL }; static struct memory_tier *find_create_memory_tier(struct memory_dev_type *memtype) { int ret; bool found_slot = false; struct memory_tier *memtier, *new_memtier; int adistance = memtype->adistance; unsigned int memtier_adistance_chunk_size = MEMTIER_CHUNK_SIZE; lockdep_assert_held_once(&memory_tier_lock); adistance = round_down(adistance, memtier_adistance_chunk_size); /* * If the memtype is already part of a memory tier, * just return that. */ if (!list_empty(&memtype->tier_sibling)) { list_for_each_entry(memtier, &memory_tiers, list) { if (adistance == memtier->adistance_start) return memtier; } WARN_ON(1); return ERR_PTR(-EINVAL); } list_for_each_entry(memtier, &memory_tiers, list) { if (adistance == memtier->adistance_start) { goto link_memtype; } else if (adistance < memtier->adistance_start) { found_slot = true; break; } } new_memtier = kzalloc(sizeof(struct memory_tier), GFP_KERNEL); if (!new_memtier) return ERR_PTR(-ENOMEM); new_memtier->adistance_start = adistance; INIT_LIST_HEAD(&new_memtier->list); INIT_LIST_HEAD(&new_memtier->memory_types); if (found_slot) list_add_tail(&new_memtier->list, &memtier->list); else list_add_tail(&new_memtier->list, &memory_tiers); new_memtier->dev.id = adistance >> MEMTIER_CHUNK_BITS; new_memtier->dev.bus = &memory_tier_subsys; new_memtier->dev.release = memory_tier_device_release; new_memtier->dev.groups = memtier_dev_groups; ret = device_register(&new_memtier->dev); if (ret) { list_del(&new_memtier->list); put_device(&new_memtier->dev); return ERR_PTR(ret); } memtier = new_memtier; link_memtype: list_add(&memtype->tier_sibling, &memtier->memory_types); return memtier; } static struct memory_tier *__node_get_memory_tier(int node) { pg_data_t *pgdat; pgdat = NODE_DATA(node); if (!pgdat) return NULL; /* * Since we hold memory_tier_lock, we can avoid * RCU read locks when accessing the details. No * parallel updates are possible here. */ return rcu_dereference_check(pgdat->memtier, lockdep_is_held(&memory_tier_lock)); } #ifdef CONFIG_MIGRATION bool node_is_toptier(int node) { bool toptier; pg_data_t *pgdat; struct memory_tier *memtier; pgdat = NODE_DATA(node); if (!pgdat) return false; rcu_read_lock(); memtier = rcu_dereference(pgdat->memtier); if (!memtier) { toptier = true; goto out; } if (memtier->adistance_start <= top_tier_adistance) toptier = true; else toptier = false; out: rcu_read_unlock(); return toptier; } void node_get_allowed_targets(pg_data_t *pgdat, nodemask_t *targets) { struct memory_tier *memtier; /* * pg_data_t.memtier updates includes a synchronize_rcu() * which ensures that we either find NULL or a valid memtier * in NODE_DATA. protect the access via rcu_read_lock(); */ rcu_read_lock(); memtier = rcu_dereference(pgdat->memtier); if (memtier) *targets = memtier->lower_tier_mask; else *targets = NODE_MASK_NONE; rcu_read_unlock(); } /** * next_demotion_node() - Get the next node in the demotion path * @node: The starting node to lookup the next node * * Return: node id for next memory node in the demotion path hierarchy * from @node; NUMA_NO_NODE if @node is terminal. This does not keep * @node online or guarantee that it *continues* to be the next demotion * target. */ int next_demotion_node(int node) { struct demotion_nodes *nd; int target; if (!node_demotion) return NUMA_NO_NODE; nd = &node_demotion[node]; /* * node_demotion[] is updated without excluding this * function from running. * * Make sure to use RCU over entire code blocks if * node_demotion[] reads need to be consistent. */ rcu_read_lock(); /* * If there are multiple target nodes, just select one * target node randomly. * * In addition, we can also use round-robin to select * target node, but we should introduce another variable * for node_demotion[] to record last selected target node, * that may cause cache ping-pong due to the changing of * last target node. Or introducing per-cpu data to avoid * caching issue, which seems more complicated. So selecting * target node randomly seems better until now. */ target = node_random(&nd->preferred); rcu_read_unlock(); return target; } static void disable_all_demotion_targets(void) { struct memory_tier *memtier; int node; for_each_node_state(node, N_MEMORY) { node_demotion[node].preferred = NODE_MASK_NONE; /* * We are holding memory_tier_lock, it is safe * to access pgda->memtier. */ memtier = __node_get_memory_tier(node); if (memtier) memtier->lower_tier_mask = NODE_MASK_NONE; } /* * Ensure that the "disable" is visible across the system. * Readers will see either a combination of before+disable * state or disable+after. They will never see before and * after state together. */ synchronize_rcu(); } static void dump_demotion_targets(void) { int node; for_each_node_state(node, N_MEMORY) { struct memory_tier *memtier = __node_get_memory_tier(node); nodemask_t preferred = node_demotion[node].preferred; if (!memtier) continue; if (nodes_empty(preferred)) pr_info("Demotion targets for Node %d: null\n", node); else pr_info("Demotion targets for Node %d: preferred: %*pbl, fallback: %*pbl\n", node, nodemask_pr_args(&preferred), nodemask_pr_args(&memtier->lower_tier_mask)); } } /* * Find an automatic demotion target for all memory * nodes. Failing here is OK. It might just indicate * being at the end of a chain. */ static void establish_demotion_targets(void) { struct memory_tier *memtier; struct demotion_nodes *nd; int target = NUMA_NO_NODE, node; int distance, best_distance; nodemask_t tier_nodes, lower_tier; lockdep_assert_held_once(&memory_tier_lock); if (!node_demotion) return; disable_all_demotion_targets(); for_each_node_state(node, N_MEMORY) { best_distance = -1; nd = &node_demotion[node]; memtier = __node_get_memory_tier(node); if (!memtier || list_is_last(&memtier->list, &memory_tiers)) continue; /* * Get the lower memtier to find the demotion node list. */ memtier = list_next_entry(memtier, list); tier_nodes = get_memtier_nodemask(memtier); /* * find_next_best_node, use 'used' nodemask as a skip list. * Add all memory nodes except the selected memory tier * nodelist to skip list so that we find the best node from the * memtier nodelist. */ nodes_andnot(tier_nodes, node_states[N_MEMORY], tier_nodes); /* * Find all the nodes in the memory tier node list of same best distance. * add them to the preferred mask. We randomly select between nodes * in the preferred mask when allocating pages during demotion. */ do { target = find_next_best_node(node, &tier_nodes); if (target == NUMA_NO_NODE) break; distance = node_distance(node, target); if (distance == best_distance || best_distance == -1) { best_distance = distance; node_set(target, nd->preferred); } else { break; } } while (1); } /* * Promotion is allowed from a memory tier to higher * memory tier only if the memory tier doesn't include * compute. We want to skip promotion from a memory tier, * if any node that is part of the memory tier have CPUs. * Once we detect such a memory tier, we consider that tier * as top tiper from which promotion is not allowed. */ list_for_each_entry_reverse(memtier, &memory_tiers, list) { tier_nodes = get_memtier_nodemask(memtier); nodes_and(tier_nodes, node_states[N_CPU], tier_nodes); if (!nodes_empty(tier_nodes)) { /* * abstract distance below the max value of this memtier * is considered toptier. */ top_tier_adistance = memtier->adistance_start + MEMTIER_CHUNK_SIZE - 1; break; } } /* * Now build the lower_tier mask for each node collecting node mask from * all memory tier below it. This allows us to fallback demotion page * allocation to a set of nodes that is closer the above selected * preferred node. */ lower_tier = node_states[N_MEMORY]; list_for_each_entry(memtier, &memory_tiers, list) { /* * Keep removing current tier from lower_tier nodes, * This will remove all nodes in current and above * memory tier from the lower_tier mask. */ tier_nodes = get_memtier_nodemask(memtier); nodes_andnot(lower_tier, lower_tier, tier_nodes); memtier->lower_tier_mask = lower_tier; } dump_demotion_targets(); } #else static inline void establish_demotion_targets(void) {} #endif /* CONFIG_MIGRATION */ static inline void __init_node_memory_type(int node, struct memory_dev_type *memtype) { if (!node_memory_types[node].memtype) node_memory_types[node].memtype = memtype; /* * for each device getting added in the same NUMA node * with this specific memtype, bump the map count. We * Only take memtype device reference once, so that * changing a node memtype can be done by dropping the * only reference count taken here. */ if (node_memory_types[node].memtype == memtype) { if (!node_memory_types[node].map_count++) kref_get(&memtype->kref); } } static struct memory_tier *set_node_memory_tier(int node) { struct memory_tier *memtier; struct memory_dev_type *memtype = default_dram_type; int adist = MEMTIER_ADISTANCE_DRAM; pg_data_t *pgdat = NODE_DATA(node); lockdep_assert_held_once(&memory_tier_lock); if (!node_state(node, N_MEMORY)) return ERR_PTR(-EINVAL); mt_calc_adistance(node, &adist); if (!node_memory_types[node].memtype) { memtype = mt_find_alloc_memory_type(adist, &default_memory_types); if (IS_ERR(memtype)) { memtype = default_dram_type; pr_info("Failed to allocate a memory type. Fall back.\n"); } } __init_node_memory_type(node, memtype); memtype = node_memory_types[node].memtype; node_set(node, memtype->nodes); memtier = find_create_memory_tier(memtype); if (!IS_ERR(memtier)) rcu_assign_pointer(pgdat->memtier, memtier); return memtier; } static void destroy_memory_tier(struct memory_tier *memtier) { list_del(&memtier->list); device_unregister(&memtier->dev); } static bool clear_node_memory_tier(int node) { bool cleared = false; pg_data_t *pgdat; struct memory_tier *memtier; pgdat = NODE_DATA(node); if (!pgdat) return false; /* * Make sure that anybody looking at NODE_DATA who finds * a valid memtier finds memory_dev_types with nodes still * linked to the memtier. We achieve this by waiting for * rcu read section to finish using synchronize_rcu. * This also enables us to free the destroyed memory tier * with kfree instead of kfree_rcu */ memtier = __node_get_memory_tier(node); if (memtier) { struct memory_dev_type *memtype; rcu_assign_pointer(pgdat->memtier, NULL); synchronize_rcu(); memtype = node_memory_types[node].memtype; node_clear(node, memtype->nodes); if (nodes_empty(memtype->nodes)) { list_del_init(&memtype->tier_sibling); if (list_empty(&memtier->memory_types)) destroy_memory_tier(memtier); } cleared = true; } return cleared; } static void release_memtype(struct kref *kref) { struct memory_dev_type *memtype; memtype = container_of(kref, struct memory_dev_type, kref); kfree(memtype); } struct memory_dev_type *alloc_memory_type(int adistance) { struct memory_dev_type *memtype; memtype = kmalloc(sizeof(*memtype), GFP_KERNEL); if (!memtype) return ERR_PTR(-ENOMEM); memtype->adistance = adistance; INIT_LIST_HEAD(&memtype->tier_sibling); memtype->nodes = NODE_MASK_NONE; kref_init(&memtype->kref); return memtype; } EXPORT_SYMBOL_GPL(alloc_memory_type); void put_memory_type(struct memory_dev_type *memtype) { kref_put(&memtype->kref, release_memtype); } EXPORT_SYMBOL_GPL(put_memory_type); void init_node_memory_type(int node, struct memory_dev_type *memtype) { mutex_lock(&memory_tier_lock); __init_node_memory_type(node, memtype); mutex_unlock(&memory_tier_lock); } EXPORT_SYMBOL_GPL(init_node_memory_type); void clear_node_memory_type(int node, struct memory_dev_type *memtype) { mutex_lock(&memory_tier_lock); if (node_memory_types[node].memtype == memtype || !memtype) node_memory_types[node].map_count--; /* * If we umapped all the attached devices to this node, * clear the node memory type. */ if (!node_memory_types[node].map_count) { memtype = node_memory_types[node].memtype; node_memory_types[node].memtype = NULL; put_memory_type(memtype); } mutex_unlock(&memory_tier_lock); } EXPORT_SYMBOL_GPL(clear_node_memory_type); struct memory_dev_type *mt_find_alloc_memory_type(int adist, struct list_head *memory_types) { struct memory_dev_type *mtype; list_for_each_entry(mtype, memory_types, list) if (mtype->adistance == adist) return mtype; mtype = alloc_memory_type(adist); if (IS_ERR(mtype)) return mtype; list_add(&mtype->list, memory_types); return mtype; } EXPORT_SYMBOL_GPL(mt_find_alloc_memory_type); void mt_put_memory_types(struct list_head *memory_types) { struct memory_dev_type *mtype, *mtn; list_for_each_entry_safe(mtype, mtn, memory_types, list) { list_del(&mtype->list); put_memory_type(mtype); } } EXPORT_SYMBOL_GPL(mt_put_memory_types); /* * This is invoked via `late_initcall()` to initialize memory tiers for * memory nodes, both with and without CPUs. After the initialization of * firmware and devices, adistance algorithms are expected to be provided. */ static int __init memory_tier_late_init(void) { int nid; struct memory_tier *memtier; get_online_mems(); guard(mutex)(&memory_tier_lock); /* Assign each uninitialized N_MEMORY node to a memory tier. */ for_each_node_state(nid, N_MEMORY) { /* * Some device drivers may have initialized * memory tiers, potentially bringing memory nodes * online and configuring memory tiers. * Exclude them here. */ if (node_memory_types[nid].memtype) continue; memtier = set_node_memory_tier(nid); if (IS_ERR(memtier)) continue; } establish_demotion_targets(); put_online_mems(); return 0; } late_initcall(memory_tier_late_init); static void dump_hmem_attrs(struct access_coordinate *coord, const char *prefix) { pr_info( "%sread_latency: %u, write_latency: %u, read_bandwidth: %u, write_bandwidth: %u\n", prefix, coord->read_latency, coord->write_latency, coord->read_bandwidth, coord->write_bandwidth); } int mt_set_default_dram_perf(int nid, struct access_coordinate *perf, const char *source) { guard(mutex)(&default_dram_perf_lock); if (default_dram_perf_error) return -EIO; if (perf->read_latency + perf->write_latency == 0 || perf->read_bandwidth + perf->write_bandwidth == 0) return -EINVAL; if (default_dram_perf_ref_nid == NUMA_NO_NODE) { default_dram_perf = *perf; default_dram_perf_ref_nid = nid; default_dram_perf_ref_source = kstrdup(source, GFP_KERNEL); return 0; } /* * The performance of all default DRAM nodes is expected to be * same (that is, the variation is less than 10%). And it * will be used as base to calculate the abstract distance of * other memory nodes. */ if (abs(perf->read_latency - default_dram_perf.read_latency) * 10 > default_dram_perf.read_latency || abs(perf->write_latency - default_dram_perf.write_latency) * 10 > default_dram_perf.write_latency || abs(perf->read_bandwidth - default_dram_perf.read_bandwidth) * 10 > default_dram_perf.read_bandwidth || abs(perf->write_bandwidth - default_dram_perf.write_bandwidth) * 10 > default_dram_perf.write_bandwidth) { pr_info( "memory-tiers: the performance of DRAM node %d mismatches that of the reference\n" "DRAM node %d.\n", nid, default_dram_perf_ref_nid); pr_info(" performance of reference DRAM node %d from %s:\n", default_dram_perf_ref_nid, default_dram_perf_ref_source); dump_hmem_attrs(&default_dram_perf, " "); pr_info(" performance of DRAM node %d from %s:\n", nid, source); dump_hmem_attrs(perf, " "); pr_info( " disable default DRAM node performance based abstract distance algorithm.\n"); default_dram_perf_error = true; return -EINVAL; } return 0; } int mt_perf_to_adistance(struct access_coordinate *perf, int *adist) { guard(mutex)(&default_dram_perf_lock); if (default_dram_perf_error) return -EIO; if (perf->read_latency + perf->write_latency == 0 || perf->read_bandwidth + perf->write_bandwidth == 0) return -EINVAL; if (default_dram_perf_ref_nid == NUMA_NO_NODE) return -ENOENT; /* * The abstract distance of a memory node is in direct proportion to * its memory latency (read + write) and inversely proportional to its * memory bandwidth (read + write). The abstract distance, memory * latency, and memory bandwidth of the default DRAM nodes are used as * the base. */ *adist = MEMTIER_ADISTANCE_DRAM * (perf->read_latency + perf->write_latency) / (default_dram_perf.read_latency + default_dram_perf.write_latency) * (default_dram_perf.read_bandwidth + default_dram_perf.write_bandwidth) / (perf->read_bandwidth + perf->write_bandwidth); return 0; } EXPORT_SYMBOL_GPL(mt_perf_to_adistance); /** * register_mt_adistance_algorithm() - Register memory tiering abstract distance algorithm * @nb: The notifier block which describe the algorithm * * Return: 0 on success, errno on error. * * Every memory tiering abstract distance algorithm provider needs to * register the algorithm with register_mt_adistance_algorithm(). To * calculate the abstract distance for a specified memory node, the * notifier function will be called unless some high priority * algorithm has provided result. The prototype of the notifier * function is as follows, * * int (*algorithm_notifier)(struct notifier_block *nb, * unsigned long nid, void *data); * * Where "nid" specifies the memory node, "data" is the pointer to the * returned abstract distance (that is, "int *adist"). If the * algorithm provides the result, NOTIFY_STOP should be returned. * Otherwise, return_value & %NOTIFY_STOP_MASK == 0 to allow the next * algorithm in the chain to provide the result. */ int register_mt_adistance_algorithm(struct notifier_block *nb) { return blocking_notifier_chain_register(&mt_adistance_algorithms, nb); } EXPORT_SYMBOL_GPL(register_mt_adistance_algorithm); /** * unregister_mt_adistance_algorithm() - Unregister memory tiering abstract distance algorithm * @nb: the notifier block which describe the algorithm * * Return: 0 on success, errno on error. */ int unregister_mt_adistance_algorithm(struct notifier_block *nb) { return blocking_notifier_chain_unregister(&mt_adistance_algorithms, nb); } EXPORT_SYMBOL_GPL(unregister_mt_adistance_algorithm); /** * mt_calc_adistance() - Calculate abstract distance with registered algorithms * @node: the node to calculate abstract distance for * @adist: the returned abstract distance * * Return: if return_value & %NOTIFY_STOP_MASK != 0, then some * abstract distance algorithm provides the result, and return it via * @adist. Otherwise, no algorithm can provide the result and @adist * will be kept as it is. */ int mt_calc_adistance(int node, int *adist) { return blocking_notifier_call_chain(&mt_adistance_algorithms, node, adist); } EXPORT_SYMBOL_GPL(mt_calc_adistance); static int __meminit memtier_hotplug_callback(struct notifier_block *self, unsigned long action, void *_arg) { struct memory_tier *memtier; struct node_notify *nn = _arg; switch (action) { case NODE_REMOVED_LAST_MEMORY: mutex_lock(&memory_tier_lock); if (clear_node_memory_tier(nn->nid)) establish_demotion_targets(); mutex_unlock(&memory_tier_lock); break; case NODE_ADDED_FIRST_MEMORY: mutex_lock(&memory_tier_lock); memtier = set_node_memory_tier(nn->nid); if (!IS_ERR(memtier)) establish_demotion_targets(); mutex_unlock(&memory_tier_lock); break; } return notifier_from_errno(0); } static int __init memory_tier_init(void) { int ret; ret = subsys_virtual_register(&memory_tier_subsys, NULL); if (ret) panic("%s() failed to register memory tier subsystem\n", __func__); #ifdef CONFIG_MIGRATION node_demotion = kcalloc(nr_node_ids, sizeof(struct demotion_nodes), GFP_KERNEL); WARN_ON(!node_demotion); #endif mutex_lock(&memory_tier_lock); /* * For now we can have 4 faster memory tiers with smaller adistance * than default DRAM tier. */ default_dram_type = mt_find_alloc_memory_type(MEMTIER_ADISTANCE_DRAM, &default_memory_types); mutex_unlock(&memory_tier_lock); if (IS_ERR(default_dram_type)) panic("%s() failed to allocate default DRAM tier\n", __func__); /* Record nodes with memory and CPU to set default DRAM performance. */ nodes_and(default_dram_nodes, node_states[N_MEMORY], node_states[N_CPU]); hotplug_node_notifier(memtier_hotplug_callback, MEMTIER_HOTPLUG_PRI); return 0; } subsys_initcall(memory_tier_init); bool numa_demotion_enabled = false; #ifdef CONFIG_MIGRATION #ifdef CONFIG_SYSFS static ssize_t demotion_enabled_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%s\n", str_true_false(numa_demotion_enabled)); } static ssize_t demotion_enabled_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { ssize_t ret; bool before = numa_demotion_enabled; ret = kstrtobool(buf, &numa_demotion_enabled); if (ret) return ret; /* * Reset kswapd_failures statistics. They may no longer be * valid since the policy for kswapd has changed. */ if (before == false && numa_demotion_enabled == true) { struct pglist_data *pgdat; for_each_online_pgdat(pgdat) atomic_set(&pgdat->kswapd_failures, 0); } return count; } static struct kobj_attribute numa_demotion_enabled_attr = __ATTR_RW(demotion_enabled); static struct attribute *numa_attrs[] = { &numa_demotion_enabled_attr.attr, NULL, }; static const struct attribute_group numa_attr_group = { .attrs = numa_attrs, }; static int __init numa_init_sysfs(void) { int err; struct kobject *numa_kobj; numa_kobj = kobject_create_and_add("numa", mm_kobj); if (!numa_kobj) { pr_err("failed to create numa kobject\n"); return -ENOMEM; } err = sysfs_create_group(numa_kobj, &numa_attr_group); if (err) { pr_err("failed to register numa group\n"); goto delete_obj; } return 0; delete_obj: kobject_put(numa_kobj); return err; } subsys_initcall(numa_init_sysfs); #endif /* CONFIG_SYSFS */ #endif
23 23 1 3 18 18 1 7 40 16 29 5476 34 5 285 8410 68 8462 5 285 8458 8458 8487 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_SCHED_H #define _LINUX_SCHED_H /* * Define 'struct task_struct' and provide the main scheduler * APIs (schedule(), wakeup variants, etc.) */ #include <uapi/linux/sched.h> #include <asm/current.h> #include <asm/processor.h> #include <linux/thread_info.h> #include <linux/preempt.h> #include <linux/cpumask_types.h> #include <linux/cache.h> #include <linux/irqflags_types.h> #include <linux/smp_types.h> #include <linux/pid_types.h> #include <linux/sem_types.h> #include <linux/shm.h> #include <linux/kmsan_types.h> #include <linux/mutex_types.h> #include <linux/plist_types.h> #include <linux/hrtimer_types.h> #include <linux/timer_types.h> #include <linux/seccomp_types.h> #include <linux/nodemask_types.h> #include <linux/refcount_types.h> #include <linux/resource.h> #include <linux/latencytop.h> #include <linux/sched/prio.h> #include <linux/sched/types.h> #include <linux/signal_types.h> #include <linux/spinlock.h> #include <linux/syscall_user_dispatch_types.h> #include <linux/mm_types_task.h> #include <linux/netdevice_xmit.h> #include <linux/task_io_accounting.h> #include <linux/posix-timers_types.h> #include <linux/restart_block.h> #include <linux/rseq_types.h> #include <linux/seqlock_types.h> #include <linux/kcsan.h> #include <linux/rv.h> #include <linux/uidgid_types.h> #include <linux/tracepoint-defs.h> #include <linux/unwind_deferred_types.h> #include <asm/kmap_size.h> #ifndef COMPILE_OFFSETS #include <generated/rq-offsets.h> #endif /* task_struct member predeclarations (sorted alphabetically): */ struct audit_context; struct bio_list; struct blk_plug; struct bpf_local_storage; struct bpf_run_ctx; struct bpf_net_context; struct capture_control; struct cfs_rq; struct fs_struct; struct futex_pi_state; struct io_context; struct io_uring_task; struct mempolicy; struct nameidata; struct nsproxy; struct perf_event_context; struct perf_ctx_data; struct pid_namespace; struct pipe_inode_info; struct rcu_node; struct reclaim_state; struct robust_list_head; struct root_domain; struct rq; struct sched_attr; struct sched_dl_entity; struct seq_file; struct sighand_struct; struct signal_struct; struct task_delay_info; struct task_group; struct task_struct; struct user_event_mm; #include <linux/sched/ext.h> /* * Task state bitmask. NOTE! These bits are also * encoded in fs/proc/array.c: get_task_state(). * * We have two separate sets of flags: task->__state * is about runnability, while task->exit_state are * about the task exiting. Confusing, but this way * modifying one set can't modify the other one by * mistake. */ /* Used in tsk->__state: */ #define TASK_RUNNING 0x00000000 #define TASK_INTERRUPTIBLE 0x00000001 #define TASK_UNINTERRUPTIBLE 0x00000002 #define __TASK_STOPPED 0x00000004 #define __TASK_TRACED 0x00000008 /* Used in tsk->exit_state: */ #define EXIT_DEAD 0x00000010 #define EXIT_ZOMBIE 0x00000020 #define EXIT_TRACE (EXIT_ZOMBIE | EXIT_DEAD) /* Used in tsk->__state again: */ #define TASK_PARKED 0x00000040 #define TASK_DEAD 0x00000080 #define TASK_WAKEKILL 0x00000100 #define TASK_WAKING 0x00000200 #define TASK_NOLOAD 0x00000400 #define TASK_NEW 0x00000800 #define TASK_RTLOCK_WAIT 0x00001000 #define TASK_FREEZABLE 0x00002000 #define __TASK_FREEZABLE_UNSAFE (0x00004000 * IS_ENABLED(CONFIG_LOCKDEP)) #define TASK_FROZEN 0x00008000 #define TASK_STATE_MAX 0x00010000 #define TASK_ANY (TASK_STATE_MAX-1) /* * DO NOT ADD ANY NEW USERS ! */ #define TASK_FREEZABLE_UNSAFE (TASK_FREEZABLE | __TASK_FREEZABLE_UNSAFE) /* Convenience macros for the sake of set_current_state: */ #define TASK_KILLABLE (TASK_WAKEKILL | TASK_UNINTERRUPTIBLE) #define TASK_STOPPED (TASK_WAKEKILL | __TASK_STOPPED) #define TASK_TRACED __TASK_TRACED #define TASK_IDLE (TASK_UNINTERRUPTIBLE | TASK_NOLOAD) /* Convenience macros for the sake of wake_up(): */ #define TASK_NORMAL (TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE) /* get_task_state(): */ #define TASK_REPORT (TASK_RUNNING | TASK_INTERRUPTIBLE | \ TASK_UNINTERRUPTIBLE | __TASK_STOPPED | \ __TASK_TRACED | EXIT_DEAD | EXIT_ZOMBIE | \ TASK_PARKED) #define task_is_running(task) (READ_ONCE((task)->__state) == TASK_RUNNING) #define task_is_traced(task) ((READ_ONCE(task->jobctl) & JOBCTL_TRACED) != 0) #define task_is_stopped(task) ((READ_ONCE(task->jobctl) & JOBCTL_STOPPED) != 0) #define task_is_stopped_or_traced(task) ((READ_ONCE(task->jobctl) & (JOBCTL_STOPPED | JOBCTL_TRACED)) != 0) /* * Special states are those that do not use the normal wait-loop pattern. See * the comment with set_special_state(). */ #define is_special_task_state(state) \ ((state) & (__TASK_STOPPED | __TASK_TRACED | TASK_PARKED | \ TASK_DEAD | TASK_FROZEN)) #ifdef CONFIG_DEBUG_ATOMIC_SLEEP # define debug_normal_state_change(state_value) \ do { \ WARN_ON_ONCE(is_special_task_state(state_value)); \ current->task_state_change = _THIS_IP_; \ } while (0) # define debug_special_state_change(state_value) \ do { \ WARN_ON_ONCE(!is_special_task_state(state_value)); \ current->task_state_change = _THIS_IP_; \ } while (0) # define debug_rtlock_wait_set_state() \ do { \ current->saved_state_change = current->task_state_change;\ current->task_state_change = _THIS_IP_; \ } while (0) # define debug_rtlock_wait_restore_state() \ do { \ current->task_state_change = current->saved_state_change;\ } while (0) #else # define debug_normal_state_change(cond) do { } while (0) # define debug_special_state_change(cond) do { } while (0) # define debug_rtlock_wait_set_state() do { } while (0) # define debug_rtlock_wait_restore_state() do { } while (0) #endif #define trace_set_current_state(state_value) \ do { \ if (tracepoint_enabled(sched_set_state_tp)) \ __trace_set_current_state(state_value); \ } while (0) /* * set_current_state() includes a barrier so that the write of current->__state * is correctly serialised wrt the caller's subsequent test of whether to * actually sleep: * * for (;;) { * set_current_state(TASK_UNINTERRUPTIBLE); * if (CONDITION) * break; * * schedule(); * } * __set_current_state(TASK_RUNNING); * * If the caller does not need such serialisation (because, for instance, the * CONDITION test and condition change and wakeup are under the same lock) then * use __set_current_state(). * * The above is typically ordered against the wakeup, which does: * * CONDITION = 1; * wake_up_state(p, TASK_UNINTERRUPTIBLE); * * where wake_up_state()/try_to_wake_up() executes a full memory barrier before * accessing p->__state. * * Wakeup will do: if (@state & p->__state) p->__state = TASK_RUNNING, that is, * once it observes the TASK_UNINTERRUPTIBLE store the waking CPU can issue a * TASK_RUNNING store which can collide with __set_current_state(TASK_RUNNING). * * However, with slightly different timing the wakeup TASK_RUNNING store can * also collide with the TASK_UNINTERRUPTIBLE store. Losing that store is not * a problem either because that will result in one extra go around the loop * and our @cond test will save the day. * * Also see the comments of try_to_wake_up(). */ #define __set_current_state(state_value) \ do { \ debug_normal_state_change((state_value)); \ trace_set_current_state(state_value); \ WRITE_ONCE(current->__state, (state_value)); \ } while (0) #define set_current_state(state_value) \ do { \ debug_normal_state_change((state_value)); \ trace_set_current_state(state_value); \ smp_store_mb(current->__state, (state_value)); \ } while (0) /* * set_special_state() should be used for those states when the blocking task * can not use the regular condition based wait-loop. In that case we must * serialize against wakeups such that any possible in-flight TASK_RUNNING * stores will not collide with our state change. */ #define set_special_state(state_value) \ do { \ unsigned long flags; /* may shadow */ \ \ raw_spin_lock_irqsave(&current->pi_lock, flags); \ debug_special_state_change((state_value)); \ trace_set_current_state(state_value); \ WRITE_ONCE(current->__state, (state_value)); \ raw_spin_unlock_irqrestore(&current->pi_lock, flags); \ } while (0) /* * PREEMPT_RT specific variants for "sleeping" spin/rwlocks * * RT's spin/rwlock substitutions are state preserving. The state of the * task when blocking on the lock is saved in task_struct::saved_state and * restored after the lock has been acquired. These operations are * serialized by task_struct::pi_lock against try_to_wake_up(). Any non RT * lock related wakeups while the task is blocked on the lock are * redirected to operate on task_struct::saved_state to ensure that these * are not dropped. On restore task_struct::saved_state is set to * TASK_RUNNING so any wakeup attempt redirected to saved_state will fail. * * The lock operation looks like this: * * current_save_and_set_rtlock_wait_state(); * for (;;) { * if (try_lock()) * break; * raw_spin_unlock_irq(&lock->wait_lock); * schedule_rtlock(); * raw_spin_lock_irq(&lock->wait_lock); * set_current_state(TASK_RTLOCK_WAIT); * } * current_restore_rtlock_saved_state(); */ #define current_save_and_set_rtlock_wait_state() \ do { \ lockdep_assert_irqs_disabled(); \ raw_spin_lock(&current->pi_lock); \ current->saved_state = current->__state; \ debug_rtlock_wait_set_state(); \ trace_set_current_state(TASK_RTLOCK_WAIT); \ WRITE_ONCE(current->__state, TASK_RTLOCK_WAIT); \ raw_spin_unlock(&current->pi_lock); \ } while (0); #define current_restore_rtlock_saved_state() \ do { \ lockdep_assert_irqs_disabled(); \ raw_spin_lock(&current->pi_lock); \ debug_rtlock_wait_restore_state(); \ trace_set_current_state(current->saved_state); \ WRITE_ONCE(current->__state, current->saved_state); \ current->saved_state = TASK_RUNNING; \ raw_spin_unlock(&current->pi_lock); \ } while (0); #define get_current_state() READ_ONCE(current->__state) /* * Define the task command name length as enum, then it can be visible to * BPF programs. */ enum { TASK_COMM_LEN = 16, }; extern void sched_tick(void); #define MAX_SCHEDULE_TIMEOUT LONG_MAX extern long schedule_timeout(long timeout); extern long schedule_timeout_interruptible(long timeout); extern long schedule_timeout_killable(long timeout); extern long schedule_timeout_uninterruptible(long timeout); extern long schedule_timeout_idle(long timeout); asmlinkage void schedule(void); extern void schedule_preempt_disabled(void); asmlinkage void preempt_schedule_irq(void); #ifdef CONFIG_PREEMPT_RT extern void schedule_rtlock(void); #endif extern int __must_check io_schedule_prepare(void); extern void io_schedule_finish(int token); extern long io_schedule_timeout(long timeout); extern void io_schedule(void); /* wrapper functions to trace from this header file */ DECLARE_TRACEPOINT(sched_set_state_tp); extern void __trace_set_current_state(int state_value); DECLARE_TRACEPOINT(sched_set_need_resched_tp); extern void __trace_set_need_resched(struct task_struct *curr, int tif); /** * struct prev_cputime - snapshot of system and user cputime * @utime: time spent in user mode * @stime: time spent in system mode * @lock: protects the above two fields * * Stores previous user/system time values such that we can guarantee * monotonicity. */ struct prev_cputime { #ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE u64 utime; u64 stime; raw_spinlock_t lock; #endif }; enum vtime_state { /* Task is sleeping or running in a CPU with VTIME inactive: */ VTIME_INACTIVE = 0, /* Task is idle */ VTIME_IDLE, /* Task runs in kernelspace in a CPU with VTIME active: */ VTIME_SYS, /* Task runs in userspace in a CPU with VTIME active: */ VTIME_USER, /* Task runs as guests in a CPU with VTIME active: */ VTIME_GUEST, }; struct vtime { seqcount_t seqcount; unsigned long long starttime; enum vtime_state state; unsigned int cpu; u64 utime; u64 stime; u64 gtime; }; /* * Utilization clamp constraints. * @UCLAMP_MIN: Minimum utilization * @UCLAMP_MAX: Maximum utilization * @UCLAMP_CNT: Utilization clamp constraints count */ enum uclamp_id { UCLAMP_MIN = 0, UCLAMP_MAX, UCLAMP_CNT }; extern struct root_domain def_root_domain; extern struct mutex sched_domains_mutex; extern void sched_domains_mutex_lock(void); extern void sched_domains_mutex_unlock(void); struct sched_param { int sched_priority; }; struct sched_info { #ifdef CONFIG_SCHED_INFO /* Cumulative counters: */ /* # of times we have run on this CPU: */ unsigned long pcount; /* Time spent waiting on a runqueue: */ unsigned long long run_delay; /* Max time spent waiting on a runqueue: */ unsigned long long max_run_delay; /* Min time spent waiting on a runqueue: */ unsigned long long min_run_delay; /* Timestamps: */ /* When did we last run on a CPU? */ unsigned long long last_arrival; /* When were we last queued to run? */ unsigned long long last_queued; #endif /* CONFIG_SCHED_INFO */ }; /* * Integer metrics need fixed point arithmetic, e.g., sched/fair * has a few: load, load_avg, util_avg, freq, and capacity. * * We define a basic fixed point arithmetic range, and then formalize * all these metrics based on that basic range. */ # define SCHED_FIXEDPOINT_SHIFT 10 # define SCHED_FIXEDPOINT_SCALE (1L << SCHED_FIXEDPOINT_SHIFT) /* Increase resolution of cpu_capacity calculations */ # define SCHED_CAPACITY_SHIFT SCHED_FIXEDPOINT_SHIFT # define SCHED_CAPACITY_SCALE (1L << SCHED_CAPACITY_SHIFT) struct load_weight { unsigned long weight; u32 inv_weight; }; /* * The load/runnable/util_avg accumulates an infinite geometric series * (see __update_load_avg_cfs_rq() in kernel/sched/pelt.c). * * [load_avg definition] * * load_avg = runnable% * scale_load_down(load) * * [runnable_avg definition] * * runnable_avg = runnable% * SCHED_CAPACITY_SCALE * * [util_avg definition] * * util_avg = running% * SCHED_CAPACITY_SCALE * * where runnable% is the time ratio that a sched_entity is runnable and * running% the time ratio that a sched_entity is running. * * For cfs_rq, they are the aggregated values of all runnable and blocked * sched_entities. * * The load/runnable/util_avg doesn't directly factor frequency scaling and CPU * capacity scaling. The scaling is done through the rq_clock_pelt that is used * for computing those signals (see update_rq_clock_pelt()) * * N.B., the above ratios (runnable% and running%) themselves are in the * range of [0, 1]. To do fixed point arithmetics, we therefore scale them * to as large a range as necessary. This is for example reflected by * util_avg's SCHED_CAPACITY_SCALE. * * [Overflow issue] * * The 64-bit load_sum can have 4353082796 (=2^64/47742/88761) entities * with the highest load (=88761), always runnable on a single cfs_rq, * and should not overflow as the number already hits PID_MAX_LIMIT. * * For all other cases (including 32-bit kernels), struct load_weight's * weight will overflow first before we do, because: * * Max(load_avg) <= Max(load.weight) * * Then it is the load_weight's responsibility to consider overflow * issues. */ struct sched_avg { u64 last_update_time; u64 load_sum; u64 runnable_sum; u32 util_sum; u32 period_contrib; unsigned long load_avg; unsigned long runnable_avg; unsigned long util_avg; unsigned int util_est; } ____cacheline_aligned; /* * The UTIL_AVG_UNCHANGED flag is used to synchronize util_est with util_avg * updates. When a task is dequeued, its util_est should not be updated if its * util_avg has not been updated in the meantime. * This information is mapped into the MSB bit of util_est at dequeue time. * Since max value of util_est for a task is 1024 (PELT util_avg for a task) * it is safe to use MSB. */ #define UTIL_EST_WEIGHT_SHIFT 2 #define UTIL_AVG_UNCHANGED 0x80000000 struct sched_statistics { #ifdef CONFIG_SCHEDSTATS u64 wait_start; u64 wait_max; u64 wait_count; u64 wait_sum; u64 iowait_count; u64 iowait_sum; u64 sleep_start; u64 sleep_max; s64 sum_sleep_runtime; u64 block_start; u64 block_max; s64 sum_block_runtime; s64 exec_max; u64 slice_max; u64 nr_migrations_cold; u64 nr_failed_migrations_affine; u64 nr_failed_migrations_running; u64 nr_failed_migrations_hot; u64 nr_forced_migrations; u64 nr_wakeups; u64 nr_wakeups_sync; u64 nr_wakeups_migrate; u64 nr_wakeups_local; u64 nr_wakeups_remote; u64 nr_wakeups_affine; u64 nr_wakeups_affine_attempts; u64 nr_wakeups_passive; u64 nr_wakeups_idle; #ifdef CONFIG_SCHED_CORE u64 core_forceidle_sum; #endif #endif /* CONFIG_SCHEDSTATS */ } ____cacheline_aligned; struct sched_entity { /* For load-balancing: */ struct load_weight load; struct rb_node run_node; u64 deadline; u64 min_vruntime; u64 min_slice; struct list_head group_node; unsigned char on_rq; unsigned char sched_delayed; unsigned char rel_deadline; unsigned char custom_slice; /* hole */ u64 exec_start; u64 sum_exec_runtime; u64 prev_sum_exec_runtime; u64 vruntime; union { /* * When !@on_rq this field is vlag. * When cfs_rq->curr == se (which implies @on_rq) * this field is vprot. See protect_slice(). */ s64 vlag; u64 vprot; }; u64 slice; u64 nr_migrations; #ifdef CONFIG_FAIR_GROUP_SCHED int depth; struct sched_entity *parent; /* rq on which this entity is (to be) queued: */ struct cfs_rq *cfs_rq; /* rq "owned" by this entity/group: */ struct cfs_rq *my_q; /* cached value of my_q->h_nr_running */ unsigned long runnable_weight; #endif /* * Per entity load average tracking. * * Put into separate cache line so it does not * collide with read-mostly values above. */ struct sched_avg avg; }; struct sched_rt_entity { struct list_head run_list; unsigned long timeout; unsigned long watchdog_stamp; unsigned int time_slice; unsigned short on_rq; unsigned short on_list; struct sched_rt_entity *back; #ifdef CONFIG_RT_GROUP_SCHED struct sched_rt_entity *parent; /* rq on which this entity is (to be) queued: */ struct rt_rq *rt_rq; /* rq "owned" by this entity/group: */ struct rt_rq *my_q; #endif } __randomize_layout; struct rq_flags; typedef struct task_struct *(*dl_server_pick_f)(struct sched_dl_entity *, struct rq_flags *rf); struct sched_dl_entity { struct rb_node rb_node; /* * Original scheduling parameters. Copied here from sched_attr * during sched_setattr(), they will remain the same until * the next sched_setattr(). */ u64 dl_runtime; /* Maximum runtime for each instance */ u64 dl_deadline; /* Relative deadline of each instance */ u64 dl_period; /* Separation of two instances (period) */ u64 dl_bw; /* dl_runtime / dl_period */ u64 dl_density; /* dl_runtime / dl_deadline */ /* * Actual scheduling parameters. Initialized with the values above, * they are continuously updated during task execution. Note that * the remaining runtime could be < 0 in case we are in overrun. */ s64 runtime; /* Remaining runtime for this instance */ u64 deadline; /* Absolute deadline for this instance */ unsigned int flags; /* Specifying the scheduler behaviour */ /* * Some bool flags: * * @dl_throttled tells if we exhausted the runtime. If so, the * task has to wait for a replenishment to be performed at the * next firing of dl_timer. * * @dl_yielded tells if task gave up the CPU before consuming * all its available runtime during the last job. * * @dl_non_contending tells if the task is inactive while still * contributing to the active utilization. In other words, it * indicates if the inactive timer has been armed and its handler * has not been executed yet. This flag is useful to avoid race * conditions between the inactive timer handler and the wakeup * code. * * @dl_overrun tells if the task asked to be informed about runtime * overruns. * * @dl_server tells if this is a server entity. * * @dl_server_active tells if the dlserver is active(started). * dlserver is started on first cfs enqueue on an idle runqueue * and is stopped when a dequeue results in 0 cfs tasks on the * runqueue. In other words, dlserver is active only when cpu's * runqueue has atleast one cfs task. * * @dl_defer tells if this is a deferred or regular server. For * now only defer server exists. * * @dl_defer_armed tells if the deferrable server is waiting * for the replenishment timer to activate it. * * @dl_defer_running tells if the deferrable server is actually * running, skipping the defer phase. * * @dl_defer_idle tracks idle state */ unsigned int dl_throttled : 1; unsigned int dl_yielded : 1; unsigned int dl_non_contending : 1; unsigned int dl_overrun : 1; unsigned int dl_server : 1; unsigned int dl_server_active : 1; unsigned int dl_defer : 1; unsigned int dl_defer_armed : 1; unsigned int dl_defer_running : 1; unsigned int dl_defer_idle : 1; /* * Bandwidth enforcement timer. Each -deadline task has its * own bandwidth to be enforced, thus we need one timer per task. */ struct hrtimer dl_timer; /* * Inactive timer, responsible for decreasing the active utilization * at the "0-lag time". When a -deadline task blocks, it contributes * to GRUB's active utilization until the "0-lag time", hence a * timer is needed to decrease the active utilization at the correct * time. */ struct hrtimer inactive_timer; /* * Bits for DL-server functionality. Also see the comment near * dl_server_update(). * * @rq the runqueue this server is for */ struct rq *rq; dl_server_pick_f server_pick_task; #ifdef CONFIG_RT_MUTEXES /* * Priority Inheritance. When a DEADLINE scheduling entity is boosted * pi_se points to the donor, otherwise points to the dl_se it belongs * to (the original one/itself). */ struct sched_dl_entity *pi_se; #endif }; #ifdef CONFIG_UCLAMP_TASK /* Number of utilization clamp buckets (shorter alias) */ #define UCLAMP_BUCKETS CONFIG_UCLAMP_BUCKETS_COUNT /* * Utilization clamp for a scheduling entity * @value: clamp value "assigned" to a se * @bucket_id: bucket index corresponding to the "assigned" value * @active: the se is currently refcounted in a rq's bucket * @user_defined: the requested clamp value comes from user-space * * The bucket_id is the index of the clamp bucket matching the clamp value * which is pre-computed and stored to avoid expensive integer divisions from * the fast path. * * The active bit is set whenever a task has got an "effective" value assigned, * which can be different from the clamp value "requested" from user-space. * This allows to know a task is refcounted in the rq's bucket corresponding * to the "effective" bucket_id. * * The user_defined bit is set whenever a task has got a task-specific clamp * value requested from userspace, i.e. the system defaults apply to this task * just as a restriction. This allows to relax default clamps when a less * restrictive task-specific value has been requested, thus allowing to * implement a "nice" semantic. For example, a task running with a 20% * default boost can still drop its own boosting to 0%. */ struct uclamp_se { unsigned int value : bits_per(SCHED_CAPACITY_SCALE); unsigned int bucket_id : bits_per(UCLAMP_BUCKETS); unsigned int active : 1; unsigned int user_defined : 1; }; #endif /* CONFIG_UCLAMP_TASK */ union rcu_special { struct { u8 blocked; u8 need_qs; u8 exp_hint; /* Hint for performance. */ u8 need_mb; /* Readers need smp_mb(). */ } b; /* Bits. */ u32 s; /* Set of bits. */ }; enum perf_event_task_context { perf_invalid_context = -1, perf_hw_context = 0, perf_sw_context, perf_nr_task_contexts, }; /* * Number of contexts where an event can trigger: * task, softirq, hardirq, nmi. */ #define PERF_NR_CONTEXTS 4 struct wake_q_node { struct wake_q_node *next; }; struct kmap_ctrl { #ifdef CONFIG_KMAP_LOCAL int idx; pte_t pteval[KM_MAX_IDX]; #endif }; struct task_struct { #ifdef CONFIG_THREAD_INFO_IN_TASK /* * For reasons of header soup (see current_thread_info()), this * must be the first element of task_struct. */ struct thread_info thread_info; #endif unsigned int __state; /* saved state for "spinlock sleepers" */ unsigned int saved_state; /* * This begins the randomizable portion of task_struct. Only * scheduling-critical items should be added above here. */ randomized_struct_fields_start void *stack; refcount_t usage; /* Per task flags (PF_*), defined further below: */ unsigned int flags; unsigned int ptrace; #ifdef CONFIG_MEM_ALLOC_PROFILING struct alloc_tag *alloc_tag; #endif int on_cpu; struct __call_single_node wake_entry; unsigned int wakee_flips; unsigned long wakee_flip_decay_ts; struct task_struct *last_wakee; /* * recent_used_cpu is initially set as the last CPU used by a task * that wakes affine another task. Waker/wakee relationships can * push tasks around a CPU where each wakeup moves to the next one. * Tracking a recently used CPU allows a quick search for a recently * used CPU that may be idle. */ int recent_used_cpu; int wake_cpu; int on_rq; int prio; int static_prio; int normal_prio; unsigned int rt_priority; struct sched_entity se; struct sched_rt_entity rt; struct sched_dl_entity dl; struct sched_dl_entity *dl_server; #ifdef CONFIG_SCHED_CLASS_EXT struct sched_ext_entity scx; #endif const struct sched_class *sched_class; #ifdef CONFIG_SCHED_CORE struct rb_node core_node; unsigned long core_cookie; unsigned int core_occupation; #endif #ifdef CONFIG_CGROUP_SCHED struct task_group *sched_task_group; #ifdef CONFIG_CFS_BANDWIDTH struct callback_head sched_throttle_work; struct list_head throttle_node; bool throttled; #endif #endif #ifdef CONFIG_UCLAMP_TASK /* * Clamp values requested for a scheduling entity. * Must be updated with task_rq_lock() held. */ struct uclamp_se uclamp_req[UCLAMP_CNT]; /* * Effective clamp values used for a scheduling entity. * Must be updated with task_rq_lock() held. */ struct uclamp_se uclamp[UCLAMP_CNT]; #endif struct sched_statistics stats; #ifdef CONFIG_PREEMPT_NOTIFIERS /* List of struct preempt_notifier: */ struct hlist_head preempt_notifiers; #endif #ifdef CONFIG_BLK_DEV_IO_TRACE unsigned int btrace_seq; #endif unsigned int policy; unsigned long max_allowed_capacity; int nr_cpus_allowed; const cpumask_t *cpus_ptr; cpumask_t *user_cpus_ptr; cpumask_t cpus_mask; void *migration_pending; unsigned short migration_disabled; unsigned short migration_flags; #ifdef CONFIG_PREEMPT_RCU int rcu_read_lock_nesting; union rcu_special rcu_read_unlock_special; struct list_head rcu_node_entry; struct rcu_node *rcu_blocked_node; #endif /* #ifdef CONFIG_PREEMPT_RCU */ #ifdef CONFIG_TASKS_RCU unsigned long rcu_tasks_nvcsw; u8 rcu_tasks_holdout; u8 rcu_tasks_idx; int rcu_tasks_idle_cpu; struct list_head rcu_tasks_holdout_list; int rcu_tasks_exit_cpu; struct list_head rcu_tasks_exit_list; #endif /* #ifdef CONFIG_TASKS_RCU */ #ifdef CONFIG_TASKS_TRACE_RCU int trc_reader_nesting; int trc_ipi_to_cpu; union rcu_special trc_reader_special; struct list_head trc_holdout_list; struct list_head trc_blkd_node; int trc_blkd_cpu; #endif /* #ifdef CONFIG_TASKS_TRACE_RCU */ struct sched_info sched_info; struct list_head tasks; struct plist_node pushable_tasks; struct rb_node pushable_dl_tasks; struct mm_struct *mm; struct mm_struct *active_mm; struct address_space *faults_disabled_mapping; int exit_state; int exit_code; int exit_signal; /* The signal sent when the parent dies: */ int pdeath_signal; /* JOBCTL_*, siglock protected: */ unsigned long jobctl; /* Used for emulating ABI behavior of previous Linux versions: */ unsigned int personality; /* Scheduler bits, serialized by scheduler locks: */ unsigned sched_reset_on_fork:1; unsigned sched_contributes_to_load:1; unsigned sched_migrated:1; unsigned sched_task_hot:1; /* Force alignment to the next boundary: */ unsigned :0; /* Unserialized, strictly 'current' */ /* * This field must not be in the scheduler word above due to wakelist * queueing no longer being serialized by p->on_cpu. However: * * p->XXX = X; ttwu() * schedule() if (p->on_rq && ..) // false * smp_mb__after_spinlock(); if (smp_load_acquire(&p->on_cpu) && //true * deactivate_task() ttwu_queue_wakelist()) * p->on_rq = 0; p->sched_remote_wakeup = Y; * * guarantees all stores of 'current' are visible before * ->sched_remote_wakeup gets used, so it can be in this word. */ unsigned sched_remote_wakeup:1; #ifdef CONFIG_RT_MUTEXES unsigned sched_rt_mutex:1; #endif /* Bit to tell TOMOYO we're in execve(): */ unsigned in_execve:1; unsigned in_iowait:1; #ifndef TIF_RESTORE_SIGMASK unsigned restore_sigmask:1; #endif #ifdef CONFIG_MEMCG_V1 unsigned in_user_fault:1; #endif #ifdef CONFIG_LRU_GEN /* whether the LRU algorithm may apply to this access */ unsigned in_lru_fault:1; #endif #ifdef CONFIG_COMPAT_BRK unsigned brk_randomized:1; #endif #ifdef CONFIG_CGROUPS /* disallow userland-initiated cgroup migration */ unsigned no_cgroup_migration:1; /* task is frozen/stopped (used by the cgroup freezer) */ unsigned frozen:1; #endif #ifdef CONFIG_BLK_CGROUP unsigned use_memdelay:1; #endif #ifdef CONFIG_PSI /* Stalled due to lack of memory */ unsigned in_memstall:1; #endif #ifdef CONFIG_PAGE_OWNER /* Used by page_owner=on to detect recursion in page tracking. */ unsigned in_page_owner:1; #endif #ifdef CONFIG_EVENTFD /* Recursion prevention for eventfd_signal() */ unsigned in_eventfd:1; #endif #ifdef CONFIG_ARCH_HAS_CPU_PASID unsigned pasid_activated:1; #endif #ifdef CONFIG_X86_BUS_LOCK_DETECT unsigned reported_split_lock:1; #endif #ifdef CONFIG_TASK_DELAY_ACCT /* delay due to memory thrashing */ unsigned in_thrashing:1; #endif unsigned in_nf_duplicate:1; #ifdef CONFIG_PREEMPT_RT struct netdev_xmit net_xmit; #endif unsigned long atomic_flags; /* Flags requiring atomic access. */ struct restart_block restart_block; pid_t pid; pid_t tgid; #ifdef CONFIG_STACKPROTECTOR /* Canary value for the -fstack-protector GCC feature: */ unsigned long stack_canary; #endif /* * Pointers to the (original) parent process, youngest child, younger sibling, * older sibling, respectively. (p->father can be replaced with * p->real_parent->pid) */ /* Real parent process: */ struct task_struct __rcu *real_parent; /* Recipient of SIGCHLD, wait4() reports: */ struct task_struct __rcu *parent; /* * Children/sibling form the list of natural children: */ struct list_head children; struct list_head sibling; struct task_struct *group_leader; /* * 'ptraced' is the list of tasks this task is using ptrace() on. * * This includes both natural children and PTRACE_ATTACH targets. * 'ptrace_entry' is this task's link on the p->parent->ptraced list. */ struct list_head ptraced; struct list_head ptrace_entry; /* PID/PID hash table linkage. */ struct pid *thread_pid; struct hlist_node pid_links[PIDTYPE_MAX]; struct list_head thread_node; struct completion *vfork_done; /* CLONE_CHILD_SETTID: */ int __user *set_child_tid; /* CLONE_CHILD_CLEARTID: */ int __user *clear_child_tid; /* PF_KTHREAD | PF_IO_WORKER */ void *worker_private; u64 utime; u64 stime; #ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME u64 utimescaled; u64 stimescaled; #endif u64 gtime; struct prev_cputime prev_cputime; #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN struct vtime vtime; #endif #ifdef CONFIG_NO_HZ_FULL atomic_t tick_dep_mask; #endif /* Context switch counts: */ unsigned long nvcsw; unsigned long nivcsw; /* Monotonic time in nsecs: */ u64 start_time; /* Boot based time in nsecs: */ u64 start_boottime; /* MM fault and swap info: this can arguably be seen as either mm-specific or thread-specific: */ unsigned long min_flt; unsigned long maj_flt; /* Empty if CONFIG_POSIX_CPUTIMERS=n */ struct posix_cputimers posix_cputimers; #ifdef CONFIG_POSIX_CPU_TIMERS_TASK_WORK struct posix_cputimers_work posix_cputimers_work; #endif /* Process credentials: */ /* Tracer's credentials at attach: */ const struct cred __rcu *ptracer_cred; /* Objective and real subjective task credentials (COW): */ const struct cred __rcu *real_cred; /* Effective (overridable) subjective task credentials (COW): */ const struct cred __rcu *cred; #ifdef CONFIG_KEYS /* Cached requested key. */ struct key *cached_requested_key; #endif /* * executable name, excluding path. * * - normally initialized begin_new_exec() * - set it with set_task_comm() * - strscpy_pad() to ensure it is always NUL-terminated and * zero-padded * - task_lock() to ensure the operation is atomic and the name is * fully updated. */ char comm[TASK_COMM_LEN]; struct nameidata *nameidata; #ifdef CONFIG_SYSVIPC struct sysv_sem sysvsem; struct sysv_shm sysvshm; #endif #ifdef CONFIG_DETECT_HUNG_TASK unsigned long last_switch_count; unsigned long last_switch_time; #endif /* Filesystem information: */ struct fs_struct *fs; /* Open file information: */ struct files_struct *files; #ifdef CONFIG_IO_URING struct io_uring_task *io_uring; #endif /* Namespaces: */ struct nsproxy *nsproxy; /* Signal handlers: */ struct signal_struct *signal; struct sighand_struct __rcu *sighand; sigset_t blocked; sigset_t real_blocked; /* Restored if set_restore_sigmask() was used: */ sigset_t saved_sigmask; struct sigpending pending; unsigned long sas_ss_sp; size_t sas_ss_size; unsigned int sas_ss_flags; struct callback_head *task_works; #ifdef CONFIG_AUDIT #ifdef CONFIG_AUDITSYSCALL struct audit_context *audit_context; #endif kuid_t loginuid; unsigned int sessionid; #endif struct seccomp seccomp; struct syscall_user_dispatch syscall_dispatch; /* Thread group tracking: */ u64 parent_exec_id; u64 self_exec_id; /* Protection against (de-)allocation: mm, files, fs, tty, keyrings, mems_allowed, mempolicy: */ spinlock_t alloc_lock; /* Protection of the PI data structures: */ raw_spinlock_t pi_lock; struct wake_q_node wake_q; #ifdef CONFIG_RT_MUTEXES /* PI waiters blocked on a rt_mutex held by this task: */ struct rb_root_cached pi_waiters; /* Updated under owner's pi_lock and rq lock */ struct task_struct *pi_top_task; /* Deadlock detection and priority inheritance handling: */ struct rt_mutex_waiter *pi_blocked_on; #endif struct mutex *blocked_on; /* lock we're blocked on */ #ifdef CONFIG_DETECT_HUNG_TASK_BLOCKER /* * Encoded lock address causing task block (lower 2 bits = type from * <linux/hung_task.h>). Accessed via hung_task_*() helpers. */ unsigned long blocker; #endif #ifdef CONFIG_DEBUG_ATOMIC_SLEEP int non_block_count; #endif #ifdef CONFIG_TRACE_IRQFLAGS struct irqtrace_events irqtrace; unsigned int hardirq_threaded; u64 hardirq_chain_key; int softirqs_enabled; int softirq_context; int irq_config; #endif #ifdef CONFIG_PREEMPT_RT int softirq_disable_cnt; #endif #ifdef CONFIG_LOCKDEP # define MAX_LOCK_DEPTH 48UL u64 curr_chain_key; int lockdep_depth; unsigned int lockdep_recursion; struct held_lock held_locks[MAX_LOCK_DEPTH]; #endif #if defined(CONFIG_UBSAN) && !defined(CONFIG_UBSAN_TRAP) unsigned int in_ubsan; #endif /* Journalling filesystem info: */ void *journal_info; /* Stacked block device info: */ struct bio_list *bio_list; /* Stack plugging: */ struct blk_plug *plug; /* VM state: */ struct reclaim_state *reclaim_state; struct io_context *io_context; #ifdef CONFIG_COMPACTION struct capture_control *capture_control; #endif /* Ptrace state: */ unsigned long ptrace_message; kernel_siginfo_t *last_siginfo; struct task_io_accounting ioac; #ifdef CONFIG_PSI /* Pressure stall state */ unsigned int psi_flags; #endif #ifdef CONFIG_TASK_XACCT /* Accumulated RSS usage: */ u64 acct_rss_mem1; /* Accumulated virtual memory usage: */ u64 acct_vm_mem1; /* stime + utime since last update: */ u64 acct_timexpd; #endif #ifdef CONFIG_CPUSETS /* Protected by ->alloc_lock: */ nodemask_t mems_allowed; /* Sequence number to catch updates: */ seqcount_spinlock_t mems_allowed_seq; int cpuset_mem_spread_rotor; #endif #ifdef CONFIG_CGROUPS /* Control Group info protected by css_set_lock: */ struct css_set __rcu *cgroups; /* cg_list protected by css_set_lock and tsk->alloc_lock: */ struct list_head cg_list; #ifdef CONFIG_PREEMPT_RT struct llist_node cg_dead_lnode; #endif /* CONFIG_PREEMPT_RT */ #endif /* CONFIG_CGROUPS */ #ifdef CONFIG_X86_CPU_RESCTRL u32 closid; u32 rmid; #endif #ifdef CONFIG_FUTEX struct robust_list_head __user *robust_list; #ifdef CONFIG_COMPAT struct compat_robust_list_head __user *compat_robust_list; #endif struct list_head pi_state_list; struct futex_pi_state *pi_state_cache; struct mutex futex_exit_mutex; unsigned int futex_state; #endif #ifdef CONFIG_PERF_EVENTS u8 perf_recursion[PERF_NR_CONTEXTS]; struct perf_event_context *perf_event_ctxp; struct mutex perf_event_mutex; struct list_head perf_event_list; struct perf_ctx_data __rcu *perf_ctx_data; #endif #ifdef CONFIG_DEBUG_PREEMPT unsigned long preempt_disable_ip; #endif #ifdef CONFIG_NUMA /* Protected by alloc_lock: */ struct mempolicy *mempolicy; short il_prev; u8 il_weight; short pref_node_fork; #endif #ifdef CONFIG_NUMA_BALANCING int numa_scan_seq; unsigned int numa_scan_period; unsigned int numa_scan_period_max; int numa_preferred_nid; unsigned long numa_migrate_retry; /* Migration stamp: */ u64 node_stamp; u64 last_task_numa_placement; u64 last_sum_exec_runtime; struct callback_head numa_work; /* * This pointer is only modified for current in syscall and * pagefault context (and for tasks being destroyed), so it can be read * from any of the following contexts: * - RCU read-side critical section * - current->numa_group from everywhere * - task's runqueue locked, task not running */ struct numa_group __rcu *numa_group; /* * numa_faults is an array split into four regions: * faults_memory, faults_cpu, faults_memory_buffer, faults_cpu_buffer * in this precise order. * * faults_memory: Exponential decaying average of faults on a per-node * basis. Scheduling placement decisions are made based on these * counts. The values remain static for the duration of a PTE scan. * faults_cpu: Track the nodes the process was running on when a NUMA * hinting fault was incurred. * faults_memory_buffer and faults_cpu_buffer: Record faults per node * during the current scan window. When the scan completes, the counts * in faults_memory and faults_cpu decay and these values are copied. */ unsigned long *numa_faults; unsigned long total_numa_faults; /* * numa_faults_locality tracks if faults recorded during the last * scan window were remote/local or failed to migrate. The task scan * period is adapted based on the locality of the faults with different * weights depending on whether they were shared or private faults */ unsigned long numa_faults_locality[3]; unsigned long numa_pages_migrated; #endif /* CONFIG_NUMA_BALANCING */ struct rseq_data rseq; struct sched_mm_cid mm_cid; struct tlbflush_unmap_batch tlb_ubc; /* Cache last used pipe for splice(): */ struct pipe_inode_info *splice_pipe; struct page_frag task_frag; #ifdef CONFIG_TASK_DELAY_ACCT struct task_delay_info *delays; #endif #ifdef CONFIG_FAULT_INJECTION int make_it_fail; unsigned int fail_nth; #endif /* * When (nr_dirtied >= nr_dirtied_pause), it's time to call * balance_dirty_pages() for a dirty throttling pause: */ int nr_dirtied; int nr_dirtied_pause; /* Start of a write-and-pause period: */ unsigned long dirty_paused_when; #ifdef CONFIG_LATENCYTOP int latency_record_count; struct latency_record latency_record[LT_SAVECOUNT]; #endif /* * Time slack values; these are used to round up poll() and * select() etc timeout values. These are in nanoseconds. */ u64 timer_slack_ns; u64 default_timer_slack_ns; #if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS) unsigned int kasan_depth; #endif #ifdef CONFIG_KCSAN struct kcsan_ctx kcsan_ctx; #ifdef CONFIG_TRACE_IRQFLAGS struct irqtrace_events kcsan_save_irqtrace; #endif #ifdef CONFIG_KCSAN_WEAK_MEMORY int kcsan_stack_depth; #endif #endif #ifdef CONFIG_KMSAN struct kmsan_ctx kmsan_ctx; #endif #if IS_ENABLED(CONFIG_KUNIT) struct kunit *kunit_test; #endif #ifdef CONFIG_FUNCTION_GRAPH_TRACER /* Index of current stored address in ret_stack: */ int curr_ret_stack; int curr_ret_depth; /* Stack of return addresses for return function tracing: */ unsigned long *ret_stack; /* Timestamp for last schedule: */ unsigned long long ftrace_timestamp; unsigned long long ftrace_sleeptime; /* * Number of functions that haven't been traced * because of depth overrun: */ atomic_t trace_overrun; /* Pause tracing: */ atomic_t tracing_graph_pause; #endif #ifdef CONFIG_TRACING /* Bitmask and counter of trace recursion: */ unsigned long trace_recursion; #endif /* CONFIG_TRACING */ #ifdef CONFIG_KCOV /* See kernel/kcov.c for more details. */ /* Coverage collection mode enabled for this task (0 if disabled): */ unsigned int kcov_mode; /* Size of the kcov_area: */ unsigned int kcov_size; /* Buffer for coverage collection: */ void *kcov_area; /* KCOV descriptor wired with this task or NULL: */ struct kcov *kcov; /* KCOV common handle for remote coverage collection: */ u64 kcov_handle; /* KCOV sequence number: */ int kcov_sequence; /* Collect coverage from softirq context: */ unsigned int kcov_softirq; #endif #ifdef CONFIG_MEMCG_V1 struct mem_cgroup *memcg_in_oom; #endif #ifdef CONFIG_MEMCG /* Number of pages to reclaim on returning to userland: */ unsigned int memcg_nr_pages_over_high; /* Used by memcontrol for targeted memcg charge: */ struct mem_cgroup *active_memcg; /* Cache for current->cgroups->memcg->objcg lookups: */ struct obj_cgroup *objcg; #endif #ifdef CONFIG_BLK_CGROUP struct gendisk *throttle_disk; #endif #ifdef CONFIG_UPROBES struct uprobe_task *utask; #endif #if defined(CONFIG_BCACHE) || defined(CONFIG_BCACHE_MODULE) unsigned int sequential_io; unsigned int sequential_io_avg; #endif struct kmap_ctrl kmap_ctrl; #ifdef CONFIG_DEBUG_ATOMIC_SLEEP unsigned long task_state_change; # ifdef CONFIG_PREEMPT_RT unsigned long saved_state_change; # endif #endif struct rcu_head rcu; refcount_t rcu_users; int pagefault_disabled; #ifdef CONFIG_MMU struct task_struct *oom_reaper_list; struct timer_list oom_reaper_timer; #endif #ifdef CONFIG_VMAP_STACK struct vm_struct *stack_vm_area; #endif #ifdef CONFIG_THREAD_INFO_IN_TASK /* A live task holds one reference: */ refcount_t stack_refcount; #endif #ifdef CONFIG_LIVEPATCH int patch_state; #endif #ifdef CONFIG_SECURITY /* Used by LSM modules for access restriction: */ void *security; #endif #ifdef CONFIG_BPF_SYSCALL /* Used by BPF task local storage */ struct bpf_local_storage __rcu *bpf_storage; /* Used for BPF run context */ struct bpf_run_ctx *bpf_ctx; #endif /* Used by BPF for per-TASK xdp storage */ struct bpf_net_context *bpf_net_context; #ifdef CONFIG_KSTACK_ERASE unsigned long lowest_stack; #endif #ifdef CONFIG_KSTACK_ERASE_METRICS unsigned long prev_lowest_stack; #endif #ifdef CONFIG_X86_MCE void __user *mce_vaddr; __u64 mce_kflags; u64 mce_addr; __u64 mce_ripv : 1, mce_whole_page : 1, __mce_reserved : 62; struct callback_head mce_kill_me; int mce_count; #endif #ifdef CONFIG_KRETPROBES struct llist_head kretprobe_instances; #endif #ifdef CONFIG_RETHOOK struct llist_head rethooks; #endif #ifdef CONFIG_ARCH_HAS_PARANOID_L1D_FLUSH /* * If L1D flush is supported on mm context switch * then we use this callback head to queue kill work * to kill tasks that are not running on SMT disabled * cores */ struct callback_head l1d_flush_kill; #endif #ifdef CONFIG_RV /* * Per-task RV monitor, fixed in CONFIG_RV_PER_TASK_MONITORS. * If memory becomes a concern, we can think about a dynamic method. */ union rv_task_monitor rv[CONFIG_RV_PER_TASK_MONITORS]; #endif #ifdef CONFIG_USER_EVENTS struct user_event_mm *user_event_mm; #endif #ifdef CONFIG_UNWIND_USER struct unwind_task_info unwind_info; #endif /* CPU-specific state of this task: */ struct thread_struct thread; /* * New fields for task_struct should be added above here, so that * they are included in the randomized portion of task_struct. */ randomized_struct_fields_end } __attribute__ ((aligned (64))); #ifdef CONFIG_SCHED_PROXY_EXEC DECLARE_STATIC_KEY_TRUE(__sched_proxy_exec); static inline bool sched_proxy_exec(void) { return static_branch_likely(&__sched_proxy_exec); } #else static inline bool sched_proxy_exec(void) { return false; } #endif #define TASK_REPORT_IDLE (TASK_REPORT + 1) #define TASK_REPORT_MAX (TASK_REPORT_IDLE << 1) static inline unsigned int __task_state_index(unsigned int tsk_state, unsigned int tsk_exit_state) { unsigned int state = (tsk_state | tsk_exit_state) & TASK_REPORT; BUILD_BUG_ON_NOT_POWER_OF_2(TASK_REPORT_MAX); if ((tsk_state & TASK_IDLE) == TASK_IDLE) state = TASK_REPORT_IDLE; /* * We're lying here, but rather than expose a completely new task state * to userspace, we can make this appear as if the task has gone through * a regular rt_mutex_lock() call. * Report frozen tasks as uninterruptible. */ if ((tsk_state & TASK_RTLOCK_WAIT) || (tsk_state & TASK_FROZEN)) state = TASK_UNINTERRUPTIBLE; return fls(state); } static inline unsigned int task_state_index(struct task_struct *tsk) { return __task_state_index(READ_ONCE(tsk->__state), tsk->exit_state); } static inline char task_index_to_char(unsigned int state) { static const char state_char[] = "RSDTtXZPI"; BUILD_BUG_ON(TASK_REPORT_MAX * 2 != 1 << (sizeof(state_char) - 1)); return state_char[state]; } static inline char task_state_to_char(struct task_struct *tsk) { return task_index_to_char(task_state_index(tsk)); } extern struct pid *cad_pid; /* * Per process flags */ #define PF_VCPU 0x00000001 /* I'm a virtual CPU */ #define PF_IDLE 0x00000002 /* I am an IDLE thread */ #define PF_EXITING 0x00000004 /* Getting shut down */ #define PF_POSTCOREDUMP 0x00000008 /* Coredumps should ignore this task */ #define PF_IO_WORKER 0x00000010 /* Task is an IO worker */ #define PF_WQ_WORKER 0x00000020 /* I'm a workqueue worker */ #define PF_FORKNOEXEC 0x00000040 /* Forked but didn't exec */ #define PF_MCE_PROCESS 0x00000080 /* Process policy on mce errors */ #define PF_SUPERPRIV 0x00000100 /* Used super-user privileges */ #define PF_DUMPCORE 0x00000200 /* Dumped core */ #define PF_SIGNALED 0x00000400 /* Killed by a signal */ #define PF_MEMALLOC 0x00000800 /* Allocating memory to free memory. See memalloc_noreclaim_save() */ #define PF_NPROC_EXCEEDED 0x00001000 /* set_user() noticed that RLIMIT_NPROC was exceeded */ #define PF_USED_MATH 0x00002000 /* If unset the fpu must be initialized before use */ #define PF_USER_WORKER 0x00004000 /* Kernel thread cloned from userspace thread */ #define PF_NOFREEZE 0x00008000 /* This thread should not be frozen */ #define PF_KCOMPACTD 0x00010000 /* I am kcompactd */ #define PF_KSWAPD 0x00020000 /* I am kswapd */ #define PF_MEMALLOC_NOFS 0x00040000 /* All allocations inherit GFP_NOFS. See memalloc_nfs_save() */ #define PF_MEMALLOC_NOIO 0x00080000 /* All allocations inherit GFP_NOIO. See memalloc_noio_save() */ #define PF_LOCAL_THROTTLE 0x00100000 /* Throttle writes only against the bdi I write to, * I am cleaning dirty pages from some other bdi. */ #define PF_KTHREAD 0x00200000 /* I am a kernel thread */ #define PF_RANDOMIZE 0x00400000 /* Randomize virtual address space */ #define PF__HOLE__00800000 0x00800000 #define PF__HOLE__01000000 0x01000000 #define PF__HOLE__02000000 0x02000000 #define PF_NO_SETAFFINITY 0x04000000 /* Userland is not allowed to meddle with cpus_mask */ #define PF_MCE_EARLY 0x08000000 /* Early kill for mce process policy */ #define PF_MEMALLOC_PIN 0x10000000 /* Allocations constrained to zones which allow long term pinning. * See memalloc_pin_save() */ #define PF_BLOCK_TS 0x20000000 /* plug has ts that needs updating */ #define PF__HOLE__40000000 0x40000000 #define PF_SUSPEND_TASK 0x80000000 /* This thread called freeze_processes() and should not be frozen */ /* * Only the _current_ task can read/write to tsk->flags, but other * tasks can access tsk->flags in readonly mode for example * with tsk_used_math (like during threaded core dumping). * There is however an exception to this rule during ptrace * or during fork: the ptracer task is allowed to write to the * child->flags of its traced child (same goes for fork, the parent * can write to the child->flags), because we're guaranteed the * child is not running and in turn not changing child->flags * at the same time the parent does it. */ #define clear_stopped_child_used_math(child) do { (child)->flags &= ~PF_USED_MATH; } while (0) #define set_stopped_child_used_math(child) do { (child)->flags |= PF_USED_MATH; } while (0) #define clear_used_math() clear_stopped_child_used_math(current) #define set_used_math() set_stopped_child_used_math(current) #define conditional_stopped_child_used_math(condition, child) \ do { (child)->flags &= ~PF_USED_MATH, (child)->flags |= (condition) ? PF_USED_MATH : 0; } while (0) #define conditional_used_math(condition) conditional_stopped_child_used_math(condition, current) #define copy_to_stopped_child_used_math(child) \ do { (child)->flags &= ~PF_USED_MATH, (child)->flags |= current->flags & PF_USED_MATH; } while (0) /* NOTE: this will return 0 or PF_USED_MATH, it will never return 1 */ #define tsk_used_math(p) ((p)->flags & PF_USED_MATH) #define used_math() tsk_used_math(current) static __always_inline bool is_percpu_thread(void) { return (current->flags & PF_NO_SETAFFINITY) && (current->nr_cpus_allowed == 1); } /* Per-process atomic flags. */ #define PFA_NO_NEW_PRIVS 0 /* May not gain new privileges. */ #define PFA_SPREAD_PAGE 1 /* Spread page cache over cpuset */ #define PFA_SPREAD_SLAB 2 /* Spread some slab caches over cpuset */ #define PFA_SPEC_SSB_DISABLE 3 /* Speculative Store Bypass disabled */ #define PFA_SPEC_SSB_FORCE_DISABLE 4 /* Speculative Store Bypass force disabled*/ #define PFA_SPEC_IB_DISABLE 5 /* Indirect branch speculation restricted */ #define PFA_SPEC_IB_FORCE_DISABLE 6 /* Indirect branch speculation permanently restricted */ #define PFA_SPEC_SSB_NOEXEC 7 /* Speculative Store Bypass clear on execve() */ #define TASK_PFA_TEST(name, func) \ static inline bool task_##func(struct task_struct *p) \ { return test_bit(PFA_##name, &p->atomic_flags); } #define TASK_PFA_SET(name, func) \ static inline void task_set_##func(struct task_struct *p) \ { set_bit(PFA_##name, &p->atomic_flags); } #define TASK_PFA_CLEAR(name, func) \ static inline void task_clear_##func(struct task_struct *p) \ { clear_bit(PFA_##name, &p->atomic_flags); } TASK_PFA_TEST(NO_NEW_PRIVS, no_new_privs) TASK_PFA_SET(NO_NEW_PRIVS, no_new_privs) TASK_PFA_TEST(SPREAD_PAGE, spread_page) TASK_PFA_SET(SPREAD_PAGE, spread_page) TASK_PFA_CLEAR(SPREAD_PAGE, spread_page) TASK_PFA_TEST(SPREAD_SLAB, spread_slab) TASK_PFA_SET(SPREAD_SLAB, spread_slab) TASK_PFA_CLEAR(SPREAD_SLAB, spread_slab) TASK_PFA_TEST(SPEC_SSB_DISABLE, spec_ssb_disable) TASK_PFA_SET(SPEC_SSB_DISABLE, spec_ssb_disable) TASK_PFA_CLEAR(SPEC_SSB_DISABLE, spec_ssb_disable) TASK_PFA_TEST(SPEC_SSB_NOEXEC, spec_ssb_noexec) TASK_PFA_SET(SPEC_SSB_NOEXEC, spec_ssb_noexec) TASK_PFA_CLEAR(SPEC_SSB_NOEXEC, spec_ssb_noexec) TASK_PFA_TEST(SPEC_SSB_FORCE_DISABLE, spec_ssb_force_disable) TASK_PFA_SET(SPEC_SSB_FORCE_DISABLE, spec_ssb_force_disable) TASK_PFA_TEST(SPEC_IB_DISABLE, spec_ib_disable) TASK_PFA_SET(SPEC_IB_DISABLE, spec_ib_disable) TASK_PFA_CLEAR(SPEC_IB_DISABLE, spec_ib_disable) TASK_PFA_TEST(SPEC_IB_FORCE_DISABLE, spec_ib_force_disable) TASK_PFA_SET(SPEC_IB_FORCE_DISABLE, spec_ib_force_disable) static inline void current_restore_flags(unsigned long orig_flags, unsigned long flags) { current->flags &= ~flags; current->flags |= orig_flags & flags; } extern int cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpumask *trial); extern int task_can_attach(struct task_struct *p); extern int dl_bw_alloc(int cpu, u64 dl_bw); extern void dl_bw_free(int cpu, u64 dl_bw); /* set_cpus_allowed_force() - consider using set_cpus_allowed_ptr() instead */ extern void set_cpus_allowed_force(struct task_struct *p, const struct cpumask *new_mask); /** * set_cpus_allowed_ptr - set CPU affinity mask of a task * @p: the task * @new_mask: CPU affinity mask * * Return: zero if successful, or a negative error code */ extern int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask); extern int dup_user_cpus_ptr(struct task_struct *dst, struct task_struct *src, int node); extern void release_user_cpus_ptr(struct task_struct *p); extern int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask); extern void force_compatible_cpus_allowed_ptr(struct task_struct *p); extern void relax_compatible_cpus_allowed_ptr(struct task_struct *p); extern int yield_to(struct task_struct *p, bool preempt); extern void set_user_nice(struct task_struct *p, long nice); extern int task_prio(const struct task_struct *p); /** * task_nice - return the nice value of a given task. * @p: the task in question. * * Return: The nice value [ -20 ... 0 ... 19 ]. */ static inline int task_nice(const struct task_struct *p) { return PRIO_TO_NICE((p)->static_prio); } extern int can_nice(const struct task_struct *p, const int nice); extern int task_curr(const struct task_struct *p); extern int idle_cpu(int cpu); extern int sched_setscheduler(struct task_struct *, int, const struct sched_param *); extern int sched_setscheduler_nocheck(struct task_struct *, int, const struct sched_param *); extern void sched_set_fifo(struct task_struct *p); extern void sched_set_fifo_low(struct task_struct *p); extern void sched_set_fifo_secondary(struct task_struct *p); extern void sched_set_normal(struct task_struct *p, int nice); extern int sched_setattr(struct task_struct *, const struct sched_attr *); extern int sched_setattr_nocheck(struct task_struct *, const struct sched_attr *); extern struct task_struct *idle_task(int cpu); /** * is_idle_task - is the specified task an idle task? * @p: the task in question. * * Return: 1 if @p is an idle task. 0 otherwise. */ static __always_inline bool is_idle_task(const struct task_struct *p) { return !!(p->flags & PF_IDLE); } extern struct task_struct *curr_task(int cpu); extern void ia64_set_curr_task(int cpu, struct task_struct *p); void yield(void); union thread_union { struct task_struct task; #ifndef CONFIG_THREAD_INFO_IN_TASK struct thread_info thread_info; #endif unsigned long stack[THREAD_SIZE/sizeof(long)]; }; #ifndef CONFIG_THREAD_INFO_IN_TASK extern struct thread_info init_thread_info; #endif extern unsigned long init_stack[THREAD_SIZE / sizeof(unsigned long)]; #ifdef CONFIG_THREAD_INFO_IN_TASK # define task_thread_info(task) (&(task)->thread_info) #else # define task_thread_info(task) ((struct thread_info *)(task)->stack) #endif /* * find a task by one of its numerical ids * * find_task_by_pid_ns(): * finds a task by its pid in the specified namespace * find_task_by_vpid(): * finds a task by its virtual pid * * see also find_vpid() etc in include/linux/pid.h */ extern struct task_struct *find_task_by_vpid(pid_t nr); extern struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns); /* * find a task by its virtual pid and get the task struct */ extern struct task_struct *find_get_task_by_vpid(pid_t nr); extern int wake_up_state(struct task_struct *tsk, unsigned int state); extern int wake_up_process(struct task_struct *tsk); extern void wake_up_new_task(struct task_struct *tsk); extern void kick_process(struct task_struct *tsk); extern void __set_task_comm(struct task_struct *tsk, const char *from, bool exec); #define set_task_comm(tsk, from) ({ \ BUILD_BUG_ON(sizeof(from) != TASK_COMM_LEN); \ __set_task_comm(tsk, from, false); \ }) /* * - Why not use task_lock()? * User space can randomly change their names anyway, so locking for readers * doesn't make sense. For writers, locking is probably necessary, as a race * condition could lead to long-term mixed results. * The strscpy_pad() in __set_task_comm() can ensure that the task comm is * always NUL-terminated and zero-padded. Therefore the race condition between * reader and writer is not an issue. * * - BUILD_BUG_ON() can help prevent the buf from being truncated. * Since the callers don't perform any return value checks, this safeguard is * necessary. */ #define get_task_comm(buf, tsk) ({ \ BUILD_BUG_ON(sizeof(buf) < TASK_COMM_LEN); \ strscpy_pad(buf, (tsk)->comm); \ buf; \ }) static __always_inline void scheduler_ipi(void) { /* * Fold TIF_NEED_RESCHED into the preempt_count; anybody setting * TIF_NEED_RESCHED remotely (for the first time) will also send * this IPI. */ preempt_fold_need_resched(); } extern unsigned long wait_task_inactive(struct task_struct *, unsigned int match_state); /* * Set thread flags in other task's structures. * See asm/thread_info.h for TIF_xxxx flags available: */ static inline void set_tsk_thread_flag(struct task_struct *tsk, int flag) { set_ti_thread_flag(task_thread_info(tsk), flag); } static inline void clear_tsk_thread_flag(struct task_struct *tsk, int flag) { clear_ti_thread_flag(task_thread_info(tsk), flag); } static inline void update_tsk_thread_flag(struct task_struct *tsk, int flag, bool value) { update_ti_thread_flag(task_thread_info(tsk), flag, value); } static inline int test_and_set_tsk_thread_flag(struct task_struct *tsk, int flag) { return test_and_set_ti_thread_flag(task_thread_info(tsk), flag); } static inline int test_and_clear_tsk_thread_flag(struct task_struct *tsk, int flag) { return test_and_clear_ti_thread_flag(task_thread_info(tsk), flag); } static inline int test_tsk_thread_flag(struct task_struct *tsk, int flag) { return test_ti_thread_flag(task_thread_info(tsk), flag); } static inline void set_tsk_need_resched(struct task_struct *tsk) { if (tracepoint_enabled(sched_set_need_resched_tp) && !test_tsk_thread_flag(tsk, TIF_NEED_RESCHED)) __trace_set_need_resched(tsk, TIF_NEED_RESCHED); set_tsk_thread_flag(tsk,TIF_NEED_RESCHED); } static inline void clear_tsk_need_resched(struct task_struct *tsk) { atomic_long_andnot(_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY, (atomic_long_t *)&task_thread_info(tsk)->flags); } static inline int test_tsk_need_resched(struct task_struct *tsk) { return unlikely(test_tsk_thread_flag(tsk,TIF_NEED_RESCHED)); } static inline void set_need_resched_current(void) { lockdep_assert_irqs_disabled(); set_tsk_need_resched(current); set_preempt_need_resched(); } /* * cond_resched() and cond_resched_lock(): latency reduction via * explicit rescheduling in places that are safe. The return * value indicates whether a reschedule was done in fact. * cond_resched_lock() will drop the spinlock before scheduling, */ #if !defined(CONFIG_PREEMPTION) || defined(CONFIG_PREEMPT_DYNAMIC) extern int __cond_resched(void); #if defined(CONFIG_PREEMPT_DYNAMIC) && defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL) DECLARE_STATIC_CALL(cond_resched, __cond_resched); static __always_inline int _cond_resched(void) { return static_call_mod(cond_resched)(); } #elif defined(CONFIG_PREEMPT_DYNAMIC) && defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY) extern int dynamic_cond_resched(void); static __always_inline int _cond_resched(void) { return dynamic_cond_resched(); } #else /* !CONFIG_PREEMPTION */ static inline int _cond_resched(void) { return __cond_resched(); } #endif /* PREEMPT_DYNAMIC && CONFIG_HAVE_PREEMPT_DYNAMIC_CALL */ #else /* CONFIG_PREEMPTION && !CONFIG_PREEMPT_DYNAMIC */ static inline int _cond_resched(void) { return 0; } #endif /* !CONFIG_PREEMPTION || CONFIG_PREEMPT_DYNAMIC */ #define cond_resched() ({ \ __might_resched(__FILE__, __LINE__, 0); \ _cond_resched(); \ }) extern int __cond_resched_lock(spinlock_t *lock); extern int __cond_resched_rwlock_read(rwlock_t *lock); extern int __cond_resched_rwlock_write(rwlock_t *lock); #define MIGHT_RESCHED_RCU_SHIFT 8 #define MIGHT_RESCHED_PREEMPT_MASK ((1U << MIGHT_RESCHED_RCU_SHIFT) - 1) #ifndef CONFIG_PREEMPT_RT /* * Non RT kernels have an elevated preempt count due to the held lock, * but are not allowed to be inside a RCU read side critical section */ # define PREEMPT_LOCK_RESCHED_OFFSETS PREEMPT_LOCK_OFFSET #else /* * spin/rw_lock() on RT implies rcu_read_lock(). The might_sleep() check in * cond_resched*lock() has to take that into account because it checks for * preempt_count() and rcu_preempt_depth(). */ # define PREEMPT_LOCK_RESCHED_OFFSETS \ (PREEMPT_LOCK_OFFSET + (1U << MIGHT_RESCHED_RCU_SHIFT)) #endif #define cond_resched_lock(lock) ({ \ __might_resched(__FILE__, __LINE__, PREEMPT_LOCK_RESCHED_OFFSETS); \ __cond_resched_lock(lock); \ }) #define cond_resched_rwlock_read(lock) ({ \ __might_resched(__FILE__, __LINE__, PREEMPT_LOCK_RESCHED_OFFSETS); \ __cond_resched_rwlock_read(lock); \ }) #define cond_resched_rwlock_write(lock) ({ \ __might_resched(__FILE__, __LINE__, PREEMPT_LOCK_RESCHED_OFFSETS); \ __cond_resched_rwlock_write(lock); \ }) #ifndef CONFIG_PREEMPT_RT static inline struct mutex *__get_task_blocked_on(struct task_struct *p) { struct mutex *m = p->blocked_on; if (m) lockdep_assert_held_once(&m->wait_lock); return m; } static inline void __set_task_blocked_on(struct task_struct *p, struct mutex *m) { struct mutex *blocked_on = READ_ONCE(p->blocked_on); WARN_ON_ONCE(!m); /* The task should only be setting itself as blocked */ WARN_ON_ONCE(p != current); /* Currently we serialize blocked_on under the mutex::wait_lock */ lockdep_assert_held_once(&m->wait_lock); /* * Check ensure we don't overwrite existing mutex value * with a different mutex. Note, setting it to the same * lock repeatedly is ok. */ WARN_ON_ONCE(blocked_on && blocked_on != m); WRITE_ONCE(p->blocked_on, m); } static inline void set_task_blocked_on(struct task_struct *p, struct mutex *m) { guard(raw_spinlock_irqsave)(&m->wait_lock); __set_task_blocked_on(p, m); } static inline void __clear_task_blocked_on(struct task_struct *p, struct mutex *m) { if (m) { struct mutex *blocked_on = READ_ONCE(p->blocked_on); /* Currently we serialize blocked_on under the mutex::wait_lock */ lockdep_assert_held_once(&m->wait_lock); /* * There may be cases where we re-clear already cleared * blocked_on relationships, but make sure we are not * clearing the relationship with a different lock. */ WARN_ON_ONCE(blocked_on && blocked_on != m); } WRITE_ONCE(p->blocked_on, NULL); } static inline void clear_task_blocked_on(struct task_struct *p, struct mutex *m) { guard(raw_spinlock_irqsave)(&m->wait_lock); __clear_task_blocked_on(p, m); } #else static inline void __clear_task_blocked_on(struct task_struct *p, struct rt_mutex *m) { } static inline void clear_task_blocked_on(struct task_struct *p, struct rt_mutex *m) { } #endif /* !CONFIG_PREEMPT_RT */ static __always_inline bool need_resched(void) { return unlikely(tif_need_resched()); } /* * Wrappers for p->thread_info->cpu access. No-op on UP. */ #ifdef CONFIG_SMP static inline unsigned int task_cpu(const struct task_struct *p) { return READ_ONCE(task_thread_info(p)->cpu); } extern void set_task_cpu(struct task_struct *p, unsigned int cpu); #else static inline unsigned int task_cpu(const struct task_struct *p) { return 0; } static inline void set_task_cpu(struct task_struct *p, unsigned int cpu) { } #endif /* CONFIG_SMP */ static inline bool task_is_runnable(struct task_struct *p) { return p->on_rq && !p->se.sched_delayed; } extern bool sched_task_on_rq(struct task_struct *p); extern unsigned long get_wchan(struct task_struct *p); extern struct task_struct *cpu_curr_snapshot(int cpu); /* * In order to reduce various lock holder preemption latencies provide an * interface to see if a vCPU is currently running or not. * * This allows us to terminate optimistic spin loops and block, analogous to * the native optimistic spin heuristic of testing if the lock owner task is * running or not. */ #ifndef vcpu_is_preempted static inline bool vcpu_is_preempted(int cpu) { return false; } #endif extern long sched_setaffinity(pid_t pid, const struct cpumask *new_mask); extern long sched_getaffinity(pid_t pid, struct cpumask *mask); #ifndef TASK_SIZE_OF #define TASK_SIZE_OF(tsk) TASK_SIZE #endif static inline bool owner_on_cpu(struct task_struct *owner) { /* * As lock holder preemption issue, we both skip spinning if * task is not on cpu or its cpu is preempted */ return READ_ONCE(owner->on_cpu) && !vcpu_is_preempted(task_cpu(owner)); } /* Returns effective CPU energy utilization, as seen by the scheduler */ unsigned long sched_cpu_util(int cpu); #ifdef CONFIG_SCHED_CORE extern void sched_core_free(struct task_struct *tsk); extern void sched_core_fork(struct task_struct *p); extern int sched_core_share_pid(unsigned int cmd, pid_t pid, enum pid_type type, unsigned long uaddr); extern int sched_core_idle_cpu(int cpu); #else static inline void sched_core_free(struct task_struct *tsk) { } static inline void sched_core_fork(struct task_struct *p) { } static inline int sched_core_idle_cpu(int cpu) { return idle_cpu(cpu); } #endif extern void sched_set_stop_task(int cpu, struct task_struct *stop); #ifdef CONFIG_MEM_ALLOC_PROFILING static __always_inline struct alloc_tag *alloc_tag_save(struct alloc_tag *tag) { swap(current->alloc_tag, tag); return tag; } static __always_inline void alloc_tag_restore(struct alloc_tag *tag, struct alloc_tag *old) { #ifdef CONFIG_MEM_ALLOC_PROFILING_DEBUG WARN(current->alloc_tag != tag, "current->alloc_tag was changed:\n"); #endif current->alloc_tag = old; } #else #define alloc_tag_save(_tag) NULL #define alloc_tag_restore(_tag, _old) do {} while (0) #endif /* Avoids recursive inclusion hell */ #ifdef CONFIG_SCHED_MM_CID void sched_mm_cid_before_execve(struct task_struct *t); void sched_mm_cid_after_execve(struct task_struct *t); void sched_mm_cid_fork(struct task_struct *t); void sched_mm_cid_exit(struct task_struct *t); static __always_inline int task_mm_cid(struct task_struct *t) { return t->mm_cid.cid & ~(MM_CID_ONCPU | MM_CID_TRANSIT); } #else static inline void sched_mm_cid_before_execve(struct task_struct *t) { } static inline void sched_mm_cid_after_execve(struct task_struct *t) { } static inline void sched_mm_cid_fork(struct task_struct *t) { } static inline void sched_mm_cid_exit(struct task_struct *t) { } static __always_inline int task_mm_cid(struct task_struct *t) { /* * Use the processor id as a fall-back when the mm cid feature is * disabled. This provides functional per-cpu data structure accesses * in user-space, althrough it won't provide the memory usage benefits. */ return task_cpu(t); } #endif #ifndef MODULE #ifndef COMPILE_OFFSETS extern void ___migrate_enable(void); struct rq; DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); /* * The "struct rq" is not available here, so we can't access the * "runqueues" with this_cpu_ptr(), as the compilation will fail in * this_cpu_ptr() -> raw_cpu_ptr() -> __verify_pcpu_ptr(): * typeof((ptr) + 0) * * So use arch_raw_cpu_ptr()/PERCPU_PTR() directly here. */ #ifdef CONFIG_SMP #define this_rq_raw() arch_raw_cpu_ptr(&runqueues) #else #define this_rq_raw() PERCPU_PTR(&runqueues) #endif #define this_rq_pinned() (*(unsigned int *)((void *)this_rq_raw() + RQ_nr_pinned)) static inline void __migrate_enable(void) { struct task_struct *p = current; #ifdef CONFIG_DEBUG_PREEMPT /* * Check both overflow from migrate_disable() and superfluous * migrate_enable(). */ if (WARN_ON_ONCE((s16)p->migration_disabled <= 0)) return; #endif if (p->migration_disabled > 1) { p->migration_disabled--; return; } /* * Ensure stop_task runs either before or after this, and that * __set_cpus_allowed_ptr(SCA_MIGRATE_ENABLE) doesn't schedule(). */ guard(preempt)(); if (unlikely(p->cpus_ptr != &p->cpus_mask)) ___migrate_enable(); /* * Mustn't clear migration_disabled() until cpus_ptr points back at the * regular cpus_mask, otherwise things that race (eg. * select_fallback_rq) get confused. */ barrier(); p->migration_disabled = 0; this_rq_pinned()--; } static inline void __migrate_disable(void) { struct task_struct *p = current; if (p->migration_disabled) { #ifdef CONFIG_DEBUG_PREEMPT /* *Warn about overflow half-way through the range. */ WARN_ON_ONCE((s16)p->migration_disabled < 0); #endif p->migration_disabled++; return; } guard(preempt)(); this_rq_pinned()++; p->migration_disabled = 1; } #else /* !COMPILE_OFFSETS */ static inline void __migrate_disable(void) { } static inline void __migrate_enable(void) { } #endif /* !COMPILE_OFFSETS */ /* * So that it is possible to not export the runqueues variable, define and * export migrate_enable/migrate_disable in kernel/sched/core.c too, and use * them for the modules. The macro "INSTANTIATE_EXPORTED_MIGRATE_DISABLE" will * be defined in kernel/sched/core.c. */ #ifndef INSTANTIATE_EXPORTED_MIGRATE_DISABLE static __always_inline void migrate_disable(void) { __migrate_disable(); } static __always_inline void migrate_enable(void) { __migrate_enable(); } #else /* INSTANTIATE_EXPORTED_MIGRATE_DISABLE */ extern void migrate_disable(void); extern void migrate_enable(void); #endif /* INSTANTIATE_EXPORTED_MIGRATE_DISABLE */ #else /* MODULE */ extern void migrate_disable(void); extern void migrate_enable(void); #endif /* MODULE */ DEFINE_LOCK_GUARD_0(migrate, migrate_disable(), migrate_enable()) #endif
38 8 8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* AF_RXRPC internal definitions * * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #include <linux/atomic.h> #include <linux/seqlock.h> #include <linux/win_minmax.h> #include <net/net_namespace.h> #include <net/netns/generic.h> #include <net/sock.h> #include <net/af_rxrpc.h> #include <keys/rxrpc-type.h> #include "protocol.h" #define FCRYPT_BSIZE 8 struct rxrpc_crypt { union { u8 x[FCRYPT_BSIZE]; __be32 n[2]; }; } __attribute__((aligned(8))); #define rxrpc_queue_work(WS) queue_work(rxrpc_workqueue, (WS)) #define rxrpc_queue_delayed_work(WS,D) \ queue_delayed_work(rxrpc_workqueue, (WS), (D)) struct key_preparsed_payload; struct rxrpc_connection; struct rxrpc_txbuf; struct rxrpc_txqueue; struct rxgk_context; /* * Mark applied to socket buffers in skb->mark. skb->priority is used * to pass supplementary information. */ enum rxrpc_skb_mark { RXRPC_SKB_MARK_PACKET, /* Received packet */ RXRPC_SKB_MARK_ERROR, /* Error notification */ RXRPC_SKB_MARK_CHALLENGE, /* Challenge notification */ RXRPC_SKB_MARK_SERVICE_CONN_SECURED, /* Service connection response has been verified */ RXRPC_SKB_MARK_REJECT_BUSY, /* Reject with BUSY */ RXRPC_SKB_MARK_REJECT_ABORT, /* Reject with ABORT (code in skb->priority) */ RXRPC_SKB_MARK_REJECT_CONN_ABORT, /* Reject with connection ABORT (code in skb->priority) */ }; /* * sk_state for RxRPC sockets */ enum { RXRPC_UNBOUND = 0, RXRPC_CLIENT_UNBOUND, /* Unbound socket used as client */ RXRPC_CLIENT_BOUND, /* client local address bound */ RXRPC_SERVER_BOUND, /* server local address bound */ RXRPC_SERVER_BOUND2, /* second server local address bound */ RXRPC_SERVER_LISTENING, /* server listening for connections */ RXRPC_SERVER_LISTEN_DISABLED, /* server listening disabled */ RXRPC_CLOSE, /* socket is being closed */ }; /* * Per-network namespace data. */ struct rxrpc_net { struct proc_dir_entry *proc_net; /* Subdir in /proc/net */ u32 epoch; /* Local epoch for detecting local-end reset */ struct list_head calls; /* List of calls active in this namespace */ spinlock_t call_lock; /* Lock for ->calls */ atomic_t nr_calls; /* Count of allocated calls */ atomic_t nr_conns; struct list_head bundle_proc_list; /* List of bundles for proc */ struct list_head conn_proc_list; /* List of conns in this namespace for proc */ struct list_head service_conns; /* Service conns in this namespace */ rwlock_t conn_lock; /* Lock for ->conn_proc_list, ->service_conns */ struct work_struct service_conn_reaper; struct timer_list service_conn_reap_timer; bool live; atomic_t nr_client_conns; struct hlist_head local_endpoints; struct mutex local_mutex; /* Lock for ->local_endpoints */ DECLARE_HASHTABLE (peer_hash, 10); spinlock_t peer_hash_lock; /* Lock for ->peer_hash */ #define RXRPC_KEEPALIVE_TIME 20 /* NAT keepalive time in seconds */ u8 peer_keepalive_cursor; time64_t peer_keepalive_base; struct list_head peer_keepalive[32]; struct list_head peer_keepalive_new; struct timer_list peer_keepalive_timer; struct work_struct peer_keepalive_work; atomic_t stat_tx_data; atomic_t stat_tx_data_retrans; atomic_t stat_tx_data_send; atomic_t stat_tx_data_send_frag; atomic_t stat_tx_data_send_fail; atomic_t stat_tx_data_send_msgsize; atomic_t stat_tx_data_underflow; atomic_t stat_tx_data_cwnd_reset; atomic_t stat_rx_data; atomic_t stat_rx_data_reqack; atomic_t stat_rx_data_jumbo; atomic_t stat_tx_ack_fill; atomic_t stat_tx_ack_send; atomic_t stat_tx_ack_skip; atomic_t stat_tx_acks[256]; atomic_t stat_rx_acks[256]; atomic_t stat_tx_jumbo[10]; atomic_t stat_rx_jumbo[10]; atomic_t stat_why_req_ack[8]; atomic_t stat_io_loop; }; /* * Service backlog preallocation. * * This contains circular buffers of preallocated peers, connections and calls * for incoming service calls and their head and tail pointers. This allows * calls to be set up in the data_ready handler, thereby avoiding the need to * shuffle packets around so much. */ struct rxrpc_backlog { unsigned short peer_backlog_head; unsigned short peer_backlog_tail; unsigned short conn_backlog_head; unsigned short conn_backlog_tail; unsigned short call_backlog_head; unsigned short call_backlog_tail; #define RXRPC_BACKLOG_MAX 32 struct rxrpc_peer *peer_backlog[RXRPC_BACKLOG_MAX]; struct rxrpc_connection *conn_backlog[RXRPC_BACKLOG_MAX]; struct rxrpc_call *call_backlog[RXRPC_BACKLOG_MAX]; }; /* * RxRPC socket definition */ struct rxrpc_sock { /* WARNING: sk has to be the first member */ struct sock sk; const struct rxrpc_kernel_ops *app_ops; /* Table of kernel app notification funcs */ struct rxrpc_local *local; /* local endpoint */ struct rxrpc_backlog *backlog; /* Preallocation for services */ struct sk_buff_head recvmsg_oobq; /* OOB messages for recvmsg to pick up */ struct rb_root pending_oobq; /* OOB messages awaiting userspace to respond to */ u64 oob_id_counter; /* OOB message ID counter */ spinlock_t incoming_lock; /* Incoming call vs service shutdown lock */ struct list_head sock_calls; /* List of calls owned by this socket */ struct list_head to_be_accepted; /* calls awaiting acceptance */ struct list_head recvmsg_q; /* Calls awaiting recvmsg's attention */ spinlock_t recvmsg_lock; /* Lock for recvmsg_q */ struct key *key; /* security for this socket */ struct key *securities; /* list of server security descriptors */ struct rb_root calls; /* User ID -> call mapping */ unsigned long flags; #define RXRPC_SOCK_CONNECTED 0 /* connect_srx is set */ #define RXRPC_SOCK_MANAGE_RESPONSE 1 /* User wants to manage RESPONSE packets */ rwlock_t call_lock; /* lock for calls */ u32 min_sec_level; /* minimum security level */ #define RXRPC_SECURITY_MAX RXRPC_SECURITY_ENCRYPT bool exclusive; /* Exclusive connection for a client socket */ u16 second_service; /* Additional service bound to the endpoint */ struct { /* Service upgrade information */ u16 from; /* Service ID to upgrade (if not 0) */ u16 to; /* service ID to upgrade to */ } service_upgrade; sa_family_t family; /* Protocol family created with */ struct sockaddr_rxrpc srx; /* Primary Service/local addresses */ struct sockaddr_rxrpc connect_srx; /* Default client address from connect() */ }; #define rxrpc_sk(__sk) container_of((__sk), struct rxrpc_sock, sk) /* * CPU-byteorder normalised Rx packet header. */ struct rxrpc_host_header { u32 epoch; /* client boot timestamp */ u32 cid; /* connection and channel ID */ u32 callNumber; /* call ID (0 for connection-level packets) */ u32 seq; /* sequence number of pkt in call stream */ u32 serial; /* serial number of pkt sent to network */ u8 type; /* packet type */ u8 flags; /* packet flags */ u8 userStatus; /* app-layer defined status */ u8 securityIndex; /* security protocol ID */ union { u16 _rsvd; /* reserved */ u16 cksum; /* kerberos security checksum */ }; u16 serviceId; /* service ID */ } __packed; /* * RxRPC socket buffer private variables * - max 48 bytes (struct sk_buff::cb) */ struct rxrpc_skb_priv { union { struct rxrpc_connection *poke_conn; /* Conn referred to (poke packet) */ struct { u16 offset; /* Offset of data */ u16 len; /* Length of data */ u8 flags; #define RXRPC_RX_VERIFIED 0x01 }; struct { rxrpc_seq_t first_ack; /* First packet in acks table */ rxrpc_seq_t prev_ack; /* Highest seq seen */ rxrpc_serial_t acked_serial; /* Packet in response to (or 0) */ u16 nr_acks; /* Number of acks+nacks */ u8 reason; /* Reason for ack */ } ack; struct { struct rxrpc_connection *conn; /* Connection referred to */ union { u32 rxkad_nonce; }; } chall; struct { rxrpc_serial_t challenge_serial; u32 kvno; u32 version; u16 len; u16 ticket_len; } resp; }; struct rxrpc_host_header hdr; /* RxRPC packet header from this packet */ }; #define rxrpc_skb(__skb) ((struct rxrpc_skb_priv *) &(__skb)->cb) /* * RxRPC security module interface */ struct rxrpc_security { const char *name; /* name of this service */ u8 security_index; /* security type provided */ u32 no_key_abort; /* Abort code indicating no key */ /* Initialise a security service */ int (*init)(void); /* Clean up a security service */ void (*exit)(void); /* Parse the information from a server key */ int (*preparse_server_key)(struct key_preparsed_payload *); /* Clean up the preparse buffer after parsing a server key */ void (*free_preparse_server_key)(struct key_preparsed_payload *); /* Destroy the payload of a server key */ void (*destroy_server_key)(struct key *); /* Describe a server key */ void (*describe_server_key)(const struct key *, struct seq_file *); /* initialise a connection's security */ int (*init_connection_security)(struct rxrpc_connection *, struct rxrpc_key_token *); /* Work out how much data we can store in a packet, given an estimate * of the amount of data remaining and allocate a data buffer. */ struct rxrpc_txbuf *(*alloc_txbuf)(struct rxrpc_call *call, size_t remaining, gfp_t gfp); /* impose security on a packet */ int (*secure_packet)(struct rxrpc_call *, struct rxrpc_txbuf *); /* verify the security on a received packet */ int (*verify_packet)(struct rxrpc_call *, struct sk_buff *); /* Free crypto request on a call */ void (*free_call_crypto)(struct rxrpc_call *); /* issue a challenge */ int (*issue_challenge)(struct rxrpc_connection *); /* Validate a challenge packet */ bool (*validate_challenge)(struct rxrpc_connection *conn, struct sk_buff *skb); /* Fill out the cmsg for recvmsg() to pass on a challenge to userspace. * The security class gets to add additional information. */ int (*challenge_to_recvmsg)(struct rxrpc_connection *conn, struct sk_buff *challenge, struct msghdr *msg); /* Parse sendmsg() control message and respond to challenge. */ int (*sendmsg_respond_to_challenge)(struct sk_buff *challenge, struct msghdr *msg); /* respond to a challenge */ int (*respond_to_challenge)(struct rxrpc_connection *conn, struct sk_buff *challenge); /* verify a response */ int (*verify_response)(struct rxrpc_connection *, struct sk_buff *); /* clear connection security */ void (*clear)(struct rxrpc_connection *); /* Default ticket -> key decoder */ int (*default_decode_ticket)(struct rxrpc_connection *conn, struct sk_buff *skb, unsigned int ticket_offset, unsigned int ticket_len, struct key **_key); }; /* * RxRPC local transport endpoint description * - owned by a single AF_RXRPC socket * - pointed to by transport socket struct sk_user_data */ struct rxrpc_local { struct rcu_head rcu; atomic_t active_users; /* Number of users of the local endpoint */ refcount_t ref; /* Number of references to the structure */ struct net *net; /* The network namespace */ struct rxrpc_net *rxnet; /* Our bits in the network namespace */ struct hlist_node link; struct socket *socket; /* my UDP socket */ struct task_struct *io_thread; struct completion io_thread_ready; /* Indication that the I/O thread started */ struct page_frag_cache tx_alloc; /* Tx control packet allocation (I/O thread only) */ struct rxrpc_sock *service; /* Service(s) listening on this endpoint */ #ifdef CONFIG_AF_RXRPC_INJECT_RX_DELAY struct sk_buff_head rx_delay_queue; /* Delay injection queue */ #endif struct sk_buff_head rx_queue; /* Received packets */ struct list_head conn_attend_q; /* Conns requiring immediate attention */ struct list_head call_attend_q; /* Calls requiring immediate attention */ struct rb_root client_bundles; /* Client connection bundles by socket params */ spinlock_t client_bundles_lock; /* Lock for client_bundles */ bool kill_all_client_conns; struct list_head idle_client_conns; struct timer_list client_conn_reap_timer; unsigned long client_conn_flags; #define RXRPC_CLIENT_CONN_REAP_TIMER 0 /* The client conn reap timer expired */ spinlock_t lock; /* access lock */ rwlock_t services_lock; /* lock for services list */ int debug_id; /* debug ID for printks */ bool dead; bool service_closed; /* Service socket closed */ struct idr conn_ids; /* List of connection IDs */ struct list_head new_client_calls; /* Newly created client calls need connection */ spinlock_t client_call_lock; /* Lock for ->new_client_calls */ struct sockaddr_rxrpc srx; /* local address */ union { /* Provide a kvec table sufficiently large to manage either a * DATA packet with a maximum set of jumbo subpackets or a PING * ACK padded out to 64K with zeropages for PMTUD. */ struct kvec kvec[1 + RXRPC_MAX_NR_JUMBO > 3 + 16 ? 1 + RXRPC_MAX_NR_JUMBO : 3 + 16]; struct bio_vec bvec[3 + 16]; }; }; /* * RxRPC remote transport endpoint definition * - matched by local endpoint, remote port, address and protocol type */ struct rxrpc_peer { struct rcu_head rcu; /* This must be first */ refcount_t ref; unsigned long hash_key; struct hlist_node hash_link; struct rxrpc_local *local; struct hlist_head error_targets; /* targets for net error distribution */ struct rb_root service_conns; /* Service connections */ struct list_head keepalive_link; /* Link in net->peer_keepalive[] */ unsigned long app_data; /* Application data (e.g. afs_server) */ unsigned int last_tx_at; /* Last time packet sent here (time64_t LSW) */ seqlock_t service_conn_lock; spinlock_t lock; /* access lock */ int debug_id; /* debug ID for printks */ struct sockaddr_rxrpc srx; /* remote address */ /* Path MTU discovery [RFC8899] */ unsigned int pmtud_trial; /* Current MTU probe size */ unsigned int pmtud_good; /* Largest working MTU probe we've tried */ unsigned int pmtud_bad; /* Smallest non-working MTU probe we've tried */ bool pmtud_lost; /* T if MTU probe was lost */ bool pmtud_probing; /* T if we have an active probe outstanding */ bool pmtud_pending; /* T if a call to this peer should send a probe */ u8 pmtud_jumbo; /* Max jumbo packets for the MTU */ bool ackr_adv_pmtud; /* T if the peer advertises path-MTU */ unsigned int ackr_max_data; /* Maximum data advertised by peer */ unsigned int if_mtu; /* Local interface MTU (- hdrsize) for this peer */ unsigned int max_data; /* Maximum packet data capacity for this peer */ unsigned short hdrsize; /* header size (IP + UDP + RxRPC) */ unsigned short tx_seg_max; /* Maximum number of transmissable segments */ /* Calculated RTT cache */ unsigned int recent_srtt_us; unsigned int recent_rto_us; u8 cong_ssthresh; /* Congestion slow-start threshold */ }; /* * Keys for matching a connection. */ struct rxrpc_conn_proto { union { struct { u32 epoch; /* epoch of this connection */ u32 cid; /* connection ID */ }; u64 index_key; }; }; struct rxrpc_conn_parameters { struct rxrpc_local *local; /* Representation of local endpoint */ struct rxrpc_peer *peer; /* Representation of remote endpoint */ struct key *key; /* Security details */ bool exclusive; /* T if conn is exclusive */ bool upgrade; /* T if service ID can be upgraded */ u16 service_id; /* Service ID for this connection */ u32 security_level; /* Security level selected */ }; /* * Call completion condition (state == RXRPC_CALL_COMPLETE). */ enum rxrpc_call_completion { RXRPC_CALL_SUCCEEDED, /* - Normal termination */ RXRPC_CALL_REMOTELY_ABORTED, /* - call aborted by peer */ RXRPC_CALL_LOCALLY_ABORTED, /* - call aborted locally on error or close */ RXRPC_CALL_LOCAL_ERROR, /* - call failed due to local error */ RXRPC_CALL_NETWORK_ERROR, /* - call terminated by network error */ NR__RXRPC_CALL_COMPLETIONS }; /* * Bits in the connection flags. */ enum rxrpc_conn_flag { RXRPC_CONN_IN_SERVICE_CONNS, /* Conn is in peer->service_conns */ RXRPC_CONN_DONT_REUSE, /* Don't reuse this connection */ RXRPC_CONN_PROBING_FOR_UPGRADE, /* Probing for service upgrade */ RXRPC_CONN_FINAL_ACK_0, /* Need final ACK for channel 0 */ RXRPC_CONN_FINAL_ACK_1, /* Need final ACK for channel 1 */ RXRPC_CONN_FINAL_ACK_2, /* Need final ACK for channel 2 */ RXRPC_CONN_FINAL_ACK_3, /* Need final ACK for channel 3 */ }; #define RXRPC_CONN_FINAL_ACK_MASK ((1UL << RXRPC_CONN_FINAL_ACK_0) | \ (1UL << RXRPC_CONN_FINAL_ACK_1) | \ (1UL << RXRPC_CONN_FINAL_ACK_2) | \ (1UL << RXRPC_CONN_FINAL_ACK_3)) /* * Events that can be raised upon a connection. */ enum rxrpc_conn_event { RXRPC_CONN_EV_CHALLENGE, /* Send challenge packet */ RXRPC_CONN_EV_ABORT_CALLS, /* Abort attached calls */ }; /* * The connection protocol state. */ enum rxrpc_conn_proto_state { RXRPC_CONN_UNUSED, /* Connection not yet attempted */ RXRPC_CONN_CLIENT_UNSECURED, /* Client connection needs security init */ RXRPC_CONN_CLIENT, /* Client connection */ RXRPC_CONN_SERVICE_PREALLOC, /* Service connection preallocation */ RXRPC_CONN_SERVICE_UNSECURED, /* Service unsecured connection */ RXRPC_CONN_SERVICE_CHALLENGING, /* Service challenging for security */ RXRPC_CONN_SERVICE, /* Service secured connection */ RXRPC_CONN_ABORTED, /* Conn aborted */ RXRPC_CONN__NR_STATES }; /* * RxRPC client connection bundle. */ struct rxrpc_bundle { struct rxrpc_local *local; /* Representation of local endpoint */ struct rxrpc_peer *peer; /* Remote endpoint */ struct key *key; /* Security details */ struct list_head proc_link; /* Link in net->bundle_proc_list */ const struct rxrpc_security *security; /* applied security module */ refcount_t ref; atomic_t active; /* Number of active users */ unsigned int debug_id; u32 security_level; /* Security level selected */ u16 service_id; /* Service ID for this connection */ bool try_upgrade; /* True if the bundle is attempting upgrade */ bool exclusive; /* T if conn is exclusive */ bool upgrade; /* T if service ID can be upgraded */ unsigned short alloc_error; /* Error from last conn allocation */ struct rb_node local_node; /* Node in local->client_conns */ struct list_head waiting_calls; /* Calls waiting for channels */ unsigned long avail_chans; /* Mask of available channels */ unsigned int conn_ids[4]; /* Connection IDs. */ struct rxrpc_connection *conns[4]; /* The connections in the bundle (max 4) */ }; /* * RxRPC connection definition * - matched by { local, peer, epoch, conn_id, direction } * - each connection can only handle four simultaneous calls */ struct rxrpc_connection { struct rxrpc_conn_proto proto; struct rxrpc_local *local; /* Representation of local endpoint */ struct rxrpc_peer *peer; /* Remote endpoint */ struct rxrpc_net *rxnet; /* Network namespace to which call belongs */ struct key *key; /* Security details */ struct list_head attend_link; /* Link in local->conn_attend_q */ refcount_t ref; atomic_t active; /* Active count for service conns */ struct rcu_head rcu; struct list_head cache_link; unsigned char act_chans; /* Mask of active channels */ struct rxrpc_channel { unsigned long final_ack_at; /* Time at which to issue final ACK */ struct rxrpc_call *call; /* Active call */ unsigned int call_debug_id; /* call->debug_id */ u32 call_id; /* ID of current call */ u32 call_counter; /* Call ID counter */ u32 last_call; /* ID of last call */ u8 last_type; /* Type of last packet */ union { u32 last_seq; u32 last_abort; }; } channels[RXRPC_MAXCALLS]; struct timer_list timer; /* Conn event timer */ struct work_struct processor; /* connection event processor */ struct work_struct destructor; /* In-process-context destroyer */ struct rxrpc_bundle *bundle; /* Client connection bundle */ struct rb_node service_node; /* Node in peer->service_conns */ struct list_head proc_link; /* link in procfs list */ struct list_head link; /* link in master connection list */ struct sk_buff_head rx_queue; /* received conn-level packets */ struct page_frag_cache tx_data_alloc; /* Tx DATA packet allocation */ struct mutex tx_data_alloc_lock; struct mutex security_lock; /* Lock for security management */ const struct rxrpc_security *security; /* applied security module */ union { struct { struct crypto_sync_skcipher *cipher; /* encryption handle */ struct rxrpc_crypt csum_iv; /* packet checksum base */ u32 nonce; /* response re-use preventer */ } rxkad; struct { struct rxgk_context *keys[4]; /* (Re-)keying buffer */ u64 start_time; /* The start time for TK derivation */ u8 nonce[20]; /* Response re-use preventer */ u32 enctype; /* Kerberos 5 encoding type */ u32 key_number; /* Current key number */ } rxgk; }; rwlock_t security_use_lock; /* Security use/modification lock */ struct sk_buff *tx_response; /* Response packet to be transmitted */ unsigned long flags; unsigned long events; unsigned long idle_timestamp; /* Time at which last became idle */ spinlock_t state_lock; /* state-change lock */ enum rxrpc_conn_proto_state state; /* current state of connection */ enum rxrpc_call_completion completion; /* Completion condition */ s32 abort_code; /* Abort code of connection abort */ int debug_id; /* debug ID for printks */ rxrpc_serial_t tx_serial; /* Outgoing packet serial number counter */ unsigned int hi_serial; /* highest serial number received */ rxrpc_serial_t pmtud_probe; /* Serial of MTU probe (or 0) */ unsigned int pmtud_call; /* ID of call used for probe */ u32 service_id; /* Service ID, possibly upgraded */ u32 security_level; /* Security level selected */ u8 security_ix; /* security type */ u8 out_clientflag; /* RXRPC_CLIENT_INITIATED if we are client */ u8 bundle_shift; /* Index into bundle->avail_chans */ bool exclusive; /* T if conn is exclusive */ bool upgrade; /* T if service ID can be upgraded */ u16 orig_service_id; /* Originally requested service ID */ short error; /* Local error code */ }; static inline bool rxrpc_to_server(const struct rxrpc_skb_priv *sp) { return sp->hdr.flags & RXRPC_CLIENT_INITIATED; } static inline bool rxrpc_to_client(const struct rxrpc_skb_priv *sp) { return !rxrpc_to_server(sp); } /* * Flags in call->flags. */ enum rxrpc_call_flag { RXRPC_CALL_RELEASED, /* call has been released - no more message to userspace */ RXRPC_CALL_HAS_USERID, /* has a user ID attached */ RXRPC_CALL_IS_SERVICE, /* Call is service call */ RXRPC_CALL_EXPOSED, /* The call was exposed to the world */ RXRPC_CALL_RX_LAST, /* Received the last packet (at rxtx_top) */ RXRPC_CALL_TX_LAST, /* Last packet in Tx buffer (at rxtx_top) */ RXRPC_CALL_TX_ALL_ACKED, /* Last packet has been hard-acked */ RXRPC_CALL_TX_NO_MORE, /* No more data to transmit (MSG_MORE deasserted) */ RXRPC_CALL_SEND_PING, /* A ping will need to be sent */ RXRPC_CALL_RETRANS_TIMEOUT, /* Retransmission due to timeout occurred */ RXRPC_CALL_BEGAN_RX_TIMER, /* We began the expect_rx_by timer */ RXRPC_CALL_RX_HEARD, /* The peer responded at least once to this call */ RXRPC_CALL_DISCONNECTED, /* The call has been disconnected */ RXRPC_CALL_KERNEL, /* The call was made by the kernel */ RXRPC_CALL_UPGRADE, /* Service upgrade was requested for the call */ RXRPC_CALL_EXCLUSIVE, /* The call uses a once-only connection */ RXRPC_CALL_RX_IS_IDLE, /* recvmsg() is idle - send an ACK */ RXRPC_CALL_RECVMSG_READ_ALL, /* recvmsg() read all of the received data */ RXRPC_CALL_CONN_CHALLENGING, /* The connection is being challenged */ }; /* * Events that can be raised on a call. */ enum rxrpc_call_event { RXRPC_CALL_EV_ACK_LOST, /* ACK may be lost, send ping */ RXRPC_CALL_EV_INITIAL_PING, /* Send initial ping for a new service call */ }; /* * The states that a call can be in. */ enum rxrpc_call_state { RXRPC_CALL_UNINITIALISED, RXRPC_CALL_CLIENT_AWAIT_CONN, /* - client waiting for connection to become available */ RXRPC_CALL_CLIENT_SEND_REQUEST, /* - client sending request phase */ RXRPC_CALL_CLIENT_AWAIT_REPLY, /* - client awaiting reply */ RXRPC_CALL_CLIENT_RECV_REPLY, /* - client receiving reply phase */ RXRPC_CALL_SERVER_PREALLOC, /* - service preallocation */ RXRPC_CALL_SERVER_RECV_REQUEST, /* - server receiving request */ RXRPC_CALL_SERVER_ACK_REQUEST, /* - server pending ACK of request */ RXRPC_CALL_SERVER_SEND_REPLY, /* - server sending reply */ RXRPC_CALL_SERVER_AWAIT_ACK, /* - server awaiting final ACK */ RXRPC_CALL_COMPLETE, /* - call complete */ NR__RXRPC_CALL_STATES }; /* * Call Tx congestion management modes. */ enum rxrpc_ca_state { RXRPC_CA_SLOW_START, RXRPC_CA_CONGEST_AVOIDANCE, RXRPC_CA_PACKET_LOSS, RXRPC_CA_FAST_RETRANSMIT, NR__RXRPC_CA_STATES } __mode(byte); /* * Current purpose of call RACK timer. According to the RACK-TLP protocol * [RFC8985], the transmission timer (call->rack_timo_at) may only be used for * one of these at once. */ enum rxrpc_rack_timer_mode { RXRPC_CALL_RACKTIMER_OFF, /* Timer not running */ RXRPC_CALL_RACKTIMER_RACK_REORDER, /* RACK reordering timer */ RXRPC_CALL_RACKTIMER_TLP_PTO, /* TLP timeout */ RXRPC_CALL_RACKTIMER_RTO, /* Retransmission timeout */ } __mode(byte); /* * RxRPC call definition * - matched by { connection, call_id } */ struct rxrpc_call { struct rcu_head rcu; struct rxrpc_connection *conn; /* connection carrying call */ struct rxrpc_bundle *bundle; /* Connection bundle to use */ struct rxrpc_peer *peer; /* Peer record for remote address */ struct rxrpc_local *local; /* Representation of local endpoint */ struct rxrpc_sock __rcu *socket; /* socket responsible */ struct rxrpc_net *rxnet; /* Network namespace to which call belongs */ struct key *key; /* Security details */ const struct rxrpc_security *security; /* applied security module */ struct mutex user_mutex; /* User access mutex */ struct sockaddr_rxrpc dest_srx; /* Destination address */ ktime_t delay_ack_at; /* When DELAY ACK needs to happen */ ktime_t rack_timo_at; /* When ACK is figured as lost */ ktime_t ping_at; /* When next to send a ping */ ktime_t keepalive_at; /* When next to send a keepalive ping */ ktime_t expect_rx_by; /* When we expect to get a packet by */ ktime_t expect_req_by; /* When we expect to get a request DATA packet by */ ktime_t expect_term_by; /* When we expect call termination by */ u32 next_rx_timo; /* Timeout for next Rx packet (ms) */ u32 next_req_timo; /* Timeout for next Rx request packet (ms) */ u32 hard_timo; /* Maximum lifetime or 0 (s) */ struct timer_list timer; /* Combined event timer */ struct work_struct destroyer; /* In-process-context destroyer */ rxrpc_notify_rx_t notify_rx; /* kernel service Rx notification function */ struct list_head link; /* link in master call list */ struct list_head wait_link; /* Link in local->new_client_calls */ struct hlist_node error_link; /* link in error distribution list */ struct list_head accept_link; /* Link in rx->acceptq */ struct list_head recvmsg_link; /* Link in rx->recvmsg_q */ struct list_head sock_link; /* Link in rx->sock_calls */ struct rb_node sock_node; /* Node in rx->calls */ struct list_head attend_link; /* Link in local->call_attend_q */ struct rxrpc_txbuf *tx_pending; /* Tx buffer being filled */ wait_queue_head_t waitq; /* Wait queue for channel or Tx */ s64 tx_total_len; /* Total length left to be transmitted (or -1) */ unsigned long user_call_ID; /* user-defined call ID */ unsigned long flags; unsigned long events; spinlock_t notify_lock; /* Kernel notification lock */ unsigned int send_abort_why; /* Why the abort [enum rxrpc_abort_reason] */ s32 send_abort; /* Abort code to be sent */ short send_abort_err; /* Error to be associated with the abort */ rxrpc_seq_t send_abort_seq; /* DATA packet that incurred the abort (or 0) */ s32 abort_code; /* Local/remote abort code */ int error; /* Local error incurred */ enum rxrpc_call_state _state; /* Current state of call (needs barrier) */ enum rxrpc_call_completion completion; /* Call completion condition */ refcount_t ref; u8 security_ix; /* Security type */ enum rxrpc_interruptibility interruptibility; /* At what point call may be interrupted */ u32 call_id; /* call ID on connection */ u32 cid; /* connection ID plus channel index */ u32 security_level; /* Security level selected */ u32 security_enctype; /* Security-specific encoding type (or 0) */ int debug_id; /* debug ID for printks */ unsigned short rx_pkt_offset; /* Current recvmsg packet offset */ unsigned short rx_pkt_len; /* Current recvmsg packet len */ /* Sendmsg data tracking. */ rxrpc_seq_t send_top; /* Highest Tx slot filled by sendmsg. */ struct rxrpc_txqueue *send_queue; /* Queue that sendmsg is writing into */ /* Transmitted data tracking. */ struct rxrpc_txqueue *tx_queue; /* Start of transmission buffers */ struct rxrpc_txqueue *tx_qtail; /* End of transmission buffers */ rxrpc_seq_t tx_qbase; /* First slot in tx_queue */ rxrpc_seq_t tx_bottom; /* First packet in buffer */ rxrpc_seq_t tx_transmitted; /* Highest packet transmitted */ rxrpc_seq_t tx_top; /* Highest Tx slot allocated. */ rxrpc_serial_t tx_last_serial; /* Serial of last DATA transmitted */ u16 tx_backoff; /* Delay to insert due to Tx failure (ms) */ u16 tx_nr_sent; /* Number of packets sent, but unacked */ u16 tx_nr_lost; /* Number of packets marked lost */ u16 tx_nr_resent; /* Number of packets resent, but unacked */ u16 tx_winsize; /* Maximum size of Tx window */ #define RXRPC_TX_MAX_WINDOW 128 u8 tx_jumbo_max; /* Maximum subpkts peer will accept */ ktime_t tx_last_sent; /* Last time a transmission occurred */ /* Received data tracking */ struct sk_buff_head recvmsg_queue; /* Queue of packets ready for recvmsg() */ struct sk_buff_head rx_queue; /* Queue of packets for this call to receive */ struct sk_buff_head rx_oos_queue; /* Queue of out of sequence packets */ rxrpc_seq_t rx_highest_seq; /* Higest sequence number received */ rxrpc_seq_t rx_consumed; /* Highest packet consumed */ rxrpc_serial_t rx_serial; /* Highest serial received for this call */ u8 rx_winsize; /* Size of Rx window */ /* TCP-style slow-start congestion control [RFC5681]. Since the SMSS * is fixed, we keep these numbers in terms of segments (ie. DATA * packets) rather than bytes. */ #define RXRPC_TX_SMSS RXRPC_JUMBO_DATALEN #define RXRPC_MIN_CWND 4 enum rxrpc_ca_state cong_ca_state; /* Congestion control state */ u8 cong_extra; /* Extra to send for congestion management */ u16 cong_cwnd; /* Congestion window size */ u16 cong_ssthresh; /* Slow-start threshold */ u16 cong_dup_acks; /* Count of ACKs showing missing packets */ u16 cong_cumul_acks; /* Cumulative ACK count */ ktime_t cong_tstamp; /* Last time cwnd was changed */ /* RACK-TLP [RFC8985] state. */ ktime_t rack_xmit_ts; /* Latest transmission timestamp */ ktime_t rack_rtt; /* RTT of most recently ACK'd segment */ ktime_t rack_rtt_ts; /* Timestamp of rack_rtt */ ktime_t rack_reo_wnd; /* Reordering window */ unsigned int rack_reo_wnd_mult; /* Multiplier applied to rack_reo_wnd */ int rack_reo_wnd_persist; /* Num loss recoveries before reset reo_wnd */ rxrpc_seq_t rack_fack; /* Highest sequence so far ACK'd */ rxrpc_seq_t rack_end_seq; /* Highest sequence seen */ rxrpc_seq_t rack_dsack_round; /* DSACK opt recv'd in latest roundtrip */ bool rack_dsack_round_none; /* T if dsack_round is "None" */ bool rack_reordering_seen; /* T if detected reordering event */ enum rxrpc_rack_timer_mode rack_timer_mode; /* Current mode of RACK timer */ bool tlp_is_retrans; /* T if unacked TLP retransmission */ rxrpc_serial_t tlp_serial; /* Serial of TLP probe (or 0 if none in progress) */ rxrpc_seq_t tlp_seq; /* Sequence of TLP probe */ unsigned int tlp_rtt_taken; /* Last time RTT taken */ ktime_t tlp_max_ack_delay; /* Sender budget for max delayed ACK interval */ /* Receive-phase ACK management (ACKs we send). */ u8 ackr_reason; /* reason to ACK */ u16 ackr_sack_base; /* Starting slot in SACK table ring */ rxrpc_seq_t ackr_window; /* Base of SACK window */ rxrpc_seq_t ackr_wtop; /* Base of SACK window */ unsigned int ackr_nr_unacked; /* Number of unacked packets */ atomic_t ackr_nr_consumed; /* Number of packets needing hard ACK */ struct { #define RXRPC_SACK_SIZE 256 /* SACK table for soft-acked packets */ u8 ackr_sack_table[RXRPC_SACK_SIZE]; } __aligned(8); /* RTT management */ rxrpc_serial_t rtt_serial[4]; /* Serial number of DATA or PING sent */ ktime_t rtt_sent_at[4]; /* Time packet sent */ unsigned long rtt_avail; /* Mask of available slots in bits 0-3, * Mask of pending samples in 8-11 */ #define RXRPC_CALL_RTT_AVAIL_MASK 0xf #define RXRPC_CALL_RTT_PEND_SHIFT 8 /* Transmission-phase ACK management (ACKs we've received). */ ktime_t acks_latest_ts; /* Timestamp of latest ACK received */ rxrpc_seq_t acks_hard_ack; /* Highest sequence hard acked */ rxrpc_seq_t acks_prev_seq; /* Highest previousPacket received */ rxrpc_seq_t acks_lowest_nak; /* Lowest NACK in the buffer (or ==tx_hard_ack) */ rxrpc_serial_t acks_highest_serial; /* Highest serial number ACK'd */ unsigned short acks_nr_sacks; /* Number of soft acks recorded */ unsigned short acks_nr_snacks; /* Number of soft nacks recorded */ /* Calculated RTT cache */ ktime_t rtt_last_req; /* Time of last RTT request */ unsigned int rtt_count; /* Number of samples we've got */ unsigned int rtt_taken; /* Number of samples taken (wrapping) */ struct minmax min_rtt; /* Estimated minimum RTT */ u32 srtt_us; /* smoothed round trip time << 3 in usecs */ u32 mdev_us; /* medium deviation */ u32 mdev_max_us; /* maximal mdev for the last rtt period */ u32 rttvar_us; /* smoothed mdev_max */ u32 rto_us; /* Retransmission timeout in usec */ u8 backoff; /* Backoff timeout (as shift) */ }; /* * Summary of a new ACK and the changes it made to the Tx buffer packet states. */ struct rxrpc_ack_summary { rxrpc_serial_t ack_serial; /* Serial number of ACK */ rxrpc_serial_t acked_serial; /* Serial number ACK'd */ u16 in_flight; /* Number of unreceived transmissions */ u16 nr_new_hacks; /* Number of rotated new ACKs */ u16 nr_new_sacks; /* Number of new soft ACKs in packet */ u16 nr_new_snacks; /* Number of new soft nacks in packet */ u8 ack_reason; bool new_low_snack:1; /* T if new low soft NACK found */ bool retrans_timeo:1; /* T if reTx due to timeout happened */ bool need_retransmit:1; /* T if we need transmission */ bool rtt_sample_avail:1; /* T if RTT sample available */ bool in_fast_or_rto_recovery:1; bool exiting_fast_or_rto_recovery:1; bool tlp_probe_acked:1; /* T if the TLP probe seq was acked */ u8 /*enum rxrpc_congest_change*/ change; }; /* * sendmsg() cmsg-specified parameters. */ enum rxrpc_command { RXRPC_CMD_SEND_DATA, /* send data message */ RXRPC_CMD_SEND_ABORT, /* request abort generation */ RXRPC_CMD_REJECT_BUSY, /* [server] reject a call as busy */ RXRPC_CMD_CHARGE_ACCEPT, /* [server] charge accept preallocation */ }; struct rxrpc_call_params { s64 tx_total_len; /* Total Tx data length (if send data) */ unsigned long user_call_ID; /* User's call ID */ struct { u32 hard; /* Maximum lifetime (sec) */ u32 idle; /* Max time since last data packet (msec) */ u32 normal; /* Max time since last call packet (msec) */ } timeouts; u8 nr_timeouts; /* Number of timeouts specified */ bool kernel; /* T if kernel is making the call */ enum rxrpc_interruptibility interruptibility; /* How is interruptible is the call? */ }; struct rxrpc_send_params { struct rxrpc_call_params call; u32 abort_code; /* Abort code to Tx (if abort) */ enum rxrpc_command command : 8; /* The command to implement */ bool exclusive; /* Shared or exclusive call */ bool upgrade; /* If the connection is upgradeable */ }; /* * Buffer of data to be output as a packet. */ struct rxrpc_txbuf { refcount_t ref; rxrpc_seq_t seq; /* Sequence number of this packet */ rxrpc_serial_t serial; /* Last serial number transmitted with */ unsigned int call_debug_id; unsigned int debug_id; unsigned short len; /* Amount of data in buffer */ unsigned short space; /* Remaining data space */ unsigned short offset; /* Offset of fill point */ unsigned short crypto_header; /* Size of crypto header */ unsigned short sec_header; /* Size of security header */ unsigned short pkt_len; /* Size of packet content */ unsigned short alloc_size; /* Amount of bufferage allocated */ unsigned int flags; #define RXRPC_TXBUF_WIRE_FLAGS 0xff /* The wire protocol flags */ #define RXRPC_TXBUF_RESENT 0x100 /* Set if has been resent */ __be16 cksum; /* Checksum to go in header */ bool jumboable; /* Can be non-terminal jumbo subpacket */ void *data; /* Data with preceding jumbo header */ }; static inline bool rxrpc_sending_to_server(const struct rxrpc_txbuf *txb) { return txb->flags & RXRPC_CLIENT_INITIATED; } static inline bool rxrpc_sending_to_client(const struct rxrpc_txbuf *txb) { return !rxrpc_sending_to_server(txb); } /* * Transmit queue element, including RACK [RFC8985] per-segment metadata. The * transmission timestamp is in usec from the base. */ struct rxrpc_txqueue { /* Start with the members we want to prefetch. */ struct rxrpc_txqueue *next; ktime_t xmit_ts_base; rxrpc_seq_t qbase; u8 nr_reported_acks; /* Number of segments explicitly acked/nacked */ unsigned long segment_acked; /* Bit-per-buf: Set if ACK'd */ unsigned long segment_lost; /* Bit-per-buf: Set if declared lost */ unsigned long segment_retransmitted; /* Bit-per-buf: Set if retransmitted */ unsigned long rtt_samples; /* Bit-per-buf: Set if available for RTT */ unsigned long ever_retransmitted; /* Bit-per-buf: Set if ever retransmitted */ /* The arrays we want to pack into as few cache lines as possible. */ struct { #define RXRPC_NR_TXQUEUE BITS_PER_LONG #define RXRPC_TXQ_MASK (RXRPC_NR_TXQUEUE - 1) struct rxrpc_txbuf *bufs[RXRPC_NR_TXQUEUE]; unsigned int segment_serial[RXRPC_NR_TXQUEUE]; unsigned int segment_xmit_ts[RXRPC_NR_TXQUEUE]; } ____cacheline_aligned; }; /* * Data transmission request. */ struct rxrpc_send_data_req { ktime_t now; /* Current time */ struct rxrpc_txqueue *tq; /* Tx queue segment holding first DATA */ rxrpc_seq_t seq; /* Sequence of first data */ int n; /* Number of DATA packets to glue into jumbo */ bool retrans; /* T if this is a retransmission */ bool did_send; /* T if did actually send */ bool tlp_probe; /* T if this is a TLP probe */ int /* enum rxrpc_txdata_trace */ trace; }; #include <trace/events/rxrpc.h> /* * Allocate the next serial number on a connection. 0 must be skipped. */ static inline rxrpc_serial_t rxrpc_get_next_serial(struct rxrpc_connection *conn) { rxrpc_serial_t serial; serial = conn->tx_serial; if (serial == 0) serial = 1; conn->tx_serial = serial + 1; return serial; } /* * Allocate the next serial n numbers on a connection. 0 must be skipped. */ static inline rxrpc_serial_t rxrpc_get_next_serials(struct rxrpc_connection *conn, unsigned int n) { rxrpc_serial_t serial; serial = conn->tx_serial; if (serial + n <= n) serial = 1; conn->tx_serial = serial + n; return serial; } /* * af_rxrpc.c */ extern atomic_t rxrpc_n_rx_skbs; extern struct workqueue_struct *rxrpc_workqueue; /* * call_accept.c */ int rxrpc_service_prealloc(struct rxrpc_sock *, gfp_t); void rxrpc_discard_prealloc(struct rxrpc_sock *); bool rxrpc_new_incoming_call(struct rxrpc_local *local, struct rxrpc_peer *peer, struct rxrpc_connection *conn, struct sockaddr_rxrpc *peer_srx, struct sk_buff *skb); int rxrpc_user_charge_accept(struct rxrpc_sock *, unsigned long); /* * call_event.c */ void rxrpc_propose_ping(struct rxrpc_call *call, u32 serial, enum rxrpc_propose_ack_trace why); void rxrpc_propose_delay_ACK(struct rxrpc_call *, rxrpc_serial_t, enum rxrpc_propose_ack_trace); void rxrpc_resend_tlp(struct rxrpc_call *call); void rxrpc_transmit_some_data(struct rxrpc_call *call, unsigned int limit, enum rxrpc_txdata_trace trace); bool rxrpc_input_call_event(struct rxrpc_call *call); /* * call_object.c */ extern const char *const rxrpc_call_states[]; extern const char *const rxrpc_call_completions[]; extern struct kmem_cache *rxrpc_call_jar; void rxrpc_poke_call(struct rxrpc_call *call, enum rxrpc_call_poke_trace what); struct rxrpc_call *rxrpc_find_call_by_user_ID(struct rxrpc_sock *, unsigned long); struct rxrpc_call *rxrpc_alloc_call(struct rxrpc_sock *, gfp_t, unsigned int); struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *, struct rxrpc_conn_parameters *, struct rxrpc_call_params *, gfp_t, unsigned int) __releases(&rx->sk.sk_lock) __acquires(&call->user_mutex); void rxrpc_start_call_timer(struct rxrpc_call *call); void rxrpc_incoming_call(struct rxrpc_sock *, struct rxrpc_call *, struct sk_buff *); void rxrpc_release_call(struct rxrpc_sock *, struct rxrpc_call *); void rxrpc_release_calls_on_socket(struct rxrpc_sock *); void rxrpc_see_call(struct rxrpc_call *, enum rxrpc_call_trace); struct rxrpc_call *rxrpc_try_get_call(struct rxrpc_call *, enum rxrpc_call_trace); void rxrpc_get_call(struct rxrpc_call *, enum rxrpc_call_trace); void rxrpc_put_call(struct rxrpc_call *, enum rxrpc_call_trace); void rxrpc_cleanup_call(struct rxrpc_call *); void rxrpc_destroy_all_calls(struct rxrpc_net *); static inline bool rxrpc_is_service_call(const struct rxrpc_call *call) { return test_bit(RXRPC_CALL_IS_SERVICE, &call->flags); } static inline bool rxrpc_is_client_call(const struct rxrpc_call *call) { return !rxrpc_is_service_call(call); } /* * call_state.c */ bool rxrpc_set_call_completion(struct rxrpc_call *call, enum rxrpc_call_completion compl, u32 abort_code, int error); bool rxrpc_call_completed(struct rxrpc_call *call); bool rxrpc_abort_call(struct rxrpc_call *call, rxrpc_seq_t seq, u32 abort_code, int error, enum rxrpc_abort_reason why); void rxrpc_prefail_call(struct rxrpc_call *call, enum rxrpc_call_completion compl, int error); static inline void rxrpc_set_call_state(struct rxrpc_call *call, enum rxrpc_call_state state) { /* Order write of completion info before write of ->state. */ smp_store_release(&call->_state, state); wake_up(&call->waitq); } static inline enum rxrpc_call_state __rxrpc_call_state(const struct rxrpc_call *call) { return call->_state; /* Only inside I/O thread */ } static inline bool __rxrpc_call_is_complete(const struct rxrpc_call *call) { return __rxrpc_call_state(call) == RXRPC_CALL_COMPLETE; } static inline enum rxrpc_call_state rxrpc_call_state(const struct rxrpc_call *call) { /* Order read ->state before read of completion info. */ return smp_load_acquire(&call->_state); } static inline bool rxrpc_call_is_complete(const struct rxrpc_call *call) { return rxrpc_call_state(call) == RXRPC_CALL_COMPLETE; } static inline bool rxrpc_call_has_failed(const struct rxrpc_call *call) { return rxrpc_call_is_complete(call) && call->completion != RXRPC_CALL_SUCCEEDED; } /* * conn_client.c */ extern unsigned int rxrpc_reap_client_connections; extern unsigned long rxrpc_conn_idle_client_expiry; extern unsigned long rxrpc_conn_idle_client_fast_expiry; void rxrpc_purge_client_connections(struct rxrpc_local *local); struct rxrpc_bundle *rxrpc_get_bundle(struct rxrpc_bundle *, enum rxrpc_bundle_trace); void rxrpc_put_bundle(struct rxrpc_bundle *, enum rxrpc_bundle_trace); int rxrpc_look_up_bundle(struct rxrpc_call *call, gfp_t gfp); void rxrpc_connect_client_calls(struct rxrpc_local *local); void rxrpc_expose_client_call(struct rxrpc_call *); void rxrpc_disconnect_client_call(struct rxrpc_bundle *, struct rxrpc_call *); void rxrpc_deactivate_bundle(struct rxrpc_bundle *bundle); void rxrpc_discard_expired_client_conns(struct rxrpc_local *local); void rxrpc_clean_up_local_conns(struct rxrpc_local *); /* * conn_event.c */ void rxrpc_conn_retransmit_call(struct rxrpc_connection *conn, struct sk_buff *skb, unsigned int channel); int rxrpc_abort_conn(struct rxrpc_connection *conn, struct sk_buff *skb, s32 abort_code, int err, enum rxrpc_abort_reason why); void rxrpc_process_connection(struct work_struct *); void rxrpc_process_delayed_final_acks(struct rxrpc_connection *, bool); bool rxrpc_input_conn_packet(struct rxrpc_connection *conn, struct sk_buff *skb); void rxrpc_input_conn_event(struct rxrpc_connection *conn, struct sk_buff *skb); static inline bool rxrpc_is_conn_aborted(const struct rxrpc_connection *conn) { /* Order reading the abort info after the state check. */ return smp_load_acquire(&conn->state) == RXRPC_CONN_ABORTED; } /* * conn_object.c */ extern unsigned int rxrpc_connection_expiry; extern unsigned int rxrpc_closed_conn_expiry; void rxrpc_poke_conn(struct rxrpc_connection *conn, enum rxrpc_conn_trace why); struct rxrpc_connection *rxrpc_alloc_connection(struct rxrpc_net *, gfp_t); struct rxrpc_connection *rxrpc_find_client_connection_rcu(struct rxrpc_local *, struct sockaddr_rxrpc *, struct sk_buff *); void __rxrpc_disconnect_call(struct rxrpc_connection *, struct rxrpc_call *); void rxrpc_disconnect_call(struct rxrpc_call *); void rxrpc_kill_client_conn(struct rxrpc_connection *); void rxrpc_queue_conn(struct rxrpc_connection *, enum rxrpc_conn_trace); void rxrpc_see_connection(struct rxrpc_connection *, enum rxrpc_conn_trace); struct rxrpc_connection *rxrpc_get_connection(struct rxrpc_connection *, enum rxrpc_conn_trace); struct rxrpc_connection *rxrpc_get_connection_maybe(struct rxrpc_connection *, enum rxrpc_conn_trace); void rxrpc_put_connection(struct rxrpc_connection *, enum rxrpc_conn_trace); void rxrpc_service_connection_reaper(struct work_struct *); void rxrpc_destroy_all_connections(struct rxrpc_net *); static inline bool rxrpc_conn_is_client(const struct rxrpc_connection *conn) { return conn->out_clientflag; } static inline bool rxrpc_conn_is_service(const struct rxrpc_connection *conn) { return !rxrpc_conn_is_client(conn); } static inline void rxrpc_reduce_conn_timer(struct rxrpc_connection *conn, unsigned long expire_at) { timer_reduce(&conn->timer, expire_at); } /* * conn_service.c */ struct rxrpc_connection *rxrpc_find_service_conn_rcu(struct rxrpc_peer *, struct sk_buff *); struct rxrpc_connection *rxrpc_prealloc_service_connection(struct rxrpc_net *, gfp_t); void rxrpc_new_incoming_connection(struct rxrpc_sock *, struct rxrpc_connection *, const struct rxrpc_security *, struct sk_buff *); void rxrpc_unpublish_service_conn(struct rxrpc_connection *); /* * input.c */ void rxrpc_congestion_degrade(struct rxrpc_call *); void rxrpc_input_call_packet(struct rxrpc_call *, struct sk_buff *); void rxrpc_implicit_end_call(struct rxrpc_call *, struct sk_buff *); /* * input_rack.c */ void rxrpc_input_rack_one(struct rxrpc_call *call, struct rxrpc_ack_summary *summary, struct rxrpc_txqueue *tq, unsigned int ix); void rxrpc_input_rack(struct rxrpc_call *call, struct rxrpc_ack_summary *summary, struct rxrpc_txqueue *tq, unsigned long new_acks); void rxrpc_rack_detect_loss_and_arm_timer(struct rxrpc_call *call, struct rxrpc_ack_summary *summary); ktime_t rxrpc_tlp_calc_pto(struct rxrpc_call *call, ktime_t now); void rxrpc_tlp_send_probe(struct rxrpc_call *call); void rxrpc_tlp_process_ack(struct rxrpc_call *call, struct rxrpc_ack_summary *summary); void rxrpc_rack_timer_expired(struct rxrpc_call *call, ktime_t overran_by); /* Initialise TLP state [RFC8958 7.1]. */ static inline void rxrpc_tlp_init(struct rxrpc_call *call) { call->tlp_serial = 0; call->tlp_seq = call->acks_hard_ack; call->tlp_is_retrans = false; } /* * io_thread.c */ int rxrpc_encap_rcv(struct sock *, struct sk_buff *); void rxrpc_error_report(struct sock *); bool rxrpc_direct_abort(struct sk_buff *skb, enum rxrpc_abort_reason why, s32 abort_code, int err); bool rxrpc_direct_conn_abort(struct sk_buff *skb, enum rxrpc_abort_reason why, s32 abort_code, int err); int rxrpc_io_thread(void *data); void rxrpc_post_response(struct rxrpc_connection *conn, struct sk_buff *skb); static inline void rxrpc_wake_up_io_thread(struct rxrpc_local *local) { if (!local->io_thread) return; wake_up_process(READ_ONCE(local->io_thread)); } static inline bool rxrpc_protocol_error(struct sk_buff *skb, enum rxrpc_abort_reason why) { return rxrpc_direct_abort(skb, why, RX_PROTOCOL_ERROR, -EPROTO); } /* * insecure.c */ extern const struct rxrpc_security rxrpc_no_security; /* * key.c */ extern struct key_type key_type_rxrpc; int rxrpc_request_key(struct rxrpc_sock *, sockptr_t , int); int rxrpc_get_server_data_key(struct rxrpc_connection *, const void *, time64_t, u32); /* * local_event.c */ void rxrpc_gen_version_string(void); void rxrpc_send_version_request(struct rxrpc_local *local, struct rxrpc_host_header *hdr, struct sk_buff *skb); /* * local_object.c */ void rxrpc_local_dont_fragment(const struct rxrpc_local *local, bool set); struct rxrpc_local *rxrpc_lookup_local(struct net *, const struct sockaddr_rxrpc *); struct rxrpc_local *rxrpc_get_local(struct rxrpc_local *, enum rxrpc_local_trace); struct rxrpc_local *rxrpc_get_local_maybe(struct rxrpc_local *, enum rxrpc_local_trace); void rxrpc_put_local(struct rxrpc_local *, enum rxrpc_local_trace); struct rxrpc_local *rxrpc_use_local(struct rxrpc_local *, enum rxrpc_local_trace); void rxrpc_unuse_local(struct rxrpc_local *, enum rxrpc_local_trace); void rxrpc_destroy_local(struct rxrpc_local *local); void rxrpc_destroy_all_locals(struct rxrpc_net *); static inline bool __rxrpc_use_local(struct rxrpc_local *local, enum rxrpc_local_trace why) { int r, u; r = refcount_read(&local->ref); u = atomic_fetch_add_unless(&local->active_users, 1, 0); trace_rxrpc_local(local->debug_id, why, r, u); return u != 0; } static inline void rxrpc_see_local(struct rxrpc_local *local, enum rxrpc_local_trace why) { int r, u; r = refcount_read(&local->ref); u = atomic_read(&local->active_users); trace_rxrpc_local(local->debug_id, why, r, u); } /* * misc.c */ extern unsigned int rxrpc_max_backlog __read_mostly; extern unsigned long rxrpc_soft_ack_delay; extern unsigned long rxrpc_idle_ack_delay; extern unsigned int rxrpc_rx_window_size; extern unsigned int rxrpc_rx_mtu; extern unsigned int rxrpc_rx_jumbo_max; #ifdef CONFIG_AF_RXRPC_INJECT_RX_DELAY extern unsigned long rxrpc_inject_rx_delay; #endif /* * net_ns.c */ extern unsigned int rxrpc_net_id; extern struct pernet_operations rxrpc_net_ops; static inline struct rxrpc_net *rxrpc_net(struct net *net) { return net_generic(net, rxrpc_net_id); } /* * out_of_band.c */ void rxrpc_notify_socket_oob(struct rxrpc_call *call, struct sk_buff *skb); void rxrpc_add_pending_oob(struct rxrpc_sock *rx, struct sk_buff *skb); int rxrpc_sendmsg_oob(struct rxrpc_sock *rx, struct msghdr *msg, size_t len); /* * output.c */ ssize_t do_udp_sendmsg(struct socket *socket, struct msghdr *msg, size_t len); void rxrpc_send_ACK(struct rxrpc_call *call, u8 ack_reason, rxrpc_serial_t serial, enum rxrpc_propose_ack_trace why); void rxrpc_send_probe_for_pmtud(struct rxrpc_call *call); int rxrpc_send_abort_packet(struct rxrpc_call *); void rxrpc_send_data_packet(struct rxrpc_call *call, struct rxrpc_send_data_req *req); void rxrpc_send_conn_abort(struct rxrpc_connection *conn); void rxrpc_reject_packet(struct rxrpc_local *local, struct sk_buff *skb); void rxrpc_send_keepalive(struct rxrpc_peer *); void rxrpc_send_response(struct rxrpc_connection *conn, struct sk_buff *skb); /* * peer_event.c */ void rxrpc_input_error(struct rxrpc_local *, struct sk_buff *); void rxrpc_peer_keepalive_worker(struct work_struct *); void rxrpc_input_probe_for_pmtud(struct rxrpc_connection *conn, rxrpc_serial_t acked_serial, bool sendmsg_fail); /* Update the last transmission time on a peer for keepalive purposes. */ static inline void rxrpc_peer_mark_tx(struct rxrpc_peer *peer) { /* To avoid tearing on 32-bit systems, we only keep the LSW. */ WRITE_ONCE(peer->last_tx_at, ktime_get_seconds()); } /* * peer_object.c */ struct rxrpc_peer *rxrpc_lookup_peer_rcu(struct rxrpc_local *, const struct sockaddr_rxrpc *); struct rxrpc_peer *rxrpc_lookup_peer(struct rxrpc_local *local, struct sockaddr_rxrpc *srx, gfp_t gfp); void rxrpc_assess_MTU_size(struct rxrpc_local *local, struct rxrpc_peer *peer); struct rxrpc_peer *rxrpc_alloc_peer(struct rxrpc_local *, gfp_t, enum rxrpc_peer_trace); void rxrpc_new_incoming_peer(struct rxrpc_local *local, struct rxrpc_peer *peer); void rxrpc_destroy_all_peers(struct rxrpc_net *); struct rxrpc_peer *rxrpc_get_peer(struct rxrpc_peer *, enum rxrpc_peer_trace); struct rxrpc_peer *rxrpc_get_peer_maybe(struct rxrpc_peer *, enum rxrpc_peer_trace); void rxrpc_put_peer(struct rxrpc_peer *, enum rxrpc_peer_trace); /* * proc.c */ extern const struct seq_operations rxrpc_call_seq_ops; extern const struct seq_operations rxrpc_connection_seq_ops; extern const struct seq_operations rxrpc_bundle_seq_ops; extern const struct seq_operations rxrpc_peer_seq_ops; extern const struct seq_operations rxrpc_local_seq_ops; /* * recvmsg.c */ void rxrpc_notify_socket(struct rxrpc_call *); int rxrpc_recvmsg(struct socket *, struct msghdr *, size_t, int); /* * Abort a call due to a protocol error. */ static inline int rxrpc_abort_eproto(struct rxrpc_call *call, struct sk_buff *skb, s32 abort_code, enum rxrpc_abort_reason why) { struct rxrpc_skb_priv *sp = rxrpc_skb(skb); rxrpc_abort_call(call, sp->hdr.seq, abort_code, -EPROTO, why); return -EPROTO; } /* * rtt.c */ void rxrpc_call_add_rtt(struct rxrpc_call *call, enum rxrpc_rtt_rx_trace why, int rtt_slot, rxrpc_serial_t send_serial, rxrpc_serial_t resp_serial, ktime_t send_time, ktime_t resp_time); ktime_t rxrpc_get_rto_backoff(struct rxrpc_call *call, bool retrans); void rxrpc_call_init_rtt(struct rxrpc_call *call); /* * rxgk.c */ extern const struct rxrpc_security rxgk_yfs; /* * rxkad.c */ #ifdef CONFIG_RXKAD extern const struct rxrpc_security rxkad; #endif /* * security.c */ int __init rxrpc_init_security(void); const struct rxrpc_security *rxrpc_security_lookup(u8); void rxrpc_exit_security(void); int rxrpc_init_client_call_security(struct rxrpc_call *); int rxrpc_init_client_conn_security(struct rxrpc_connection *); const struct rxrpc_security *rxrpc_get_incoming_security(struct rxrpc_sock *, struct sk_buff *); struct key *rxrpc_look_up_server_security(struct rxrpc_connection *, struct sk_buff *, u32, u32); /* * sendmsg.c */ bool rxrpc_propose_abort(struct rxrpc_call *call, s32 abort_code, int error, enum rxrpc_abort_reason why); int rxrpc_do_sendmsg(struct rxrpc_sock *, struct msghdr *, size_t); /* * server_key.c */ extern struct key_type key_type_rxrpc_s; int rxrpc_server_keyring(struct rxrpc_sock *, sockptr_t, int); /* * skbuff.c */ void rxrpc_kernel_data_consumed(struct rxrpc_call *, struct sk_buff *); void rxrpc_new_skb(struct sk_buff *, enum rxrpc_skb_trace); void rxrpc_see_skb(struct sk_buff *, enum rxrpc_skb_trace); void rxrpc_eaten_skb(struct sk_buff *, enum rxrpc_skb_trace); void rxrpc_get_skb(struct sk_buff *, enum rxrpc_skb_trace); void rxrpc_free_skb(struct sk_buff *, enum rxrpc_skb_trace); void rxrpc_purge_queue(struct sk_buff_head *); /* * stats.c */ int rxrpc_stats_show(struct seq_file *seq, void *v); int rxrpc_stats_clear(struct file *file, char *buf, size_t size); #define rxrpc_inc_stat(rxnet, s) atomic_inc(&(rxnet)->s) #define rxrpc_dec_stat(rxnet, s) atomic_dec(&(rxnet)->s) /* * sysctl.c */ #ifdef CONFIG_SYSCTL extern int __init rxrpc_sysctl_init(void); extern void rxrpc_sysctl_exit(void); #else static inline int __init rxrpc_sysctl_init(void) { return 0; } static inline void rxrpc_sysctl_exit(void) {} #endif /* * txbuf.c */ extern atomic_t rxrpc_nr_txbuf; struct rxrpc_txbuf *rxrpc_alloc_data_txbuf(struct rxrpc_call *call, size_t data_size, size_t data_align, gfp_t gfp); void rxrpc_see_txbuf(struct rxrpc_txbuf *txb, enum rxrpc_txbuf_trace what); void rxrpc_put_txbuf(struct rxrpc_txbuf *txb, enum rxrpc_txbuf_trace what); /* * utils.c */ int rxrpc_extract_addr_from_skb(struct sockaddr_rxrpc *, struct sk_buff *); static inline bool before(u32 seq1, u32 seq2) { return (s32)(seq1 - seq2) < 0; } static inline bool before_eq(u32 seq1, u32 seq2) { return (s32)(seq1 - seq2) <= 0; } static inline bool after(u32 seq1, u32 seq2) { return (s32)(seq1 - seq2) > 0; } static inline bool after_eq(u32 seq1, u32 seq2) { return (s32)(seq1 - seq2) >= 0; } static inline u32 earliest(u32 seq1, u32 seq2) { return before(seq1, seq2) ? seq1 : seq2; } static inline u32 latest(u32 seq1, u32 seq2) { return after(seq1, seq2) ? seq1 : seq2; } static inline bool rxrpc_seq_in_txq(const struct rxrpc_txqueue *tq, rxrpc_seq_t seq) { return (seq & (RXRPC_NR_TXQUEUE - 1)) == tq->qbase; } static inline void rxrpc_queue_rx_call_packet(struct rxrpc_call *call, struct sk_buff *skb) { rxrpc_get_skb(skb, rxrpc_skb_get_call_rx); __skb_queue_tail(&call->rx_queue, skb); rxrpc_poke_call(call, rxrpc_call_poke_rx_packet); } /* * Calculate how much space there is for transmitting more DATA packets. */ static inline unsigned int rxrpc_tx_window_space(const struct rxrpc_call *call) { int winsize = umin(call->tx_winsize, call->cong_cwnd + call->cong_extra); int transmitted = call->tx_top - call->tx_bottom; return max(winsize - transmitted, 0); } static inline unsigned int rxrpc_left_out(const struct rxrpc_call *call) { return call->acks_nr_sacks + call->tx_nr_lost; } /* * Calculate the number of transmitted DATA packets assumed to be in flight * [approx RFC6675]. */ static inline unsigned int rxrpc_tx_in_flight(const struct rxrpc_call *call) { return call->tx_nr_sent - rxrpc_left_out(call) + call->tx_nr_resent; } /* * debug tracing */ extern unsigned int rxrpc_debug; #define dbgprintk(FMT,...) \ printk("[%-6.6s] "FMT"\n", current->comm ,##__VA_ARGS__) #define kenter(FMT,...) dbgprintk("==> %s("FMT")",__func__ ,##__VA_ARGS__) #define kleave(FMT,...) dbgprintk("<== %s()"FMT"",__func__ ,##__VA_ARGS__) #define kdebug(FMT,...) dbgprintk(" "FMT ,##__VA_ARGS__) #if defined(__KDEBUG) #define _enter(FMT,...) kenter(FMT,##__VA_ARGS__) #define _leave(FMT,...) kleave(FMT,##__VA_ARGS__) #define _debug(FMT,...) kdebug(FMT,##__VA_ARGS__) #elif defined(CONFIG_AF_RXRPC_DEBUG) #define RXRPC_DEBUG_KENTER 0x01 #define RXRPC_DEBUG_KLEAVE 0x02 #define RXRPC_DEBUG_KDEBUG 0x04 #define _enter(FMT,...) \ do { \ if (unlikely(rxrpc_debug & RXRPC_DEBUG_KENTER)) \ kenter(FMT,##__VA_ARGS__); \ } while (0) #define _leave(FMT,...) \ do { \ if (unlikely(rxrpc_debug & RXRPC_DEBUG_KLEAVE)) \ kleave(FMT,##__VA_ARGS__); \ } while (0) #define _debug(FMT,...) \ do { \ if (unlikely(rxrpc_debug & RXRPC_DEBUG_KDEBUG)) \ kdebug(FMT,##__VA_ARGS__); \ } while (0) #else #define _enter(FMT,...) no_printk("==> %s("FMT")",__func__ ,##__VA_ARGS__) #define _leave(FMT,...) no_printk("<== %s()"FMT"",__func__ ,##__VA_ARGS__) #define _debug(FMT,...) no_printk(" "FMT ,##__VA_ARGS__) #endif /* * debug assertion checking */ #if 1 // defined(__KDEBUGALL) #define ASSERT(X) \ do { \ if (unlikely(!(X))) { \ pr_err("Assertion failed\n"); \ BUG(); \ } \ } while (0) #define ASSERTCMP(X, OP, Y) \ do { \ __typeof__(X) _x = (X); \ __typeof__(Y) _y = (__typeof__(X))(Y); \ if (unlikely(!(_x OP _y))) { \ pr_err("Assertion failed - %lu(0x%lx) %s %lu(0x%lx) is false\n", \ (unsigned long)_x, (unsigned long)_x, #OP, \ (unsigned long)_y, (unsigned long)_y); \ BUG(); \ } \ } while (0) #define ASSERTIF(C, X) \ do { \ if (unlikely((C) && !(X))) { \ pr_err("Assertion failed\n"); \ BUG(); \ } \ } while (0) #define ASSERTIFCMP(C, X, OP, Y) \ do { \ __typeof__(X) _x = (X); \ __typeof__(Y) _y = (__typeof__(X))(Y); \ if (unlikely((C) && !(_x OP _y))) { \ pr_err("Assertion failed - %lu(0x%lx) %s %lu(0x%lx) is false\n", \ (unsigned long)_x, (unsigned long)_x, #OP, \ (unsigned long)_y, (unsigned long)_y); \ BUG(); \ } \ } while (0) #else #define ASSERT(X) \ do { \ } while (0) #define ASSERTCMP(X, OP, Y) \ do { \ } while (0) #define ASSERTIF(C, X) \ do { \ } while (0) #define ASSERTIFCMP(C, X, OP, Y) \ do { \ } while (0) #endif /* __KDEBUGALL */
119 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 // SPDX-License-Identifier: GPL-2.0-or-later /* * (C) 2010 Pablo Neira Ayuso <pablo@netfilter.org> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/netfilter.h> #include <linux/slab.h> #include <linux/kernel.h> #include <linux/moduleparam.h> #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_extend.h> #include <net/netfilter/nf_conntrack_timestamp.h> static bool nf_ct_tstamp __read_mostly; module_param_named(tstamp, nf_ct_tstamp, bool, 0644); MODULE_PARM_DESC(tstamp, "Enable connection tracking flow timestamping."); void nf_conntrack_tstamp_pernet_init(struct net *net) { net->ct.sysctl_tstamp = nf_ct_tstamp; }
298 298 299 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 // SPDX-License-Identifier: LGPL-2.0+ /* * Copyright (C) 1993, 1994, 1995, 1996, 1997 Free Software Foundation, Inc. * This file is part of the GNU C Library. * Contributed by Paul Eggert (eggert@twinsun.com). * * The GNU C Library is free software; you can redistribute it and/or * modify it under the terms of the GNU Library General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * The GNU C Library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Library General Public License for more details. * * You should have received a copy of the GNU Library General Public * License along with the GNU C Library; see the file COPYING.LIB. If not, * write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, * Boston, MA 02111-1307, USA. */ /* * Converts the calendar time to broken-down time representation * * 2009-7-14: * Moved from glibc-2.6 to kernel by Zhaolei<zhaolei@cn.fujitsu.com> * 2021-06-02: * Reimplemented by Cassio Neri <cassio.neri@gmail.com> */ #include <linux/time.h> #include <linux/module.h> #include <linux/kernel.h> #define SECS_PER_HOUR (60 * 60) #define SECS_PER_DAY (SECS_PER_HOUR * 24) /** * time64_to_tm - converts the calendar time to local broken-down time * * @totalsecs: the number of seconds elapsed since 00:00:00 on January 1, 1970, * Coordinated Universal Time (UTC). * @offset: offset seconds adding to totalsecs. * @result: pointer to struct tm variable to receive broken-down time */ void time64_to_tm(time64_t totalsecs, int offset, struct tm *result) { u32 u32tmp, day_of_century, year_of_century, day_of_year, month, day; u64 u64tmp, udays, century, year; bool is_Jan_or_Feb, is_leap_year; long days, rem; int remainder; days = div_s64_rem(totalsecs, SECS_PER_DAY, &remainder); rem = remainder; rem += offset; while (rem < 0) { rem += SECS_PER_DAY; --days; } while (rem >= SECS_PER_DAY) { rem -= SECS_PER_DAY; ++days; } result->tm_hour = rem / SECS_PER_HOUR; rem %= SECS_PER_HOUR; result->tm_min = rem / 60; result->tm_sec = rem % 60; /* January 1, 1970 was a Thursday. */ result->tm_wday = (4 + days) % 7; if (result->tm_wday < 0) result->tm_wday += 7; /* * The following algorithm is, basically, Proposition 6.3 of Neri * and Schneider [1]. In a few words: it works on the computational * (fictitious) calendar where the year starts in March, month = 2 * (*), and finishes in February, month = 13. This calendar is * mathematically convenient because the day of the year does not * depend on whether the year is leap or not. For instance: * * March 1st 0-th day of the year; * ... * April 1st 31-st day of the year; * ... * January 1st 306-th day of the year; (Important!) * ... * February 28th 364-th day of the year; * February 29th 365-th day of the year (if it exists). * * After having worked out the date in the computational calendar * (using just arithmetics) it's easy to convert it to the * corresponding date in the Gregorian calendar. * * [1] "Euclidean Affine Functions and Applications to Calendar * Algorithms". https://arxiv.org/abs/2102.06959 * * (*) The numbering of months follows tm more closely and thus, * is slightly different from [1]. */ udays = ((u64) days) + 2305843009213814918ULL; u64tmp = 4 * udays + 3; century = div64_u64_rem(u64tmp, 146097, &u64tmp); day_of_century = (u32) (u64tmp / 4); u32tmp = 4 * day_of_century + 3; u64tmp = 2939745ULL * u32tmp; year_of_century = upper_32_bits(u64tmp); day_of_year = lower_32_bits(u64tmp) / 2939745 / 4; year = 100 * century + year_of_century; is_leap_year = year_of_century ? !(year_of_century % 4) : !(century % 4); u32tmp = 2141 * day_of_year + 132377; month = u32tmp >> 16; day = ((u16) u32tmp) / 2141; /* * Recall that January 1st is the 306-th day of the year in the * computational (not Gregorian) calendar. */ is_Jan_or_Feb = day_of_year >= 306; /* Convert to the Gregorian calendar and adjust to Unix time. */ year = year + is_Jan_or_Feb - 6313183731940000ULL; month = is_Jan_or_Feb ? month - 12 : month; day = day + 1; day_of_year += is_Jan_or_Feb ? -306 : 31 + 28 + is_leap_year; /* Convert to tm's format. */ result->tm_year = (long) (year - 1900); result->tm_mon = (int) month; result->tm_mday = (int) day; result->tm_yday = (int) day_of_year; } EXPORT_SYMBOL(time64_to_tm);
28 8 3 4 5 17 2 15 20 931 893 36 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 // SPDX-License-Identifier: GPL-2.0-only #include <linux/netlink.h> #include <linux/nospec.h> #include <linux/rtnetlink.h> #include <linux/types.h> #include <net/ip.h> #include <net/net_namespace.h> #include <net/tcp.h> static int ip_metrics_convert(struct nlattr *fc_mx, int fc_mx_len, u32 *metrics, struct netlink_ext_ack *extack) { bool ecn_ca = false; struct nlattr *nla; int remaining; nla_for_each_attr(nla, fc_mx, fc_mx_len, remaining) { int type = nla_type(nla); u32 val; if (!type) continue; if (type > RTAX_MAX) { NL_SET_ERR_MSG(extack, "Invalid metric type"); return -EINVAL; } type = array_index_nospec(type, RTAX_MAX + 1); if (type == RTAX_CC_ALGO) { char tmp[TCP_CA_NAME_MAX]; nla_strscpy(tmp, nla, sizeof(tmp)); val = tcp_ca_get_key_by_name(tmp, &ecn_ca); if (val == TCP_CA_UNSPEC) { NL_SET_ERR_MSG(extack, "Unknown tcp congestion algorithm"); return -EINVAL; } } else { if (nla_len(nla) != sizeof(u32)) { NL_SET_ERR_MSG_ATTR(extack, nla, "Invalid attribute in metrics"); return -EINVAL; } val = nla_get_u32(nla); } if (type == RTAX_ADVMSS && val > 65535 - 40) val = 65535 - 40; if (type == RTAX_MTU && val > 65535 - 15) val = 65535 - 15; if (type == RTAX_HOPLIMIT && val > 255) val = 255; if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK)) { NL_SET_ERR_MSG(extack, "Unknown flag set in feature mask in metrics attribute"); return -EINVAL; } metrics[type - 1] = val; } if (ecn_ca) metrics[RTAX_FEATURES - 1] |= DST_FEATURE_ECN_CA; return 0; } struct dst_metrics *ip_fib_metrics_init(struct nlattr *fc_mx, int fc_mx_len, struct netlink_ext_ack *extack) { struct dst_metrics *fib_metrics; int err; if (!fc_mx) return (struct dst_metrics *)&dst_default_metrics; fib_metrics = kzalloc(sizeof(*fib_metrics), GFP_KERNEL); if (unlikely(!fib_metrics)) return ERR_PTR(-ENOMEM); err = ip_metrics_convert(fc_mx, fc_mx_len, fib_metrics->metrics, extack); if (!err) { refcount_set(&fib_metrics->refcnt, 1); } else { kfree(fib_metrics); fib_metrics = ERR_PTR(err); } return fib_metrics; } EXPORT_SYMBOL_GPL(ip_fib_metrics_init);
6 3 5 7 2 7 3 2 21 12 9 2 2 4 3 5 5 3 5 9 9 7 2 9 9 9 5 5 5 8 5 10 23 2 2 2 5 1 11 1 9 3 13 4 4 13 9 11 10 5 5 2 1 8 13 7 7 7 7 6 3 5 2 7 9 1 28 1 8 28 9 9 9 8 4 7 1 10 1 18 21 21 20 10 8 18 5 22 6 2 13 24 24 1 4 22 16 16 24 1 1 2 4 6 10 2 7 3 5 2 3 14 13 8 1 4 3 3 1 1 4 5 5 3 1 1 6 3 2 1 23 16 1 1 1 1 1 3 4 1 3 2 1 3 3 1 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2015, Sony Mobile Communications Inc. * Copyright (c) 2013, The Linux Foundation. All rights reserved. */ #include <linux/module.h> #include <linux/netlink.h> #include <linux/qrtr.h> #include <linux/termios.h> /* For TIOCINQ/OUTQ */ #include <linux/spinlock.h> #include <linux/wait.h> #include <net/sock.h> #include "qrtr.h" #define QRTR_PROTO_VER_1 1 #define QRTR_PROTO_VER_2 3 /* auto-bind range */ #define QRTR_MIN_EPH_SOCKET 0x4000 #define QRTR_MAX_EPH_SOCKET 0x7fff #define QRTR_EPH_PORT_RANGE \ XA_LIMIT(QRTR_MIN_EPH_SOCKET, QRTR_MAX_EPH_SOCKET) #define QRTR_PORT_CTRL_LEGACY 0xffff /** * struct qrtr_hdr_v1 - (I|R)PCrouter packet header version 1 * @version: protocol version * @type: packet type; one of QRTR_TYPE_* * @src_node_id: source node * @src_port_id: source port * @confirm_rx: boolean; whether a resume-tx packet should be send in reply * @size: length of packet, excluding this header * @dst_node_id: destination node * @dst_port_id: destination port */ struct qrtr_hdr_v1 { __le32 version; __le32 type; __le32 src_node_id; __le32 src_port_id; __le32 confirm_rx; __le32 size; __le32 dst_node_id; __le32 dst_port_id; } __packed; /** * struct qrtr_hdr_v2 - (I|R)PCrouter packet header later versions * @version: protocol version * @type: packet type; one of QRTR_TYPE_* * @flags: bitmask of QRTR_FLAGS_* * @optlen: length of optional header data * @size: length of packet, excluding this header and optlen * @src_node_id: source node * @src_port_id: source port * @dst_node_id: destination node * @dst_port_id: destination port */ struct qrtr_hdr_v2 { u8 version; u8 type; u8 flags; u8 optlen; __le32 size; __le16 src_node_id; __le16 src_port_id; __le16 dst_node_id; __le16 dst_port_id; }; #define QRTR_FLAGS_CONFIRM_RX BIT(0) struct qrtr_cb { u32 src_node; u32 src_port; u32 dst_node; u32 dst_port; u8 type; u8 confirm_rx; }; #define QRTR_HDR_MAX_SIZE max_t(size_t, sizeof(struct qrtr_hdr_v1), \ sizeof(struct qrtr_hdr_v2)) struct qrtr_sock { /* WARNING: sk must be the first member */ struct sock sk; struct sockaddr_qrtr us; struct sockaddr_qrtr peer; }; static inline struct qrtr_sock *qrtr_sk(struct sock *sk) { BUILD_BUG_ON(offsetof(struct qrtr_sock, sk) != 0); return container_of(sk, struct qrtr_sock, sk); } static unsigned int qrtr_local_nid = 1; /* for node ids */ static RADIX_TREE(qrtr_nodes, GFP_ATOMIC); static DEFINE_SPINLOCK(qrtr_nodes_lock); /* broadcast list */ static LIST_HEAD(qrtr_all_nodes); /* lock for qrtr_all_nodes and node reference */ static DEFINE_MUTEX(qrtr_node_lock); /* local port allocation management */ static DEFINE_XARRAY_ALLOC(qrtr_ports); /** * struct qrtr_node - endpoint node * @ep_lock: lock for endpoint management and callbacks * @ep: endpoint * @ref: reference count for node * @nid: node id * @qrtr_tx_flow: tree of qrtr_tx_flow, keyed by node << 32 | port * @qrtr_tx_lock: lock for qrtr_tx_flow inserts * @rx_queue: receive queue * @item: list item for broadcast list */ struct qrtr_node { struct mutex ep_lock; struct qrtr_endpoint *ep; struct kref ref; unsigned int nid; struct radix_tree_root qrtr_tx_flow; struct mutex qrtr_tx_lock; /* for qrtr_tx_flow */ struct sk_buff_head rx_queue; struct list_head item; }; /** * struct qrtr_tx_flow - tx flow control * @resume_tx: waiters for a resume tx from the remote * @pending: number of waiting senders * @tx_failed: indicates that a message with confirm_rx flag was lost */ struct qrtr_tx_flow { struct wait_queue_head resume_tx; int pending; int tx_failed; }; #define QRTR_TX_FLOW_HIGH 10 #define QRTR_TX_FLOW_LOW 5 static int qrtr_local_enqueue(struct qrtr_node *node, struct sk_buff *skb, int type, struct sockaddr_qrtr *from, struct sockaddr_qrtr *to); static int qrtr_bcast_enqueue(struct qrtr_node *node, struct sk_buff *skb, int type, struct sockaddr_qrtr *from, struct sockaddr_qrtr *to); static struct qrtr_sock *qrtr_port_lookup(int port); static void qrtr_port_put(struct qrtr_sock *ipc); /* Release node resources and free the node. * * Do not call directly, use qrtr_node_release. To be used with * kref_put_mutex. As such, the node mutex is expected to be locked on call. */ static void __qrtr_node_release(struct kref *kref) { struct qrtr_node *node = container_of(kref, struct qrtr_node, ref); struct radix_tree_iter iter; struct qrtr_tx_flow *flow; unsigned long flags; void __rcu **slot; spin_lock_irqsave(&qrtr_nodes_lock, flags); /* If the node is a bridge for other nodes, there are possibly * multiple entries pointing to our released node, delete them all. */ radix_tree_for_each_slot(slot, &qrtr_nodes, &iter, 0) { if (*slot == node) radix_tree_iter_delete(&qrtr_nodes, &iter, slot); } spin_unlock_irqrestore(&qrtr_nodes_lock, flags); list_del(&node->item); mutex_unlock(&qrtr_node_lock); skb_queue_purge(&node->rx_queue); /* Free tx flow counters */ radix_tree_for_each_slot(slot, &node->qrtr_tx_flow, &iter, 0) { flow = *slot; radix_tree_iter_delete(&node->qrtr_tx_flow, &iter, slot); kfree(flow); } kfree(node); } /* Increment reference to node. */ static struct qrtr_node *qrtr_node_acquire(struct qrtr_node *node) { if (node) kref_get(&node->ref); return node; } /* Decrement reference to node and release as necessary. */ static void qrtr_node_release(struct qrtr_node *node) { if (!node) return; kref_put_mutex(&node->ref, __qrtr_node_release, &qrtr_node_lock); } /** * qrtr_tx_resume() - reset flow control counter * @node: qrtr_node that the QRTR_TYPE_RESUME_TX packet arrived on * @skb: resume_tx packet */ static void qrtr_tx_resume(struct qrtr_node *node, struct sk_buff *skb) { struct qrtr_ctrl_pkt *pkt = (struct qrtr_ctrl_pkt *)skb->data; u64 remote_node = le32_to_cpu(pkt->client.node); u32 remote_port = le32_to_cpu(pkt->client.port); struct qrtr_tx_flow *flow; unsigned long key; key = remote_node << 32 | remote_port; rcu_read_lock(); flow = radix_tree_lookup(&node->qrtr_tx_flow, key); rcu_read_unlock(); if (flow) { spin_lock(&flow->resume_tx.lock); flow->pending = 0; spin_unlock(&flow->resume_tx.lock); wake_up_interruptible_all(&flow->resume_tx); } consume_skb(skb); } /** * qrtr_tx_wait() - flow control for outgoing packets * @node: qrtr_node that the packet is to be send to * @dest_node: node id of the destination * @dest_port: port number of the destination * @type: type of message * * The flow control scheme is based around the low and high "watermarks". When * the low watermark is passed the confirm_rx flag is set on the outgoing * message, which will trigger the remote to send a control message of the type * QRTR_TYPE_RESUME_TX to reset the counter. If the high watermark is hit * further transmision should be paused. * * Return: 1 if confirm_rx should be set, 0 otherwise or errno failure */ static int qrtr_tx_wait(struct qrtr_node *node, int dest_node, int dest_port, int type) { unsigned long key = (u64)dest_node << 32 | dest_port; struct qrtr_tx_flow *flow; int confirm_rx = 0; int ret; /* Never set confirm_rx on non-data packets */ if (type != QRTR_TYPE_DATA) return 0; mutex_lock(&node->qrtr_tx_lock); flow = radix_tree_lookup(&node->qrtr_tx_flow, key); if (!flow) { flow = kzalloc(sizeof(*flow), GFP_KERNEL); if (flow) { init_waitqueue_head(&flow->resume_tx); if (radix_tree_insert(&node->qrtr_tx_flow, key, flow)) { kfree(flow); flow = NULL; } } } mutex_unlock(&node->qrtr_tx_lock); /* Set confirm_rx if we where unable to find and allocate a flow */ if (!flow) return 1; spin_lock_irq(&flow->resume_tx.lock); ret = wait_event_interruptible_locked_irq(flow->resume_tx, flow->pending < QRTR_TX_FLOW_HIGH || flow->tx_failed || !node->ep); if (ret < 0) { confirm_rx = ret; } else if (!node->ep) { confirm_rx = -EPIPE; } else if (flow->tx_failed) { flow->tx_failed = 0; confirm_rx = 1; } else { flow->pending++; confirm_rx = flow->pending == QRTR_TX_FLOW_LOW; } spin_unlock_irq(&flow->resume_tx.lock); return confirm_rx; } /** * qrtr_tx_flow_failed() - flag that tx of confirm_rx flagged messages failed * @node: qrtr_node that the packet is to be send to * @dest_node: node id of the destination * @dest_port: port number of the destination * * Signal that the transmission of a message with confirm_rx flag failed. The * flow's "pending" counter will keep incrementing towards QRTR_TX_FLOW_HIGH, * at which point transmission would stall forever waiting for the resume TX * message associated with the dropped confirm_rx message. * Work around this by marking the flow as having a failed transmission and * cause the next transmission attempt to be sent with the confirm_rx. */ static void qrtr_tx_flow_failed(struct qrtr_node *node, int dest_node, int dest_port) { unsigned long key = (u64)dest_node << 32 | dest_port; struct qrtr_tx_flow *flow; rcu_read_lock(); flow = radix_tree_lookup(&node->qrtr_tx_flow, key); rcu_read_unlock(); if (flow) { spin_lock_irq(&flow->resume_tx.lock); flow->tx_failed = 1; spin_unlock_irq(&flow->resume_tx.lock); } } /* Pass an outgoing packet socket buffer to the endpoint driver. */ static int qrtr_node_enqueue(struct qrtr_node *node, struct sk_buff *skb, int type, struct sockaddr_qrtr *from, struct sockaddr_qrtr *to) { struct qrtr_hdr_v1 *hdr; size_t len = skb->len; int rc, confirm_rx; confirm_rx = qrtr_tx_wait(node, to->sq_node, to->sq_port, type); if (confirm_rx < 0) { kfree_skb(skb); return confirm_rx; } hdr = skb_push(skb, sizeof(*hdr)); hdr->version = cpu_to_le32(QRTR_PROTO_VER_1); hdr->type = cpu_to_le32(type); hdr->src_node_id = cpu_to_le32(from->sq_node); hdr->src_port_id = cpu_to_le32(from->sq_port); if (to->sq_port == QRTR_PORT_CTRL) { hdr->dst_node_id = cpu_to_le32(node->nid); hdr->dst_port_id = cpu_to_le32(QRTR_PORT_CTRL); } else { hdr->dst_node_id = cpu_to_le32(to->sq_node); hdr->dst_port_id = cpu_to_le32(to->sq_port); } hdr->size = cpu_to_le32(len); hdr->confirm_rx = !!confirm_rx; rc = skb_put_padto(skb, ALIGN(len, 4) + sizeof(*hdr)); if (!rc) { mutex_lock(&node->ep_lock); rc = -ENODEV; if (node->ep) rc = node->ep->xmit(node->ep, skb); else kfree_skb(skb); mutex_unlock(&node->ep_lock); } /* Need to ensure that a subsequent message carries the otherwise lost * confirm_rx flag if we dropped this one */ if (rc && confirm_rx) qrtr_tx_flow_failed(node, to->sq_node, to->sq_port); return rc; } /* Lookup node by id. * * callers must release with qrtr_node_release() */ static struct qrtr_node *qrtr_node_lookup(unsigned int nid) { struct qrtr_node *node; unsigned long flags; mutex_lock(&qrtr_node_lock); spin_lock_irqsave(&qrtr_nodes_lock, flags); node = radix_tree_lookup(&qrtr_nodes, nid); node = qrtr_node_acquire(node); spin_unlock_irqrestore(&qrtr_nodes_lock, flags); mutex_unlock(&qrtr_node_lock); return node; } /* Assign node id to node. * * This is mostly useful for automatic node id assignment, based on * the source id in the incoming packet. */ static void qrtr_node_assign(struct qrtr_node *node, unsigned int nid) { unsigned long flags; if (nid == QRTR_EP_NID_AUTO) return; spin_lock_irqsave(&qrtr_nodes_lock, flags); radix_tree_insert(&qrtr_nodes, nid, node); if (node->nid == QRTR_EP_NID_AUTO) node->nid = nid; spin_unlock_irqrestore(&qrtr_nodes_lock, flags); } /** * qrtr_endpoint_post() - post incoming data * @ep: endpoint handle * @data: data pointer * @len: size of data in bytes * * Return: 0 on success; negative error code on failure */ int qrtr_endpoint_post(struct qrtr_endpoint *ep, const void *data, size_t len) { struct qrtr_node *node = ep->node; const struct qrtr_hdr_v1 *v1; const struct qrtr_hdr_v2 *v2; struct qrtr_sock *ipc; struct sk_buff *skb; struct qrtr_cb *cb; size_t size; unsigned int ver; size_t hdrlen; if (len == 0 || len & 3) return -EINVAL; skb = __netdev_alloc_skb(NULL, len, GFP_ATOMIC | __GFP_NOWARN); if (!skb) return -ENOMEM; cb = (struct qrtr_cb *)skb->cb; /* Version field in v1 is little endian, so this works for both cases */ ver = *(u8*)data; switch (ver) { case QRTR_PROTO_VER_1: if (len < sizeof(*v1)) goto err; v1 = data; hdrlen = sizeof(*v1); cb->type = le32_to_cpu(v1->type); cb->src_node = le32_to_cpu(v1->src_node_id); cb->src_port = le32_to_cpu(v1->src_port_id); cb->confirm_rx = !!v1->confirm_rx; cb->dst_node = le32_to_cpu(v1->dst_node_id); cb->dst_port = le32_to_cpu(v1->dst_port_id); size = le32_to_cpu(v1->size); break; case QRTR_PROTO_VER_2: if (len < sizeof(*v2)) goto err; v2 = data; hdrlen = sizeof(*v2) + v2->optlen; cb->type = v2->type; cb->confirm_rx = !!(v2->flags & QRTR_FLAGS_CONFIRM_RX); cb->src_node = le16_to_cpu(v2->src_node_id); cb->src_port = le16_to_cpu(v2->src_port_id); cb->dst_node = le16_to_cpu(v2->dst_node_id); cb->dst_port = le16_to_cpu(v2->dst_port_id); if (cb->src_port == (u16)QRTR_PORT_CTRL) cb->src_port = QRTR_PORT_CTRL; if (cb->dst_port == (u16)QRTR_PORT_CTRL) cb->dst_port = QRTR_PORT_CTRL; size = le32_to_cpu(v2->size); break; default: pr_err("qrtr: Invalid version %d\n", ver); goto err; } if (cb->dst_port == QRTR_PORT_CTRL_LEGACY) cb->dst_port = QRTR_PORT_CTRL; if (!size || len != ALIGN(size, 4) + hdrlen) goto err; if ((cb->type == QRTR_TYPE_NEW_SERVER || cb->type == QRTR_TYPE_RESUME_TX) && size < sizeof(struct qrtr_ctrl_pkt)) goto err; if (cb->dst_port != QRTR_PORT_CTRL && cb->type != QRTR_TYPE_DATA && cb->type != QRTR_TYPE_RESUME_TX) goto err; skb_put_data(skb, data + hdrlen, size); qrtr_node_assign(node, cb->src_node); if (cb->type == QRTR_TYPE_NEW_SERVER) { /* Remote node endpoint can bridge other distant nodes */ const struct qrtr_ctrl_pkt *pkt; pkt = data + hdrlen; qrtr_node_assign(node, le32_to_cpu(pkt->server.node)); } if (cb->type == QRTR_TYPE_RESUME_TX) { qrtr_tx_resume(node, skb); } else { ipc = qrtr_port_lookup(cb->dst_port); if (!ipc) goto err; if (sock_queue_rcv_skb(&ipc->sk, skb)) { qrtr_port_put(ipc); goto err; } qrtr_port_put(ipc); } return 0; err: kfree_skb(skb); return -EINVAL; } EXPORT_SYMBOL_GPL(qrtr_endpoint_post); /** * qrtr_alloc_ctrl_packet() - allocate control packet skb * @pkt: reference to qrtr_ctrl_pkt pointer * @flags: the type of memory to allocate * * Returns newly allocated sk_buff, or NULL on failure * * This function allocates a sk_buff large enough to carry a qrtr_ctrl_pkt and * on success returns a reference to the control packet in @pkt. */ static struct sk_buff *qrtr_alloc_ctrl_packet(struct qrtr_ctrl_pkt **pkt, gfp_t flags) { const int pkt_len = sizeof(struct qrtr_ctrl_pkt); struct sk_buff *skb; skb = alloc_skb(QRTR_HDR_MAX_SIZE + pkt_len, flags); if (!skb) return NULL; skb_reserve(skb, QRTR_HDR_MAX_SIZE); *pkt = skb_put_zero(skb, pkt_len); return skb; } /** * qrtr_endpoint_register() - register a new endpoint * @ep: endpoint to register * @nid: desired node id; may be QRTR_EP_NID_AUTO for auto-assignment * Return: 0 on success; negative error code on failure * * The specified endpoint must have the xmit function pointer set on call. */ int qrtr_endpoint_register(struct qrtr_endpoint *ep, unsigned int nid) { struct qrtr_node *node; if (!ep || !ep->xmit) return -EINVAL; node = kzalloc(sizeof(*node), GFP_KERNEL); if (!node) return -ENOMEM; kref_init(&node->ref); mutex_init(&node->ep_lock); skb_queue_head_init(&node->rx_queue); node->nid = QRTR_EP_NID_AUTO; node->ep = ep; INIT_RADIX_TREE(&node->qrtr_tx_flow, GFP_KERNEL); mutex_init(&node->qrtr_tx_lock); qrtr_node_assign(node, nid); mutex_lock(&qrtr_node_lock); list_add(&node->item, &qrtr_all_nodes); mutex_unlock(&qrtr_node_lock); ep->node = node; return 0; } EXPORT_SYMBOL_GPL(qrtr_endpoint_register); /** * qrtr_endpoint_unregister - unregister endpoint * @ep: endpoint to unregister */ void qrtr_endpoint_unregister(struct qrtr_endpoint *ep) { struct qrtr_node *node = ep->node; struct sockaddr_qrtr src = {AF_QIPCRTR, node->nid, QRTR_PORT_CTRL}; struct sockaddr_qrtr dst = {AF_QIPCRTR, qrtr_local_nid, QRTR_PORT_CTRL}; struct radix_tree_iter iter; struct qrtr_ctrl_pkt *pkt; struct qrtr_tx_flow *flow; struct sk_buff *skb; unsigned long flags; void __rcu **slot; mutex_lock(&node->ep_lock); node->ep = NULL; mutex_unlock(&node->ep_lock); /* Notify the local controller about the event */ spin_lock_irqsave(&qrtr_nodes_lock, flags); radix_tree_for_each_slot(slot, &qrtr_nodes, &iter, 0) { if (*slot != node) continue; src.sq_node = iter.index; skb = qrtr_alloc_ctrl_packet(&pkt, GFP_ATOMIC); if (skb) { pkt->cmd = cpu_to_le32(QRTR_TYPE_BYE); qrtr_local_enqueue(NULL, skb, QRTR_TYPE_BYE, &src, &dst); } } spin_unlock_irqrestore(&qrtr_nodes_lock, flags); /* Wake up any transmitters waiting for resume-tx from the node */ mutex_lock(&node->qrtr_tx_lock); radix_tree_for_each_slot(slot, &node->qrtr_tx_flow, &iter, 0) { flow = *slot; wake_up_interruptible_all(&flow->resume_tx); } mutex_unlock(&node->qrtr_tx_lock); qrtr_node_release(node); ep->node = NULL; } EXPORT_SYMBOL_GPL(qrtr_endpoint_unregister); /* Lookup socket by port. * * Callers must release with qrtr_port_put() */ static struct qrtr_sock *qrtr_port_lookup(int port) { struct qrtr_sock *ipc; if (port == QRTR_PORT_CTRL) port = 0; rcu_read_lock(); ipc = xa_load(&qrtr_ports, port); if (ipc) sock_hold(&ipc->sk); rcu_read_unlock(); return ipc; } /* Release acquired socket. */ static void qrtr_port_put(struct qrtr_sock *ipc) { sock_put(&ipc->sk); } /* Remove port assignment. */ static void qrtr_port_remove(struct qrtr_sock *ipc) { struct qrtr_ctrl_pkt *pkt; struct sk_buff *skb; int port = ipc->us.sq_port; struct sockaddr_qrtr to; to.sq_family = AF_QIPCRTR; to.sq_node = QRTR_NODE_BCAST; to.sq_port = QRTR_PORT_CTRL; skb = qrtr_alloc_ctrl_packet(&pkt, GFP_KERNEL); if (skb) { pkt->cmd = cpu_to_le32(QRTR_TYPE_DEL_CLIENT); pkt->client.node = cpu_to_le32(ipc->us.sq_node); pkt->client.port = cpu_to_le32(ipc->us.sq_port); skb_set_owner_w(skb, &ipc->sk); qrtr_bcast_enqueue(NULL, skb, QRTR_TYPE_DEL_CLIENT, &ipc->us, &to); } if (port == QRTR_PORT_CTRL) port = 0; __sock_put(&ipc->sk); xa_erase(&qrtr_ports, port); /* Ensure that if qrtr_port_lookup() did enter the RCU read section we * wait for it to up increment the refcount */ synchronize_rcu(); } /* Assign port number to socket. * * Specify port in the integer pointed to by port, and it will be adjusted * on return as necesssary. * * Port may be: * 0: Assign ephemeral port in [QRTR_MIN_EPH_SOCKET, QRTR_MAX_EPH_SOCKET] * <QRTR_MIN_EPH_SOCKET: Specified; requires CAP_NET_ADMIN * >QRTR_MIN_EPH_SOCKET: Specified; available to all */ static int qrtr_port_assign(struct qrtr_sock *ipc, int *port) { int rc; if (!*port) { rc = xa_alloc(&qrtr_ports, port, ipc, QRTR_EPH_PORT_RANGE, GFP_KERNEL); } else if (*port < QRTR_MIN_EPH_SOCKET && !capable(CAP_NET_ADMIN)) { rc = -EACCES; } else if (*port == QRTR_PORT_CTRL) { rc = xa_insert(&qrtr_ports, 0, ipc, GFP_KERNEL); } else { rc = xa_insert(&qrtr_ports, *port, ipc, GFP_KERNEL); } if (rc == -EBUSY) return -EADDRINUSE; else if (rc < 0) return rc; sock_hold(&ipc->sk); return 0; } /* Reset all non-control ports */ static void qrtr_reset_ports(void) { struct qrtr_sock *ipc; unsigned long index; rcu_read_lock(); xa_for_each_start(&qrtr_ports, index, ipc, 1) { sock_hold(&ipc->sk); ipc->sk.sk_err = ENETRESET; sk_error_report(&ipc->sk); sock_put(&ipc->sk); } rcu_read_unlock(); } /* Bind socket to address. * * Socket should be locked upon call. */ static int __qrtr_bind(struct socket *sock, const struct sockaddr_qrtr *addr, int zapped) { struct qrtr_sock *ipc = qrtr_sk(sock->sk); struct sock *sk = sock->sk; int port; int rc; /* rebinding ok */ if (!zapped && addr->sq_port == ipc->us.sq_port) return 0; port = addr->sq_port; rc = qrtr_port_assign(ipc, &port); if (rc) return rc; /* unbind previous, if any */ if (!zapped) qrtr_port_remove(ipc); ipc->us.sq_port = port; sock_reset_flag(sk, SOCK_ZAPPED); /* Notify all open ports about the new controller */ if (port == QRTR_PORT_CTRL) qrtr_reset_ports(); return 0; } /* Auto bind to an ephemeral port. */ static int qrtr_autobind(struct socket *sock) { struct sock *sk = sock->sk; struct sockaddr_qrtr addr; if (!sock_flag(sk, SOCK_ZAPPED)) return 0; addr.sq_family = AF_QIPCRTR; addr.sq_node = qrtr_local_nid; addr.sq_port = 0; return __qrtr_bind(sock, &addr, 1); } /* Bind socket to specified sockaddr. */ static int qrtr_bind(struct socket *sock, struct sockaddr_unsized *saddr, int len) { DECLARE_SOCKADDR(struct sockaddr_qrtr *, addr, saddr); struct qrtr_sock *ipc = qrtr_sk(sock->sk); struct sock *sk = sock->sk; int rc; if (len < sizeof(*addr) || addr->sq_family != AF_QIPCRTR) return -EINVAL; if (addr->sq_node != ipc->us.sq_node) return -EINVAL; lock_sock(sk); rc = __qrtr_bind(sock, addr, sock_flag(sk, SOCK_ZAPPED)); release_sock(sk); return rc; } /* Queue packet to local peer socket. */ static int qrtr_local_enqueue(struct qrtr_node *node, struct sk_buff *skb, int type, struct sockaddr_qrtr *from, struct sockaddr_qrtr *to) { struct qrtr_sock *ipc; struct qrtr_cb *cb; ipc = qrtr_port_lookup(to->sq_port); if (!ipc || &ipc->sk == skb->sk) { /* do not send to self */ if (ipc) qrtr_port_put(ipc); kfree_skb(skb); return -ENODEV; } cb = (struct qrtr_cb *)skb->cb; cb->src_node = from->sq_node; cb->src_port = from->sq_port; if (sock_queue_rcv_skb(&ipc->sk, skb)) { qrtr_port_put(ipc); kfree_skb(skb); return -ENOSPC; } qrtr_port_put(ipc); return 0; } /* Queue packet for broadcast. */ static int qrtr_bcast_enqueue(struct qrtr_node *node, struct sk_buff *skb, int type, struct sockaddr_qrtr *from, struct sockaddr_qrtr *to) { struct sk_buff *skbn; mutex_lock(&qrtr_node_lock); list_for_each_entry(node, &qrtr_all_nodes, item) { skbn = pskb_copy(skb, GFP_KERNEL); if (!skbn) break; skb_set_owner_w(skbn, skb->sk); qrtr_node_enqueue(node, skbn, type, from, to); } mutex_unlock(&qrtr_node_lock); qrtr_local_enqueue(NULL, skb, type, from, to); return 0; } static int qrtr_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) { DECLARE_SOCKADDR(struct sockaddr_qrtr *, addr, msg->msg_name); int (*enqueue_fn)(struct qrtr_node *, struct sk_buff *, int, struct sockaddr_qrtr *, struct sockaddr_qrtr *); __le32 qrtr_type = cpu_to_le32(QRTR_TYPE_DATA); struct qrtr_sock *ipc = qrtr_sk(sock->sk); struct sock *sk = sock->sk; struct qrtr_node *node; struct sk_buff *skb; size_t plen; u32 type; int rc; if (msg->msg_flags & ~(MSG_DONTWAIT)) return -EINVAL; if (len > 65535) return -EMSGSIZE; lock_sock(sk); if (addr) { if (msg->msg_namelen < sizeof(*addr)) { release_sock(sk); return -EINVAL; } if (addr->sq_family != AF_QIPCRTR) { release_sock(sk); return -EINVAL; } rc = qrtr_autobind(sock); if (rc) { release_sock(sk); return rc; } } else if (sk->sk_state == TCP_ESTABLISHED) { addr = &ipc->peer; } else { release_sock(sk); return -ENOTCONN; } node = NULL; if (addr->sq_node == QRTR_NODE_BCAST) { if (addr->sq_port != QRTR_PORT_CTRL && qrtr_local_nid != QRTR_NODE_BCAST) { release_sock(sk); return -ENOTCONN; } enqueue_fn = qrtr_bcast_enqueue; } else if (addr->sq_node == ipc->us.sq_node) { enqueue_fn = qrtr_local_enqueue; } else { node = qrtr_node_lookup(addr->sq_node); if (!node) { release_sock(sk); return -ECONNRESET; } enqueue_fn = qrtr_node_enqueue; } plen = (len + 3) & ~3; skb = sock_alloc_send_skb(sk, plen + QRTR_HDR_MAX_SIZE, msg->msg_flags & MSG_DONTWAIT, &rc); if (!skb) { rc = -ENOMEM; goto out_node; } skb_reserve(skb, QRTR_HDR_MAX_SIZE); rc = memcpy_from_msg(skb_put(skb, len), msg, len); if (rc) { kfree_skb(skb); goto out_node; } if (ipc->us.sq_port == QRTR_PORT_CTRL) { if (len < 4) { rc = -EINVAL; kfree_skb(skb); goto out_node; } /* control messages already require the type as 'command' */ skb_copy_bits(skb, 0, &qrtr_type, 4); } type = le32_to_cpu(qrtr_type); rc = enqueue_fn(node, skb, type, &ipc->us, addr); if (rc >= 0) rc = len; out_node: qrtr_node_release(node); release_sock(sk); return rc; } static int qrtr_send_resume_tx(struct qrtr_cb *cb) { struct sockaddr_qrtr remote = { AF_QIPCRTR, cb->src_node, cb->src_port }; struct sockaddr_qrtr local = { AF_QIPCRTR, cb->dst_node, cb->dst_port }; struct qrtr_ctrl_pkt *pkt; struct qrtr_node *node; struct sk_buff *skb; int ret; node = qrtr_node_lookup(remote.sq_node); if (!node) return -EINVAL; skb = qrtr_alloc_ctrl_packet(&pkt, GFP_KERNEL); if (!skb) return -ENOMEM; pkt->cmd = cpu_to_le32(QRTR_TYPE_RESUME_TX); pkt->client.node = cpu_to_le32(cb->dst_node); pkt->client.port = cpu_to_le32(cb->dst_port); ret = qrtr_node_enqueue(node, skb, QRTR_TYPE_RESUME_TX, &local, &remote); qrtr_node_release(node); return ret; } static int qrtr_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, int flags) { DECLARE_SOCKADDR(struct sockaddr_qrtr *, addr, msg->msg_name); struct sock *sk = sock->sk; struct sk_buff *skb; struct qrtr_cb *cb; int copied, rc; lock_sock(sk); if (sock_flag(sk, SOCK_ZAPPED)) { release_sock(sk); return -EADDRNOTAVAIL; } skb = skb_recv_datagram(sk, flags, &rc); if (!skb) { release_sock(sk); return rc; } cb = (struct qrtr_cb *)skb->cb; copied = skb->len; if (copied > size) { copied = size; msg->msg_flags |= MSG_TRUNC; } rc = skb_copy_datagram_msg(skb, 0, msg, copied); if (rc < 0) goto out; rc = copied; if (addr) { /* There is an anonymous 2-byte hole after sq_family, * make sure to clear it. */ memset(addr, 0, sizeof(*addr)); addr->sq_family = AF_QIPCRTR; addr->sq_node = cb->src_node; addr->sq_port = cb->src_port; msg->msg_namelen = sizeof(*addr); } out: if (cb->confirm_rx) qrtr_send_resume_tx(cb); skb_free_datagram(sk, skb); release_sock(sk); return rc; } static int qrtr_connect(struct socket *sock, struct sockaddr_unsized *saddr, int len, int flags) { DECLARE_SOCKADDR(struct sockaddr_qrtr *, addr, saddr); struct qrtr_sock *ipc = qrtr_sk(sock->sk); struct sock *sk = sock->sk; int rc; if (len < sizeof(*addr) || addr->sq_family != AF_QIPCRTR) return -EINVAL; lock_sock(sk); sk->sk_state = TCP_CLOSE; sock->state = SS_UNCONNECTED; rc = qrtr_autobind(sock); if (rc) { release_sock(sk); return rc; } ipc->peer = *addr; sock->state = SS_CONNECTED; sk->sk_state = TCP_ESTABLISHED; release_sock(sk); return 0; } static int qrtr_getname(struct socket *sock, struct sockaddr *saddr, int peer) { struct qrtr_sock *ipc = qrtr_sk(sock->sk); struct sockaddr_qrtr qaddr; struct sock *sk = sock->sk; lock_sock(sk); if (peer) { if (sk->sk_state != TCP_ESTABLISHED) { release_sock(sk); return -ENOTCONN; } qaddr = ipc->peer; } else { qaddr = ipc->us; } release_sock(sk); qaddr.sq_family = AF_QIPCRTR; memcpy(saddr, &qaddr, sizeof(qaddr)); return sizeof(qaddr); } static int qrtr_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { void __user *argp = (void __user *)arg; struct qrtr_sock *ipc = qrtr_sk(sock->sk); struct sock *sk = sock->sk; struct sockaddr_qrtr *sq; struct sk_buff *skb; struct ifreq ifr; long len = 0; int rc = 0; lock_sock(sk); switch (cmd) { case TIOCOUTQ: len = sk->sk_sndbuf - sk_wmem_alloc_get(sk); if (len < 0) len = 0; rc = put_user(len, (int __user *)argp); break; case TIOCINQ: skb = skb_peek(&sk->sk_receive_queue); if (skb) len = skb->len; rc = put_user(len, (int __user *)argp); break; case SIOCGIFADDR: if (get_user_ifreq(&ifr, NULL, argp)) { rc = -EFAULT; break; } sq = (struct sockaddr_qrtr *)&ifr.ifr_addr; *sq = ipc->us; if (put_user_ifreq(&ifr, argp)) { rc = -EFAULT; break; } break; case SIOCADDRT: case SIOCDELRT: case SIOCSIFADDR: case SIOCGIFDSTADDR: case SIOCSIFDSTADDR: case SIOCGIFBRDADDR: case SIOCSIFBRDADDR: case SIOCGIFNETMASK: case SIOCSIFNETMASK: rc = -EINVAL; break; default: rc = -ENOIOCTLCMD; break; } release_sock(sk); return rc; } static int qrtr_release(struct socket *sock) { struct sock *sk = sock->sk; struct qrtr_sock *ipc; if (!sk) return 0; lock_sock(sk); ipc = qrtr_sk(sk); sk->sk_shutdown = SHUTDOWN_MASK; if (!sock_flag(sk, SOCK_DEAD)) sk->sk_state_change(sk); sock_set_flag(sk, SOCK_DEAD); sock_orphan(sk); sock->sk = NULL; if (!sock_flag(sk, SOCK_ZAPPED)) qrtr_port_remove(ipc); skb_queue_purge(&sk->sk_receive_queue); release_sock(sk); sock_put(sk); return 0; } static const struct proto_ops qrtr_proto_ops = { .owner = THIS_MODULE, .family = AF_QIPCRTR, .bind = qrtr_bind, .connect = qrtr_connect, .socketpair = sock_no_socketpair, .accept = sock_no_accept, .listen = sock_no_listen, .sendmsg = qrtr_sendmsg, .recvmsg = qrtr_recvmsg, .getname = qrtr_getname, .ioctl = qrtr_ioctl, .gettstamp = sock_gettstamp, .poll = datagram_poll, .shutdown = sock_no_shutdown, .release = qrtr_release, .mmap = sock_no_mmap, }; static struct proto qrtr_proto = { .name = "QIPCRTR", .owner = THIS_MODULE, .obj_size = sizeof(struct qrtr_sock), }; static int qrtr_create(struct net *net, struct socket *sock, int protocol, int kern) { struct qrtr_sock *ipc; struct sock *sk; if (sock->type != SOCK_DGRAM) return -EPROTOTYPE; sk = sk_alloc(net, AF_QIPCRTR, GFP_KERNEL, &qrtr_proto, kern); if (!sk) return -ENOMEM; sock_set_flag(sk, SOCK_ZAPPED); sock_init_data(sock, sk); sock->ops = &qrtr_proto_ops; ipc = qrtr_sk(sk); ipc->us.sq_family = AF_QIPCRTR; ipc->us.sq_node = qrtr_local_nid; ipc->us.sq_port = 0; return 0; } static const struct net_proto_family qrtr_family = { .owner = THIS_MODULE, .family = AF_QIPCRTR, .create = qrtr_create, }; static int __init qrtr_proto_init(void) { int rc; rc = proto_register(&qrtr_proto, 1); if (rc) return rc; rc = sock_register(&qrtr_family); if (rc) goto err_proto; rc = qrtr_ns_init(); if (rc) goto err_sock; return 0; err_sock: sock_unregister(qrtr_family.family); err_proto: proto_unregister(&qrtr_proto); return rc; } postcore_initcall(qrtr_proto_init); static void __exit qrtr_proto_fini(void) { qrtr_ns_remove(); sock_unregister(qrtr_family.family); proto_unregister(&qrtr_proto); } module_exit(qrtr_proto_fini); MODULE_DESCRIPTION("Qualcomm IPC-router driver"); MODULE_LICENSE("GPL v2"); MODULE_ALIAS_NETPROTO(PF_QIPCRTR);
15 5 10 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 /* * IPv6 specific functions of netfilter core * * Rusty Russell (C) 2000 -- This code is GPL. * Patrick McHardy (C) 2006-2012 */ #include <linux/kernel.h> #include <linux/init.h> #include <linux/ipv6.h> #include <linux/netfilter.h> #include <linux/netfilter_ipv6.h> #include <linux/export.h> #include <net/addrconf.h> #include <net/dst.h> #include <net/ipv6.h> #include <net/ip6_route.h> #include <net/xfrm.h> #include <net/netfilter/nf_queue.h> #include <net/netfilter/nf_conntrack_bridge.h> #include <net/netfilter/ipv6/nf_defrag_ipv6.h> #include "../bridge/br_private.h" int ip6_route_me_harder(struct net *net, struct sock *sk_partial, struct sk_buff *skb) { const struct ipv6hdr *iph = ipv6_hdr(skb); struct sock *sk = sk_to_full_sk(sk_partial); struct net_device *dev = skb_dst_dev(skb); struct flow_keys flkeys; unsigned int hh_len; struct dst_entry *dst; int strict = (ipv6_addr_type(&iph->daddr) & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL)); struct flowi6 fl6 = { .flowi6_l3mdev = l3mdev_master_ifindex(dev), .flowi6_mark = skb->mark, .flowi6_uid = sock_net_uid(net, sk), .daddr = iph->daddr, .saddr = iph->saddr, .flowlabel = ip6_flowinfo(iph), }; int err; if (sk && sk->sk_bound_dev_if) fl6.flowi6_oif = sk->sk_bound_dev_if; else if (strict) fl6.flowi6_oif = dev->ifindex; fib6_rules_early_flow_dissect(net, skb, &fl6, &flkeys); dst = ip6_route_output(net, sk, &fl6); err = dst->error; if (err) { IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES); net_dbg_ratelimited("ip6_route_me_harder: No more route\n"); dst_release(dst); return err; } /* Drop old route. */ skb_dst_drop(skb); skb_dst_set(skb, dst); #ifdef CONFIG_XFRM if (!(IP6CB(skb)->flags & IP6SKB_XFRM_TRANSFORMED) && xfrm_decode_session(net, skb, flowi6_to_flowi(&fl6), AF_INET6) == 0) { /* ignore return value from skb_dstref_steal, xfrm_lookup takes * care of dropping the refcnt if needed. */ skb_dstref_steal(skb); dst = xfrm_lookup(net, dst, flowi6_to_flowi(&fl6), sk, 0); if (IS_ERR(dst)) return PTR_ERR(dst); skb_dst_set(skb, dst); } #endif /* Change in oif may mean change in hh_len. */ hh_len = skb_dst_dev(skb)->hard_header_len; if (skb_headroom(skb) < hh_len && pskb_expand_head(skb, HH_DATA_ALIGN(hh_len - skb_headroom(skb)), 0, GFP_ATOMIC)) return -ENOMEM; return 0; } EXPORT_SYMBOL(ip6_route_me_harder); static int nf_ip6_reroute(struct sk_buff *skb, const struct nf_queue_entry *entry) { struct ip6_rt_info *rt_info = nf_queue_entry_reroute(entry); if (entry->state.hook == NF_INET_LOCAL_OUT) { const struct ipv6hdr *iph = ipv6_hdr(skb); if (!ipv6_addr_equal(&iph->daddr, &rt_info->daddr) || !ipv6_addr_equal(&iph->saddr, &rt_info->saddr) || skb->mark != rt_info->mark) return ip6_route_me_harder(entry->state.net, entry->state.sk, skb); } return 0; } int __nf_ip6_route(struct net *net, struct dst_entry **dst, struct flowi *fl, bool strict) { static const struct ipv6_pinfo fake_pinfo; static const struct inet_sock fake_sk = { /* makes ip6_route_output set RT6_LOOKUP_F_IFACE: */ .sk.sk_bound_dev_if = 1, .pinet6 = (struct ipv6_pinfo *) &fake_pinfo, }; const void *sk = strict ? &fake_sk : NULL; struct dst_entry *result; int err; result = ip6_route_output(net, sk, &fl->u.ip6); err = result->error; if (err) dst_release(result); else *dst = result; return err; } EXPORT_SYMBOL_GPL(__nf_ip6_route); int br_ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, struct nf_bridge_frag_data *data, int (*output)(struct net *, struct sock *sk, const struct nf_bridge_frag_data *data, struct sk_buff *)) { int frag_max_size = BR_INPUT_SKB_CB(skb)->frag_max_size; u8 tstamp_type = skb->tstamp_type; ktime_t tstamp = skb->tstamp; struct ip6_frag_state state; u8 *prevhdr, nexthdr = 0; unsigned int mtu, hlen; int hroom, err = 0; __be32 frag_id; err = ip6_find_1stfragopt(skb, &prevhdr); if (err < 0) goto blackhole; hlen = err; nexthdr = *prevhdr; mtu = skb->dev->mtu; if (frag_max_size > mtu || frag_max_size < IPV6_MIN_MTU) goto blackhole; mtu = frag_max_size; if (mtu < hlen + sizeof(struct frag_hdr) + 8) goto blackhole; mtu -= hlen + sizeof(struct frag_hdr); frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr, &ipv6_hdr(skb)->saddr); if (skb->ip_summed == CHECKSUM_PARTIAL && (err = skb_checksum_help(skb))) goto blackhole; hroom = LL_RESERVED_SPACE(skb->dev); if (skb_has_frag_list(skb)) { unsigned int first_len = skb_pagelen(skb); struct ip6_fraglist_iter iter; struct sk_buff *frag2; if (first_len - hlen > mtu) goto blackhole; if (skb_cloned(skb) || skb_headroom(skb) < (hroom + sizeof(struct frag_hdr))) goto slow_path; skb_walk_frags(skb, frag2) { if (frag2->len > mtu) goto blackhole; /* Partially cloned skb? */ if (skb_shared(frag2) || skb_headroom(frag2) < (hlen + hroom + sizeof(struct frag_hdr))) goto slow_path; } err = ip6_fraglist_init(skb, hlen, prevhdr, nexthdr, frag_id, &iter); if (err < 0) goto blackhole; for (;;) { /* Prepare header of the next frame, * before previous one went down. */ if (iter.frag) ip6_fraglist_prepare(skb, &iter); skb_set_delivery_time(skb, tstamp, tstamp_type); err = output(net, sk, data, skb); if (err || !iter.frag) break; skb = ip6_fraglist_next(&iter); } kfree(iter.tmp_hdr); if (!err) return 0; kfree_skb_list(iter.frag); return err; } slow_path: /* This is a linearized skbuff, the original geometry is lost for us. * This may also be a clone skbuff, we could preserve the geometry for * the copies but probably not worth the effort. */ ip6_frag_init(skb, hlen, mtu, skb->dev->needed_tailroom, LL_RESERVED_SPACE(skb->dev), prevhdr, nexthdr, frag_id, &state); while (state.left > 0) { struct sk_buff *skb2; skb2 = ip6_frag_next(skb, &state); if (IS_ERR(skb2)) { err = PTR_ERR(skb2); goto blackhole; } skb_set_delivery_time(skb2, tstamp, tstamp_type); err = output(net, sk, data, skb2); if (err) goto blackhole; } consume_skb(skb); return err; blackhole: kfree_skb(skb); return 0; } EXPORT_SYMBOL_GPL(br_ip6_fragment); static const struct nf_ipv6_ops ipv6ops = { #if IS_MODULE(CONFIG_IPV6) .chk_addr = ipv6_chk_addr, .route_me_harder = ip6_route_me_harder, .dev_get_saddr = ipv6_dev_get_saddr, .route = __nf_ip6_route, #if IS_ENABLED(CONFIG_SYN_COOKIES) .cookie_init_sequence = __cookie_v6_init_sequence, .cookie_v6_check = __cookie_v6_check, #endif #endif .route_input = ip6_route_input, .fragment = ip6_fragment, .reroute = nf_ip6_reroute, #if IS_MODULE(CONFIG_IPV6) .br_fragment = br_ip6_fragment, #endif }; int __init ipv6_netfilter_init(void) { RCU_INIT_POINTER(nf_ipv6_ops, &ipv6ops); return 0; } /* This can be called from inet6_init() on errors, so it cannot * be marked __exit. -DaveM */ void ipv6_netfilter_fini(void) { RCU_INIT_POINTER(nf_ipv6_ops, NULL); }
118 5497 1 1 1 8 15 7 1 2 6 29 2 1 25 3 2 18 8 10 1 1 24 5 15 1 11 3 2 53 15 377 354 11 12 55 6 1 1 1 2 1 22 1 1 1 2 1 6 1 1 1 1 5767 1 2 2 26 1 3 2 27 1 9 12 10 90 88 4 5 40 5 28 2 2 5189 374 5841 598 2 355 5421 5763 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/ioctl.c * * Copyright (C) 1991, 1992 Linus Torvalds */ #include <linux/syscalls.h> #include <linux/mm.h> #include <linux/capability.h> #include <linux/compat.h> #include <linux/file.h> #include <linux/fs.h> #include <linux/security.h> #include <linux/export.h> #include <linux/uaccess.h> #include <linux/writeback.h> #include <linux/buffer_head.h> #include <linux/falloc.h> #include <linux/sched/signal.h> #include <linux/fiemap.h> #include <linux/mount.h> #include <linux/fscrypt.h> #include <linux/fileattr.h> #include "internal.h" #include <asm/ioctls.h> /* So that the fiemap access checks can't overflow on 32 bit machines. */ #define FIEMAP_MAX_EXTENTS (UINT_MAX / sizeof(struct fiemap_extent)) /** * vfs_ioctl - call filesystem specific ioctl methods * @filp: open file to invoke ioctl method on * @cmd: ioctl command to execute * @arg: command-specific argument for ioctl * * Invokes filesystem specific ->unlocked_ioctl, if one exists; otherwise * returns -ENOTTY. * * Returns 0 on success, -errno on error. */ static int vfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { int error = -ENOTTY; if (!filp->f_op->unlocked_ioctl) goto out; error = filp->f_op->unlocked_ioctl(filp, cmd, arg); if (error == -ENOIOCTLCMD) error = -ENOTTY; out: return error; } static int ioctl_fibmap(struct file *filp, int __user *p) { struct inode *inode = file_inode(filp); struct super_block *sb = inode->i_sb; int error, ur_block; sector_t block; if (!capable(CAP_SYS_RAWIO)) return -EPERM; error = get_user(ur_block, p); if (error) return error; if (ur_block < 0) return -EINVAL; block = ur_block; error = bmap(inode, &block); if (block > INT_MAX) { error = -ERANGE; pr_warn_ratelimited("[%s/%d] FS: %s File: %pD4 would truncate fibmap result\n", current->comm, task_pid_nr(current), sb->s_id, filp); } if (error) ur_block = 0; else ur_block = block; if (put_user(ur_block, p)) error = -EFAULT; return error; } /** * fiemap_fill_next_extent - Fiemap helper function * @fieinfo: Fiemap context passed into ->fiemap * @logical: Extent logical start offset, in bytes * @phys: Extent physical start offset, in bytes * @len: Extent length, in bytes * @flags: FIEMAP_EXTENT flags that describe this extent * * Called from file system ->fiemap callback. Will populate extent * info as passed in via arguments and copy to user memory. On * success, extent count on fieinfo is incremented. * * Returns 0 on success, -errno on error, 1 if this was the last * extent that will fit in user array. */ int fiemap_fill_next_extent(struct fiemap_extent_info *fieinfo, u64 logical, u64 phys, u64 len, u32 flags) { struct fiemap_extent extent; struct fiemap_extent __user *dest = fieinfo->fi_extents_start; /* only count the extents */ if (fieinfo->fi_extents_max == 0) { fieinfo->fi_extents_mapped++; return (flags & FIEMAP_EXTENT_LAST) ? 1 : 0; } if (fieinfo->fi_extents_mapped >= fieinfo->fi_extents_max) return 1; #define SET_UNKNOWN_FLAGS (FIEMAP_EXTENT_DELALLOC) #define SET_NO_UNMOUNTED_IO_FLAGS (FIEMAP_EXTENT_DATA_ENCRYPTED) #define SET_NOT_ALIGNED_FLAGS (FIEMAP_EXTENT_DATA_TAIL|FIEMAP_EXTENT_DATA_INLINE) if (flags & SET_UNKNOWN_FLAGS) flags |= FIEMAP_EXTENT_UNKNOWN; if (flags & SET_NO_UNMOUNTED_IO_FLAGS) flags |= FIEMAP_EXTENT_ENCODED; if (flags & SET_NOT_ALIGNED_FLAGS) flags |= FIEMAP_EXTENT_NOT_ALIGNED; memset(&extent, 0, sizeof(extent)); extent.fe_logical = logical; extent.fe_physical = phys; extent.fe_length = len; extent.fe_flags = flags; dest += fieinfo->fi_extents_mapped; if (copy_to_user(dest, &extent, sizeof(extent))) return -EFAULT; fieinfo->fi_extents_mapped++; if (fieinfo->fi_extents_mapped == fieinfo->fi_extents_max) return 1; return (flags & FIEMAP_EXTENT_LAST) ? 1 : 0; } EXPORT_SYMBOL(fiemap_fill_next_extent); /** * fiemap_prep - check validity of requested flags for fiemap * @inode: Inode to operate on * @fieinfo: Fiemap context passed into ->fiemap * @start: Start of the mapped range * @len: Length of the mapped range, can be truncated by this function. * @supported_flags: Set of fiemap flags that the file system understands * * This function must be called from each ->fiemap instance to validate the * fiemap request against the file system parameters. * * Returns 0 on success, or a negative error on failure. */ int fiemap_prep(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 *len, u32 supported_flags) { u64 maxbytes = inode->i_sb->s_maxbytes; u32 incompat_flags; int ret = 0; if (*len == 0) return -EINVAL; if (start >= maxbytes) return -EFBIG; /* * Shrink request scope to what the fs can actually handle. */ if (*len > maxbytes || (maxbytes - *len) < start) *len = maxbytes - start; supported_flags |= FIEMAP_FLAG_SYNC; supported_flags &= FIEMAP_FLAGS_COMPAT; incompat_flags = fieinfo->fi_flags & ~supported_flags; if (incompat_flags) { fieinfo->fi_flags = incompat_flags; return -EBADR; } if (fieinfo->fi_flags & FIEMAP_FLAG_SYNC) ret = filemap_write_and_wait(inode->i_mapping); return ret; } EXPORT_SYMBOL(fiemap_prep); static int ioctl_fiemap(struct file *filp, struct fiemap __user *ufiemap) { struct fiemap fiemap; struct fiemap_extent_info fieinfo = { 0, }; struct inode *inode = file_inode(filp); int error; if (!inode->i_op->fiemap) return -EOPNOTSUPP; if (copy_from_user(&fiemap, ufiemap, sizeof(fiemap))) return -EFAULT; if (fiemap.fm_extent_count > FIEMAP_MAX_EXTENTS) return -EINVAL; fieinfo.fi_flags = fiemap.fm_flags; fieinfo.fi_extents_max = fiemap.fm_extent_count; fieinfo.fi_extents_start = ufiemap->fm_extents; error = inode->i_op->fiemap(inode, &fieinfo, fiemap.fm_start, fiemap.fm_length); fiemap.fm_flags = fieinfo.fi_flags; fiemap.fm_mapped_extents = fieinfo.fi_extents_mapped; if (copy_to_user(ufiemap, &fiemap, sizeof(fiemap))) error = -EFAULT; return error; } static int ioctl_file_clone(struct file *dst_file, unsigned long srcfd, u64 off, u64 olen, u64 destoff) { CLASS(fd, src_file)(srcfd); loff_t cloned; int ret; if (fd_empty(src_file)) return -EBADF; cloned = vfs_clone_file_range(fd_file(src_file), off, dst_file, destoff, olen, 0); if (cloned < 0) ret = cloned; else if (olen && cloned != olen) ret = -EINVAL; else ret = 0; return ret; } static int ioctl_file_clone_range(struct file *file, struct file_clone_range __user *argp) { struct file_clone_range args; if (copy_from_user(&args, argp, sizeof(args))) return -EFAULT; return ioctl_file_clone(file, args.src_fd, args.src_offset, args.src_length, args.dest_offset); } /* * This provides compatibility with legacy XFS pre-allocation ioctls * which predate the fallocate syscall. * * Only the l_start, l_len and l_whence fields of the 'struct space_resv' * are used here, rest are ignored. */ static int ioctl_preallocate(struct file *filp, int mode, void __user *argp) { struct inode *inode = file_inode(filp); struct space_resv sr; if (copy_from_user(&sr, argp, sizeof(sr))) return -EFAULT; switch (sr.l_whence) { case SEEK_SET: break; case SEEK_CUR: sr.l_start += filp->f_pos; break; case SEEK_END: sr.l_start += i_size_read(inode); break; default: return -EINVAL; } return vfs_fallocate(filp, mode | FALLOC_FL_KEEP_SIZE, sr.l_start, sr.l_len); } /* on ia32 l_start is on a 32-bit boundary */ #if defined CONFIG_COMPAT && defined(CONFIG_X86_64) /* just account for different alignment */ static int compat_ioctl_preallocate(struct file *file, int mode, struct space_resv_32 __user *argp) { struct inode *inode = file_inode(file); struct space_resv_32 sr; if (copy_from_user(&sr, argp, sizeof(sr))) return -EFAULT; switch (sr.l_whence) { case SEEK_SET: break; case SEEK_CUR: sr.l_start += file->f_pos; break; case SEEK_END: sr.l_start += i_size_read(inode); break; default: return -EINVAL; } return vfs_fallocate(file, mode | FALLOC_FL_KEEP_SIZE, sr.l_start, sr.l_len); } #endif static int file_ioctl(struct file *filp, unsigned int cmd, int __user *p) { switch (cmd) { case FIBMAP: return ioctl_fibmap(filp, p); case FS_IOC_RESVSP: case FS_IOC_RESVSP64: return ioctl_preallocate(filp, 0, p); case FS_IOC_UNRESVSP: case FS_IOC_UNRESVSP64: return ioctl_preallocate(filp, FALLOC_FL_PUNCH_HOLE, p); case FS_IOC_ZERO_RANGE: return ioctl_preallocate(filp, FALLOC_FL_ZERO_RANGE, p); } return -ENOIOCTLCMD; } static int ioctl_fionbio(struct file *filp, int __user *argp) { unsigned int flag; int on, error; error = get_user(on, argp); if (error) return error; flag = O_NONBLOCK; #ifdef __sparc__ /* SunOS compatibility item. */ if (O_NONBLOCK != O_NDELAY) flag |= O_NDELAY; #endif spin_lock(&filp->f_lock); if (on) filp->f_flags |= flag; else filp->f_flags &= ~flag; spin_unlock(&filp->f_lock); return error; } static int ioctl_fioasync(unsigned int fd, struct file *filp, int __user *argp) { unsigned int flag; int on, error; error = get_user(on, argp); if (error) return error; flag = on ? FASYNC : 0; /* Did FASYNC state change ? */ if ((flag ^ filp->f_flags) & FASYNC) { if (filp->f_op->fasync) /* fasync() adjusts filp->f_flags */ error = filp->f_op->fasync(fd, filp, on); else error = -ENOTTY; } return error < 0 ? error : 0; } static int ioctl_fsfreeze(struct file *filp) { struct super_block *sb = file_inode(filp)->i_sb; if (!ns_capable(sb->s_user_ns, CAP_SYS_ADMIN)) return -EPERM; /* If filesystem doesn't support freeze feature, return. */ if (sb->s_op->freeze_fs == NULL && sb->s_op->freeze_super == NULL) return -EOPNOTSUPP; /* Freeze */ if (sb->s_op->freeze_super) return sb->s_op->freeze_super(sb, FREEZE_HOLDER_USERSPACE, NULL); return freeze_super(sb, FREEZE_HOLDER_USERSPACE, NULL); } static int ioctl_fsthaw(struct file *filp) { struct super_block *sb = file_inode(filp)->i_sb; if (!ns_capable(sb->s_user_ns, CAP_SYS_ADMIN)) return -EPERM; /* Thaw */ if (sb->s_op->thaw_super) return sb->s_op->thaw_super(sb, FREEZE_HOLDER_USERSPACE, NULL); return thaw_super(sb, FREEZE_HOLDER_USERSPACE, NULL); } static int ioctl_file_dedupe_range(struct file *file, struct file_dedupe_range __user *argp) { struct file_dedupe_range *same = NULL; int ret; unsigned long size; u16 count; if (get_user(count, &argp->dest_count)) { ret = -EFAULT; goto out; } size = struct_size(same, info, count); if (size > PAGE_SIZE) { ret = -ENOMEM; goto out; } same = memdup_user(argp, size); if (IS_ERR(same)) { ret = PTR_ERR(same); same = NULL; goto out; } same->dest_count = count; ret = vfs_dedupe_file_range(file, same); if (ret) goto out; ret = copy_to_user(argp, same, size); if (ret) ret = -EFAULT; out: kfree(same); return ret; } static int ioctl_getfsuuid(struct file *file, void __user *argp) { struct super_block *sb = file_inode(file)->i_sb; struct fsuuid2 u = { .len = sb->s_uuid_len, }; if (!sb->s_uuid_len) return -ENOTTY; memcpy(&u.uuid[0], &sb->s_uuid, sb->s_uuid_len); return copy_to_user(argp, &u, sizeof(u)) ? -EFAULT : 0; } static int ioctl_get_fs_sysfs_path(struct file *file, void __user *argp) { struct super_block *sb = file_inode(file)->i_sb; if (!strlen(sb->s_sysfs_name)) return -ENOTTY; struct fs_sysfs_path u = {}; u.len = scnprintf(u.name, sizeof(u.name), "%s/%s", sb->s_type->name, sb->s_sysfs_name); return copy_to_user(argp, &u, sizeof(u)) ? -EFAULT : 0; } /* * do_vfs_ioctl() is not for drivers and not intended to be EXPORT_SYMBOL()'d. * It's just a simple helper for sys_ioctl and compat_sys_ioctl. * * When you add any new common ioctls to the switches above and below, * please ensure they have compatible arguments in compat mode. * * The LSM mailing list should also be notified of any command additions or * changes, as specific LSMs may be affected. */ static int do_vfs_ioctl(struct file *filp, unsigned int fd, unsigned int cmd, unsigned long arg) { void __user *argp = (void __user *)arg; struct inode *inode = file_inode(filp); switch (cmd) { case FIOCLEX: set_close_on_exec(fd, 1); return 0; case FIONCLEX: set_close_on_exec(fd, 0); return 0; case FIONBIO: return ioctl_fionbio(filp, argp); case FIOASYNC: return ioctl_fioasync(fd, filp, argp); case FIOQSIZE: if (S_ISDIR(inode->i_mode) || (S_ISREG(inode->i_mode) && !IS_ANON_FILE(inode)) || S_ISLNK(inode->i_mode)) { loff_t res = inode_get_bytes(inode); return copy_to_user(argp, &res, sizeof(res)) ? -EFAULT : 0; } return -ENOTTY; case FIFREEZE: return ioctl_fsfreeze(filp); case FITHAW: return ioctl_fsthaw(filp); case FS_IOC_FIEMAP: return ioctl_fiemap(filp, argp); case FIGETBSZ: /* anon_bdev filesystems may not have a block size */ if (!inode->i_sb->s_blocksize) return -EINVAL; return put_user(inode->i_sb->s_blocksize, (int __user *)argp); case FICLONE: return ioctl_file_clone(filp, arg, 0, 0, 0); case FICLONERANGE: return ioctl_file_clone_range(filp, argp); case FIDEDUPERANGE: return ioctl_file_dedupe_range(filp, argp); case FIONREAD: if (!S_ISREG(inode->i_mode) || IS_ANON_FILE(inode)) return vfs_ioctl(filp, cmd, arg); return put_user(i_size_read(inode) - filp->f_pos, (int __user *)argp); case FS_IOC_GETFLAGS: return ioctl_getflags(filp, argp); case FS_IOC_SETFLAGS: return ioctl_setflags(filp, argp); case FS_IOC_FSGETXATTR: return ioctl_fsgetxattr(filp, argp); case FS_IOC_FSSETXATTR: return ioctl_fssetxattr(filp, argp); case FS_IOC_GETFSUUID: return ioctl_getfsuuid(filp, argp); case FS_IOC_GETFSSYSFSPATH: return ioctl_get_fs_sysfs_path(filp, argp); default: if (S_ISREG(inode->i_mode) && !IS_ANON_FILE(inode)) return file_ioctl(filp, cmd, argp); break; } return -ENOIOCTLCMD; } SYSCALL_DEFINE3(ioctl, unsigned int, fd, unsigned int, cmd, unsigned long, arg) { CLASS(fd, f)(fd); int error; if (fd_empty(f)) return -EBADF; error = security_file_ioctl(fd_file(f), cmd, arg); if (error) return error; error = do_vfs_ioctl(fd_file(f), fd, cmd, arg); if (error == -ENOIOCTLCMD) error = vfs_ioctl(fd_file(f), cmd, arg); return error; } #ifdef CONFIG_COMPAT /** * compat_ptr_ioctl - generic implementation of .compat_ioctl file operation * @file: The file to operate on. * @cmd: The ioctl command number. * @arg: The argument to the ioctl. * * This is not normally called as a function, but instead set in struct * file_operations as * * .compat_ioctl = compat_ptr_ioctl, * * On most architectures, the compat_ptr_ioctl() just passes all arguments * to the corresponding ->ioctl handler. The exception is arch/s390, where * compat_ptr() clears the top bit of a 32-bit pointer value, so user space * pointers to the second 2GB alias the first 2GB, as is the case for * native 32-bit s390 user space. * * The compat_ptr_ioctl() function must therefore be use